diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,350033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.932860169163922, + "eval_steps": 5000000.0, + "global_step": 500000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.865720338327844e-05, + "grad_norm": 0.144638329744339, + "learning_rate": 2e-06, + "loss": 2.3034, + "step": 10 + }, + { + "epoch": 7.731440676655688e-05, + "grad_norm": 0.11194495111703873, + "learning_rate": 4e-06, + "loss": 2.309, + "step": 20 + }, + { + "epoch": 0.00011597161014983532, + "grad_norm": 0.10528688132762909, + "learning_rate": 6e-06, + "loss": 2.2992, + "step": 30 + }, + { + "epoch": 0.00015462881353311376, + "grad_norm": 0.09728169441223145, + "learning_rate": 8e-06, + "loss": 2.3064, + "step": 40 + }, + { + "epoch": 0.0001932860169163922, + "grad_norm": 0.09796875715255737, + "learning_rate": 1e-05, + "loss": 2.3046, + "step": 50 + }, + { + "epoch": 0.00023194322029967065, + "grad_norm": 0.09509941190481186, + "learning_rate": 1.2e-05, + "loss": 2.2819, + "step": 60 + }, + { + "epoch": 0.0002706004236829491, + "grad_norm": 0.09584011882543564, + "learning_rate": 1.4e-05, + "loss": 2.3002, + "step": 70 + }, + { + "epoch": 0.0003092576270662275, + "grad_norm": 0.09579554200172424, + "learning_rate": 1.6e-05, + "loss": 2.279, + "step": 80 + }, + { + "epoch": 0.000347914830449506, + "grad_norm": 0.09664827585220337, + "learning_rate": 1.8e-05, + "loss": 2.3028, + "step": 90 + }, + { + "epoch": 0.0003865720338327844, + "grad_norm": 0.0957031100988388, + "learning_rate": 2e-05, + "loss": 2.2977, + "step": 100 + }, + { + "epoch": 0.0004252292372160628, + "grad_norm": 0.09676313400268555, + "learning_rate": 2.2e-05, + "loss": 2.2983, + "step": 110 + }, + { + "epoch": 0.0004638864405993413, + "grad_norm": 0.0992269441485405, + "learning_rate": 2.4e-05, + "loss": 2.2717, + "step": 120 + }, + { + "epoch": 0.0005025436439826198, + "grad_norm": 0.11956988275051117, + "learning_rate": 2.6e-05, + "loss": 2.2923, + "step": 130 + }, + { + "epoch": 0.0005412008473658982, + "grad_norm": 0.09714414924383163, + "learning_rate": 2.8e-05, + "loss": 2.2973, + "step": 140 + }, + { + "epoch": 0.0005798580507491766, + "grad_norm": 0.09981994330883026, + "learning_rate": 3e-05, + "loss": 2.2948, + "step": 150 + }, + { + "epoch": 0.000618515254132455, + "grad_norm": 0.09482458233833313, + "learning_rate": 3.2e-05, + "loss": 2.28, + "step": 160 + }, + { + "epoch": 0.0006571724575157334, + "grad_norm": 0.10037554055452347, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.2722, + "step": 170 + }, + { + "epoch": 0.000695829660899012, + "grad_norm": 0.09889110177755356, + "learning_rate": 3.6e-05, + "loss": 2.288, + "step": 180 + }, + { + "epoch": 0.0007344868642822904, + "grad_norm": 0.09977748990058899, + "learning_rate": 3.8e-05, + "loss": 2.2815, + "step": 190 + }, + { + "epoch": 0.0007731440676655688, + "grad_norm": 0.10139628499746323, + "learning_rate": 4e-05, + "loss": 2.2811, + "step": 200 + }, + { + "epoch": 0.0008118012710488472, + "grad_norm": 0.09862680733203888, + "learning_rate": 4.2000000000000004e-05, + "loss": 2.2776, + "step": 210 + }, + { + "epoch": 0.0008504584744321256, + "grad_norm": 0.09958124905824661, + "learning_rate": 4.4e-05, + "loss": 2.2652, + "step": 220 + }, + { + "epoch": 0.0008891156778154042, + "grad_norm": 0.10025738924741745, + "learning_rate": 4.6e-05, + "loss": 2.2808, + "step": 230 + }, + { + "epoch": 0.0009277728811986826, + "grad_norm": 0.09997476637363434, + "learning_rate": 4.8e-05, + "loss": 2.2766, + "step": 240 + }, + { + "epoch": 0.000966430084581961, + "grad_norm": 0.10078670084476471, + "learning_rate": 5e-05, + "loss": 2.2808, + "step": 250 + }, + { + "epoch": 0.0010050872879652395, + "grad_norm": 0.09915891289710999, + "learning_rate": 5.2e-05, + "loss": 2.2705, + "step": 260 + }, + { + "epoch": 0.001043744491348518, + "grad_norm": 0.09878283739089966, + "learning_rate": 5.4e-05, + "loss": 2.2642, + "step": 270 + }, + { + "epoch": 0.0010824016947317964, + "grad_norm": 0.09960974007844925, + "learning_rate": 5.6e-05, + "loss": 2.2696, + "step": 280 + }, + { + "epoch": 0.0011210588981150748, + "grad_norm": 0.10109972953796387, + "learning_rate": 5.800000000000001e-05, + "loss": 2.2802, + "step": 290 + }, + { + "epoch": 0.0011597161014983532, + "grad_norm": 0.1021229699254036, + "learning_rate": 6e-05, + "loss": 2.2745, + "step": 300 + }, + { + "epoch": 0.0011983733048816316, + "grad_norm": 0.10506097227334976, + "learning_rate": 6.2e-05, + "loss": 2.2848, + "step": 310 + }, + { + "epoch": 0.00123703050826491, + "grad_norm": 0.10662543773651123, + "learning_rate": 6.4e-05, + "loss": 2.2669, + "step": 320 + }, + { + "epoch": 0.0012756877116481885, + "grad_norm": 0.11067084223031998, + "learning_rate": 6.6e-05, + "loss": 2.2745, + "step": 330 + }, + { + "epoch": 0.0013143449150314669, + "grad_norm": 0.10717500746250153, + "learning_rate": 6.800000000000001e-05, + "loss": 2.2709, + "step": 340 + }, + { + "epoch": 0.0013530021184147453, + "grad_norm": 0.10758475214242935, + "learning_rate": 7.000000000000001e-05, + "loss": 2.2624, + "step": 350 + }, + { + "epoch": 0.001391659321798024, + "grad_norm": 0.10585075616836548, + "learning_rate": 7.2e-05, + "loss": 2.2588, + "step": 360 + }, + { + "epoch": 0.0014303165251813024, + "grad_norm": 0.10558781772851944, + "learning_rate": 7.4e-05, + "loss": 2.2673, + "step": 370 + }, + { + "epoch": 0.0014689737285645808, + "grad_norm": 0.1085430234670639, + "learning_rate": 7.6e-05, + "loss": 2.2711, + "step": 380 + }, + { + "epoch": 0.0015076309319478592, + "grad_norm": 0.10785538703203201, + "learning_rate": 7.8e-05, + "loss": 2.2614, + "step": 390 + }, + { + "epoch": 0.0015462881353311376, + "grad_norm": 0.10400920361280441, + "learning_rate": 8e-05, + "loss": 2.2781, + "step": 400 + }, + { + "epoch": 0.001584945338714416, + "grad_norm": 0.10345766693353653, + "learning_rate": 8.2e-05, + "loss": 2.2441, + "step": 410 + }, + { + "epoch": 0.0016236025420976945, + "grad_norm": 0.1062808483839035, + "learning_rate": 8.400000000000001e-05, + "loss": 2.2522, + "step": 420 + }, + { + "epoch": 0.0016622597454809729, + "grad_norm": 0.11519643664360046, + "learning_rate": 8.599999999999999e-05, + "loss": 2.262, + "step": 430 + }, + { + "epoch": 0.0017009169488642513, + "grad_norm": 0.10753437131643295, + "learning_rate": 8.8e-05, + "loss": 2.261, + "step": 440 + }, + { + "epoch": 0.0017395741522475297, + "grad_norm": 0.11565092951059341, + "learning_rate": 8.999999999999999e-05, + "loss": 2.2629, + "step": 450 + }, + { + "epoch": 0.0017782313556308083, + "grad_norm": 0.11193150281906128, + "learning_rate": 9.2e-05, + "loss": 2.2495, + "step": 460 + }, + { + "epoch": 0.0018168885590140868, + "grad_norm": 0.10999926179647446, + "learning_rate": 9.400000000000001e-05, + "loss": 2.2677, + "step": 470 + }, + { + "epoch": 0.0018555457623973652, + "grad_norm": 0.1156141385436058, + "learning_rate": 9.6e-05, + "loss": 2.2475, + "step": 480 + }, + { + "epoch": 0.0018942029657806436, + "grad_norm": 0.10972867161035538, + "learning_rate": 9.800000000000001e-05, + "loss": 2.2666, + "step": 490 + }, + { + "epoch": 0.001932860169163922, + "grad_norm": 0.1137310117483139, + "learning_rate": 0.0001, + "loss": 2.2476, + "step": 500 + }, + { + "epoch": 0.0019715173725472004, + "grad_norm": 0.11586374044418335, + "learning_rate": 0.000102, + "loss": 2.2718, + "step": 510 + }, + { + "epoch": 0.002010174575930479, + "grad_norm": 0.10975225269794464, + "learning_rate": 0.000104, + "loss": 2.2705, + "step": 520 + }, + { + "epoch": 0.0020488317793137573, + "grad_norm": 0.1140342503786087, + "learning_rate": 0.000106, + "loss": 2.2574, + "step": 530 + }, + { + "epoch": 0.002087488982697036, + "grad_norm": 0.11401575058698654, + "learning_rate": 0.000108, + "loss": 2.2646, + "step": 540 + }, + { + "epoch": 0.002126146186080314, + "grad_norm": 0.11033317446708679, + "learning_rate": 0.00011, + "loss": 2.2426, + "step": 550 + }, + { + "epoch": 0.0021648033894635928, + "grad_norm": 0.12125371396541595, + "learning_rate": 0.000112, + "loss": 2.2444, + "step": 560 + }, + { + "epoch": 0.002203460592846871, + "grad_norm": 0.11703993380069733, + "learning_rate": 0.000114, + "loss": 2.2463, + "step": 570 + }, + { + "epoch": 0.0022421177962301496, + "grad_norm": 0.11883124709129333, + "learning_rate": 0.00011600000000000001, + "loss": 2.243, + "step": 580 + }, + { + "epoch": 0.002280774999613428, + "grad_norm": 0.12158622592687607, + "learning_rate": 0.000118, + "loss": 2.2576, + "step": 590 + }, + { + "epoch": 0.0023194322029967064, + "grad_norm": 0.11580552160739899, + "learning_rate": 0.00012, + "loss": 2.2597, + "step": 600 + }, + { + "epoch": 0.0023580894063799846, + "grad_norm": 0.12047332525253296, + "learning_rate": 0.000122, + "loss": 2.2684, + "step": 610 + }, + { + "epoch": 0.0023967466097632633, + "grad_norm": 0.11461437493562698, + "learning_rate": 0.000124, + "loss": 2.259, + "step": 620 + }, + { + "epoch": 0.002435403813146542, + "grad_norm": 0.12357478588819504, + "learning_rate": 0.000126, + "loss": 2.2549, + "step": 630 + }, + { + "epoch": 0.00247406101652982, + "grad_norm": 0.11461521685123444, + "learning_rate": 0.000128, + "loss": 2.2527, + "step": 640 + }, + { + "epoch": 0.0025127182199130987, + "grad_norm": 0.12529653310775757, + "learning_rate": 0.00013000000000000002, + "loss": 2.2535, + "step": 650 + }, + { + "epoch": 0.002551375423296377, + "grad_norm": 0.12418084591627121, + "learning_rate": 0.000132, + "loss": 2.2578, + "step": 660 + }, + { + "epoch": 0.0025900326266796556, + "grad_norm": 0.11342219263315201, + "learning_rate": 0.000134, + "loss": 2.2488, + "step": 670 + }, + { + "epoch": 0.0026286898300629338, + "grad_norm": 0.12109920382499695, + "learning_rate": 0.00013600000000000003, + "loss": 2.2513, + "step": 680 + }, + { + "epoch": 0.0026673470334462124, + "grad_norm": 0.1217515617609024, + "learning_rate": 0.00013800000000000002, + "loss": 2.2513, + "step": 690 + }, + { + "epoch": 0.0027060042368294906, + "grad_norm": 0.12609773874282837, + "learning_rate": 0.00014000000000000001, + "loss": 2.2452, + "step": 700 + }, + { + "epoch": 0.0027446614402127693, + "grad_norm": 0.12247970700263977, + "learning_rate": 0.00014199999999999998, + "loss": 2.2418, + "step": 710 + }, + { + "epoch": 0.002783318643596048, + "grad_norm": 0.12363874912261963, + "learning_rate": 0.000144, + "loss": 2.2465, + "step": 720 + }, + { + "epoch": 0.002821975846979326, + "grad_norm": 0.12146373093128204, + "learning_rate": 0.000146, + "loss": 2.2627, + "step": 730 + }, + { + "epoch": 0.0028606330503626047, + "grad_norm": 0.12576045095920563, + "learning_rate": 0.000148, + "loss": 2.2658, + "step": 740 + }, + { + "epoch": 0.002899290253745883, + "grad_norm": 0.1205977275967598, + "learning_rate": 0.00015, + "loss": 2.2444, + "step": 750 + }, + { + "epoch": 0.0029379474571291616, + "grad_norm": 0.12153708934783936, + "learning_rate": 0.000152, + "loss": 2.2632, + "step": 760 + }, + { + "epoch": 0.0029766046605124398, + "grad_norm": 0.11591946333646774, + "learning_rate": 0.000154, + "loss": 2.2524, + "step": 770 + }, + { + "epoch": 0.0030152618638957184, + "grad_norm": 0.12641796469688416, + "learning_rate": 0.000156, + "loss": 2.2611, + "step": 780 + }, + { + "epoch": 0.0030539190672789966, + "grad_norm": 0.12534596025943756, + "learning_rate": 0.000158, + "loss": 2.2364, + "step": 790 + }, + { + "epoch": 0.0030925762706622752, + "grad_norm": 0.1186433732509613, + "learning_rate": 0.00016, + "loss": 2.253, + "step": 800 + }, + { + "epoch": 0.0031312334740455534, + "grad_norm": 0.12480587512254715, + "learning_rate": 0.000162, + "loss": 2.2434, + "step": 810 + }, + { + "epoch": 0.003169890677428832, + "grad_norm": 0.1353996843099594, + "learning_rate": 0.000164, + "loss": 2.2472, + "step": 820 + }, + { + "epoch": 0.0032085478808121107, + "grad_norm": 0.12390103936195374, + "learning_rate": 0.00016600000000000002, + "loss": 2.258, + "step": 830 + }, + { + "epoch": 0.003247205084195389, + "grad_norm": 0.1263820379972458, + "learning_rate": 0.00016800000000000002, + "loss": 2.2552, + "step": 840 + }, + { + "epoch": 0.0032858622875786675, + "grad_norm": 0.12677688896656036, + "learning_rate": 0.00017, + "loss": 2.233, + "step": 850 + }, + { + "epoch": 0.0033245194909619457, + "grad_norm": 0.12664929032325745, + "learning_rate": 0.00017199999999999998, + "loss": 2.2527, + "step": 860 + }, + { + "epoch": 0.0033631766943452244, + "grad_norm": 0.14130549132823944, + "learning_rate": 0.000174, + "loss": 2.253, + "step": 870 + }, + { + "epoch": 0.0034018338977285026, + "grad_norm": 0.13592848181724548, + "learning_rate": 0.000176, + "loss": 2.2475, + "step": 880 + }, + { + "epoch": 0.0034404911011117812, + "grad_norm": 0.1275721788406372, + "learning_rate": 0.000178, + "loss": 2.2585, + "step": 890 + }, + { + "epoch": 0.0034791483044950594, + "grad_norm": 0.12690509855747223, + "learning_rate": 0.00017999999999999998, + "loss": 2.2525, + "step": 900 + }, + { + "epoch": 0.003517805507878338, + "grad_norm": 0.12834276258945465, + "learning_rate": 0.000182, + "loss": 2.2622, + "step": 910 + }, + { + "epoch": 0.0035564627112616167, + "grad_norm": 0.13139639794826508, + "learning_rate": 0.000184, + "loss": 2.2527, + "step": 920 + }, + { + "epoch": 0.003595119914644895, + "grad_norm": 0.13028880953788757, + "learning_rate": 0.000186, + "loss": 2.2579, + "step": 930 + }, + { + "epoch": 0.0036337771180281735, + "grad_norm": 0.14631029963493347, + "learning_rate": 0.00018800000000000002, + "loss": 2.2659, + "step": 940 + }, + { + "epoch": 0.0036724343214114517, + "grad_norm": 0.1378420740365982, + "learning_rate": 0.00019, + "loss": 2.257, + "step": 950 + }, + { + "epoch": 0.0037110915247947304, + "grad_norm": 0.12957599759101868, + "learning_rate": 0.000192, + "loss": 2.2533, + "step": 960 + }, + { + "epoch": 0.0037497487281780086, + "grad_norm": 0.12839531898498535, + "learning_rate": 0.000194, + "loss": 2.2618, + "step": 970 + }, + { + "epoch": 0.003788405931561287, + "grad_norm": 0.13206636905670166, + "learning_rate": 0.00019600000000000002, + "loss": 2.2564, + "step": 980 + }, + { + "epoch": 0.0038270631349445654, + "grad_norm": 0.13995692133903503, + "learning_rate": 0.00019800000000000002, + "loss": 2.2503, + "step": 990 + }, + { + "epoch": 0.003865720338327844, + "grad_norm": 0.13417716324329376, + "learning_rate": 0.0002, + "loss": 2.2494, + "step": 1000 + }, + { + "epoch": 0.0039043775417111222, + "grad_norm": 0.13489042222499847, + "learning_rate": 0.000202, + "loss": 2.2506, + "step": 1010 + }, + { + "epoch": 0.003943034745094401, + "grad_norm": 0.139221653342247, + "learning_rate": 0.000204, + "loss": 2.2493, + "step": 1020 + }, + { + "epoch": 0.003981691948477679, + "grad_norm": 0.14685332775115967, + "learning_rate": 0.000206, + "loss": 2.252, + "step": 1030 + }, + { + "epoch": 0.004020349151860958, + "grad_norm": 0.1311921775341034, + "learning_rate": 0.000208, + "loss": 2.2559, + "step": 1040 + }, + { + "epoch": 0.004059006355244236, + "grad_norm": 0.13858993351459503, + "learning_rate": 0.00021, + "loss": 2.2518, + "step": 1050 + }, + { + "epoch": 0.0040976635586275146, + "grad_norm": 0.15868879854679108, + "learning_rate": 0.000212, + "loss": 2.2542, + "step": 1060 + }, + { + "epoch": 0.004136320762010793, + "grad_norm": 0.13859611749649048, + "learning_rate": 0.000214, + "loss": 2.2512, + "step": 1070 + }, + { + "epoch": 0.004174977965394072, + "grad_norm": 0.12724171578884125, + "learning_rate": 0.000216, + "loss": 2.2692, + "step": 1080 + }, + { + "epoch": 0.00421363516877735, + "grad_norm": 0.13106082379817963, + "learning_rate": 0.000218, + "loss": 2.2533, + "step": 1090 + }, + { + "epoch": 0.004252292372160628, + "grad_norm": 0.1482999473810196, + "learning_rate": 0.00022, + "loss": 2.2379, + "step": 1100 + }, + { + "epoch": 0.0042909495755439064, + "grad_norm": 0.1275726705789566, + "learning_rate": 0.000222, + "loss": 2.254, + "step": 1110 + }, + { + "epoch": 0.0043296067789271855, + "grad_norm": 0.13930654525756836, + "learning_rate": 0.000224, + "loss": 2.2566, + "step": 1120 + }, + { + "epoch": 0.004368263982310464, + "grad_norm": 0.13760654628276825, + "learning_rate": 0.00022600000000000002, + "loss": 2.2536, + "step": 1130 + }, + { + "epoch": 0.004406921185693742, + "grad_norm": 0.13762839138507843, + "learning_rate": 0.000228, + "loss": 2.2569, + "step": 1140 + }, + { + "epoch": 0.004445578389077021, + "grad_norm": 0.14481611549854279, + "learning_rate": 0.00023, + "loss": 2.2445, + "step": 1150 + }, + { + "epoch": 0.004484235592460299, + "grad_norm": 0.13288311660289764, + "learning_rate": 0.00023200000000000003, + "loss": 2.246, + "step": 1160 + }, + { + "epoch": 0.004522892795843577, + "grad_norm": 0.15648561716079712, + "learning_rate": 0.00023400000000000002, + "loss": 2.248, + "step": 1170 + }, + { + "epoch": 0.004561549999226856, + "grad_norm": 0.13135381042957306, + "learning_rate": 0.000236, + "loss": 2.2464, + "step": 1180 + }, + { + "epoch": 0.004600207202610135, + "grad_norm": 0.14945846796035767, + "learning_rate": 0.00023799999999999998, + "loss": 2.2538, + "step": 1190 + }, + { + "epoch": 0.004638864405993413, + "grad_norm": 0.1418386697769165, + "learning_rate": 0.00024, + "loss": 2.2351, + "step": 1200 + }, + { + "epoch": 0.004677521609376691, + "grad_norm": 0.1434290111064911, + "learning_rate": 0.000242, + "loss": 2.263, + "step": 1210 + }, + { + "epoch": 0.004716178812759969, + "grad_norm": 0.14787034690380096, + "learning_rate": 0.000244, + "loss": 2.2561, + "step": 1220 + }, + { + "epoch": 0.004754836016143248, + "grad_norm": 0.13952895998954773, + "learning_rate": 0.000246, + "loss": 2.2382, + "step": 1230 + }, + { + "epoch": 0.0047934932195265265, + "grad_norm": 0.14909480512142181, + "learning_rate": 0.000248, + "loss": 2.2535, + "step": 1240 + }, + { + "epoch": 0.004832150422909805, + "grad_norm": 0.1471477746963501, + "learning_rate": 0.00025, + "loss": 2.2703, + "step": 1250 + }, + { + "epoch": 0.004870807626293084, + "grad_norm": 0.14406970143318176, + "learning_rate": 0.000252, + "loss": 2.2445, + "step": 1260 + }, + { + "epoch": 0.004909464829676362, + "grad_norm": 0.14098721742630005, + "learning_rate": 0.000254, + "loss": 2.2494, + "step": 1270 + }, + { + "epoch": 0.00494812203305964, + "grad_norm": 0.14734388887882233, + "learning_rate": 0.000256, + "loss": 2.2402, + "step": 1280 + }, + { + "epoch": 0.004986779236442918, + "grad_norm": 0.14964799582958221, + "learning_rate": 0.00025800000000000004, + "loss": 2.2578, + "step": 1290 + }, + { + "epoch": 0.0050254364398261975, + "grad_norm": 0.15391208231449127, + "learning_rate": 0.00026000000000000003, + "loss": 2.2633, + "step": 1300 + }, + { + "epoch": 0.005064093643209476, + "grad_norm": 0.152483731508255, + "learning_rate": 0.000262, + "loss": 2.2457, + "step": 1310 + }, + { + "epoch": 0.005102750846592754, + "grad_norm": 0.1448708474636078, + "learning_rate": 0.000264, + "loss": 2.2566, + "step": 1320 + }, + { + "epoch": 0.005141408049976033, + "grad_norm": 0.15382276475429535, + "learning_rate": 0.000266, + "loss": 2.2623, + "step": 1330 + }, + { + "epoch": 0.005180065253359311, + "grad_norm": 0.15795128047466278, + "learning_rate": 0.000268, + "loss": 2.2504, + "step": 1340 + }, + { + "epoch": 0.005218722456742589, + "grad_norm": 0.14650534093379974, + "learning_rate": 0.00027, + "loss": 2.2556, + "step": 1350 + }, + { + "epoch": 0.0052573796601258676, + "grad_norm": 0.15081769227981567, + "learning_rate": 0.00027200000000000005, + "loss": 2.2551, + "step": 1360 + }, + { + "epoch": 0.005296036863509147, + "grad_norm": 0.1477501541376114, + "learning_rate": 0.00027400000000000005, + "loss": 2.2585, + "step": 1370 + }, + { + "epoch": 0.005334694066892425, + "grad_norm": 0.16164270043373108, + "learning_rate": 0.00027600000000000004, + "loss": 2.2643, + "step": 1380 + }, + { + "epoch": 0.005373351270275703, + "grad_norm": 0.15059034526348114, + "learning_rate": 0.00027800000000000004, + "loss": 2.2505, + "step": 1390 + }, + { + "epoch": 0.005412008473658981, + "grad_norm": 0.14349956810474396, + "learning_rate": 0.00028000000000000003, + "loss": 2.264, + "step": 1400 + }, + { + "epoch": 0.00545066567704226, + "grad_norm": 0.14197014272212982, + "learning_rate": 0.00028199999999999997, + "loss": 2.2582, + "step": 1410 + }, + { + "epoch": 0.0054893228804255385, + "grad_norm": 0.1571299135684967, + "learning_rate": 0.00028399999999999996, + "loss": 2.2598, + "step": 1420 + }, + { + "epoch": 0.005527980083808817, + "grad_norm": 0.14361920952796936, + "learning_rate": 0.00028599999999999996, + "loss": 2.2426, + "step": 1430 + }, + { + "epoch": 0.005566637287192096, + "grad_norm": 0.14507678151130676, + "learning_rate": 0.000288, + "loss": 2.2576, + "step": 1440 + }, + { + "epoch": 0.005605294490575374, + "grad_norm": 0.1388143002986908, + "learning_rate": 0.00029, + "loss": 2.2526, + "step": 1450 + }, + { + "epoch": 0.005643951693958652, + "grad_norm": 0.15597301721572876, + "learning_rate": 0.000292, + "loss": 2.2675, + "step": 1460 + }, + { + "epoch": 0.00568260889734193, + "grad_norm": 0.13945583999156952, + "learning_rate": 0.000294, + "loss": 2.2717, + "step": 1470 + }, + { + "epoch": 0.0057212661007252094, + "grad_norm": 0.1599157154560089, + "learning_rate": 0.000296, + "loss": 2.256, + "step": 1480 + }, + { + "epoch": 0.005759923304108488, + "grad_norm": 0.14406763017177582, + "learning_rate": 0.000298, + "loss": 2.2692, + "step": 1490 + }, + { + "epoch": 0.005798580507491766, + "grad_norm": 0.15903228521347046, + "learning_rate": 0.0003, + "loss": 2.2734, + "step": 1500 + }, + { + "epoch": 0.005837237710875044, + "grad_norm": 0.14652226865291595, + "learning_rate": 0.000302, + "loss": 2.2475, + "step": 1510 + }, + { + "epoch": 0.005875894914258323, + "grad_norm": 0.16651105880737305, + "learning_rate": 0.000304, + "loss": 2.2459, + "step": 1520 + }, + { + "epoch": 0.005914552117641601, + "grad_norm": 0.15857480466365814, + "learning_rate": 0.000306, + "loss": 2.2536, + "step": 1530 + }, + { + "epoch": 0.0059532093210248795, + "grad_norm": 0.1656733900308609, + "learning_rate": 0.000308, + "loss": 2.2597, + "step": 1540 + }, + { + "epoch": 0.005991866524408159, + "grad_norm": 0.1586008071899414, + "learning_rate": 0.00031, + "loss": 2.2641, + "step": 1550 + }, + { + "epoch": 0.006030523727791437, + "grad_norm": 0.15786172449588776, + "learning_rate": 0.000312, + "loss": 2.2672, + "step": 1560 + }, + { + "epoch": 0.006069180931174715, + "grad_norm": 0.15883037447929382, + "learning_rate": 0.000314, + "loss": 2.2475, + "step": 1570 + }, + { + "epoch": 0.006107838134557993, + "grad_norm": 0.16221074759960175, + "learning_rate": 0.000316, + "loss": 2.258, + "step": 1580 + }, + { + "epoch": 0.006146495337941272, + "grad_norm": 0.14376521110534668, + "learning_rate": 0.00031800000000000003, + "loss": 2.2619, + "step": 1590 + }, + { + "epoch": 0.0061851525413245505, + "grad_norm": 0.1429993212223053, + "learning_rate": 0.00032, + "loss": 2.2329, + "step": 1600 + }, + { + "epoch": 0.006223809744707829, + "grad_norm": 0.15341663360595703, + "learning_rate": 0.000322, + "loss": 2.2501, + "step": 1610 + }, + { + "epoch": 0.006262466948091107, + "grad_norm": 0.15319260954856873, + "learning_rate": 0.000324, + "loss": 2.2727, + "step": 1620 + }, + { + "epoch": 0.006301124151474386, + "grad_norm": 0.16039767861366272, + "learning_rate": 0.000326, + "loss": 2.2525, + "step": 1630 + }, + { + "epoch": 0.006339781354857664, + "grad_norm": 0.1588415652513504, + "learning_rate": 0.000328, + "loss": 2.2577, + "step": 1640 + }, + { + "epoch": 0.006378438558240942, + "grad_norm": 0.1502636820077896, + "learning_rate": 0.00033, + "loss": 2.2542, + "step": 1650 + }, + { + "epoch": 0.006417095761624221, + "grad_norm": 0.1684446483850479, + "learning_rate": 0.00033200000000000005, + "loss": 2.2632, + "step": 1660 + }, + { + "epoch": 0.0064557529650075, + "grad_norm": 0.13977859914302826, + "learning_rate": 0.00033400000000000004, + "loss": 2.2662, + "step": 1670 + }, + { + "epoch": 0.006494410168390778, + "grad_norm": 0.14510294795036316, + "learning_rate": 0.00033600000000000004, + "loss": 2.2705, + "step": 1680 + }, + { + "epoch": 0.006533067371774056, + "grad_norm": 0.16292671859264374, + "learning_rate": 0.00033800000000000003, + "loss": 2.2546, + "step": 1690 + }, + { + "epoch": 0.006571724575157335, + "grad_norm": 0.15019813179969788, + "learning_rate": 0.00034, + "loss": 2.2561, + "step": 1700 + }, + { + "epoch": 0.006610381778540613, + "grad_norm": 0.1569780558347702, + "learning_rate": 0.000342, + "loss": 2.2519, + "step": 1710 + }, + { + "epoch": 0.0066490389819238915, + "grad_norm": 0.16433066129684448, + "learning_rate": 0.00034399999999999996, + "loss": 2.2739, + "step": 1720 + }, + { + "epoch": 0.006687696185307171, + "grad_norm": 0.1681896299123764, + "learning_rate": 0.000346, + "loss": 2.2628, + "step": 1730 + }, + { + "epoch": 0.006726353388690449, + "grad_norm": 0.17025180160999298, + "learning_rate": 0.000348, + "loss": 2.262, + "step": 1740 + }, + { + "epoch": 0.006765010592073727, + "grad_norm": 0.15026843547821045, + "learning_rate": 0.00035, + "loss": 2.2624, + "step": 1750 + }, + { + "epoch": 0.006803667795457005, + "grad_norm": 0.1481882929801941, + "learning_rate": 0.000352, + "loss": 2.2602, + "step": 1760 + }, + { + "epoch": 0.006842324998840284, + "grad_norm": 0.16018210351467133, + "learning_rate": 0.000354, + "loss": 2.2603, + "step": 1770 + }, + { + "epoch": 0.0068809822022235624, + "grad_norm": 0.15643790364265442, + "learning_rate": 0.000356, + "loss": 2.2671, + "step": 1780 + }, + { + "epoch": 0.006919639405606841, + "grad_norm": 0.1506490260362625, + "learning_rate": 0.000358, + "loss": 2.2785, + "step": 1790 + }, + { + "epoch": 0.006958296608990119, + "grad_norm": 0.15134088695049286, + "learning_rate": 0.00035999999999999997, + "loss": 2.2866, + "step": 1800 + }, + { + "epoch": 0.006996953812373398, + "grad_norm": 0.16592784225940704, + "learning_rate": 0.000362, + "loss": 2.2632, + "step": 1810 + }, + { + "epoch": 0.007035611015756676, + "grad_norm": 0.1507008671760559, + "learning_rate": 0.000364, + "loss": 2.2678, + "step": 1820 + }, + { + "epoch": 0.007074268219139954, + "grad_norm": 0.15520262718200684, + "learning_rate": 0.000366, + "loss": 2.2733, + "step": 1830 + }, + { + "epoch": 0.007112925422523233, + "grad_norm": 0.1662900447845459, + "learning_rate": 0.000368, + "loss": 2.2644, + "step": 1840 + }, + { + "epoch": 0.007151582625906512, + "grad_norm": 0.17302747070789337, + "learning_rate": 0.00037, + "loss": 2.265, + "step": 1850 + }, + { + "epoch": 0.00719023982928979, + "grad_norm": 0.167618989944458, + "learning_rate": 0.000372, + "loss": 2.2652, + "step": 1860 + }, + { + "epoch": 0.007228897032673068, + "grad_norm": 0.1645369678735733, + "learning_rate": 0.000374, + "loss": 2.2806, + "step": 1870 + }, + { + "epoch": 0.007267554236056347, + "grad_norm": 0.16747227311134338, + "learning_rate": 0.00037600000000000003, + "loss": 2.2729, + "step": 1880 + }, + { + "epoch": 0.007306211439439625, + "grad_norm": 0.17343640327453613, + "learning_rate": 0.000378, + "loss": 2.2777, + "step": 1890 + }, + { + "epoch": 0.0073448686428229035, + "grad_norm": 0.1752873957157135, + "learning_rate": 0.00038, + "loss": 2.2629, + "step": 1900 + }, + { + "epoch": 0.007383525846206182, + "grad_norm": 0.17244476079940796, + "learning_rate": 0.000382, + "loss": 2.2598, + "step": 1910 + }, + { + "epoch": 0.007422183049589461, + "grad_norm": 0.1787695288658142, + "learning_rate": 0.000384, + "loss": 2.2656, + "step": 1920 + }, + { + "epoch": 0.007460840252972739, + "grad_norm": 0.15853258967399597, + "learning_rate": 0.000386, + "loss": 2.2673, + "step": 1930 + }, + { + "epoch": 0.007499497456356017, + "grad_norm": 0.1486985981464386, + "learning_rate": 0.000388, + "loss": 2.2751, + "step": 1940 + }, + { + "epoch": 0.007538154659739296, + "grad_norm": 0.1753663718700409, + "learning_rate": 0.00039000000000000005, + "loss": 2.2727, + "step": 1950 + }, + { + "epoch": 0.007576811863122574, + "grad_norm": 0.18662576377391815, + "learning_rate": 0.00039200000000000004, + "loss": 2.2633, + "step": 1960 + }, + { + "epoch": 0.007615469066505853, + "grad_norm": 0.1988459974527359, + "learning_rate": 0.00039400000000000004, + "loss": 2.2634, + "step": 1970 + }, + { + "epoch": 0.007654126269889131, + "grad_norm": 0.17339687049388885, + "learning_rate": 0.00039600000000000003, + "loss": 2.2757, + "step": 1980 + }, + { + "epoch": 0.00769278347327241, + "grad_norm": 0.18046623468399048, + "learning_rate": 0.000398, + "loss": 2.2807, + "step": 1990 + }, + { + "epoch": 0.007731440676655688, + "grad_norm": 0.16285806894302368, + "learning_rate": 0.0004, + "loss": 2.261, + "step": 2000 + }, + { + "epoch": 0.007770097880038966, + "grad_norm": 0.17096810042858124, + "learning_rate": 0.000402, + "loss": 2.2811, + "step": 2010 + }, + { + "epoch": 0.0078087550834222445, + "grad_norm": 0.1719149798154831, + "learning_rate": 0.000404, + "loss": 2.2868, + "step": 2020 + }, + { + "epoch": 0.007847412286805524, + "grad_norm": 0.17674361169338226, + "learning_rate": 0.00040600000000000006, + "loss": 2.2721, + "step": 2030 + }, + { + "epoch": 0.007886069490188802, + "grad_norm": 0.15781450271606445, + "learning_rate": 0.000408, + "loss": 2.283, + "step": 2040 + }, + { + "epoch": 0.00792472669357208, + "grad_norm": 0.19582943618297577, + "learning_rate": 0.00041, + "loss": 2.2681, + "step": 2050 + }, + { + "epoch": 0.007963383896955358, + "grad_norm": 0.166877880692482, + "learning_rate": 0.000412, + "loss": 2.2836, + "step": 2060 + }, + { + "epoch": 0.008002041100338636, + "grad_norm": 0.17595386505126953, + "learning_rate": 0.000414, + "loss": 2.2562, + "step": 2070 + }, + { + "epoch": 0.008040698303721916, + "grad_norm": 0.17867539823055267, + "learning_rate": 0.000416, + "loss": 2.2676, + "step": 2080 + }, + { + "epoch": 0.008079355507105195, + "grad_norm": 0.16842317581176758, + "learning_rate": 0.00041799999999999997, + "loss": 2.2731, + "step": 2090 + }, + { + "epoch": 0.008118012710488473, + "grad_norm": 0.1676853597164154, + "learning_rate": 0.00042, + "loss": 2.2834, + "step": 2100 + }, + { + "epoch": 0.008156669913871751, + "grad_norm": 0.16343624889850616, + "learning_rate": 0.000422, + "loss": 2.2692, + "step": 2110 + }, + { + "epoch": 0.008195327117255029, + "grad_norm": 0.18455322086811066, + "learning_rate": 0.000424, + "loss": 2.2817, + "step": 2120 + }, + { + "epoch": 0.008233984320638307, + "grad_norm": 0.16690371930599213, + "learning_rate": 0.000426, + "loss": 2.2858, + "step": 2130 + }, + { + "epoch": 0.008272641524021586, + "grad_norm": 0.20441211760044098, + "learning_rate": 0.000428, + "loss": 2.2803, + "step": 2140 + }, + { + "epoch": 0.008311298727404865, + "grad_norm": 0.1717919260263443, + "learning_rate": 0.00043, + "loss": 2.2571, + "step": 2150 + }, + { + "epoch": 0.008349955930788144, + "grad_norm": 0.20094193518161774, + "learning_rate": 0.000432, + "loss": 2.2691, + "step": 2160 + }, + { + "epoch": 0.008388613134171422, + "grad_norm": 0.17125524580478668, + "learning_rate": 0.00043400000000000003, + "loss": 2.2859, + "step": 2170 + }, + { + "epoch": 0.0084272703375547, + "grad_norm": 0.18455886840820312, + "learning_rate": 0.000436, + "loss": 2.2732, + "step": 2180 + }, + { + "epoch": 0.008465927540937978, + "grad_norm": 0.20173197984695435, + "learning_rate": 0.000438, + "loss": 2.2928, + "step": 2190 + }, + { + "epoch": 0.008504584744321256, + "grad_norm": 0.18156647682189941, + "learning_rate": 0.00044, + "loss": 2.2725, + "step": 2200 + }, + { + "epoch": 0.008543241947704535, + "grad_norm": 0.1819404810667038, + "learning_rate": 0.000442, + "loss": 2.2693, + "step": 2210 + }, + { + "epoch": 0.008581899151087813, + "grad_norm": 0.18304607272148132, + "learning_rate": 0.000444, + "loss": 2.2843, + "step": 2220 + }, + { + "epoch": 0.008620556354471093, + "grad_norm": 0.15833936631679535, + "learning_rate": 0.000446, + "loss": 2.2818, + "step": 2230 + }, + { + "epoch": 0.008659213557854371, + "grad_norm": 0.1906866580247879, + "learning_rate": 0.000448, + "loss": 2.264, + "step": 2240 + }, + { + "epoch": 0.00869787076123765, + "grad_norm": 0.16470417380332947, + "learning_rate": 0.00045000000000000004, + "loss": 2.2943, + "step": 2250 + }, + { + "epoch": 0.008736527964620927, + "grad_norm": 0.16799645125865936, + "learning_rate": 0.00045200000000000004, + "loss": 2.2778, + "step": 2260 + }, + { + "epoch": 0.008775185168004206, + "grad_norm": 0.1809621900320053, + "learning_rate": 0.00045400000000000003, + "loss": 2.284, + "step": 2270 + }, + { + "epoch": 0.008813842371387484, + "grad_norm": 0.18328256905078888, + "learning_rate": 0.000456, + "loss": 2.272, + "step": 2280 + }, + { + "epoch": 0.008852499574770762, + "grad_norm": 0.22645479440689087, + "learning_rate": 0.000458, + "loss": 2.2805, + "step": 2290 + }, + { + "epoch": 0.008891156778154042, + "grad_norm": 0.21596001088619232, + "learning_rate": 0.00046, + "loss": 2.2835, + "step": 2300 + }, + { + "epoch": 0.00892981398153732, + "grad_norm": 0.16236698627471924, + "learning_rate": 0.000462, + "loss": 2.2845, + "step": 2310 + }, + { + "epoch": 0.008968471184920598, + "grad_norm": 0.16047120094299316, + "learning_rate": 0.00046400000000000006, + "loss": 2.2648, + "step": 2320 + }, + { + "epoch": 0.009007128388303877, + "grad_norm": 0.1953253298997879, + "learning_rate": 0.00046600000000000005, + "loss": 2.2981, + "step": 2330 + }, + { + "epoch": 0.009045785591687155, + "grad_norm": 0.16764451563358307, + "learning_rate": 0.00046800000000000005, + "loss": 2.2999, + "step": 2340 + }, + { + "epoch": 0.009084442795070433, + "grad_norm": 0.18153171241283417, + "learning_rate": 0.00047, + "loss": 2.2778, + "step": 2350 + }, + { + "epoch": 0.009123099998453711, + "grad_norm": 0.1875174194574356, + "learning_rate": 0.000472, + "loss": 2.2867, + "step": 2360 + }, + { + "epoch": 0.009161757201836991, + "grad_norm": 0.18244752287864685, + "learning_rate": 0.000474, + "loss": 2.2849, + "step": 2370 + }, + { + "epoch": 0.00920041440522027, + "grad_norm": 0.15618745982646942, + "learning_rate": 0.00047599999999999997, + "loss": 2.2917, + "step": 2380 + }, + { + "epoch": 0.009239071608603548, + "grad_norm": 0.1807391494512558, + "learning_rate": 0.00047799999999999996, + "loss": 2.2907, + "step": 2390 + }, + { + "epoch": 0.009277728811986826, + "grad_norm": 0.1593099981546402, + "learning_rate": 0.00048, + "loss": 2.2669, + "step": 2400 + }, + { + "epoch": 0.009316386015370104, + "grad_norm": 0.17011666297912598, + "learning_rate": 0.000482, + "loss": 2.2749, + "step": 2410 + }, + { + "epoch": 0.009355043218753382, + "grad_norm": 0.19175320863723755, + "learning_rate": 0.000484, + "loss": 2.2909, + "step": 2420 + }, + { + "epoch": 0.00939370042213666, + "grad_norm": 0.19758890569210052, + "learning_rate": 0.000486, + "loss": 2.2949, + "step": 2430 + }, + { + "epoch": 0.009432357625519939, + "grad_norm": 0.16885755956172943, + "learning_rate": 0.000488, + "loss": 2.2786, + "step": 2440 + }, + { + "epoch": 0.009471014828903218, + "grad_norm": 0.16191567480564117, + "learning_rate": 0.00049, + "loss": 2.2946, + "step": 2450 + }, + { + "epoch": 0.009509672032286497, + "grad_norm": 0.15698394179344177, + "learning_rate": 0.000492, + "loss": 2.2832, + "step": 2460 + }, + { + "epoch": 0.009548329235669775, + "grad_norm": 0.19043174386024475, + "learning_rate": 0.000494, + "loss": 2.2739, + "step": 2470 + }, + { + "epoch": 0.009586986439053053, + "grad_norm": 0.18678732216358185, + "learning_rate": 0.000496, + "loss": 2.2879, + "step": 2480 + }, + { + "epoch": 0.009625643642436331, + "grad_norm": 0.21114246547222137, + "learning_rate": 0.000498, + "loss": 2.2851, + "step": 2490 + }, + { + "epoch": 0.00966430084581961, + "grad_norm": 0.17600609362125397, + "learning_rate": 0.0005, + "loss": 2.2906, + "step": 2500 + }, + { + "epoch": 0.009702958049202888, + "grad_norm": 0.1817324459552765, + "learning_rate": 0.0005020000000000001, + "loss": 2.2937, + "step": 2510 + }, + { + "epoch": 0.009741615252586168, + "grad_norm": 0.19554492831230164, + "learning_rate": 0.000504, + "loss": 2.2969, + "step": 2520 + }, + { + "epoch": 0.009780272455969446, + "grad_norm": 0.1822926104068756, + "learning_rate": 0.000506, + "loss": 2.2901, + "step": 2530 + }, + { + "epoch": 0.009818929659352724, + "grad_norm": 0.19118402898311615, + "learning_rate": 0.000508, + "loss": 2.2959, + "step": 2540 + }, + { + "epoch": 0.009857586862736002, + "grad_norm": 0.1897551566362381, + "learning_rate": 0.00051, + "loss": 2.2919, + "step": 2550 + }, + { + "epoch": 0.00989624406611928, + "grad_norm": 0.17411769926548004, + "learning_rate": 0.000512, + "loss": 2.2832, + "step": 2560 + }, + { + "epoch": 0.009934901269502559, + "grad_norm": 0.16486330330371857, + "learning_rate": 0.000514, + "loss": 2.2867, + "step": 2570 + }, + { + "epoch": 0.009973558472885837, + "grad_norm": 0.18757574260234833, + "learning_rate": 0.0005160000000000001, + "loss": 2.3017, + "step": 2580 + }, + { + "epoch": 0.010012215676269117, + "grad_norm": 0.17420315742492676, + "learning_rate": 0.000518, + "loss": 2.2916, + "step": 2590 + }, + { + "epoch": 0.010050872879652395, + "grad_norm": 0.1847243458032608, + "learning_rate": 0.0005200000000000001, + "loss": 2.2889, + "step": 2600 + }, + { + "epoch": 0.010089530083035673, + "grad_norm": 0.23134857416152954, + "learning_rate": 0.000522, + "loss": 2.2994, + "step": 2610 + }, + { + "epoch": 0.010128187286418951, + "grad_norm": 0.250232458114624, + "learning_rate": 0.000524, + "loss": 2.2847, + "step": 2620 + }, + { + "epoch": 0.01016684448980223, + "grad_norm": 0.2091963291168213, + "learning_rate": 0.000526, + "loss": 2.2928, + "step": 2630 + }, + { + "epoch": 0.010205501693185508, + "grad_norm": 0.1857314556837082, + "learning_rate": 0.000528, + "loss": 2.3083, + "step": 2640 + }, + { + "epoch": 0.010244158896568786, + "grad_norm": 0.20166365802288055, + "learning_rate": 0.0005300000000000001, + "loss": 2.2853, + "step": 2650 + }, + { + "epoch": 0.010282816099952066, + "grad_norm": 0.1892065852880478, + "learning_rate": 0.000532, + "loss": 2.2838, + "step": 2660 + }, + { + "epoch": 0.010321473303335344, + "grad_norm": 0.20674696564674377, + "learning_rate": 0.0005340000000000001, + "loss": 2.3009, + "step": 2670 + }, + { + "epoch": 0.010360130506718622, + "grad_norm": 0.19799582660198212, + "learning_rate": 0.000536, + "loss": 2.3038, + "step": 2680 + }, + { + "epoch": 0.0103987877101019, + "grad_norm": 0.19329093396663666, + "learning_rate": 0.0005380000000000001, + "loss": 2.2932, + "step": 2690 + }, + { + "epoch": 0.010437444913485179, + "grad_norm": 0.21683421730995178, + "learning_rate": 0.00054, + "loss": 2.3102, + "step": 2700 + }, + { + "epoch": 0.010476102116868457, + "grad_norm": 0.16719898581504822, + "learning_rate": 0.0005420000000000001, + "loss": 2.3057, + "step": 2710 + }, + { + "epoch": 0.010514759320251735, + "grad_norm": 0.17605605721473694, + "learning_rate": 0.0005440000000000001, + "loss": 2.2882, + "step": 2720 + }, + { + "epoch": 0.010553416523635013, + "grad_norm": 0.1690768152475357, + "learning_rate": 0.000546, + "loss": 2.3003, + "step": 2730 + }, + { + "epoch": 0.010592073727018293, + "grad_norm": 0.175802081823349, + "learning_rate": 0.0005480000000000001, + "loss": 2.2874, + "step": 2740 + }, + { + "epoch": 0.010630730930401571, + "grad_norm": 0.17927300930023193, + "learning_rate": 0.00055, + "loss": 2.2925, + "step": 2750 + }, + { + "epoch": 0.01066938813378485, + "grad_norm": 0.19693294167518616, + "learning_rate": 0.0005520000000000001, + "loss": 2.2915, + "step": 2760 + }, + { + "epoch": 0.010708045337168128, + "grad_norm": 0.16840171813964844, + "learning_rate": 0.000554, + "loss": 2.2909, + "step": 2770 + }, + { + "epoch": 0.010746702540551406, + "grad_norm": 0.23003901541233063, + "learning_rate": 0.0005560000000000001, + "loss": 2.3001, + "step": 2780 + }, + { + "epoch": 0.010785359743934684, + "grad_norm": 0.17170651257038116, + "learning_rate": 0.000558, + "loss": 2.2891, + "step": 2790 + }, + { + "epoch": 0.010824016947317962, + "grad_norm": 0.19299864768981934, + "learning_rate": 0.0005600000000000001, + "loss": 2.3097, + "step": 2800 + }, + { + "epoch": 0.010862674150701242, + "grad_norm": 0.17300739884376526, + "learning_rate": 0.0005620000000000001, + "loss": 2.2869, + "step": 2810 + }, + { + "epoch": 0.01090133135408452, + "grad_norm": 0.20269937813282013, + "learning_rate": 0.0005639999999999999, + "loss": 2.3018, + "step": 2820 + }, + { + "epoch": 0.010939988557467799, + "grad_norm": 1.11091148853302, + "learning_rate": 0.000566, + "loss": 2.3178, + "step": 2830 + }, + { + "epoch": 0.010978645760851077, + "grad_norm": 0.1726454347372055, + "learning_rate": 0.0005679999999999999, + "loss": 2.3028, + "step": 2840 + }, + { + "epoch": 0.011017302964234355, + "grad_norm": 0.19507430493831635, + "learning_rate": 0.00057, + "loss": 2.3144, + "step": 2850 + }, + { + "epoch": 0.011055960167617633, + "grad_norm": 0.18410713970661163, + "learning_rate": 0.0005719999999999999, + "loss": 2.2999, + "step": 2860 + }, + { + "epoch": 0.011094617371000912, + "grad_norm": 0.18434615433216095, + "learning_rate": 0.000574, + "loss": 2.2988, + "step": 2870 + }, + { + "epoch": 0.011133274574384192, + "grad_norm": 0.203523188829422, + "learning_rate": 0.000576, + "loss": 2.3002, + "step": 2880 + }, + { + "epoch": 0.01117193177776747, + "grad_norm": 0.20426899194717407, + "learning_rate": 0.000578, + "loss": 2.3113, + "step": 2890 + }, + { + "epoch": 0.011210588981150748, + "grad_norm": 0.20032745599746704, + "learning_rate": 0.00058, + "loss": 2.2975, + "step": 2900 + }, + { + "epoch": 0.011249246184534026, + "grad_norm": 0.1720176339149475, + "learning_rate": 0.0005819999999999999, + "loss": 2.3101, + "step": 2910 + }, + { + "epoch": 0.011287903387917304, + "grad_norm": 0.19644276797771454, + "learning_rate": 0.000584, + "loss": 2.3017, + "step": 2920 + }, + { + "epoch": 0.011326560591300583, + "grad_norm": 0.17177051305770874, + "learning_rate": 0.0005859999999999999, + "loss": 2.3089, + "step": 2930 + }, + { + "epoch": 0.01136521779468386, + "grad_norm": 0.19372963905334473, + "learning_rate": 0.000588, + "loss": 2.2961, + "step": 2940 + }, + { + "epoch": 0.01140387499806714, + "grad_norm": 0.17544853687286377, + "learning_rate": 0.00059, + "loss": 2.3147, + "step": 2950 + }, + { + "epoch": 0.011442532201450419, + "grad_norm": 0.19046401977539062, + "learning_rate": 0.000592, + "loss": 2.3084, + "step": 2960 + }, + { + "epoch": 0.011481189404833697, + "grad_norm": 0.16877801716327667, + "learning_rate": 0.000594, + "loss": 2.3172, + "step": 2970 + }, + { + "epoch": 0.011519846608216975, + "grad_norm": 0.18222913146018982, + "learning_rate": 0.000596, + "loss": 2.3085, + "step": 2980 + }, + { + "epoch": 0.011558503811600254, + "grad_norm": 0.17426137626171112, + "learning_rate": 0.000598, + "loss": 2.3167, + "step": 2990 + }, + { + "epoch": 0.011597161014983532, + "grad_norm": 0.2104695737361908, + "learning_rate": 0.0006, + "loss": 2.3178, + "step": 3000 + }, + { + "epoch": 0.01163581821836681, + "grad_norm": 0.1998831331729889, + "learning_rate": 0.000602, + "loss": 2.3057, + "step": 3010 + }, + { + "epoch": 0.011674475421750088, + "grad_norm": 0.2088499665260315, + "learning_rate": 0.000604, + "loss": 2.3215, + "step": 3020 + }, + { + "epoch": 0.011713132625133368, + "grad_norm": 0.1630593240261078, + "learning_rate": 0.000606, + "loss": 2.3106, + "step": 3030 + }, + { + "epoch": 0.011751789828516646, + "grad_norm": 0.19042055308818817, + "learning_rate": 0.000608, + "loss": 2.3159, + "step": 3040 + }, + { + "epoch": 0.011790447031899924, + "grad_norm": 0.2393055111169815, + "learning_rate": 0.00061, + "loss": 2.3008, + "step": 3050 + }, + { + "epoch": 0.011829104235283203, + "grad_norm": 0.17863787710666656, + "learning_rate": 0.000612, + "loss": 2.2997, + "step": 3060 + }, + { + "epoch": 0.01186776143866648, + "grad_norm": 0.17681317031383514, + "learning_rate": 0.000614, + "loss": 2.295, + "step": 3070 + }, + { + "epoch": 0.011906418642049759, + "grad_norm": 0.20421043038368225, + "learning_rate": 0.000616, + "loss": 2.3029, + "step": 3080 + }, + { + "epoch": 0.011945075845433037, + "grad_norm": 0.1847294718027115, + "learning_rate": 0.0006180000000000001, + "loss": 2.3119, + "step": 3090 + }, + { + "epoch": 0.011983733048816317, + "grad_norm": 0.19557037949562073, + "learning_rate": 0.00062, + "loss": 2.3224, + "step": 3100 + }, + { + "epoch": 0.012022390252199595, + "grad_norm": 0.1692788153886795, + "learning_rate": 0.000622, + "loss": 2.3152, + "step": 3110 + }, + { + "epoch": 0.012061047455582874, + "grad_norm": 0.21557722985744476, + "learning_rate": 0.000624, + "loss": 2.3147, + "step": 3120 + }, + { + "epoch": 0.012099704658966152, + "grad_norm": 0.17801079154014587, + "learning_rate": 0.000626, + "loss": 2.3108, + "step": 3130 + }, + { + "epoch": 0.01213836186234943, + "grad_norm": 0.18151815235614777, + "learning_rate": 0.000628, + "loss": 2.3268, + "step": 3140 + }, + { + "epoch": 0.012177019065732708, + "grad_norm": 0.2068193405866623, + "learning_rate": 0.00063, + "loss": 2.3084, + "step": 3150 + }, + { + "epoch": 0.012215676269115986, + "grad_norm": 0.18960556387901306, + "learning_rate": 0.000632, + "loss": 2.3111, + "step": 3160 + }, + { + "epoch": 0.012254333472499266, + "grad_norm": 0.2341216653585434, + "learning_rate": 0.000634, + "loss": 2.3085, + "step": 3170 + }, + { + "epoch": 0.012292990675882545, + "grad_norm": 0.19195427000522614, + "learning_rate": 0.0006360000000000001, + "loss": 2.3069, + "step": 3180 + }, + { + "epoch": 0.012331647879265823, + "grad_norm": 0.16848890483379364, + "learning_rate": 0.000638, + "loss": 2.2939, + "step": 3190 + }, + { + "epoch": 0.012370305082649101, + "grad_norm": 0.1788295954465866, + "learning_rate": 0.00064, + "loss": 2.3208, + "step": 3200 + }, + { + "epoch": 0.01240896228603238, + "grad_norm": 0.19146698713302612, + "learning_rate": 0.000642, + "loss": 2.3245, + "step": 3210 + }, + { + "epoch": 0.012447619489415657, + "grad_norm": 0.18817923963069916, + "learning_rate": 0.000644, + "loss": 2.3363, + "step": 3220 + }, + { + "epoch": 0.012486276692798936, + "grad_norm": 0.23819443583488464, + "learning_rate": 0.000646, + "loss": 2.313, + "step": 3230 + }, + { + "epoch": 0.012524933896182214, + "grad_norm": 0.22465969622135162, + "learning_rate": 0.000648, + "loss": 2.3134, + "step": 3240 + }, + { + "epoch": 0.012563591099565494, + "grad_norm": 0.18059320747852325, + "learning_rate": 0.0006500000000000001, + "loss": 2.3073, + "step": 3250 + }, + { + "epoch": 0.012602248302948772, + "grad_norm": 0.22967374324798584, + "learning_rate": 0.000652, + "loss": 2.3114, + "step": 3260 + }, + { + "epoch": 0.01264090550633205, + "grad_norm": 0.18112795054912567, + "learning_rate": 0.0006540000000000001, + "loss": 2.3164, + "step": 3270 + }, + { + "epoch": 0.012679562709715328, + "grad_norm": 0.2173462063074112, + "learning_rate": 0.000656, + "loss": 2.3134, + "step": 3280 + }, + { + "epoch": 0.012718219913098606, + "grad_norm": 0.1925901472568512, + "learning_rate": 0.0006580000000000001, + "loss": 2.3193, + "step": 3290 + }, + { + "epoch": 0.012756877116481885, + "grad_norm": 0.18288016319274902, + "learning_rate": 0.00066, + "loss": 2.3186, + "step": 3300 + }, + { + "epoch": 0.012795534319865163, + "grad_norm": 0.1826305240392685, + "learning_rate": 0.000662, + "loss": 2.3374, + "step": 3310 + }, + { + "epoch": 0.012834191523248443, + "grad_norm": 0.23015402257442474, + "learning_rate": 0.0006640000000000001, + "loss": 2.3157, + "step": 3320 + }, + { + "epoch": 0.012872848726631721, + "grad_norm": 0.19382594525814056, + "learning_rate": 0.000666, + "loss": 2.3106, + "step": 3330 + }, + { + "epoch": 0.012911505930015, + "grad_norm": 0.18039023876190186, + "learning_rate": 0.0006680000000000001, + "loss": 2.3175, + "step": 3340 + }, + { + "epoch": 0.012950163133398277, + "grad_norm": 0.1734837144613266, + "learning_rate": 0.00067, + "loss": 2.3264, + "step": 3350 + }, + { + "epoch": 0.012988820336781556, + "grad_norm": 0.2026592344045639, + "learning_rate": 0.0006720000000000001, + "loss": 2.324, + "step": 3360 + }, + { + "epoch": 0.013027477540164834, + "grad_norm": 0.17693206667900085, + "learning_rate": 0.000674, + "loss": 2.3244, + "step": 3370 + }, + { + "epoch": 0.013066134743548112, + "grad_norm": 0.18763317167758942, + "learning_rate": 0.0006760000000000001, + "loss": 2.3247, + "step": 3380 + }, + { + "epoch": 0.013104791946931392, + "grad_norm": 0.21735341846942902, + "learning_rate": 0.0006780000000000001, + "loss": 2.3155, + "step": 3390 + }, + { + "epoch": 0.01314344915031467, + "grad_norm": 0.20110860466957092, + "learning_rate": 0.00068, + "loss": 2.3164, + "step": 3400 + }, + { + "epoch": 0.013182106353697948, + "grad_norm": 0.20035584270954132, + "learning_rate": 0.0006820000000000001, + "loss": 2.3293, + "step": 3410 + }, + { + "epoch": 0.013220763557081227, + "grad_norm": 0.19247788190841675, + "learning_rate": 0.000684, + "loss": 2.3253, + "step": 3420 + }, + { + "epoch": 0.013259420760464505, + "grad_norm": 0.2187040001153946, + "learning_rate": 0.0006860000000000001, + "loss": 2.3348, + "step": 3430 + }, + { + "epoch": 0.013298077963847783, + "grad_norm": 0.2174689620733261, + "learning_rate": 0.0006879999999999999, + "loss": 2.3274, + "step": 3440 + }, + { + "epoch": 0.013336735167231061, + "grad_norm": 0.17934581637382507, + "learning_rate": 0.00069, + "loss": 2.3291, + "step": 3450 + }, + { + "epoch": 0.013375392370614341, + "grad_norm": 0.1819523721933365, + "learning_rate": 0.000692, + "loss": 2.315, + "step": 3460 + }, + { + "epoch": 0.01341404957399762, + "grad_norm": 0.20084640383720398, + "learning_rate": 0.000694, + "loss": 2.3244, + "step": 3470 + }, + { + "epoch": 0.013452706777380898, + "grad_norm": 0.2400597631931305, + "learning_rate": 0.000696, + "loss": 2.3189, + "step": 3480 + }, + { + "epoch": 0.013491363980764176, + "grad_norm": 0.19409014284610748, + "learning_rate": 0.0006979999999999999, + "loss": 2.3209, + "step": 3490 + }, + { + "epoch": 0.013530021184147454, + "grad_norm": 0.20252855122089386, + "learning_rate": 0.0007, + "loss": 2.3371, + "step": 3500 + }, + { + "epoch": 0.013568678387530732, + "grad_norm": 0.21784944832324982, + "learning_rate": 0.0007019999999999999, + "loss": 2.3313, + "step": 3510 + }, + { + "epoch": 0.01360733559091401, + "grad_norm": 0.17790041863918304, + "learning_rate": 0.000704, + "loss": 2.3196, + "step": 3520 + }, + { + "epoch": 0.013645992794297289, + "grad_norm": 0.19118991494178772, + "learning_rate": 0.0007059999999999999, + "loss": 2.3475, + "step": 3530 + }, + { + "epoch": 0.013684649997680568, + "grad_norm": 0.17741745710372925, + "learning_rate": 0.000708, + "loss": 2.3288, + "step": 3540 + }, + { + "epoch": 0.013723307201063847, + "grad_norm": 0.20735114812850952, + "learning_rate": 0.00071, + "loss": 2.3246, + "step": 3550 + }, + { + "epoch": 0.013761964404447125, + "grad_norm": 0.1806357353925705, + "learning_rate": 0.000712, + "loss": 2.3364, + "step": 3560 + }, + { + "epoch": 0.013800621607830403, + "grad_norm": 0.21187496185302734, + "learning_rate": 0.000714, + "loss": 2.3171, + "step": 3570 + }, + { + "epoch": 0.013839278811213681, + "grad_norm": 0.22054551541805267, + "learning_rate": 0.000716, + "loss": 2.3246, + "step": 3580 + }, + { + "epoch": 0.01387793601459696, + "grad_norm": 0.19840335845947266, + "learning_rate": 0.000718, + "loss": 2.3202, + "step": 3590 + }, + { + "epoch": 0.013916593217980238, + "grad_norm": 0.20100551843643188, + "learning_rate": 0.0007199999999999999, + "loss": 2.3361, + "step": 3600 + }, + { + "epoch": 0.013955250421363518, + "grad_norm": 0.19318947196006775, + "learning_rate": 0.000722, + "loss": 2.3387, + "step": 3610 + }, + { + "epoch": 0.013993907624746796, + "grad_norm": 0.21331772208213806, + "learning_rate": 0.000724, + "loss": 2.3339, + "step": 3620 + }, + { + "epoch": 0.014032564828130074, + "grad_norm": 0.17189262807369232, + "learning_rate": 0.000726, + "loss": 2.3349, + "step": 3630 + }, + { + "epoch": 0.014071222031513352, + "grad_norm": 0.24295486509799957, + "learning_rate": 0.000728, + "loss": 2.338, + "step": 3640 + }, + { + "epoch": 0.01410987923489663, + "grad_norm": 0.16952365636825562, + "learning_rate": 0.00073, + "loss": 2.3384, + "step": 3650 + }, + { + "epoch": 0.014148536438279909, + "grad_norm": 0.2218736708164215, + "learning_rate": 0.000732, + "loss": 2.3221, + "step": 3660 + }, + { + "epoch": 0.014187193641663187, + "grad_norm": 0.29244163632392883, + "learning_rate": 0.000734, + "loss": 2.3219, + "step": 3670 + }, + { + "epoch": 0.014225850845046467, + "grad_norm": 0.21221856772899628, + "learning_rate": 0.000736, + "loss": 2.3372, + "step": 3680 + }, + { + "epoch": 0.014264508048429745, + "grad_norm": 0.19032420217990875, + "learning_rate": 0.000738, + "loss": 2.336, + "step": 3690 + }, + { + "epoch": 0.014303165251813023, + "grad_norm": 0.1942375898361206, + "learning_rate": 0.00074, + "loss": 2.3251, + "step": 3700 + }, + { + "epoch": 0.014341822455196301, + "grad_norm": 0.18120437860488892, + "learning_rate": 0.000742, + "loss": 2.3325, + "step": 3710 + }, + { + "epoch": 0.01438047965857958, + "grad_norm": 0.22333329916000366, + "learning_rate": 0.000744, + "loss": 2.3488, + "step": 3720 + }, + { + "epoch": 0.014419136861962858, + "grad_norm": 0.21351563930511475, + "learning_rate": 0.000746, + "loss": 2.3263, + "step": 3730 + }, + { + "epoch": 0.014457794065346136, + "grad_norm": 0.1748659312725067, + "learning_rate": 0.000748, + "loss": 2.3232, + "step": 3740 + }, + { + "epoch": 0.014496451268729414, + "grad_norm": 0.1662777066230774, + "learning_rate": 0.00075, + "loss": 2.3334, + "step": 3750 + }, + { + "epoch": 0.014535108472112694, + "grad_norm": 0.19590096175670624, + "learning_rate": 0.0007520000000000001, + "loss": 2.3427, + "step": 3760 + }, + { + "epoch": 0.014573765675495972, + "grad_norm": 0.2757102847099304, + "learning_rate": 0.000754, + "loss": 2.3256, + "step": 3770 + }, + { + "epoch": 0.01461242287887925, + "grad_norm": 0.1870422512292862, + "learning_rate": 0.000756, + "loss": 2.3414, + "step": 3780 + }, + { + "epoch": 0.014651080082262529, + "grad_norm": 0.2014084756374359, + "learning_rate": 0.000758, + "loss": 2.3311, + "step": 3790 + }, + { + "epoch": 0.014689737285645807, + "grad_norm": 0.21479026973247528, + "learning_rate": 0.00076, + "loss": 2.3392, + "step": 3800 + }, + { + "epoch": 0.014728394489029085, + "grad_norm": 0.21824988722801208, + "learning_rate": 0.000762, + "loss": 2.3235, + "step": 3810 + }, + { + "epoch": 0.014767051692412363, + "grad_norm": 0.18084734678268433, + "learning_rate": 0.000764, + "loss": 2.3392, + "step": 3820 + }, + { + "epoch": 0.014805708895795643, + "grad_norm": 0.1988394409418106, + "learning_rate": 0.0007660000000000001, + "loss": 2.3204, + "step": 3830 + }, + { + "epoch": 0.014844366099178921, + "grad_norm": 0.2254992574453354, + "learning_rate": 0.000768, + "loss": 2.333, + "step": 3840 + }, + { + "epoch": 0.0148830233025622, + "grad_norm": 0.2529672384262085, + "learning_rate": 0.0007700000000000001, + "loss": 2.3312, + "step": 3850 + }, + { + "epoch": 0.014921680505945478, + "grad_norm": 0.1985018253326416, + "learning_rate": 0.000772, + "loss": 2.346, + "step": 3860 + }, + { + "epoch": 0.014960337709328756, + "grad_norm": 0.1995285451412201, + "learning_rate": 0.0007740000000000001, + "loss": 2.3367, + "step": 3870 + }, + { + "epoch": 0.014998994912712034, + "grad_norm": 0.2558148205280304, + "learning_rate": 0.000776, + "loss": 2.3475, + "step": 3880 + }, + { + "epoch": 0.015037652116095312, + "grad_norm": 0.1844359040260315, + "learning_rate": 0.000778, + "loss": 2.3576, + "step": 3890 + }, + { + "epoch": 0.015076309319478592, + "grad_norm": 0.1759207844734192, + "learning_rate": 0.0007800000000000001, + "loss": 2.3413, + "step": 3900 + }, + { + "epoch": 0.01511496652286187, + "grad_norm": 0.2178059220314026, + "learning_rate": 0.000782, + "loss": 2.3271, + "step": 3910 + }, + { + "epoch": 0.015153623726245149, + "grad_norm": 0.19973725080490112, + "learning_rate": 0.0007840000000000001, + "loss": 2.338, + "step": 3920 + }, + { + "epoch": 0.015192280929628427, + "grad_norm": 0.17417024075984955, + "learning_rate": 0.000786, + "loss": 2.3511, + "step": 3930 + }, + { + "epoch": 0.015230938133011705, + "grad_norm": 0.20759367942810059, + "learning_rate": 0.0007880000000000001, + "loss": 2.3419, + "step": 3940 + }, + { + "epoch": 0.015269595336394983, + "grad_norm": 0.17729806900024414, + "learning_rate": 0.00079, + "loss": 2.3487, + "step": 3950 + }, + { + "epoch": 0.015308252539778262, + "grad_norm": 0.16963709890842438, + "learning_rate": 0.0007920000000000001, + "loss": 2.3435, + "step": 3960 + }, + { + "epoch": 0.015346909743161542, + "grad_norm": 0.24149620532989502, + "learning_rate": 0.0007940000000000001, + "loss": 2.3415, + "step": 3970 + }, + { + "epoch": 0.01538556694654482, + "grad_norm": 0.21152982115745544, + "learning_rate": 0.000796, + "loss": 2.348, + "step": 3980 + }, + { + "epoch": 0.015424224149928098, + "grad_norm": 0.18707045912742615, + "learning_rate": 0.0007980000000000001, + "loss": 2.3503, + "step": 3990 + }, + { + "epoch": 0.015462881353311376, + "grad_norm": 0.20199261605739594, + "learning_rate": 0.0008, + "loss": 2.3655, + "step": 4000 + }, + { + "epoch": 0.015501538556694654, + "grad_norm": 0.2197076380252838, + "learning_rate": 0.0008020000000000001, + "loss": 2.3468, + "step": 4010 + }, + { + "epoch": 0.015540195760077933, + "grad_norm": 0.21308410167694092, + "learning_rate": 0.000804, + "loss": 2.3371, + "step": 4020 + }, + { + "epoch": 0.01557885296346121, + "grad_norm": 0.25000447034835815, + "learning_rate": 0.0008060000000000001, + "loss": 2.3436, + "step": 4030 + }, + { + "epoch": 0.015617510166844489, + "grad_norm": 0.18997006118297577, + "learning_rate": 0.000808, + "loss": 2.3428, + "step": 4040 + }, + { + "epoch": 0.015656167370227767, + "grad_norm": 0.1858537495136261, + "learning_rate": 0.0008100000000000001, + "loss": 2.3409, + "step": 4050 + }, + { + "epoch": 0.015694824573611047, + "grad_norm": 0.20581378042697906, + "learning_rate": 0.0008120000000000001, + "loss": 2.3435, + "step": 4060 + }, + { + "epoch": 0.015733481776994324, + "grad_norm": 0.16140508651733398, + "learning_rate": 0.0008139999999999999, + "loss": 2.3431, + "step": 4070 + }, + { + "epoch": 0.015772138980377604, + "grad_norm": 0.20004132390022278, + "learning_rate": 0.000816, + "loss": 2.3354, + "step": 4080 + }, + { + "epoch": 0.015810796183760883, + "grad_norm": 0.16737546026706696, + "learning_rate": 0.0008179999999999999, + "loss": 2.3609, + "step": 4090 + }, + { + "epoch": 0.01584945338714416, + "grad_norm": 0.2625548243522644, + "learning_rate": 0.00082, + "loss": 2.3586, + "step": 4100 + }, + { + "epoch": 0.01588811059052744, + "grad_norm": 0.2821698486804962, + "learning_rate": 0.0008219999999999999, + "loss": 2.3525, + "step": 4110 + }, + { + "epoch": 0.015926767793910716, + "grad_norm": 0.21390102803707123, + "learning_rate": 0.000824, + "loss": 2.3458, + "step": 4120 + }, + { + "epoch": 0.015965424997293996, + "grad_norm": 0.20981815457344055, + "learning_rate": 0.000826, + "loss": 2.3465, + "step": 4130 + }, + { + "epoch": 0.016004082200677273, + "grad_norm": 0.1971135288476944, + "learning_rate": 0.000828, + "loss": 2.3513, + "step": 4140 + }, + { + "epoch": 0.016042739404060553, + "grad_norm": 0.32396647334098816, + "learning_rate": 0.00083, + "loss": 2.3451, + "step": 4150 + }, + { + "epoch": 0.016081396607443833, + "grad_norm": 0.24782757461071014, + "learning_rate": 0.000832, + "loss": 2.3713, + "step": 4160 + }, + { + "epoch": 0.01612005381082711, + "grad_norm": 0.17164631187915802, + "learning_rate": 0.000834, + "loss": 2.3502, + "step": 4170 + }, + { + "epoch": 0.01615871101421039, + "grad_norm": 0.16930937767028809, + "learning_rate": 0.0008359999999999999, + "loss": 2.343, + "step": 4180 + }, + { + "epoch": 0.016197368217593665, + "grad_norm": 0.18578386306762695, + "learning_rate": 0.000838, + "loss": 2.3461, + "step": 4190 + }, + { + "epoch": 0.016236025420976945, + "grad_norm": 0.2054670751094818, + "learning_rate": 0.00084, + "loss": 2.3541, + "step": 4200 + }, + { + "epoch": 0.016274682624360222, + "grad_norm": 0.2295895665884018, + "learning_rate": 0.000842, + "loss": 2.3583, + "step": 4210 + }, + { + "epoch": 0.016313339827743502, + "grad_norm": 0.2089163213968277, + "learning_rate": 0.000844, + "loss": 2.3494, + "step": 4220 + }, + { + "epoch": 0.016351997031126782, + "grad_norm": 0.1846083551645279, + "learning_rate": 0.000846, + "loss": 2.3604, + "step": 4230 + }, + { + "epoch": 0.016390654234510058, + "grad_norm": 0.20951008796691895, + "learning_rate": 0.000848, + "loss": 2.3517, + "step": 4240 + }, + { + "epoch": 0.016429311437893338, + "grad_norm": 0.18815088272094727, + "learning_rate": 0.00085, + "loss": 2.3601, + "step": 4250 + }, + { + "epoch": 0.016467968641276615, + "grad_norm": 0.19509609043598175, + "learning_rate": 0.000852, + "loss": 2.3516, + "step": 4260 + }, + { + "epoch": 0.016506625844659895, + "grad_norm": 0.2361738532781601, + "learning_rate": 0.000854, + "loss": 2.3644, + "step": 4270 + }, + { + "epoch": 0.01654528304804317, + "grad_norm": 0.23323406279087067, + "learning_rate": 0.000856, + "loss": 2.36, + "step": 4280 + }, + { + "epoch": 0.01658394025142645, + "grad_norm": 0.20140711963176727, + "learning_rate": 0.000858, + "loss": 2.3502, + "step": 4290 + }, + { + "epoch": 0.01662259745480973, + "grad_norm": 0.22504544258117676, + "learning_rate": 0.00086, + "loss": 2.3609, + "step": 4300 + }, + { + "epoch": 0.016661254658193007, + "grad_norm": 0.19168426096439362, + "learning_rate": 0.000862, + "loss": 2.3573, + "step": 4310 + }, + { + "epoch": 0.016699911861576287, + "grad_norm": 0.20893554389476776, + "learning_rate": 0.000864, + "loss": 2.3782, + "step": 4320 + }, + { + "epoch": 0.016738569064959564, + "grad_norm": 0.19215723872184753, + "learning_rate": 0.000866, + "loss": 2.3409, + "step": 4330 + }, + { + "epoch": 0.016777226268342844, + "grad_norm": 0.18347090482711792, + "learning_rate": 0.0008680000000000001, + "loss": 2.3683, + "step": 4340 + }, + { + "epoch": 0.01681588347172612, + "grad_norm": 0.19273754954338074, + "learning_rate": 0.00087, + "loss": 2.3631, + "step": 4350 + }, + { + "epoch": 0.0168545406751094, + "grad_norm": 0.28873759508132935, + "learning_rate": 0.000872, + "loss": 2.3594, + "step": 4360 + }, + { + "epoch": 0.016893197878492677, + "grad_norm": 0.19875763356685638, + "learning_rate": 0.000874, + "loss": 2.3485, + "step": 4370 + }, + { + "epoch": 0.016931855081875957, + "grad_norm": 0.2057722955942154, + "learning_rate": 0.000876, + "loss": 2.3577, + "step": 4380 + }, + { + "epoch": 0.016970512285259236, + "grad_norm": 0.23414702713489532, + "learning_rate": 0.000878, + "loss": 2.3543, + "step": 4390 + }, + { + "epoch": 0.017009169488642513, + "grad_norm": 0.19359450042247772, + "learning_rate": 0.00088, + "loss": 2.3571, + "step": 4400 + }, + { + "epoch": 0.017047826692025793, + "grad_norm": 0.19209226965904236, + "learning_rate": 0.000882, + "loss": 2.3508, + "step": 4410 + }, + { + "epoch": 0.01708648389540907, + "grad_norm": 0.2071743756532669, + "learning_rate": 0.000884, + "loss": 2.3479, + "step": 4420 + }, + { + "epoch": 0.01712514109879235, + "grad_norm": 0.25464510917663574, + "learning_rate": 0.0008860000000000001, + "loss": 2.3719, + "step": 4430 + }, + { + "epoch": 0.017163798302175626, + "grad_norm": 0.25327372550964355, + "learning_rate": 0.000888, + "loss": 2.3576, + "step": 4440 + }, + { + "epoch": 0.017202455505558906, + "grad_norm": 0.19814546406269073, + "learning_rate": 0.0008900000000000001, + "loss": 2.351, + "step": 4450 + }, + { + "epoch": 0.017241112708942186, + "grad_norm": 0.18401065468788147, + "learning_rate": 0.000892, + "loss": 2.3566, + "step": 4460 + }, + { + "epoch": 0.017279769912325462, + "grad_norm": 0.1848963350057602, + "learning_rate": 0.000894, + "loss": 2.3525, + "step": 4470 + }, + { + "epoch": 0.017318427115708742, + "grad_norm": 0.213156595826149, + "learning_rate": 0.000896, + "loss": 2.3474, + "step": 4480 + }, + { + "epoch": 0.01735708431909202, + "grad_norm": 0.1951395869255066, + "learning_rate": 0.000898, + "loss": 2.367, + "step": 4490 + }, + { + "epoch": 0.0173957415224753, + "grad_norm": 0.18687497079372406, + "learning_rate": 0.0009000000000000001, + "loss": 2.3521, + "step": 4500 + }, + { + "epoch": 0.017434398725858575, + "grad_norm": 0.24728751182556152, + "learning_rate": 0.000902, + "loss": 2.3744, + "step": 4510 + }, + { + "epoch": 0.017473055929241855, + "grad_norm": 0.23267033696174622, + "learning_rate": 0.0009040000000000001, + "loss": 2.3624, + "step": 4520 + }, + { + "epoch": 0.017511713132625135, + "grad_norm": 0.1700582057237625, + "learning_rate": 0.000906, + "loss": 2.3719, + "step": 4530 + }, + { + "epoch": 0.01755037033600841, + "grad_norm": 0.1689836084842682, + "learning_rate": 0.0009080000000000001, + "loss": 2.3659, + "step": 4540 + }, + { + "epoch": 0.01758902753939169, + "grad_norm": 0.20966678857803345, + "learning_rate": 0.00091, + "loss": 2.3525, + "step": 4550 + }, + { + "epoch": 0.017627684742774968, + "grad_norm": 0.18348130583763123, + "learning_rate": 0.000912, + "loss": 2.358, + "step": 4560 + }, + { + "epoch": 0.017666341946158248, + "grad_norm": 0.21361717581748962, + "learning_rate": 0.0009140000000000001, + "loss": 2.3657, + "step": 4570 + }, + { + "epoch": 0.017704999149541524, + "grad_norm": 0.19976146519184113, + "learning_rate": 0.000916, + "loss": 2.3555, + "step": 4580 + }, + { + "epoch": 0.017743656352924804, + "grad_norm": 0.19655726850032806, + "learning_rate": 0.0009180000000000001, + "loss": 2.3498, + "step": 4590 + }, + { + "epoch": 0.017782313556308084, + "grad_norm": 0.19483190774917603, + "learning_rate": 0.00092, + "loss": 2.3674, + "step": 4600 + }, + { + "epoch": 0.01782097075969136, + "grad_norm": 0.22144700586795807, + "learning_rate": 0.0009220000000000001, + "loss": 2.3779, + "step": 4610 + }, + { + "epoch": 0.01785962796307464, + "grad_norm": 0.2064056396484375, + "learning_rate": 0.000924, + "loss": 2.3547, + "step": 4620 + }, + { + "epoch": 0.017898285166457917, + "grad_norm": 0.19249336421489716, + "learning_rate": 0.0009260000000000001, + "loss": 2.3645, + "step": 4630 + }, + { + "epoch": 0.017936942369841197, + "grad_norm": 0.16990318894386292, + "learning_rate": 0.0009280000000000001, + "loss": 2.3738, + "step": 4640 + }, + { + "epoch": 0.017975599573224473, + "grad_norm": 0.1956743597984314, + "learning_rate": 0.00093, + "loss": 2.3491, + "step": 4650 + }, + { + "epoch": 0.018014256776607753, + "grad_norm": 0.4806165099143982, + "learning_rate": 0.0009320000000000001, + "loss": 2.3526, + "step": 4660 + }, + { + "epoch": 0.018052913979991033, + "grad_norm": 0.1825423687696457, + "learning_rate": 0.000934, + "loss": 2.3588, + "step": 4670 + }, + { + "epoch": 0.01809157118337431, + "grad_norm": 0.23481100797653198, + "learning_rate": 0.0009360000000000001, + "loss": 2.3541, + "step": 4680 + }, + { + "epoch": 0.01813022838675759, + "grad_norm": 0.21459338068962097, + "learning_rate": 0.0009379999999999999, + "loss": 2.3703, + "step": 4690 + }, + { + "epoch": 0.018168885590140866, + "grad_norm": 0.23080278933048248, + "learning_rate": 0.00094, + "loss": 2.3711, + "step": 4700 + }, + { + "epoch": 0.018207542793524146, + "grad_norm": 0.21888788044452667, + "learning_rate": 0.000942, + "loss": 2.368, + "step": 4710 + }, + { + "epoch": 0.018246199996907422, + "grad_norm": 0.1902155727148056, + "learning_rate": 0.000944, + "loss": 2.3778, + "step": 4720 + }, + { + "epoch": 0.018284857200290702, + "grad_norm": 0.18804939091205597, + "learning_rate": 0.000946, + "loss": 2.3749, + "step": 4730 + }, + { + "epoch": 0.018323514403673982, + "grad_norm": 0.23752015829086304, + "learning_rate": 0.000948, + "loss": 2.3782, + "step": 4740 + }, + { + "epoch": 0.01836217160705726, + "grad_norm": 0.20950154960155487, + "learning_rate": 0.00095, + "loss": 2.3584, + "step": 4750 + }, + { + "epoch": 0.01840082881044054, + "grad_norm": 0.2356835901737213, + "learning_rate": 0.0009519999999999999, + "loss": 2.3952, + "step": 4760 + }, + { + "epoch": 0.018439486013823815, + "grad_norm": 0.2130763977766037, + "learning_rate": 0.000954, + "loss": 2.3659, + "step": 4770 + }, + { + "epoch": 0.018478143217207095, + "grad_norm": 0.3312756419181824, + "learning_rate": 0.0009559999999999999, + "loss": 2.3651, + "step": 4780 + }, + { + "epoch": 0.01851680042059037, + "grad_norm": 0.21410676836967468, + "learning_rate": 0.000958, + "loss": 2.3887, + "step": 4790 + }, + { + "epoch": 0.01855545762397365, + "grad_norm": 0.18393675982952118, + "learning_rate": 0.00096, + "loss": 2.3764, + "step": 4800 + }, + { + "epoch": 0.01859411482735693, + "grad_norm": 0.22657392919063568, + "learning_rate": 0.000962, + "loss": 2.3905, + "step": 4810 + }, + { + "epoch": 0.018632772030740208, + "grad_norm": 0.1854638159275055, + "learning_rate": 0.000964, + "loss": 2.3755, + "step": 4820 + }, + { + "epoch": 0.018671429234123488, + "grad_norm": 0.17408819496631622, + "learning_rate": 0.000966, + "loss": 2.3733, + "step": 4830 + }, + { + "epoch": 0.018710086437506764, + "grad_norm": 0.21019886434078217, + "learning_rate": 0.000968, + "loss": 2.3909, + "step": 4840 + }, + { + "epoch": 0.018748743640890044, + "grad_norm": 0.28492727875709534, + "learning_rate": 0.0009699999999999999, + "loss": 2.3551, + "step": 4850 + }, + { + "epoch": 0.01878740084427332, + "grad_norm": 0.3036006689071655, + "learning_rate": 0.000972, + "loss": 2.3697, + "step": 4860 + }, + { + "epoch": 0.0188260580476566, + "grad_norm": 0.27126896381378174, + "learning_rate": 0.000974, + "loss": 2.3682, + "step": 4870 + }, + { + "epoch": 0.018864715251039877, + "grad_norm": 0.19037535786628723, + "learning_rate": 0.000976, + "loss": 2.3693, + "step": 4880 + }, + { + "epoch": 0.018903372454423157, + "grad_norm": 0.17468905448913574, + "learning_rate": 0.000978, + "loss": 2.3792, + "step": 4890 + }, + { + "epoch": 0.018942029657806437, + "grad_norm": 0.20166796445846558, + "learning_rate": 0.00098, + "loss": 2.3735, + "step": 4900 + }, + { + "epoch": 0.018980686861189713, + "grad_norm": 0.21851827204227448, + "learning_rate": 0.000982, + "loss": 2.3554, + "step": 4910 + }, + { + "epoch": 0.019019344064572993, + "grad_norm": 0.2518332898616791, + "learning_rate": 0.000984, + "loss": 2.3777, + "step": 4920 + }, + { + "epoch": 0.01905800126795627, + "grad_norm": 0.21647198498249054, + "learning_rate": 0.0009860000000000001, + "loss": 2.3772, + "step": 4930 + }, + { + "epoch": 0.01909665847133955, + "grad_norm": 0.19404593110084534, + "learning_rate": 0.000988, + "loss": 2.3813, + "step": 4940 + }, + { + "epoch": 0.019135315674722826, + "grad_norm": 0.219878688454628, + "learning_rate": 0.00099, + "loss": 2.3693, + "step": 4950 + }, + { + "epoch": 0.019173972878106106, + "grad_norm": 0.18607568740844727, + "learning_rate": 0.000992, + "loss": 2.3613, + "step": 4960 + }, + { + "epoch": 0.019212630081489386, + "grad_norm": 0.28682470321655273, + "learning_rate": 0.000994, + "loss": 2.3921, + "step": 4970 + }, + { + "epoch": 0.019251287284872663, + "grad_norm": 0.18783779442310333, + "learning_rate": 0.000996, + "loss": 2.3787, + "step": 4980 + }, + { + "epoch": 0.019289944488255942, + "grad_norm": 0.21272382140159607, + "learning_rate": 0.000998, + "loss": 2.3665, + "step": 4990 + }, + { + "epoch": 0.01932860169163922, + "grad_norm": 0.1703406572341919, + "learning_rate": 0.001, + "loss": 2.3749, + "step": 5000 + }, + { + "epoch": 0.0193672588950225, + "grad_norm": 0.23985524475574493, + "learning_rate": 0.001002, + "loss": 2.3814, + "step": 5010 + }, + { + "epoch": 0.019405916098405775, + "grad_norm": 0.21343666315078735, + "learning_rate": 0.0010040000000000001, + "loss": 2.3647, + "step": 5020 + }, + { + "epoch": 0.019444573301789055, + "grad_norm": 0.18966184556484222, + "learning_rate": 0.001006, + "loss": 2.3807, + "step": 5030 + }, + { + "epoch": 0.019483230505172335, + "grad_norm": 0.23002567887306213, + "learning_rate": 0.001008, + "loss": 2.379, + "step": 5040 + }, + { + "epoch": 0.01952188770855561, + "grad_norm": 0.1744360476732254, + "learning_rate": 0.00101, + "loss": 2.3835, + "step": 5050 + }, + { + "epoch": 0.01956054491193889, + "grad_norm": 0.213888481259346, + "learning_rate": 0.001012, + "loss": 2.3697, + "step": 5060 + }, + { + "epoch": 0.019599202115322168, + "grad_norm": 0.2396910935640335, + "learning_rate": 0.001014, + "loss": 2.3888, + "step": 5070 + }, + { + "epoch": 0.019637859318705448, + "grad_norm": 0.22268559038639069, + "learning_rate": 0.001016, + "loss": 2.3851, + "step": 5080 + }, + { + "epoch": 0.019676516522088724, + "grad_norm": 0.2205670177936554, + "learning_rate": 0.001018, + "loss": 2.3908, + "step": 5090 + }, + { + "epoch": 0.019715173725472004, + "grad_norm": 0.2212257832288742, + "learning_rate": 0.00102, + "loss": 2.3914, + "step": 5100 + }, + { + "epoch": 0.019753830928855284, + "grad_norm": 0.1979675441980362, + "learning_rate": 0.0010220000000000001, + "loss": 2.3785, + "step": 5110 + }, + { + "epoch": 0.01979248813223856, + "grad_norm": 0.275046706199646, + "learning_rate": 0.001024, + "loss": 2.3741, + "step": 5120 + }, + { + "epoch": 0.01983114533562184, + "grad_norm": 0.1912265121936798, + "learning_rate": 0.001026, + "loss": 2.3851, + "step": 5130 + }, + { + "epoch": 0.019869802539005117, + "grad_norm": 0.2097253054380417, + "learning_rate": 0.001028, + "loss": 2.3815, + "step": 5140 + }, + { + "epoch": 0.019908459742388397, + "grad_norm": 0.19920724630355835, + "learning_rate": 0.00103, + "loss": 2.3834, + "step": 5150 + }, + { + "epoch": 0.019947116945771674, + "grad_norm": 0.18724262714385986, + "learning_rate": 0.0010320000000000001, + "loss": 2.3754, + "step": 5160 + }, + { + "epoch": 0.019985774149154954, + "grad_norm": 0.20353613793849945, + "learning_rate": 0.001034, + "loss": 2.3732, + "step": 5170 + }, + { + "epoch": 0.020024431352538234, + "grad_norm": 0.24157506227493286, + "learning_rate": 0.001036, + "loss": 2.3814, + "step": 5180 + }, + { + "epoch": 0.02006308855592151, + "grad_norm": 0.2027304321527481, + "learning_rate": 0.001038, + "loss": 2.3868, + "step": 5190 + }, + { + "epoch": 0.02010174575930479, + "grad_norm": 0.20146556198596954, + "learning_rate": 0.0010400000000000001, + "loss": 2.3819, + "step": 5200 + }, + { + "epoch": 0.020140402962688066, + "grad_norm": 0.1771874725818634, + "learning_rate": 0.001042, + "loss": 2.3867, + "step": 5210 + }, + { + "epoch": 0.020179060166071346, + "grad_norm": 0.18880490958690643, + "learning_rate": 0.001044, + "loss": 2.3862, + "step": 5220 + }, + { + "epoch": 0.020217717369454623, + "grad_norm": 0.2295592725276947, + "learning_rate": 0.001046, + "loss": 2.38, + "step": 5230 + }, + { + "epoch": 0.020256374572837903, + "grad_norm": 0.20400582253932953, + "learning_rate": 0.001048, + "loss": 2.3833, + "step": 5240 + }, + { + "epoch": 0.020295031776221183, + "grad_norm": 0.20016399025917053, + "learning_rate": 0.0010500000000000002, + "loss": 2.3831, + "step": 5250 + }, + { + "epoch": 0.02033368897960446, + "grad_norm": 0.211043119430542, + "learning_rate": 0.001052, + "loss": 2.3928, + "step": 5260 + }, + { + "epoch": 0.02037234618298774, + "grad_norm": 0.16745540499687195, + "learning_rate": 0.001054, + "loss": 2.3754, + "step": 5270 + }, + { + "epoch": 0.020411003386371016, + "grad_norm": 0.2527609169483185, + "learning_rate": 0.001056, + "loss": 2.3707, + "step": 5280 + }, + { + "epoch": 0.020449660589754295, + "grad_norm": 0.24378624558448792, + "learning_rate": 0.0010580000000000001, + "loss": 2.3861, + "step": 5290 + }, + { + "epoch": 0.020488317793137572, + "grad_norm": 0.215751051902771, + "learning_rate": 0.0010600000000000002, + "loss": 2.3821, + "step": 5300 + }, + { + "epoch": 0.020526974996520852, + "grad_norm": 0.17920060455799103, + "learning_rate": 0.001062, + "loss": 2.374, + "step": 5310 + }, + { + "epoch": 0.020565632199904132, + "grad_norm": 0.19355903565883636, + "learning_rate": 0.001064, + "loss": 2.4042, + "step": 5320 + }, + { + "epoch": 0.02060428940328741, + "grad_norm": 0.1970899999141693, + "learning_rate": 0.001066, + "loss": 2.3778, + "step": 5330 + }, + { + "epoch": 0.020642946606670688, + "grad_norm": 0.21478936076164246, + "learning_rate": 0.0010680000000000002, + "loss": 2.3864, + "step": 5340 + }, + { + "epoch": 0.020681603810053965, + "grad_norm": 0.1935175359249115, + "learning_rate": 0.00107, + "loss": 2.3968, + "step": 5350 + }, + { + "epoch": 0.020720261013437245, + "grad_norm": 0.20764757692813873, + "learning_rate": 0.001072, + "loss": 2.3814, + "step": 5360 + }, + { + "epoch": 0.02075891821682052, + "grad_norm": 0.2129974067211151, + "learning_rate": 0.001074, + "loss": 2.377, + "step": 5370 + }, + { + "epoch": 0.0207975754202038, + "grad_norm": 0.1787700653076172, + "learning_rate": 0.0010760000000000001, + "loss": 2.3644, + "step": 5380 + }, + { + "epoch": 0.02083623262358708, + "grad_norm": 0.17959833145141602, + "learning_rate": 0.0010780000000000002, + "loss": 2.3947, + "step": 5390 + }, + { + "epoch": 0.020874889826970357, + "grad_norm": 0.1857430785894394, + "learning_rate": 0.00108, + "loss": 2.3993, + "step": 5400 + }, + { + "epoch": 0.020913547030353637, + "grad_norm": 0.21030008792877197, + "learning_rate": 0.001082, + "loss": 2.3942, + "step": 5410 + }, + { + "epoch": 0.020952204233736914, + "grad_norm": 0.2184520959854126, + "learning_rate": 0.0010840000000000001, + "loss": 2.3898, + "step": 5420 + }, + { + "epoch": 0.020990861437120194, + "grad_norm": 0.22430236637592316, + "learning_rate": 0.0010860000000000002, + "loss": 2.3829, + "step": 5430 + }, + { + "epoch": 0.02102951864050347, + "grad_norm": 0.20971611142158508, + "learning_rate": 0.0010880000000000002, + "loss": 2.381, + "step": 5440 + }, + { + "epoch": 0.02106817584388675, + "grad_norm": 0.19676510989665985, + "learning_rate": 0.00109, + "loss": 2.3834, + "step": 5450 + }, + { + "epoch": 0.021106833047270027, + "grad_norm": 0.2264118492603302, + "learning_rate": 0.001092, + "loss": 2.3947, + "step": 5460 + }, + { + "epoch": 0.021145490250653307, + "grad_norm": 0.1974431425333023, + "learning_rate": 0.0010940000000000001, + "loss": 2.3712, + "step": 5470 + }, + { + "epoch": 0.021184147454036586, + "grad_norm": 0.20495396852493286, + "learning_rate": 0.0010960000000000002, + "loss": 2.3856, + "step": 5480 + }, + { + "epoch": 0.021222804657419863, + "grad_norm": 0.19436782598495483, + "learning_rate": 0.001098, + "loss": 2.3905, + "step": 5490 + }, + { + "epoch": 0.021261461860803143, + "grad_norm": 0.1938960999250412, + "learning_rate": 0.0011, + "loss": 2.3965, + "step": 5500 + }, + { + "epoch": 0.02130011906418642, + "grad_norm": 0.18784378468990326, + "learning_rate": 0.0011020000000000001, + "loss": 2.3885, + "step": 5510 + }, + { + "epoch": 0.0213387762675697, + "grad_norm": 0.20048850774765015, + "learning_rate": 0.0011040000000000002, + "loss": 2.3738, + "step": 5520 + }, + { + "epoch": 0.021377433470952976, + "grad_norm": 0.2599906623363495, + "learning_rate": 0.0011060000000000002, + "loss": 2.4158, + "step": 5530 + }, + { + "epoch": 0.021416090674336256, + "grad_norm": 0.2207055687904358, + "learning_rate": 0.001108, + "loss": 2.393, + "step": 5540 + }, + { + "epoch": 0.021454747877719536, + "grad_norm": 0.2058638483285904, + "learning_rate": 0.00111, + "loss": 2.3804, + "step": 5550 + }, + { + "epoch": 0.021493405081102812, + "grad_norm": 0.22761179506778717, + "learning_rate": 0.0011120000000000001, + "loss": 2.3845, + "step": 5560 + }, + { + "epoch": 0.021532062284486092, + "grad_norm": 0.18034592270851135, + "learning_rate": 0.0011140000000000002, + "loss": 2.4026, + "step": 5570 + }, + { + "epoch": 0.02157071948786937, + "grad_norm": 0.21812139451503754, + "learning_rate": 0.001116, + "loss": 2.3913, + "step": 5580 + }, + { + "epoch": 0.02160937669125265, + "grad_norm": 0.20300228893756866, + "learning_rate": 0.001118, + "loss": 2.3818, + "step": 5590 + }, + { + "epoch": 0.021648033894635925, + "grad_norm": 0.1981932371854782, + "learning_rate": 0.0011200000000000001, + "loss": 2.407, + "step": 5600 + }, + { + "epoch": 0.021686691098019205, + "grad_norm": 0.21006079018115997, + "learning_rate": 0.0011220000000000002, + "loss": 2.3832, + "step": 5610 + }, + { + "epoch": 0.021725348301402485, + "grad_norm": 0.24926023185253143, + "learning_rate": 0.0011240000000000002, + "loss": 2.3968, + "step": 5620 + }, + { + "epoch": 0.02176400550478576, + "grad_norm": 0.21123667061328888, + "learning_rate": 0.0011259999999999998, + "loss": 2.3916, + "step": 5630 + }, + { + "epoch": 0.02180266270816904, + "grad_norm": 0.19651569426059723, + "learning_rate": 0.0011279999999999999, + "loss": 2.4016, + "step": 5640 + }, + { + "epoch": 0.021841319911552318, + "grad_norm": 0.17947527766227722, + "learning_rate": 0.00113, + "loss": 2.3748, + "step": 5650 + }, + { + "epoch": 0.021879977114935598, + "grad_norm": 0.25798648595809937, + "learning_rate": 0.001132, + "loss": 2.3818, + "step": 5660 + }, + { + "epoch": 0.021918634318318874, + "grad_norm": 0.2755715548992157, + "learning_rate": 0.001134, + "loss": 2.3983, + "step": 5670 + }, + { + "epoch": 0.021957291521702154, + "grad_norm": 0.19706271588802338, + "learning_rate": 0.0011359999999999999, + "loss": 2.3904, + "step": 5680 + }, + { + "epoch": 0.021995948725085434, + "grad_norm": 0.2231791466474533, + "learning_rate": 0.001138, + "loss": 2.4054, + "step": 5690 + }, + { + "epoch": 0.02203460592846871, + "grad_norm": 0.19517715275287628, + "learning_rate": 0.00114, + "loss": 2.3792, + "step": 5700 + }, + { + "epoch": 0.02207326313185199, + "grad_norm": 0.21162500977516174, + "learning_rate": 0.001142, + "loss": 2.3907, + "step": 5710 + }, + { + "epoch": 0.022111920335235267, + "grad_norm": 0.24791096150875092, + "learning_rate": 0.0011439999999999998, + "loss": 2.4084, + "step": 5720 + }, + { + "epoch": 0.022150577538618547, + "grad_norm": 0.22254496812820435, + "learning_rate": 0.0011459999999999999, + "loss": 2.4031, + "step": 5730 + }, + { + "epoch": 0.022189234742001823, + "grad_norm": 0.17276102304458618, + "learning_rate": 0.001148, + "loss": 2.3873, + "step": 5740 + }, + { + "epoch": 0.022227891945385103, + "grad_norm": 0.21049363911151886, + "learning_rate": 0.00115, + "loss": 2.3936, + "step": 5750 + }, + { + "epoch": 0.022266549148768383, + "grad_norm": 0.21012358367443085, + "learning_rate": 0.001152, + "loss": 2.3911, + "step": 5760 + }, + { + "epoch": 0.02230520635215166, + "grad_norm": 0.17598873376846313, + "learning_rate": 0.0011539999999999999, + "loss": 2.3986, + "step": 5770 + }, + { + "epoch": 0.02234386355553494, + "grad_norm": 0.21582140028476715, + "learning_rate": 0.001156, + "loss": 2.385, + "step": 5780 + }, + { + "epoch": 0.022382520758918216, + "grad_norm": 0.2273397147655487, + "learning_rate": 0.001158, + "loss": 2.401, + "step": 5790 + }, + { + "epoch": 0.022421177962301496, + "grad_norm": 0.16410350799560547, + "learning_rate": 0.00116, + "loss": 2.3746, + "step": 5800 + }, + { + "epoch": 0.022459835165684772, + "grad_norm": 0.19850672781467438, + "learning_rate": 0.0011619999999999998, + "loss": 2.3876, + "step": 5810 + }, + { + "epoch": 0.022498492369068052, + "grad_norm": 0.2051462084054947, + "learning_rate": 0.0011639999999999999, + "loss": 2.4057, + "step": 5820 + }, + { + "epoch": 0.022537149572451332, + "grad_norm": 0.19643060863018036, + "learning_rate": 0.001166, + "loss": 2.3852, + "step": 5830 + }, + { + "epoch": 0.02257580677583461, + "grad_norm": 0.21040578186511993, + "learning_rate": 0.001168, + "loss": 2.3928, + "step": 5840 + }, + { + "epoch": 0.02261446397921789, + "grad_norm": 0.19768066704273224, + "learning_rate": 0.00117, + "loss": 2.4053, + "step": 5850 + }, + { + "epoch": 0.022653121182601165, + "grad_norm": 0.21659426391124725, + "learning_rate": 0.0011719999999999999, + "loss": 2.409, + "step": 5860 + }, + { + "epoch": 0.022691778385984445, + "grad_norm": 0.23596347868442535, + "learning_rate": 0.001174, + "loss": 2.4044, + "step": 5870 + }, + { + "epoch": 0.02273043558936772, + "grad_norm": 0.1964534968137741, + "learning_rate": 0.001176, + "loss": 2.3805, + "step": 5880 + }, + { + "epoch": 0.022769092792751, + "grad_norm": 0.15665480494499207, + "learning_rate": 0.001178, + "loss": 2.3937, + "step": 5890 + }, + { + "epoch": 0.02280774999613428, + "grad_norm": 0.18110042810440063, + "learning_rate": 0.00118, + "loss": 2.3911, + "step": 5900 + }, + { + "epoch": 0.022846407199517558, + "grad_norm": 0.20500323176383972, + "learning_rate": 0.0011819999999999999, + "loss": 2.3963, + "step": 5910 + }, + { + "epoch": 0.022885064402900838, + "grad_norm": 0.21157889068126678, + "learning_rate": 0.001184, + "loss": 2.3959, + "step": 5920 + }, + { + "epoch": 0.022923721606284114, + "grad_norm": 0.17701420187950134, + "learning_rate": 0.001186, + "loss": 2.4032, + "step": 5930 + }, + { + "epoch": 0.022962378809667394, + "grad_norm": 0.20900875329971313, + "learning_rate": 0.001188, + "loss": 2.3954, + "step": 5940 + }, + { + "epoch": 0.02300103601305067, + "grad_norm": 0.18997344374656677, + "learning_rate": 0.0011899999999999999, + "loss": 2.3865, + "step": 5950 + }, + { + "epoch": 0.02303969321643395, + "grad_norm": 0.2688944935798645, + "learning_rate": 0.001192, + "loss": 2.4034, + "step": 5960 + }, + { + "epoch": 0.023078350419817227, + "grad_norm": 0.2342280000448227, + "learning_rate": 0.001194, + "loss": 2.4029, + "step": 5970 + }, + { + "epoch": 0.023117007623200507, + "grad_norm": 0.1937403380870819, + "learning_rate": 0.001196, + "loss": 2.4059, + "step": 5980 + }, + { + "epoch": 0.023155664826583787, + "grad_norm": 0.1973395198583603, + "learning_rate": 0.001198, + "loss": 2.3967, + "step": 5990 + }, + { + "epoch": 0.023194322029967063, + "grad_norm": 0.2075272798538208, + "learning_rate": 0.0012, + "loss": 2.3909, + "step": 6000 + }, + { + "epoch": 0.023232979233350343, + "grad_norm": 0.18843838572502136, + "learning_rate": 0.001202, + "loss": 2.4028, + "step": 6010 + }, + { + "epoch": 0.02327163643673362, + "grad_norm": 0.20422710478305817, + "learning_rate": 0.001204, + "loss": 2.3918, + "step": 6020 + }, + { + "epoch": 0.0233102936401169, + "grad_norm": 0.19518813490867615, + "learning_rate": 0.001206, + "loss": 2.3905, + "step": 6030 + }, + { + "epoch": 0.023348950843500176, + "grad_norm": 0.19929325580596924, + "learning_rate": 0.001208, + "loss": 2.3904, + "step": 6040 + }, + { + "epoch": 0.023387608046883456, + "grad_norm": 0.20663338899612427, + "learning_rate": 0.00121, + "loss": 2.4037, + "step": 6050 + }, + { + "epoch": 0.023426265250266736, + "grad_norm": 0.19592155516147614, + "learning_rate": 0.001212, + "loss": 2.3941, + "step": 6060 + }, + { + "epoch": 0.023464922453650013, + "grad_norm": 0.23385976254940033, + "learning_rate": 0.001214, + "loss": 2.4098, + "step": 6070 + }, + { + "epoch": 0.023503579657033292, + "grad_norm": 0.1804191768169403, + "learning_rate": 0.001216, + "loss": 2.4017, + "step": 6080 + }, + { + "epoch": 0.02354223686041657, + "grad_norm": 0.26508116722106934, + "learning_rate": 0.001218, + "loss": 2.403, + "step": 6090 + }, + { + "epoch": 0.02358089406379985, + "grad_norm": 0.23387646675109863, + "learning_rate": 0.00122, + "loss": 2.41, + "step": 6100 + }, + { + "epoch": 0.023619551267183125, + "grad_norm": 0.19134730100631714, + "learning_rate": 0.001222, + "loss": 2.4137, + "step": 6110 + }, + { + "epoch": 0.023658208470566405, + "grad_norm": 0.18082371354103088, + "learning_rate": 0.001224, + "loss": 2.4185, + "step": 6120 + }, + { + "epoch": 0.023696865673949685, + "grad_norm": 0.19267909228801727, + "learning_rate": 0.001226, + "loss": 2.3945, + "step": 6130 + }, + { + "epoch": 0.02373552287733296, + "grad_norm": 0.19761332869529724, + "learning_rate": 0.001228, + "loss": 2.3968, + "step": 6140 + }, + { + "epoch": 0.02377418008071624, + "grad_norm": 0.20080329477787018, + "learning_rate": 0.00123, + "loss": 2.4168, + "step": 6150 + }, + { + "epoch": 0.023812837284099518, + "grad_norm": 0.17178675532341003, + "learning_rate": 0.001232, + "loss": 2.4024, + "step": 6160 + }, + { + "epoch": 0.023851494487482798, + "grad_norm": 0.17782160639762878, + "learning_rate": 0.001234, + "loss": 2.3888, + "step": 6170 + }, + { + "epoch": 0.023890151690866075, + "grad_norm": 0.2577625513076782, + "learning_rate": 0.0012360000000000001, + "loss": 2.3935, + "step": 6180 + }, + { + "epoch": 0.023928808894249354, + "grad_norm": 0.17612770199775696, + "learning_rate": 0.001238, + "loss": 2.4003, + "step": 6190 + }, + { + "epoch": 0.023967466097632634, + "grad_norm": 0.24437908828258514, + "learning_rate": 0.00124, + "loss": 2.3939, + "step": 6200 + }, + { + "epoch": 0.02400612330101591, + "grad_norm": 0.21090902388095856, + "learning_rate": 0.001242, + "loss": 2.4046, + "step": 6210 + }, + { + "epoch": 0.02404478050439919, + "grad_norm": 0.18830551207065582, + "learning_rate": 0.001244, + "loss": 2.404, + "step": 6220 + }, + { + "epoch": 0.024083437707782467, + "grad_norm": 0.2057628333568573, + "learning_rate": 0.001246, + "loss": 2.3923, + "step": 6230 + }, + { + "epoch": 0.024122094911165747, + "grad_norm": 0.21243716776371002, + "learning_rate": 0.001248, + "loss": 2.4043, + "step": 6240 + }, + { + "epoch": 0.024160752114549024, + "grad_norm": 0.24437469244003296, + "learning_rate": 0.00125, + "loss": 2.3894, + "step": 6250 + }, + { + "epoch": 0.024199409317932304, + "grad_norm": 0.20179718732833862, + "learning_rate": 0.001252, + "loss": 2.4006, + "step": 6260 + }, + { + "epoch": 0.024238066521315584, + "grad_norm": 0.3111397325992584, + "learning_rate": 0.0012540000000000001, + "loss": 2.4114, + "step": 6270 + }, + { + "epoch": 0.02427672372469886, + "grad_norm": 0.2358233630657196, + "learning_rate": 0.001256, + "loss": 2.4033, + "step": 6280 + }, + { + "epoch": 0.02431538092808214, + "grad_norm": 0.2047593742609024, + "learning_rate": 0.001258, + "loss": 2.4007, + "step": 6290 + }, + { + "epoch": 0.024354038131465416, + "grad_norm": 0.18096667528152466, + "learning_rate": 0.00126, + "loss": 2.3984, + "step": 6300 + }, + { + "epoch": 0.024392695334848696, + "grad_norm": 0.24265827238559723, + "learning_rate": 0.001262, + "loss": 2.3881, + "step": 6310 + }, + { + "epoch": 0.024431352538231973, + "grad_norm": 0.17325134575366974, + "learning_rate": 0.001264, + "loss": 2.4013, + "step": 6320 + }, + { + "epoch": 0.024470009741615253, + "grad_norm": 0.19234231114387512, + "learning_rate": 0.001266, + "loss": 2.4039, + "step": 6330 + }, + { + "epoch": 0.024508666944998533, + "grad_norm": 0.24274741113185883, + "learning_rate": 0.001268, + "loss": 2.4045, + "step": 6340 + }, + { + "epoch": 0.02454732414838181, + "grad_norm": 0.22068293392658234, + "learning_rate": 0.00127, + "loss": 2.4088, + "step": 6350 + }, + { + "epoch": 0.02458598135176509, + "grad_norm": 0.5849812626838684, + "learning_rate": 0.0012720000000000001, + "loss": 2.4016, + "step": 6360 + }, + { + "epoch": 0.024624638555148366, + "grad_norm": 0.1535091996192932, + "learning_rate": 0.001274, + "loss": 2.4138, + "step": 6370 + }, + { + "epoch": 0.024663295758531645, + "grad_norm": 0.25961729884147644, + "learning_rate": 0.001276, + "loss": 2.4161, + "step": 6380 + }, + { + "epoch": 0.024701952961914922, + "grad_norm": 0.18405650556087494, + "learning_rate": 0.001278, + "loss": 2.4102, + "step": 6390 + }, + { + "epoch": 0.024740610165298202, + "grad_norm": 0.16772936284542084, + "learning_rate": 0.00128, + "loss": 2.4185, + "step": 6400 + }, + { + "epoch": 0.024779267368681482, + "grad_norm": 0.18456052243709564, + "learning_rate": 0.0012820000000000002, + "loss": 2.392, + "step": 6410 + }, + { + "epoch": 0.02481792457206476, + "grad_norm": 0.2476855367422104, + "learning_rate": 0.001284, + "loss": 2.4039, + "step": 6420 + }, + { + "epoch": 0.024856581775448038, + "grad_norm": 0.19195041060447693, + "learning_rate": 0.001286, + "loss": 2.4161, + "step": 6430 + }, + { + "epoch": 0.024895238978831315, + "grad_norm": 0.17237518727779388, + "learning_rate": 0.001288, + "loss": 2.3988, + "step": 6440 + }, + { + "epoch": 0.024933896182214595, + "grad_norm": 0.21002577245235443, + "learning_rate": 0.0012900000000000001, + "loss": 2.3972, + "step": 6450 + }, + { + "epoch": 0.02497255338559787, + "grad_norm": 0.2846126854419708, + "learning_rate": 0.001292, + "loss": 2.4008, + "step": 6460 + }, + { + "epoch": 0.02501121058898115, + "grad_norm": 0.18850302696228027, + "learning_rate": 0.001294, + "loss": 2.4067, + "step": 6470 + }, + { + "epoch": 0.025049867792364428, + "grad_norm": 0.19248883426189423, + "learning_rate": 0.001296, + "loss": 2.4116, + "step": 6480 + }, + { + "epoch": 0.025088524995747707, + "grad_norm": 0.2386600375175476, + "learning_rate": 0.0012980000000000001, + "loss": 2.412, + "step": 6490 + }, + { + "epoch": 0.025127182199130987, + "grad_norm": 0.20422658324241638, + "learning_rate": 0.0013000000000000002, + "loss": 2.3993, + "step": 6500 + }, + { + "epoch": 0.025165839402514264, + "grad_norm": 0.20139940083026886, + "learning_rate": 0.001302, + "loss": 2.3956, + "step": 6510 + }, + { + "epoch": 0.025204496605897544, + "grad_norm": 0.20745849609375, + "learning_rate": 0.001304, + "loss": 2.4091, + "step": 6520 + }, + { + "epoch": 0.02524315380928082, + "grad_norm": 0.21816179156303406, + "learning_rate": 0.001306, + "loss": 2.3967, + "step": 6530 + }, + { + "epoch": 0.0252818110126641, + "grad_norm": 0.18074050545692444, + "learning_rate": 0.0013080000000000001, + "loss": 2.402, + "step": 6540 + }, + { + "epoch": 0.025320468216047377, + "grad_norm": 0.1811356097459793, + "learning_rate": 0.0013100000000000002, + "loss": 2.4289, + "step": 6550 + }, + { + "epoch": 0.025359125419430657, + "grad_norm": 0.1797647923231125, + "learning_rate": 0.001312, + "loss": 2.3928, + "step": 6560 + }, + { + "epoch": 0.025397782622813937, + "grad_norm": 0.22218936681747437, + "learning_rate": 0.001314, + "loss": 2.4093, + "step": 6570 + }, + { + "epoch": 0.025436439826197213, + "grad_norm": 0.19117140769958496, + "learning_rate": 0.0013160000000000001, + "loss": 2.4005, + "step": 6580 + }, + { + "epoch": 0.025475097029580493, + "grad_norm": 0.2183677852153778, + "learning_rate": 0.0013180000000000002, + "loss": 2.4097, + "step": 6590 + }, + { + "epoch": 0.02551375423296377, + "grad_norm": 0.24778427183628082, + "learning_rate": 0.00132, + "loss": 2.3962, + "step": 6600 + }, + { + "epoch": 0.02555241143634705, + "grad_norm": 0.19872575998306274, + "learning_rate": 0.001322, + "loss": 2.3982, + "step": 6610 + }, + { + "epoch": 0.025591068639730326, + "grad_norm": 0.1927270144224167, + "learning_rate": 0.001324, + "loss": 2.408, + "step": 6620 + }, + { + "epoch": 0.025629725843113606, + "grad_norm": 0.19589394330978394, + "learning_rate": 0.0013260000000000001, + "loss": 2.4102, + "step": 6630 + }, + { + "epoch": 0.025668383046496886, + "grad_norm": 0.21557344496250153, + "learning_rate": 0.0013280000000000002, + "loss": 2.4112, + "step": 6640 + }, + { + "epoch": 0.025707040249880162, + "grad_norm": 0.23353618383407593, + "learning_rate": 0.00133, + "loss": 2.4121, + "step": 6650 + }, + { + "epoch": 0.025745697453263442, + "grad_norm": 0.1943051666021347, + "learning_rate": 0.001332, + "loss": 2.3952, + "step": 6660 + }, + { + "epoch": 0.02578435465664672, + "grad_norm": 0.18049027025699615, + "learning_rate": 0.0013340000000000001, + "loss": 2.4113, + "step": 6670 + }, + { + "epoch": 0.02582301186003, + "grad_norm": 0.17567066848278046, + "learning_rate": 0.0013360000000000002, + "loss": 2.4026, + "step": 6680 + }, + { + "epoch": 0.025861669063413275, + "grad_norm": 0.16737550497055054, + "learning_rate": 0.0013380000000000002, + "loss": 2.4194, + "step": 6690 + }, + { + "epoch": 0.025900326266796555, + "grad_norm": 0.1732235997915268, + "learning_rate": 0.00134, + "loss": 2.4126, + "step": 6700 + }, + { + "epoch": 0.025938983470179835, + "grad_norm": 0.2096925675868988, + "learning_rate": 0.001342, + "loss": 2.4245, + "step": 6710 + }, + { + "epoch": 0.02597764067356311, + "grad_norm": 0.1870320439338684, + "learning_rate": 0.0013440000000000001, + "loss": 2.3983, + "step": 6720 + }, + { + "epoch": 0.02601629787694639, + "grad_norm": 0.16722071170806885, + "learning_rate": 0.0013460000000000002, + "loss": 2.4046, + "step": 6730 + }, + { + "epoch": 0.026054955080329668, + "grad_norm": 0.25226983428001404, + "learning_rate": 0.001348, + "loss": 2.4059, + "step": 6740 + }, + { + "epoch": 0.026093612283712948, + "grad_norm": 0.18488508462905884, + "learning_rate": 0.00135, + "loss": 2.4231, + "step": 6750 + }, + { + "epoch": 0.026132269487096224, + "grad_norm": 0.294162392616272, + "learning_rate": 0.0013520000000000001, + "loss": 2.4038, + "step": 6760 + }, + { + "epoch": 0.026170926690479504, + "grad_norm": 0.18017978966236115, + "learning_rate": 0.0013540000000000002, + "loss": 2.4268, + "step": 6770 + }, + { + "epoch": 0.026209583893862784, + "grad_norm": 0.15233299136161804, + "learning_rate": 0.0013560000000000002, + "loss": 2.4033, + "step": 6780 + }, + { + "epoch": 0.02624824109724606, + "grad_norm": 0.25568193197250366, + "learning_rate": 0.001358, + "loss": 2.4252, + "step": 6790 + }, + { + "epoch": 0.02628689830062934, + "grad_norm": 0.2158443033695221, + "learning_rate": 0.00136, + "loss": 2.3924, + "step": 6800 + }, + { + "epoch": 0.026325555504012617, + "grad_norm": 0.2108936309814453, + "learning_rate": 0.0013620000000000001, + "loss": 2.3924, + "step": 6810 + }, + { + "epoch": 0.026364212707395897, + "grad_norm": 0.21832455694675446, + "learning_rate": 0.0013640000000000002, + "loss": 2.4067, + "step": 6820 + }, + { + "epoch": 0.026402869910779173, + "grad_norm": 0.19482752680778503, + "learning_rate": 0.001366, + "loss": 2.4123, + "step": 6830 + }, + { + "epoch": 0.026441527114162453, + "grad_norm": 0.21157225966453552, + "learning_rate": 0.001368, + "loss": 2.4023, + "step": 6840 + }, + { + "epoch": 0.026480184317545733, + "grad_norm": 0.18475206196308136, + "learning_rate": 0.0013700000000000001, + "loss": 2.4093, + "step": 6850 + }, + { + "epoch": 0.02651884152092901, + "grad_norm": 0.1895562708377838, + "learning_rate": 0.0013720000000000002, + "loss": 2.4012, + "step": 6860 + }, + { + "epoch": 0.02655749872431229, + "grad_norm": 0.20044149458408356, + "learning_rate": 0.0013740000000000002, + "loss": 2.4186, + "step": 6870 + }, + { + "epoch": 0.026596155927695566, + "grad_norm": 0.18344005942344666, + "learning_rate": 0.0013759999999999998, + "loss": 2.401, + "step": 6880 + }, + { + "epoch": 0.026634813131078846, + "grad_norm": 0.2210623025894165, + "learning_rate": 0.0013779999999999999, + "loss": 2.4028, + "step": 6890 + }, + { + "epoch": 0.026673470334462122, + "grad_norm": 0.22009354829788208, + "learning_rate": 0.00138, + "loss": 2.4031, + "step": 6900 + }, + { + "epoch": 0.026712127537845402, + "grad_norm": 0.16893121600151062, + "learning_rate": 0.001382, + "loss": 2.4234, + "step": 6910 + }, + { + "epoch": 0.026750784741228682, + "grad_norm": 0.19680050015449524, + "learning_rate": 0.001384, + "loss": 2.3951, + "step": 6920 + }, + { + "epoch": 0.02678944194461196, + "grad_norm": 0.1974254697561264, + "learning_rate": 0.0013859999999999999, + "loss": 2.3968, + "step": 6930 + }, + { + "epoch": 0.02682809914799524, + "grad_norm": 0.17142683267593384, + "learning_rate": 0.001388, + "loss": 2.4147, + "step": 6940 + }, + { + "epoch": 0.026866756351378515, + "grad_norm": 0.20399171113967896, + "learning_rate": 0.00139, + "loss": 2.4172, + "step": 6950 + }, + { + "epoch": 0.026905413554761795, + "grad_norm": 0.20409271121025085, + "learning_rate": 0.001392, + "loss": 2.4109, + "step": 6960 + }, + { + "epoch": 0.02694407075814507, + "grad_norm": 0.20235559344291687, + "learning_rate": 0.0013939999999999998, + "loss": 2.4143, + "step": 6970 + }, + { + "epoch": 0.02698272796152835, + "grad_norm": 0.1891755312681198, + "learning_rate": 0.0013959999999999999, + "loss": 2.423, + "step": 6980 + }, + { + "epoch": 0.027021385164911628, + "grad_norm": 0.17014022171497345, + "learning_rate": 0.001398, + "loss": 2.419, + "step": 6990 + }, + { + "epoch": 0.027060042368294908, + "grad_norm": 0.25941529870033264, + "learning_rate": 0.0014, + "loss": 2.4007, + "step": 7000 + }, + { + "epoch": 0.027098699571678188, + "grad_norm": 0.16844941675662994, + "learning_rate": 0.001402, + "loss": 2.4125, + "step": 7010 + }, + { + "epoch": 0.027137356775061464, + "grad_norm": 0.16671425104141235, + "learning_rate": 0.0014039999999999999, + "loss": 2.4036, + "step": 7020 + }, + { + "epoch": 0.027176013978444744, + "grad_norm": 0.20527182519435883, + "learning_rate": 0.001406, + "loss": 2.4013, + "step": 7030 + }, + { + "epoch": 0.02721467118182802, + "grad_norm": 0.17263086140155792, + "learning_rate": 0.001408, + "loss": 2.4145, + "step": 7040 + }, + { + "epoch": 0.0272533283852113, + "grad_norm": 0.20254895091056824, + "learning_rate": 0.00141, + "loss": 2.418, + "step": 7050 + }, + { + "epoch": 0.027291985588594577, + "grad_norm": 0.1766177862882614, + "learning_rate": 0.0014119999999999998, + "loss": 2.4029, + "step": 7060 + }, + { + "epoch": 0.027330642791977857, + "grad_norm": 0.2355898916721344, + "learning_rate": 0.001414, + "loss": 2.4092, + "step": 7070 + }, + { + "epoch": 0.027369299995361137, + "grad_norm": 0.1853450983762741, + "learning_rate": 0.001416, + "loss": 2.4102, + "step": 7080 + }, + { + "epoch": 0.027407957198744413, + "grad_norm": 0.1911727339029312, + "learning_rate": 0.001418, + "loss": 2.4095, + "step": 7090 + }, + { + "epoch": 0.027446614402127693, + "grad_norm": 0.19476091861724854, + "learning_rate": 0.00142, + "loss": 2.3967, + "step": 7100 + }, + { + "epoch": 0.02748527160551097, + "grad_norm": 0.1929769665002823, + "learning_rate": 0.0014219999999999999, + "loss": 2.4075, + "step": 7110 + }, + { + "epoch": 0.02752392880889425, + "grad_norm": 0.189010888338089, + "learning_rate": 0.001424, + "loss": 2.4088, + "step": 7120 + }, + { + "epoch": 0.027562586012277526, + "grad_norm": 0.21539878845214844, + "learning_rate": 0.001426, + "loss": 2.4258, + "step": 7130 + }, + { + "epoch": 0.027601243215660806, + "grad_norm": 0.17167668044567108, + "learning_rate": 0.001428, + "loss": 2.4128, + "step": 7140 + }, + { + "epoch": 0.027639900419044086, + "grad_norm": 0.257478266954422, + "learning_rate": 0.00143, + "loss": 2.4038, + "step": 7150 + }, + { + "epoch": 0.027678557622427363, + "grad_norm": 0.1884448379278183, + "learning_rate": 0.001432, + "loss": 2.4186, + "step": 7160 + }, + { + "epoch": 0.027717214825810643, + "grad_norm": 0.18251746892929077, + "learning_rate": 0.001434, + "loss": 2.4233, + "step": 7170 + }, + { + "epoch": 0.02775587202919392, + "grad_norm": 0.1865987330675125, + "learning_rate": 0.001436, + "loss": 2.3951, + "step": 7180 + }, + { + "epoch": 0.0277945292325772, + "grad_norm": 0.17819331586360931, + "learning_rate": 0.001438, + "loss": 2.4152, + "step": 7190 + }, + { + "epoch": 0.027833186435960475, + "grad_norm": 0.16948436200618744, + "learning_rate": 0.0014399999999999999, + "loss": 2.4266, + "step": 7200 + }, + { + "epoch": 0.027871843639343755, + "grad_norm": 0.1923123002052307, + "learning_rate": 0.001442, + "loss": 2.4095, + "step": 7210 + }, + { + "epoch": 0.027910500842727035, + "grad_norm": 0.18973374366760254, + "learning_rate": 0.001444, + "loss": 2.4033, + "step": 7220 + }, + { + "epoch": 0.027949158046110312, + "grad_norm": 0.1747596561908722, + "learning_rate": 0.001446, + "loss": 2.4203, + "step": 7230 + }, + { + "epoch": 0.02798781524949359, + "grad_norm": 0.20812322199344635, + "learning_rate": 0.001448, + "loss": 2.408, + "step": 7240 + }, + { + "epoch": 0.028026472452876868, + "grad_norm": 0.19561204314231873, + "learning_rate": 0.00145, + "loss": 2.4228, + "step": 7250 + }, + { + "epoch": 0.028065129656260148, + "grad_norm": 0.2217772752046585, + "learning_rate": 0.001452, + "loss": 2.404, + "step": 7260 + }, + { + "epoch": 0.028103786859643425, + "grad_norm": 0.16729161143302917, + "learning_rate": 0.001454, + "loss": 2.405, + "step": 7270 + }, + { + "epoch": 0.028142444063026704, + "grad_norm": 0.20534683763980865, + "learning_rate": 0.001456, + "loss": 2.4125, + "step": 7280 + }, + { + "epoch": 0.028181101266409984, + "grad_norm": 0.20710591971874237, + "learning_rate": 0.001458, + "loss": 2.4264, + "step": 7290 + }, + { + "epoch": 0.02821975846979326, + "grad_norm": 0.1754801869392395, + "learning_rate": 0.00146, + "loss": 2.4193, + "step": 7300 + }, + { + "epoch": 0.02825841567317654, + "grad_norm": 0.20404274761676788, + "learning_rate": 0.001462, + "loss": 2.4108, + "step": 7310 + }, + { + "epoch": 0.028297072876559817, + "grad_norm": 0.2507224380970001, + "learning_rate": 0.001464, + "loss": 2.4159, + "step": 7320 + }, + { + "epoch": 0.028335730079943097, + "grad_norm": 0.16564474999904633, + "learning_rate": 0.001466, + "loss": 2.4109, + "step": 7330 + }, + { + "epoch": 0.028374387283326374, + "grad_norm": 0.17911064624786377, + "learning_rate": 0.001468, + "loss": 2.4013, + "step": 7340 + }, + { + "epoch": 0.028413044486709654, + "grad_norm": 0.2049718201160431, + "learning_rate": 0.00147, + "loss": 2.4342, + "step": 7350 + }, + { + "epoch": 0.028451701690092934, + "grad_norm": 0.18454407155513763, + "learning_rate": 0.001472, + "loss": 2.4239, + "step": 7360 + }, + { + "epoch": 0.02849035889347621, + "grad_norm": 0.17931893467903137, + "learning_rate": 0.001474, + "loss": 2.4239, + "step": 7370 + }, + { + "epoch": 0.02852901609685949, + "grad_norm": 0.1730000078678131, + "learning_rate": 0.001476, + "loss": 2.4134, + "step": 7380 + }, + { + "epoch": 0.028567673300242766, + "grad_norm": 0.18235714733600616, + "learning_rate": 0.001478, + "loss": 2.4198, + "step": 7390 + }, + { + "epoch": 0.028606330503626046, + "grad_norm": 0.1695987433195114, + "learning_rate": 0.00148, + "loss": 2.4202, + "step": 7400 + }, + { + "epoch": 0.028644987707009323, + "grad_norm": 0.2011159062385559, + "learning_rate": 0.001482, + "loss": 2.4123, + "step": 7410 + }, + { + "epoch": 0.028683644910392603, + "grad_norm": 0.20711570978164673, + "learning_rate": 0.001484, + "loss": 2.4094, + "step": 7420 + }, + { + "epoch": 0.028722302113775883, + "grad_norm": 0.1724478304386139, + "learning_rate": 0.0014860000000000001, + "loss": 2.4249, + "step": 7430 + }, + { + "epoch": 0.02876095931715916, + "grad_norm": 0.18605752289295197, + "learning_rate": 0.001488, + "loss": 2.4244, + "step": 7440 + }, + { + "epoch": 0.02879961652054244, + "grad_norm": 0.1809733510017395, + "learning_rate": 0.00149, + "loss": 2.4163, + "step": 7450 + }, + { + "epoch": 0.028838273723925716, + "grad_norm": 0.174998477101326, + "learning_rate": 0.001492, + "loss": 2.4125, + "step": 7460 + }, + { + "epoch": 0.028876930927308996, + "grad_norm": 0.2179594784975052, + "learning_rate": 0.001494, + "loss": 2.4089, + "step": 7470 + }, + { + "epoch": 0.028915588130692272, + "grad_norm": 0.1974450945854187, + "learning_rate": 0.001496, + "loss": 2.4168, + "step": 7480 + }, + { + "epoch": 0.028954245334075552, + "grad_norm": 0.18147790431976318, + "learning_rate": 0.001498, + "loss": 2.4196, + "step": 7490 + }, + { + "epoch": 0.02899290253745883, + "grad_norm": 0.24443376064300537, + "learning_rate": 0.0015, + "loss": 2.425, + "step": 7500 + }, + { + "epoch": 0.02903155974084211, + "grad_norm": 0.1688818484544754, + "learning_rate": 0.001502, + "loss": 2.4076, + "step": 7510 + }, + { + "epoch": 0.02907021694422539, + "grad_norm": 0.18681873381137848, + "learning_rate": 0.0015040000000000001, + "loss": 2.4089, + "step": 7520 + }, + { + "epoch": 0.029108874147608665, + "grad_norm": 0.1955171674489975, + "learning_rate": 0.001506, + "loss": 2.4153, + "step": 7530 + }, + { + "epoch": 0.029147531350991945, + "grad_norm": 0.1625387966632843, + "learning_rate": 0.001508, + "loss": 2.4194, + "step": 7540 + }, + { + "epoch": 0.02918618855437522, + "grad_norm": 0.21609428524971008, + "learning_rate": 0.00151, + "loss": 2.4088, + "step": 7550 + }, + { + "epoch": 0.0292248457577585, + "grad_norm": 0.1740817129611969, + "learning_rate": 0.001512, + "loss": 2.4304, + "step": 7560 + }, + { + "epoch": 0.029263502961141778, + "grad_norm": 0.2153731882572174, + "learning_rate": 0.001514, + "loss": 2.4228, + "step": 7570 + }, + { + "epoch": 0.029302160164525057, + "grad_norm": 0.15823526680469513, + "learning_rate": 0.001516, + "loss": 2.408, + "step": 7580 + }, + { + "epoch": 0.029340817367908337, + "grad_norm": 0.23106537759304047, + "learning_rate": 0.001518, + "loss": 2.412, + "step": 7590 + }, + { + "epoch": 0.029379474571291614, + "grad_norm": 0.17345808446407318, + "learning_rate": 0.00152, + "loss": 2.4121, + "step": 7600 + }, + { + "epoch": 0.029418131774674894, + "grad_norm": 0.1867612898349762, + "learning_rate": 0.0015220000000000001, + "loss": 2.4084, + "step": 7610 + }, + { + "epoch": 0.02945678897805817, + "grad_norm": 0.18916818499565125, + "learning_rate": 0.001524, + "loss": 2.419, + "step": 7620 + }, + { + "epoch": 0.02949544618144145, + "grad_norm": 0.18724285066127777, + "learning_rate": 0.001526, + "loss": 2.4043, + "step": 7630 + }, + { + "epoch": 0.029534103384824727, + "grad_norm": 0.18231108784675598, + "learning_rate": 0.001528, + "loss": 2.4129, + "step": 7640 + }, + { + "epoch": 0.029572760588208007, + "grad_norm": 0.21330668032169342, + "learning_rate": 0.0015300000000000001, + "loss": 2.4074, + "step": 7650 + }, + { + "epoch": 0.029611417791591287, + "grad_norm": 0.16480223834514618, + "learning_rate": 0.0015320000000000002, + "loss": 2.4249, + "step": 7660 + }, + { + "epoch": 0.029650074994974563, + "grad_norm": 0.1870049387216568, + "learning_rate": 0.001534, + "loss": 2.4123, + "step": 7670 + }, + { + "epoch": 0.029688732198357843, + "grad_norm": 0.17868660390377045, + "learning_rate": 0.001536, + "loss": 2.427, + "step": 7680 + }, + { + "epoch": 0.02972738940174112, + "grad_norm": 0.2201504111289978, + "learning_rate": 0.001538, + "loss": 2.4225, + "step": 7690 + }, + { + "epoch": 0.0297660466051244, + "grad_norm": 0.18824969232082367, + "learning_rate": 0.0015400000000000001, + "loss": 2.4198, + "step": 7700 + }, + { + "epoch": 0.029804703808507676, + "grad_norm": 0.18889504671096802, + "learning_rate": 0.001542, + "loss": 2.41, + "step": 7710 + }, + { + "epoch": 0.029843361011890956, + "grad_norm": 0.20518645644187927, + "learning_rate": 0.001544, + "loss": 2.4286, + "step": 7720 + }, + { + "epoch": 0.029882018215274236, + "grad_norm": 0.2309216409921646, + "learning_rate": 0.001546, + "loss": 2.4223, + "step": 7730 + }, + { + "epoch": 0.029920675418657512, + "grad_norm": 0.1851509064435959, + "learning_rate": 0.0015480000000000001, + "loss": 2.4122, + "step": 7740 + }, + { + "epoch": 0.029959332622040792, + "grad_norm": 0.16746215522289276, + "learning_rate": 0.0015500000000000002, + "loss": 2.4176, + "step": 7750 + }, + { + "epoch": 0.02999798982542407, + "grad_norm": 0.17244480550289154, + "learning_rate": 0.001552, + "loss": 2.4138, + "step": 7760 + }, + { + "epoch": 0.03003664702880735, + "grad_norm": 0.22287195920944214, + "learning_rate": 0.001554, + "loss": 2.4368, + "step": 7770 + }, + { + "epoch": 0.030075304232190625, + "grad_norm": 0.19575530290603638, + "learning_rate": 0.001556, + "loss": 2.4253, + "step": 7780 + }, + { + "epoch": 0.030113961435573905, + "grad_norm": 0.1770184487104416, + "learning_rate": 0.0015580000000000001, + "loss": 2.4148, + "step": 7790 + }, + { + "epoch": 0.030152618638957185, + "grad_norm": 0.2342929095029831, + "learning_rate": 0.0015600000000000002, + "loss": 2.423, + "step": 7800 + }, + { + "epoch": 0.03019127584234046, + "grad_norm": 0.16442647576332092, + "learning_rate": 0.001562, + "loss": 2.4262, + "step": 7810 + }, + { + "epoch": 0.03022993304572374, + "grad_norm": 0.18403716385364532, + "learning_rate": 0.001564, + "loss": 2.4315, + "step": 7820 + }, + { + "epoch": 0.030268590249107018, + "grad_norm": 0.15624719858169556, + "learning_rate": 0.0015660000000000001, + "loss": 2.4143, + "step": 7830 + }, + { + "epoch": 0.030307247452490298, + "grad_norm": 0.19365200400352478, + "learning_rate": 0.0015680000000000002, + "loss": 2.4198, + "step": 7840 + }, + { + "epoch": 0.030345904655873574, + "grad_norm": 0.19102121889591217, + "learning_rate": 0.00157, + "loss": 2.4338, + "step": 7850 + }, + { + "epoch": 0.030384561859256854, + "grad_norm": 0.17510868608951569, + "learning_rate": 0.001572, + "loss": 2.4047, + "step": 7860 + }, + { + "epoch": 0.030423219062640134, + "grad_norm": 0.18565328419208527, + "learning_rate": 0.001574, + "loss": 2.4435, + "step": 7870 + }, + { + "epoch": 0.03046187626602341, + "grad_norm": 0.18640480935573578, + "learning_rate": 0.0015760000000000001, + "loss": 2.4197, + "step": 7880 + }, + { + "epoch": 0.03050053346940669, + "grad_norm": 0.1854429990053177, + "learning_rate": 0.0015780000000000002, + "loss": 2.4143, + "step": 7890 + }, + { + "epoch": 0.030539190672789967, + "grad_norm": 0.20129820704460144, + "learning_rate": 0.00158, + "loss": 2.4205, + "step": 7900 + }, + { + "epoch": 0.030577847876173247, + "grad_norm": 0.18941105902194977, + "learning_rate": 0.001582, + "loss": 2.4216, + "step": 7910 + }, + { + "epoch": 0.030616505079556523, + "grad_norm": 0.18483154475688934, + "learning_rate": 0.0015840000000000001, + "loss": 2.4178, + "step": 7920 + }, + { + "epoch": 0.030655162282939803, + "grad_norm": 0.18414242565631866, + "learning_rate": 0.0015860000000000002, + "loss": 2.4261, + "step": 7930 + }, + { + "epoch": 0.030693819486323083, + "grad_norm": 0.20815731585025787, + "learning_rate": 0.0015880000000000002, + "loss": 2.4169, + "step": 7940 + }, + { + "epoch": 0.03073247668970636, + "grad_norm": 0.1711905598640442, + "learning_rate": 0.00159, + "loss": 2.4194, + "step": 7950 + }, + { + "epoch": 0.03077113389308964, + "grad_norm": 0.1658889353275299, + "learning_rate": 0.001592, + "loss": 2.4217, + "step": 7960 + }, + { + "epoch": 0.030809791096472916, + "grad_norm": 0.20899216830730438, + "learning_rate": 0.0015940000000000001, + "loss": 2.4159, + "step": 7970 + }, + { + "epoch": 0.030848448299856196, + "grad_norm": 0.18888837099075317, + "learning_rate": 0.0015960000000000002, + "loss": 2.4203, + "step": 7980 + }, + { + "epoch": 0.030887105503239472, + "grad_norm": 0.20210076868534088, + "learning_rate": 0.001598, + "loss": 2.4151, + "step": 7990 + }, + { + "epoch": 0.030925762706622752, + "grad_norm": 0.16781243681907654, + "learning_rate": 0.0016, + "loss": 2.4053, + "step": 8000 + }, + { + "epoch": 0.03096441991000603, + "grad_norm": 0.16579119861125946, + "learning_rate": 0.0016020000000000001, + "loss": 2.4033, + "step": 8010 + }, + { + "epoch": 0.03100307711338931, + "grad_norm": 0.19240018725395203, + "learning_rate": 0.0016040000000000002, + "loss": 2.4245, + "step": 8020 + }, + { + "epoch": 0.03104173431677259, + "grad_norm": 0.19058342278003693, + "learning_rate": 0.0016060000000000002, + "loss": 2.4219, + "step": 8030 + }, + { + "epoch": 0.031080391520155865, + "grad_norm": 0.16241350769996643, + "learning_rate": 0.001608, + "loss": 2.4243, + "step": 8040 + }, + { + "epoch": 0.031119048723539145, + "grad_norm": 0.17071078717708588, + "learning_rate": 0.00161, + "loss": 2.413, + "step": 8050 + }, + { + "epoch": 0.03115770592692242, + "grad_norm": 0.1861797720193863, + "learning_rate": 0.0016120000000000002, + "loss": 2.4404, + "step": 8060 + }, + { + "epoch": 0.0311963631303057, + "grad_norm": 0.1876525580883026, + "learning_rate": 0.0016140000000000002, + "loss": 2.4313, + "step": 8070 + }, + { + "epoch": 0.031235020333688978, + "grad_norm": 0.22800956666469574, + "learning_rate": 0.001616, + "loss": 2.4158, + "step": 8080 + }, + { + "epoch": 0.03127367753707226, + "grad_norm": 0.20102789998054504, + "learning_rate": 0.001618, + "loss": 2.4297, + "step": 8090 + }, + { + "epoch": 0.031312334740455534, + "grad_norm": 0.1998523324728012, + "learning_rate": 0.0016200000000000001, + "loss": 2.4265, + "step": 8100 + }, + { + "epoch": 0.031350991943838814, + "grad_norm": 0.21659059822559357, + "learning_rate": 0.0016220000000000002, + "loss": 2.4388, + "step": 8110 + }, + { + "epoch": 0.031389649147222094, + "grad_norm": 0.20605839788913727, + "learning_rate": 0.0016240000000000002, + "loss": 2.4294, + "step": 8120 + }, + { + "epoch": 0.031428306350605374, + "grad_norm": 0.19702279567718506, + "learning_rate": 0.0016259999999999998, + "loss": 2.4315, + "step": 8130 + }, + { + "epoch": 0.03146696355398865, + "grad_norm": 0.1821749359369278, + "learning_rate": 0.0016279999999999999, + "loss": 2.4176, + "step": 8140 + }, + { + "epoch": 0.03150562075737193, + "grad_norm": 0.22235362231731415, + "learning_rate": 0.00163, + "loss": 2.4267, + "step": 8150 + }, + { + "epoch": 0.03154427796075521, + "grad_norm": 0.16252587735652924, + "learning_rate": 0.001632, + "loss": 2.4193, + "step": 8160 + }, + { + "epoch": 0.03158293516413849, + "grad_norm": 0.3427959680557251, + "learning_rate": 0.001634, + "loss": 2.4284, + "step": 8170 + }, + { + "epoch": 0.03162159236752177, + "grad_norm": 0.16242393851280212, + "learning_rate": 0.0016359999999999999, + "loss": 2.4143, + "step": 8180 + }, + { + "epoch": 0.03166024957090504, + "grad_norm": 0.2046222984790802, + "learning_rate": 0.001638, + "loss": 2.4372, + "step": 8190 + }, + { + "epoch": 0.03169890677428832, + "grad_norm": 0.1593078374862671, + "learning_rate": 0.00164, + "loss": 2.4244, + "step": 8200 + }, + { + "epoch": 0.0317375639776716, + "grad_norm": 0.17311494052410126, + "learning_rate": 0.001642, + "loss": 2.4249, + "step": 8210 + }, + { + "epoch": 0.03177622118105488, + "grad_norm": 0.17711615562438965, + "learning_rate": 0.0016439999999999998, + "loss": 2.4288, + "step": 8220 + }, + { + "epoch": 0.03181487838443815, + "grad_norm": 0.16730278730392456, + "learning_rate": 0.001646, + "loss": 2.4265, + "step": 8230 + }, + { + "epoch": 0.03185353558782143, + "grad_norm": 0.24033501744270325, + "learning_rate": 0.001648, + "loss": 2.4275, + "step": 8240 + }, + { + "epoch": 0.03189219279120471, + "grad_norm": 0.18279847502708435, + "learning_rate": 0.00165, + "loss": 2.4232, + "step": 8250 + }, + { + "epoch": 0.03193084999458799, + "grad_norm": 0.1985992193222046, + "learning_rate": 0.001652, + "loss": 2.4315, + "step": 8260 + }, + { + "epoch": 0.03196950719797127, + "grad_norm": 0.15565842390060425, + "learning_rate": 0.0016539999999999999, + "loss": 2.4191, + "step": 8270 + }, + { + "epoch": 0.032008164401354545, + "grad_norm": 0.21724532544612885, + "learning_rate": 0.001656, + "loss": 2.4069, + "step": 8280 + }, + { + "epoch": 0.032046821604737825, + "grad_norm": 0.16617335379123688, + "learning_rate": 0.001658, + "loss": 2.4075, + "step": 8290 + }, + { + "epoch": 0.032085478808121105, + "grad_norm": 0.15554694831371307, + "learning_rate": 0.00166, + "loss": 2.4068, + "step": 8300 + }, + { + "epoch": 0.032124136011504385, + "grad_norm": 0.2197619378566742, + "learning_rate": 0.0016619999999999998, + "loss": 2.4262, + "step": 8310 + }, + { + "epoch": 0.032162793214887665, + "grad_norm": 0.23088595271110535, + "learning_rate": 0.001664, + "loss": 2.421, + "step": 8320 + }, + { + "epoch": 0.03220145041827094, + "grad_norm": 0.1569271832704544, + "learning_rate": 0.001666, + "loss": 2.4344, + "step": 8330 + }, + { + "epoch": 0.03224010762165422, + "grad_norm": 0.21316871047019958, + "learning_rate": 0.001668, + "loss": 2.424, + "step": 8340 + }, + { + "epoch": 0.0322787648250375, + "grad_norm": 0.199044868350029, + "learning_rate": 0.00167, + "loss": 2.4284, + "step": 8350 + }, + { + "epoch": 0.03231742202842078, + "grad_norm": 0.1673698127269745, + "learning_rate": 0.0016719999999999999, + "loss": 2.4104, + "step": 8360 + }, + { + "epoch": 0.03235607923180405, + "grad_norm": 0.21653950214385986, + "learning_rate": 0.001674, + "loss": 2.4272, + "step": 8370 + }, + { + "epoch": 0.03239473643518733, + "grad_norm": 0.17064034938812256, + "learning_rate": 0.001676, + "loss": 2.427, + "step": 8380 + }, + { + "epoch": 0.03243339363857061, + "grad_norm": 0.18788346648216248, + "learning_rate": 0.001678, + "loss": 2.4276, + "step": 8390 + }, + { + "epoch": 0.03247205084195389, + "grad_norm": 0.24243298172950745, + "learning_rate": 0.00168, + "loss": 2.4277, + "step": 8400 + }, + { + "epoch": 0.03251070804533717, + "grad_norm": 0.21477310359477997, + "learning_rate": 0.001682, + "loss": 2.4281, + "step": 8410 + }, + { + "epoch": 0.032549365248720444, + "grad_norm": 0.15949584543704987, + "learning_rate": 0.001684, + "loss": 2.4278, + "step": 8420 + }, + { + "epoch": 0.032588022452103724, + "grad_norm": 0.1640334576368332, + "learning_rate": 0.001686, + "loss": 2.4266, + "step": 8430 + }, + { + "epoch": 0.032626679655487004, + "grad_norm": 0.21765173971652985, + "learning_rate": 0.001688, + "loss": 2.416, + "step": 8440 + }, + { + "epoch": 0.032665336858870284, + "grad_norm": 0.2018786370754242, + "learning_rate": 0.0016899999999999999, + "loss": 2.4235, + "step": 8450 + }, + { + "epoch": 0.032703994062253564, + "grad_norm": 0.16875681281089783, + "learning_rate": 0.001692, + "loss": 2.4347, + "step": 8460 + }, + { + "epoch": 0.03274265126563684, + "grad_norm": 0.2098286747932434, + "learning_rate": 0.001694, + "loss": 2.4153, + "step": 8470 + }, + { + "epoch": 0.032781308469020116, + "grad_norm": 0.1699836701154709, + "learning_rate": 0.001696, + "loss": 2.431, + "step": 8480 + }, + { + "epoch": 0.032819965672403396, + "grad_norm": 0.16255663335323334, + "learning_rate": 0.001698, + "loss": 2.4396, + "step": 8490 + }, + { + "epoch": 0.032858622875786676, + "grad_norm": 0.1716599017381668, + "learning_rate": 0.0017, + "loss": 2.4215, + "step": 8500 + }, + { + "epoch": 0.03289728007916995, + "grad_norm": 0.16164468228816986, + "learning_rate": 0.001702, + "loss": 2.4358, + "step": 8510 + }, + { + "epoch": 0.03293593728255323, + "grad_norm": 0.23469357192516327, + "learning_rate": 0.001704, + "loss": 2.4325, + "step": 8520 + }, + { + "epoch": 0.03297459448593651, + "grad_norm": 0.19947314262390137, + "learning_rate": 0.001706, + "loss": 2.4306, + "step": 8530 + }, + { + "epoch": 0.03301325168931979, + "grad_norm": 0.16452614963054657, + "learning_rate": 0.001708, + "loss": 2.4251, + "step": 8540 + }, + { + "epoch": 0.03305190889270307, + "grad_norm": 0.16000273823738098, + "learning_rate": 0.00171, + "loss": 2.4243, + "step": 8550 + }, + { + "epoch": 0.03309056609608634, + "grad_norm": 0.19534049928188324, + "learning_rate": 0.001712, + "loss": 2.4204, + "step": 8560 + }, + { + "epoch": 0.03312922329946962, + "grad_norm": 0.17872479557991028, + "learning_rate": 0.001714, + "loss": 2.4244, + "step": 8570 + }, + { + "epoch": 0.0331678805028529, + "grad_norm": 0.17934873700141907, + "learning_rate": 0.001716, + "loss": 2.4344, + "step": 8580 + }, + { + "epoch": 0.03320653770623618, + "grad_norm": 0.14556285738945007, + "learning_rate": 0.001718, + "loss": 2.4166, + "step": 8590 + }, + { + "epoch": 0.03324519490961946, + "grad_norm": 0.17934808135032654, + "learning_rate": 0.00172, + "loss": 2.4329, + "step": 8600 + }, + { + "epoch": 0.033283852113002735, + "grad_norm": 0.1684950292110443, + "learning_rate": 0.001722, + "loss": 2.424, + "step": 8610 + }, + { + "epoch": 0.033322509316386015, + "grad_norm": 0.20048771798610687, + "learning_rate": 0.001724, + "loss": 2.4272, + "step": 8620 + }, + { + "epoch": 0.033361166519769295, + "grad_norm": 0.16629527509212494, + "learning_rate": 0.001726, + "loss": 2.4267, + "step": 8630 + }, + { + "epoch": 0.033399823723152575, + "grad_norm": 0.21294847130775452, + "learning_rate": 0.001728, + "loss": 2.4217, + "step": 8640 + }, + { + "epoch": 0.03343848092653585, + "grad_norm": 0.15192854404449463, + "learning_rate": 0.00173, + "loss": 2.4235, + "step": 8650 + }, + { + "epoch": 0.03347713812991913, + "grad_norm": 0.16855189204216003, + "learning_rate": 0.001732, + "loss": 2.4115, + "step": 8660 + }, + { + "epoch": 0.03351579533330241, + "grad_norm": 0.1933770477771759, + "learning_rate": 0.001734, + "loss": 2.4419, + "step": 8670 + }, + { + "epoch": 0.03355445253668569, + "grad_norm": 0.16225305199623108, + "learning_rate": 0.0017360000000000001, + "loss": 2.435, + "step": 8680 + }, + { + "epoch": 0.03359310974006897, + "grad_norm": 0.1811498999595642, + "learning_rate": 0.001738, + "loss": 2.4223, + "step": 8690 + }, + { + "epoch": 0.03363176694345224, + "grad_norm": 0.18613067269325256, + "learning_rate": 0.00174, + "loss": 2.4329, + "step": 8700 + }, + { + "epoch": 0.03367042414683552, + "grad_norm": 0.2437407523393631, + "learning_rate": 0.001742, + "loss": 2.4193, + "step": 8710 + }, + { + "epoch": 0.0337090813502188, + "grad_norm": 0.18288400769233704, + "learning_rate": 0.001744, + "loss": 2.4332, + "step": 8720 + }, + { + "epoch": 0.03374773855360208, + "grad_norm": 0.18807104229927063, + "learning_rate": 0.001746, + "loss": 2.4134, + "step": 8730 + }, + { + "epoch": 0.03378639575698535, + "grad_norm": 0.9005038738250732, + "learning_rate": 0.001748, + "loss": 2.425, + "step": 8740 + }, + { + "epoch": 0.03382505296036863, + "grad_norm": 0.19479355216026306, + "learning_rate": 0.00175, + "loss": 2.4594, + "step": 8750 + }, + { + "epoch": 0.03386371016375191, + "grad_norm": 0.13804388046264648, + "learning_rate": 0.001752, + "loss": 2.4404, + "step": 8760 + }, + { + "epoch": 0.03390236736713519, + "grad_norm": 0.17309629917144775, + "learning_rate": 0.0017540000000000001, + "loss": 2.425, + "step": 8770 + }, + { + "epoch": 0.03394102457051847, + "grad_norm": 0.15053080022335052, + "learning_rate": 0.001756, + "loss": 2.4196, + "step": 8780 + }, + { + "epoch": 0.033979681773901746, + "grad_norm": 0.16412365436553955, + "learning_rate": 0.001758, + "loss": 2.4119, + "step": 8790 + }, + { + "epoch": 0.034018338977285026, + "grad_norm": 0.16084375977516174, + "learning_rate": 0.00176, + "loss": 2.4199, + "step": 8800 + }, + { + "epoch": 0.034056996180668306, + "grad_norm": 0.19307050108909607, + "learning_rate": 0.0017620000000000001, + "loss": 2.428, + "step": 8810 + }, + { + "epoch": 0.034095653384051586, + "grad_norm": 0.18793387711048126, + "learning_rate": 0.001764, + "loss": 2.4226, + "step": 8820 + }, + { + "epoch": 0.034134310587434866, + "grad_norm": 0.1403844803571701, + "learning_rate": 0.001766, + "loss": 2.4284, + "step": 8830 + }, + { + "epoch": 0.03417296779081814, + "grad_norm": 0.19078412652015686, + "learning_rate": 0.001768, + "loss": 2.4223, + "step": 8840 + }, + { + "epoch": 0.03421162499420142, + "grad_norm": 0.21158762276172638, + "learning_rate": 0.00177, + "loss": 2.4318, + "step": 8850 + }, + { + "epoch": 0.0342502821975847, + "grad_norm": 0.14591366052627563, + "learning_rate": 0.0017720000000000001, + "loss": 2.4221, + "step": 8860 + }, + { + "epoch": 0.03428893940096798, + "grad_norm": 0.15341006219387054, + "learning_rate": 0.001774, + "loss": 2.4106, + "step": 8870 + }, + { + "epoch": 0.03432759660435125, + "grad_norm": 0.20539985597133636, + "learning_rate": 0.001776, + "loss": 2.4242, + "step": 8880 + }, + { + "epoch": 0.03436625380773453, + "grad_norm": 0.20937784016132355, + "learning_rate": 0.001778, + "loss": 2.4348, + "step": 8890 + }, + { + "epoch": 0.03440491101111781, + "grad_norm": 0.16762758791446686, + "learning_rate": 0.0017800000000000001, + "loss": 2.4272, + "step": 8900 + }, + { + "epoch": 0.03444356821450109, + "grad_norm": 0.1493655890226364, + "learning_rate": 0.0017820000000000002, + "loss": 2.4304, + "step": 8910 + }, + { + "epoch": 0.03448222541788437, + "grad_norm": 0.15680846571922302, + "learning_rate": 0.001784, + "loss": 2.4474, + "step": 8920 + }, + { + "epoch": 0.034520882621267644, + "grad_norm": 0.2038678675889969, + "learning_rate": 0.001786, + "loss": 2.4312, + "step": 8930 + }, + { + "epoch": 0.034559539824650924, + "grad_norm": 0.1632731705904007, + "learning_rate": 0.001788, + "loss": 2.4169, + "step": 8940 + }, + { + "epoch": 0.034598197028034204, + "grad_norm": 0.1767711639404297, + "learning_rate": 0.0017900000000000001, + "loss": 2.4342, + "step": 8950 + }, + { + "epoch": 0.034636854231417484, + "grad_norm": 0.19585298001766205, + "learning_rate": 0.001792, + "loss": 2.4278, + "step": 8960 + }, + { + "epoch": 0.034675511434800764, + "grad_norm": 0.1603458970785141, + "learning_rate": 0.001794, + "loss": 2.4413, + "step": 8970 + }, + { + "epoch": 0.03471416863818404, + "grad_norm": 0.20633605122566223, + "learning_rate": 0.001796, + "loss": 2.4298, + "step": 8980 + }, + { + "epoch": 0.03475282584156732, + "grad_norm": 0.2077600210905075, + "learning_rate": 0.0017980000000000001, + "loss": 2.4253, + "step": 8990 + }, + { + "epoch": 0.0347914830449506, + "grad_norm": 0.1785653680562973, + "learning_rate": 0.0018000000000000002, + "loss": 2.4297, + "step": 9000 + }, + { + "epoch": 0.03483014024833388, + "grad_norm": 0.19785434007644653, + "learning_rate": 0.001802, + "loss": 2.4188, + "step": 9010 + }, + { + "epoch": 0.03486879745171715, + "grad_norm": 0.16691158711910248, + "learning_rate": 0.001804, + "loss": 2.4313, + "step": 9020 + }, + { + "epoch": 0.03490745465510043, + "grad_norm": 0.16858600080013275, + "learning_rate": 0.001806, + "loss": 2.4427, + "step": 9030 + }, + { + "epoch": 0.03494611185848371, + "grad_norm": 0.2232050895690918, + "learning_rate": 0.0018080000000000001, + "loss": 2.4238, + "step": 9040 + }, + { + "epoch": 0.03498476906186699, + "grad_norm": 0.1576065868139267, + "learning_rate": 0.0018100000000000002, + "loss": 2.4303, + "step": 9050 + }, + { + "epoch": 0.03502342626525027, + "grad_norm": 0.24189849197864532, + "learning_rate": 0.001812, + "loss": 2.4297, + "step": 9060 + }, + { + "epoch": 0.03506208346863354, + "grad_norm": 0.18903590738773346, + "learning_rate": 0.001814, + "loss": 2.4246, + "step": 9070 + }, + { + "epoch": 0.03510074067201682, + "grad_norm": 0.14904828369617462, + "learning_rate": 0.0018160000000000001, + "loss": 2.4339, + "step": 9080 + }, + { + "epoch": 0.0351393978754001, + "grad_norm": 0.1678786277770996, + "learning_rate": 0.0018180000000000002, + "loss": 2.4337, + "step": 9090 + }, + { + "epoch": 0.03517805507878338, + "grad_norm": 0.16725444793701172, + "learning_rate": 0.00182, + "loss": 2.4289, + "step": 9100 + }, + { + "epoch": 0.03521671228216666, + "grad_norm": 0.19741575419902802, + "learning_rate": 0.001822, + "loss": 2.4519, + "step": 9110 + }, + { + "epoch": 0.035255369485549935, + "grad_norm": 0.188306525349617, + "learning_rate": 0.001824, + "loss": 2.4358, + "step": 9120 + }, + { + "epoch": 0.035294026688933215, + "grad_norm": 0.16728952527046204, + "learning_rate": 0.0018260000000000001, + "loss": 2.4164, + "step": 9130 + }, + { + "epoch": 0.035332683892316495, + "grad_norm": 0.16376613080501556, + "learning_rate": 0.0018280000000000002, + "loss": 2.4224, + "step": 9140 + }, + { + "epoch": 0.035371341095699775, + "grad_norm": 0.16212789714336395, + "learning_rate": 0.00183, + "loss": 2.4404, + "step": 9150 + }, + { + "epoch": 0.03540999829908305, + "grad_norm": 0.21214650571346283, + "learning_rate": 0.001832, + "loss": 2.4279, + "step": 9160 + }, + { + "epoch": 0.03544865550246633, + "grad_norm": 0.1481925994157791, + "learning_rate": 0.0018340000000000001, + "loss": 2.4317, + "step": 9170 + }, + { + "epoch": 0.03548731270584961, + "grad_norm": 0.18432292342185974, + "learning_rate": 0.0018360000000000002, + "loss": 2.4182, + "step": 9180 + }, + { + "epoch": 0.03552596990923289, + "grad_norm": 0.1933586597442627, + "learning_rate": 0.0018380000000000002, + "loss": 2.4391, + "step": 9190 + }, + { + "epoch": 0.03556462711261617, + "grad_norm": 0.15520641207695007, + "learning_rate": 0.00184, + "loss": 2.4261, + "step": 9200 + }, + { + "epoch": 0.03560328431599944, + "grad_norm": 0.1683807224035263, + "learning_rate": 0.001842, + "loss": 2.4161, + "step": 9210 + }, + { + "epoch": 0.03564194151938272, + "grad_norm": 0.23813362419605255, + "learning_rate": 0.0018440000000000002, + "loss": 2.4359, + "step": 9220 + }, + { + "epoch": 0.035680598722766, + "grad_norm": 0.16843190789222717, + "learning_rate": 0.0018460000000000002, + "loss": 2.4372, + "step": 9230 + }, + { + "epoch": 0.03571925592614928, + "grad_norm": 0.15540075302124023, + "learning_rate": 0.001848, + "loss": 2.4411, + "step": 9240 + }, + { + "epoch": 0.035757913129532554, + "grad_norm": 0.17383131384849548, + "learning_rate": 0.00185, + "loss": 2.4345, + "step": 9250 + }, + { + "epoch": 0.035796570332915834, + "grad_norm": 0.4560840129852295, + "learning_rate": 0.0018520000000000001, + "loss": 2.4234, + "step": 9260 + }, + { + "epoch": 0.035835227536299114, + "grad_norm": 0.17813929915428162, + "learning_rate": 0.0018540000000000002, + "loss": 2.4411, + "step": 9270 + }, + { + "epoch": 0.03587388473968239, + "grad_norm": 0.17561641335487366, + "learning_rate": 0.0018560000000000002, + "loss": 2.4331, + "step": 9280 + }, + { + "epoch": 0.03591254194306567, + "grad_norm": 0.17263886332511902, + "learning_rate": 0.001858, + "loss": 2.4285, + "step": 9290 + }, + { + "epoch": 0.035951199146448946, + "grad_norm": 0.1532730907201767, + "learning_rate": 0.00186, + "loss": 2.4331, + "step": 9300 + }, + { + "epoch": 0.035989856349832226, + "grad_norm": 0.15847301483154297, + "learning_rate": 0.0018620000000000002, + "loss": 2.4265, + "step": 9310 + }, + { + "epoch": 0.036028513553215506, + "grad_norm": 0.1659020036458969, + "learning_rate": 0.0018640000000000002, + "loss": 2.4202, + "step": 9320 + }, + { + "epoch": 0.036067170756598786, + "grad_norm": 0.19295507669448853, + "learning_rate": 0.001866, + "loss": 2.4331, + "step": 9330 + }, + { + "epoch": 0.036105827959982066, + "grad_norm": 0.17685222625732422, + "learning_rate": 0.001868, + "loss": 2.4454, + "step": 9340 + }, + { + "epoch": 0.03614448516336534, + "grad_norm": 0.1478830724954605, + "learning_rate": 0.0018700000000000001, + "loss": 2.424, + "step": 9350 + }, + { + "epoch": 0.03618314236674862, + "grad_norm": 0.1698874682188034, + "learning_rate": 0.0018720000000000002, + "loss": 2.4359, + "step": 9360 + }, + { + "epoch": 0.0362217995701319, + "grad_norm": 0.18125659227371216, + "learning_rate": 0.0018740000000000002, + "loss": 2.4275, + "step": 9370 + }, + { + "epoch": 0.03626045677351518, + "grad_norm": 0.18805450201034546, + "learning_rate": 0.0018759999999999998, + "loss": 2.4392, + "step": 9380 + }, + { + "epoch": 0.03629911397689845, + "grad_norm": 0.1469375342130661, + "learning_rate": 0.001878, + "loss": 2.4291, + "step": 9390 + }, + { + "epoch": 0.03633777118028173, + "grad_norm": 0.18483032286167145, + "learning_rate": 0.00188, + "loss": 2.4193, + "step": 9400 + }, + { + "epoch": 0.03637642838366501, + "grad_norm": 0.16580356657505035, + "learning_rate": 0.001882, + "loss": 2.4092, + "step": 9410 + }, + { + "epoch": 0.03641508558704829, + "grad_norm": 0.1719176173210144, + "learning_rate": 0.001884, + "loss": 2.4338, + "step": 9420 + }, + { + "epoch": 0.03645374279043157, + "grad_norm": 0.15128475427627563, + "learning_rate": 0.0018859999999999999, + "loss": 2.4231, + "step": 9430 + }, + { + "epoch": 0.036492399993814845, + "grad_norm": 0.16530759632587433, + "learning_rate": 0.001888, + "loss": 2.4234, + "step": 9440 + }, + { + "epoch": 0.036531057197198125, + "grad_norm": 0.20639896392822266, + "learning_rate": 0.00189, + "loss": 2.423, + "step": 9450 + }, + { + "epoch": 0.036569714400581405, + "grad_norm": 0.1547228842973709, + "learning_rate": 0.001892, + "loss": 2.426, + "step": 9460 + }, + { + "epoch": 0.036608371603964684, + "grad_norm": 0.14532585442066193, + "learning_rate": 0.0018939999999999999, + "loss": 2.4296, + "step": 9470 + }, + { + "epoch": 0.036647028807347964, + "grad_norm": 0.16781659424304962, + "learning_rate": 0.001896, + "loss": 2.4236, + "step": 9480 + }, + { + "epoch": 0.03668568601073124, + "grad_norm": 0.18817219138145447, + "learning_rate": 0.001898, + "loss": 2.4281, + "step": 9490 + }, + { + "epoch": 0.03672434321411452, + "grad_norm": 0.17814214527606964, + "learning_rate": 0.0019, + "loss": 2.4434, + "step": 9500 + }, + { + "epoch": 0.0367630004174978, + "grad_norm": 0.19819213449954987, + "learning_rate": 0.001902, + "loss": 2.4396, + "step": 9510 + }, + { + "epoch": 0.03680165762088108, + "grad_norm": 0.17489992082118988, + "learning_rate": 0.0019039999999999999, + "loss": 2.428, + "step": 9520 + }, + { + "epoch": 0.03684031482426435, + "grad_norm": 0.19796450436115265, + "learning_rate": 0.001906, + "loss": 2.4439, + "step": 9530 + }, + { + "epoch": 0.03687897202764763, + "grad_norm": 0.14627830684185028, + "learning_rate": 0.001908, + "loss": 2.4234, + "step": 9540 + }, + { + "epoch": 0.03691762923103091, + "grad_norm": 0.2152869999408722, + "learning_rate": 0.00191, + "loss": 2.4284, + "step": 9550 + }, + { + "epoch": 0.03695628643441419, + "grad_norm": 0.16523297131061554, + "learning_rate": 0.0019119999999999999, + "loss": 2.4311, + "step": 9560 + }, + { + "epoch": 0.03699494363779747, + "grad_norm": 0.1422155797481537, + "learning_rate": 0.001914, + "loss": 2.4374, + "step": 9570 + }, + { + "epoch": 0.03703360084118074, + "grad_norm": 0.18466395139694214, + "learning_rate": 0.001916, + "loss": 2.4371, + "step": 9580 + }, + { + "epoch": 0.03707225804456402, + "grad_norm": 0.16606661677360535, + "learning_rate": 0.001918, + "loss": 2.4267, + "step": 9590 + }, + { + "epoch": 0.0371109152479473, + "grad_norm": 0.15255603194236755, + "learning_rate": 0.00192, + "loss": 2.4182, + "step": 9600 + }, + { + "epoch": 0.03714957245133058, + "grad_norm": 0.18158304691314697, + "learning_rate": 0.0019219999999999999, + "loss": 2.4277, + "step": 9610 + }, + { + "epoch": 0.03718822965471386, + "grad_norm": 0.1524418443441391, + "learning_rate": 0.001924, + "loss": 2.4341, + "step": 9620 + }, + { + "epoch": 0.037226886858097136, + "grad_norm": 0.16808250546455383, + "learning_rate": 0.001926, + "loss": 2.4383, + "step": 9630 + }, + { + "epoch": 0.037265544061480416, + "grad_norm": 0.229302316904068, + "learning_rate": 0.001928, + "loss": 2.4321, + "step": 9640 + }, + { + "epoch": 0.037304201264863696, + "grad_norm": 0.1693592667579651, + "learning_rate": 0.00193, + "loss": 2.4251, + "step": 9650 + }, + { + "epoch": 0.037342858468246976, + "grad_norm": 0.1731642633676529, + "learning_rate": 0.001932, + "loss": 2.4437, + "step": 9660 + }, + { + "epoch": 0.03738151567163025, + "grad_norm": 0.1623421460390091, + "learning_rate": 0.001934, + "loss": 2.4378, + "step": 9670 + }, + { + "epoch": 0.03742017287501353, + "grad_norm": 0.12941968441009521, + "learning_rate": 0.001936, + "loss": 2.4288, + "step": 9680 + }, + { + "epoch": 0.03745883007839681, + "grad_norm": 0.16926322877407074, + "learning_rate": 0.001938, + "loss": 2.4358, + "step": 9690 + }, + { + "epoch": 0.03749748728178009, + "grad_norm": 0.18344028294086456, + "learning_rate": 0.0019399999999999999, + "loss": 2.4294, + "step": 9700 + }, + { + "epoch": 0.03753614448516337, + "grad_norm": 0.1774645298719406, + "learning_rate": 0.001942, + "loss": 2.4438, + "step": 9710 + }, + { + "epoch": 0.03757480168854664, + "grad_norm": 0.17909649014472961, + "learning_rate": 0.001944, + "loss": 2.4473, + "step": 9720 + }, + { + "epoch": 0.03761345889192992, + "grad_norm": 0.15844593942165375, + "learning_rate": 0.001946, + "loss": 2.4257, + "step": 9730 + }, + { + "epoch": 0.0376521160953132, + "grad_norm": 0.18717212975025177, + "learning_rate": 0.001948, + "loss": 2.4333, + "step": 9740 + }, + { + "epoch": 0.03769077329869648, + "grad_norm": 0.163429856300354, + "learning_rate": 0.00195, + "loss": 2.4375, + "step": 9750 + }, + { + "epoch": 0.037729430502079754, + "grad_norm": 0.15213638544082642, + "learning_rate": 0.001952, + "loss": 2.4264, + "step": 9760 + }, + { + "epoch": 0.037768087705463034, + "grad_norm": 0.18855980038642883, + "learning_rate": 0.001954, + "loss": 2.4371, + "step": 9770 + }, + { + "epoch": 0.037806744908846314, + "grad_norm": 0.1842171996831894, + "learning_rate": 0.001956, + "loss": 2.4326, + "step": 9780 + }, + { + "epoch": 0.037845402112229594, + "grad_norm": 0.18620212376117706, + "learning_rate": 0.001958, + "loss": 2.4259, + "step": 9790 + }, + { + "epoch": 0.037884059315612874, + "grad_norm": 0.15474924445152283, + "learning_rate": 0.00196, + "loss": 2.4241, + "step": 9800 + }, + { + "epoch": 0.03792271651899615, + "grad_norm": 0.15727892518043518, + "learning_rate": 0.001962, + "loss": 2.416, + "step": 9810 + }, + { + "epoch": 0.03796137372237943, + "grad_norm": 0.17438285052776337, + "learning_rate": 0.001964, + "loss": 2.4224, + "step": 9820 + }, + { + "epoch": 0.03800003092576271, + "grad_norm": 0.1774299591779709, + "learning_rate": 0.001966, + "loss": 2.423, + "step": 9830 + }, + { + "epoch": 0.03803868812914599, + "grad_norm": 0.13991932570934296, + "learning_rate": 0.001968, + "loss": 2.4208, + "step": 9840 + }, + { + "epoch": 0.03807734533252927, + "grad_norm": 0.1998598724603653, + "learning_rate": 0.00197, + "loss": 2.4263, + "step": 9850 + }, + { + "epoch": 0.03811600253591254, + "grad_norm": 0.18491345643997192, + "learning_rate": 0.0019720000000000002, + "loss": 2.4354, + "step": 9860 + }, + { + "epoch": 0.03815465973929582, + "grad_norm": 0.16325430572032928, + "learning_rate": 0.001974, + "loss": 2.4436, + "step": 9870 + }, + { + "epoch": 0.0381933169426791, + "grad_norm": 0.209353506565094, + "learning_rate": 0.001976, + "loss": 2.4268, + "step": 9880 + }, + { + "epoch": 0.03823197414606238, + "grad_norm": 0.16570289433002472, + "learning_rate": 0.001978, + "loss": 2.4323, + "step": 9890 + }, + { + "epoch": 0.03827063134944565, + "grad_norm": 0.17465120553970337, + "learning_rate": 0.00198, + "loss": 2.4478, + "step": 9900 + }, + { + "epoch": 0.03830928855282893, + "grad_norm": 0.20848344266414642, + "learning_rate": 0.001982, + "loss": 2.4321, + "step": 9910 + }, + { + "epoch": 0.03834794575621221, + "grad_norm": 0.17331746220588684, + "learning_rate": 0.001984, + "loss": 2.4321, + "step": 9920 + }, + { + "epoch": 0.03838660295959549, + "grad_norm": 0.1472446173429489, + "learning_rate": 0.001986, + "loss": 2.4397, + "step": 9930 + }, + { + "epoch": 0.03842526016297877, + "grad_norm": 0.18850277364253998, + "learning_rate": 0.001988, + "loss": 2.4395, + "step": 9940 + }, + { + "epoch": 0.038463917366362045, + "grad_norm": 0.2426530122756958, + "learning_rate": 0.00199, + "loss": 2.4367, + "step": 9950 + }, + { + "epoch": 0.038502574569745325, + "grad_norm": 0.15660598874092102, + "learning_rate": 0.001992, + "loss": 2.4304, + "step": 9960 + }, + { + "epoch": 0.038541231773128605, + "grad_norm": 0.16251277923583984, + "learning_rate": 0.001994, + "loss": 2.4295, + "step": 9970 + }, + { + "epoch": 0.038579888976511885, + "grad_norm": 0.20397990942001343, + "learning_rate": 0.001996, + "loss": 2.4516, + "step": 9980 + }, + { + "epoch": 0.038618546179895165, + "grad_norm": 0.16767215728759766, + "learning_rate": 0.001998, + "loss": 2.4455, + "step": 9990 + }, + { + "epoch": 0.03865720338327844, + "grad_norm": 0.16770479083061218, + "learning_rate": 0.002, + "loss": 2.4284, + "step": 10000 + }, + { + "epoch": 0.03869586058666172, + "grad_norm": 0.1625438928604126, + "learning_rate": 0.002, + "loss": 2.4331, + "step": 10010 + }, + { + "epoch": 0.038734517790045, + "grad_norm": 0.15517131984233856, + "learning_rate": 0.002, + "loss": 2.4339, + "step": 10020 + }, + { + "epoch": 0.03877317499342828, + "grad_norm": 0.1596555858850479, + "learning_rate": 0.002, + "loss": 2.4216, + "step": 10030 + }, + { + "epoch": 0.03881183219681155, + "grad_norm": 0.12880107760429382, + "learning_rate": 0.002, + "loss": 2.4336, + "step": 10040 + }, + { + "epoch": 0.03885048940019483, + "grad_norm": 0.14824466407299042, + "learning_rate": 0.002, + "loss": 2.4342, + "step": 10050 + }, + { + "epoch": 0.03888914660357811, + "grad_norm": 0.18370555341243744, + "learning_rate": 0.002, + "loss": 2.4277, + "step": 10060 + }, + { + "epoch": 0.03892780380696139, + "grad_norm": 0.17111271619796753, + "learning_rate": 0.002, + "loss": 2.4253, + "step": 10070 + }, + { + "epoch": 0.03896646101034467, + "grad_norm": 0.14722810685634613, + "learning_rate": 0.002, + "loss": 2.4024, + "step": 10080 + }, + { + "epoch": 0.03900511821372794, + "grad_norm": 0.16978606581687927, + "learning_rate": 0.002, + "loss": 2.4232, + "step": 10090 + }, + { + "epoch": 0.03904377541711122, + "grad_norm": 0.18143069744110107, + "learning_rate": 0.002, + "loss": 2.4441, + "step": 10100 + }, + { + "epoch": 0.0390824326204945, + "grad_norm": 0.19893452525138855, + "learning_rate": 0.002, + "loss": 2.4417, + "step": 10110 + }, + { + "epoch": 0.03912108982387778, + "grad_norm": 0.15450480580329895, + "learning_rate": 0.002, + "loss": 2.4324, + "step": 10120 + }, + { + "epoch": 0.03915974702726106, + "grad_norm": 0.153803288936615, + "learning_rate": 0.002, + "loss": 2.4373, + "step": 10130 + }, + { + "epoch": 0.039198404230644336, + "grad_norm": 0.14763586223125458, + "learning_rate": 0.002, + "loss": 2.4316, + "step": 10140 + }, + { + "epoch": 0.039237061434027616, + "grad_norm": 0.15948010981082916, + "learning_rate": 0.002, + "loss": 2.4285, + "step": 10150 + }, + { + "epoch": 0.039275718637410896, + "grad_norm": 0.16599905490875244, + "learning_rate": 0.002, + "loss": 2.4303, + "step": 10160 + }, + { + "epoch": 0.039314375840794176, + "grad_norm": 0.16918151080608368, + "learning_rate": 0.002, + "loss": 2.4334, + "step": 10170 + }, + { + "epoch": 0.03935303304417745, + "grad_norm": 0.1887025535106659, + "learning_rate": 0.002, + "loss": 2.417, + "step": 10180 + }, + { + "epoch": 0.03939169024756073, + "grad_norm": 0.18586301803588867, + "learning_rate": 0.002, + "loss": 2.4424, + "step": 10190 + }, + { + "epoch": 0.03943034745094401, + "grad_norm": 0.12432118505239487, + "learning_rate": 0.002, + "loss": 2.4189, + "step": 10200 + }, + { + "epoch": 0.03946900465432729, + "grad_norm": 0.16509106755256653, + "learning_rate": 0.002, + "loss": 2.4423, + "step": 10210 + }, + { + "epoch": 0.03950766185771057, + "grad_norm": 0.1833307445049286, + "learning_rate": 0.002, + "loss": 2.436, + "step": 10220 + }, + { + "epoch": 0.03954631906109384, + "grad_norm": 0.15685471892356873, + "learning_rate": 0.002, + "loss": 2.4376, + "step": 10230 + }, + { + "epoch": 0.03958497626447712, + "grad_norm": 0.15386788547039032, + "learning_rate": 0.002, + "loss": 2.4248, + "step": 10240 + }, + { + "epoch": 0.0396236334678604, + "grad_norm": 0.16778796911239624, + "learning_rate": 0.002, + "loss": 2.4473, + "step": 10250 + }, + { + "epoch": 0.03966229067124368, + "grad_norm": 0.143438458442688, + "learning_rate": 0.002, + "loss": 2.4363, + "step": 10260 + }, + { + "epoch": 0.03970094787462696, + "grad_norm": 0.1613985002040863, + "learning_rate": 0.002, + "loss": 2.4474, + "step": 10270 + }, + { + "epoch": 0.039739605078010234, + "grad_norm": 0.15061679482460022, + "learning_rate": 0.002, + "loss": 2.437, + "step": 10280 + }, + { + "epoch": 0.039778262281393514, + "grad_norm": 0.16388659179210663, + "learning_rate": 0.002, + "loss": 2.45, + "step": 10290 + }, + { + "epoch": 0.039816919484776794, + "grad_norm": 0.13617081940174103, + "learning_rate": 0.002, + "loss": 2.4473, + "step": 10300 + }, + { + "epoch": 0.039855576688160074, + "grad_norm": 0.20315887033939362, + "learning_rate": 0.002, + "loss": 2.4285, + "step": 10310 + }, + { + "epoch": 0.03989423389154335, + "grad_norm": 0.15514788031578064, + "learning_rate": 0.002, + "loss": 2.4379, + "step": 10320 + }, + { + "epoch": 0.03993289109492663, + "grad_norm": 0.16580229997634888, + "learning_rate": 0.002, + "loss": 2.4383, + "step": 10330 + }, + { + "epoch": 0.03997154829830991, + "grad_norm": 0.16074010729789734, + "learning_rate": 0.002, + "loss": 2.4335, + "step": 10340 + }, + { + "epoch": 0.04001020550169319, + "grad_norm": 0.1457606554031372, + "learning_rate": 0.002, + "loss": 2.4201, + "step": 10350 + }, + { + "epoch": 0.04004886270507647, + "grad_norm": 0.18716587126255035, + "learning_rate": 0.002, + "loss": 2.4333, + "step": 10360 + }, + { + "epoch": 0.04008751990845974, + "grad_norm": 0.13773228228092194, + "learning_rate": 0.002, + "loss": 2.4283, + "step": 10370 + }, + { + "epoch": 0.04012617711184302, + "grad_norm": 0.15122456848621368, + "learning_rate": 0.002, + "loss": 2.433, + "step": 10380 + }, + { + "epoch": 0.0401648343152263, + "grad_norm": 0.16871850192546844, + "learning_rate": 0.002, + "loss": 2.4313, + "step": 10390 + }, + { + "epoch": 0.04020349151860958, + "grad_norm": 0.15135620534420013, + "learning_rate": 0.002, + "loss": 2.4404, + "step": 10400 + }, + { + "epoch": 0.04024214872199285, + "grad_norm": 0.1498553603887558, + "learning_rate": 0.002, + "loss": 2.4224, + "step": 10410 + }, + { + "epoch": 0.04028080592537613, + "grad_norm": 0.18582572042942047, + "learning_rate": 0.002, + "loss": 2.4413, + "step": 10420 + }, + { + "epoch": 0.04031946312875941, + "grad_norm": 0.14045722782611847, + "learning_rate": 0.002, + "loss": 2.429, + "step": 10430 + }, + { + "epoch": 0.04035812033214269, + "grad_norm": 0.1526753455400467, + "learning_rate": 0.002, + "loss": 2.4451, + "step": 10440 + }, + { + "epoch": 0.04039677753552597, + "grad_norm": 0.17722046375274658, + "learning_rate": 0.002, + "loss": 2.4205, + "step": 10450 + }, + { + "epoch": 0.040435434738909246, + "grad_norm": 0.18134362995624542, + "learning_rate": 0.002, + "loss": 2.4421, + "step": 10460 + }, + { + "epoch": 0.040474091942292525, + "grad_norm": 0.15063491463661194, + "learning_rate": 0.002, + "loss": 2.4135, + "step": 10470 + }, + { + "epoch": 0.040512749145675805, + "grad_norm": 0.11587220430374146, + "learning_rate": 0.002, + "loss": 2.4407, + "step": 10480 + }, + { + "epoch": 0.040551406349059085, + "grad_norm": 0.14504306018352509, + "learning_rate": 0.002, + "loss": 2.434, + "step": 10490 + }, + { + "epoch": 0.040590063552442365, + "grad_norm": 0.15471212565898895, + "learning_rate": 0.002, + "loss": 2.4264, + "step": 10500 + }, + { + "epoch": 0.04062872075582564, + "grad_norm": 0.15481720864772797, + "learning_rate": 0.002, + "loss": 2.4233, + "step": 10510 + }, + { + "epoch": 0.04066737795920892, + "grad_norm": 0.14765945076942444, + "learning_rate": 0.002, + "loss": 2.425, + "step": 10520 + }, + { + "epoch": 0.0407060351625922, + "grad_norm": 0.16183441877365112, + "learning_rate": 0.002, + "loss": 2.4253, + "step": 10530 + }, + { + "epoch": 0.04074469236597548, + "grad_norm": 0.14015688002109528, + "learning_rate": 0.002, + "loss": 2.43, + "step": 10540 + }, + { + "epoch": 0.04078334956935875, + "grad_norm": 0.1484346091747284, + "learning_rate": 0.002, + "loss": 2.4191, + "step": 10550 + }, + { + "epoch": 0.04082200677274203, + "grad_norm": 0.14880962669849396, + "learning_rate": 0.002, + "loss": 2.4379, + "step": 10560 + }, + { + "epoch": 0.04086066397612531, + "grad_norm": 0.16797910630702972, + "learning_rate": 0.002, + "loss": 2.4472, + "step": 10570 + }, + { + "epoch": 0.04089932117950859, + "grad_norm": 0.15298014879226685, + "learning_rate": 0.002, + "loss": 2.4088, + "step": 10580 + }, + { + "epoch": 0.04093797838289187, + "grad_norm": 0.13158120214939117, + "learning_rate": 0.002, + "loss": 2.4229, + "step": 10590 + }, + { + "epoch": 0.040976635586275144, + "grad_norm": 0.13972997665405273, + "learning_rate": 0.002, + "loss": 2.4211, + "step": 10600 + }, + { + "epoch": 0.041015292789658424, + "grad_norm": 0.14697618782520294, + "learning_rate": 0.002, + "loss": 2.4384, + "step": 10610 + }, + { + "epoch": 0.041053949993041704, + "grad_norm": 0.20341096818447113, + "learning_rate": 0.002, + "loss": 2.428, + "step": 10620 + }, + { + "epoch": 0.041092607196424984, + "grad_norm": 0.14016160368919373, + "learning_rate": 0.002, + "loss": 2.4391, + "step": 10630 + }, + { + "epoch": 0.041131264399808264, + "grad_norm": 0.12385547906160355, + "learning_rate": 0.002, + "loss": 2.4199, + "step": 10640 + }, + { + "epoch": 0.04116992160319154, + "grad_norm": 0.15503078699111938, + "learning_rate": 0.002, + "loss": 2.4098, + "step": 10650 + }, + { + "epoch": 0.04120857880657482, + "grad_norm": 0.13293243944644928, + "learning_rate": 0.002, + "loss": 2.4147, + "step": 10660 + }, + { + "epoch": 0.041247236009958096, + "grad_norm": 0.14536736905574799, + "learning_rate": 0.002, + "loss": 2.4226, + "step": 10670 + }, + { + "epoch": 0.041285893213341376, + "grad_norm": 0.14836855232715607, + "learning_rate": 0.002, + "loss": 2.4319, + "step": 10680 + }, + { + "epoch": 0.04132455041672465, + "grad_norm": 0.15207520127296448, + "learning_rate": 0.002, + "loss": 2.4189, + "step": 10690 + }, + { + "epoch": 0.04136320762010793, + "grad_norm": 0.15331625938415527, + "learning_rate": 0.002, + "loss": 2.4086, + "step": 10700 + }, + { + "epoch": 0.04140186482349121, + "grad_norm": 0.16277608275413513, + "learning_rate": 0.002, + "loss": 2.4285, + "step": 10710 + }, + { + "epoch": 0.04144052202687449, + "grad_norm": 0.13291038572788239, + "learning_rate": 0.002, + "loss": 2.4092, + "step": 10720 + }, + { + "epoch": 0.04147917923025777, + "grad_norm": 0.13805748522281647, + "learning_rate": 0.002, + "loss": 2.413, + "step": 10730 + }, + { + "epoch": 0.04151783643364104, + "grad_norm": 0.16852214932441711, + "learning_rate": 0.002, + "loss": 2.4142, + "step": 10740 + }, + { + "epoch": 0.04155649363702432, + "grad_norm": 0.1524326652288437, + "learning_rate": 0.002, + "loss": 2.4165, + "step": 10750 + }, + { + "epoch": 0.0415951508404076, + "grad_norm": 0.17319968342781067, + "learning_rate": 0.002, + "loss": 2.4216, + "step": 10760 + }, + { + "epoch": 0.04163380804379088, + "grad_norm": 0.1453074961900711, + "learning_rate": 0.002, + "loss": 2.4352, + "step": 10770 + }, + { + "epoch": 0.04167246524717416, + "grad_norm": 0.1632017195224762, + "learning_rate": 0.002, + "loss": 2.3946, + "step": 10780 + }, + { + "epoch": 0.041711122450557435, + "grad_norm": 0.15487101674079895, + "learning_rate": 0.002, + "loss": 2.4321, + "step": 10790 + }, + { + "epoch": 0.041749779653940715, + "grad_norm": 0.14366376399993896, + "learning_rate": 0.002, + "loss": 2.4209, + "step": 10800 + }, + { + "epoch": 0.041788436857323995, + "grad_norm": 0.1451006531715393, + "learning_rate": 0.002, + "loss": 2.4191, + "step": 10810 + }, + { + "epoch": 0.041827094060707275, + "grad_norm": 0.18275074660778046, + "learning_rate": 0.002, + "loss": 2.4141, + "step": 10820 + }, + { + "epoch": 0.04186575126409055, + "grad_norm": 0.14227454364299774, + "learning_rate": 0.002, + "loss": 2.4291, + "step": 10830 + }, + { + "epoch": 0.04190440846747383, + "grad_norm": 0.14518596231937408, + "learning_rate": 0.002, + "loss": 2.4179, + "step": 10840 + }, + { + "epoch": 0.04194306567085711, + "grad_norm": 0.17375314235687256, + "learning_rate": 0.002, + "loss": 2.4294, + "step": 10850 + }, + { + "epoch": 0.04198172287424039, + "grad_norm": 0.1337478905916214, + "learning_rate": 0.002, + "loss": 2.4362, + "step": 10860 + }, + { + "epoch": 0.04202038007762367, + "grad_norm": 0.16233932971954346, + "learning_rate": 0.002, + "loss": 2.3947, + "step": 10870 + }, + { + "epoch": 0.04205903728100694, + "grad_norm": 0.1352071315050125, + "learning_rate": 0.002, + "loss": 2.4216, + "step": 10880 + }, + { + "epoch": 0.04209769448439022, + "grad_norm": 0.14264950156211853, + "learning_rate": 0.002, + "loss": 2.4258, + "step": 10890 + }, + { + "epoch": 0.0421363516877735, + "grad_norm": 0.14911723136901855, + "learning_rate": 0.002, + "loss": 2.4161, + "step": 10900 + }, + { + "epoch": 0.04217500889115678, + "grad_norm": 0.1300041824579239, + "learning_rate": 0.002, + "loss": 2.4483, + "step": 10910 + }, + { + "epoch": 0.04221366609454005, + "grad_norm": 0.14911404252052307, + "learning_rate": 0.002, + "loss": 2.4284, + "step": 10920 + }, + { + "epoch": 0.04225232329792333, + "grad_norm": 0.14238053560256958, + "learning_rate": 0.002, + "loss": 2.4336, + "step": 10930 + }, + { + "epoch": 0.04229098050130661, + "grad_norm": 0.19582848250865936, + "learning_rate": 0.002, + "loss": 2.4228, + "step": 10940 + }, + { + "epoch": 0.04232963770468989, + "grad_norm": 0.1654808223247528, + "learning_rate": 0.002, + "loss": 2.4154, + "step": 10950 + }, + { + "epoch": 0.04236829490807317, + "grad_norm": 0.15952356159687042, + "learning_rate": 0.002, + "loss": 2.4204, + "step": 10960 + }, + { + "epoch": 0.042406952111456446, + "grad_norm": 0.1382412314414978, + "learning_rate": 0.002, + "loss": 2.4181, + "step": 10970 + }, + { + "epoch": 0.042445609314839726, + "grad_norm": 0.1486886739730835, + "learning_rate": 0.002, + "loss": 2.4178, + "step": 10980 + }, + { + "epoch": 0.042484266518223006, + "grad_norm": 0.1526196449995041, + "learning_rate": 0.002, + "loss": 2.4201, + "step": 10990 + }, + { + "epoch": 0.042522923721606286, + "grad_norm": 0.1615477204322815, + "learning_rate": 0.002, + "loss": 2.4353, + "step": 11000 + }, + { + "epoch": 0.042561580924989566, + "grad_norm": 0.15322604775428772, + "learning_rate": 0.002, + "loss": 2.4149, + "step": 11010 + }, + { + "epoch": 0.04260023812837284, + "grad_norm": 0.1561826914548874, + "learning_rate": 0.002, + "loss": 2.4404, + "step": 11020 + }, + { + "epoch": 0.04263889533175612, + "grad_norm": 0.13902229070663452, + "learning_rate": 0.002, + "loss": 2.4336, + "step": 11030 + }, + { + "epoch": 0.0426775525351394, + "grad_norm": 0.14555348455905914, + "learning_rate": 0.002, + "loss": 2.4139, + "step": 11040 + }, + { + "epoch": 0.04271620973852268, + "grad_norm": 0.13210159540176392, + "learning_rate": 0.002, + "loss": 2.4143, + "step": 11050 + }, + { + "epoch": 0.04275486694190595, + "grad_norm": 0.15930576622486115, + "learning_rate": 0.002, + "loss": 2.4186, + "step": 11060 + }, + { + "epoch": 0.04279352414528923, + "grad_norm": 0.14124396443367004, + "learning_rate": 0.002, + "loss": 2.4213, + "step": 11070 + }, + { + "epoch": 0.04283218134867251, + "grad_norm": 0.15243567526340485, + "learning_rate": 0.002, + "loss": 2.4228, + "step": 11080 + }, + { + "epoch": 0.04287083855205579, + "grad_norm": 0.1674254685640335, + "learning_rate": 0.002, + "loss": 2.4153, + "step": 11090 + }, + { + "epoch": 0.04290949575543907, + "grad_norm": 0.1357090175151825, + "learning_rate": 0.002, + "loss": 2.4278, + "step": 11100 + }, + { + "epoch": 0.042948152958822344, + "grad_norm": 0.16976813971996307, + "learning_rate": 0.002, + "loss": 2.4142, + "step": 11110 + }, + { + "epoch": 0.042986810162205624, + "grad_norm": 0.14292575418949127, + "learning_rate": 0.002, + "loss": 2.4299, + "step": 11120 + }, + { + "epoch": 0.043025467365588904, + "grad_norm": 0.13828745484352112, + "learning_rate": 0.002, + "loss": 2.4205, + "step": 11130 + }, + { + "epoch": 0.043064124568972184, + "grad_norm": 0.1772737056016922, + "learning_rate": 0.002, + "loss": 2.4211, + "step": 11140 + }, + { + "epoch": 0.043102781772355464, + "grad_norm": 0.15409092605113983, + "learning_rate": 0.002, + "loss": 2.4308, + "step": 11150 + }, + { + "epoch": 0.04314143897573874, + "grad_norm": 0.14300651848316193, + "learning_rate": 0.002, + "loss": 2.4007, + "step": 11160 + }, + { + "epoch": 0.04318009617912202, + "grad_norm": 0.15811540186405182, + "learning_rate": 0.002, + "loss": 2.4217, + "step": 11170 + }, + { + "epoch": 0.0432187533825053, + "grad_norm": 0.11724254488945007, + "learning_rate": 0.002, + "loss": 2.4236, + "step": 11180 + }, + { + "epoch": 0.04325741058588858, + "grad_norm": 0.15714186429977417, + "learning_rate": 0.002, + "loss": 2.4145, + "step": 11190 + }, + { + "epoch": 0.04329606778927185, + "grad_norm": 0.13312242925167084, + "learning_rate": 0.002, + "loss": 2.4184, + "step": 11200 + }, + { + "epoch": 0.04333472499265513, + "grad_norm": 0.19631125032901764, + "learning_rate": 0.002, + "loss": 2.4347, + "step": 11210 + }, + { + "epoch": 0.04337338219603841, + "grad_norm": 0.12227917462587357, + "learning_rate": 0.002, + "loss": 2.4227, + "step": 11220 + }, + { + "epoch": 0.04341203939942169, + "grad_norm": 0.1563911885023117, + "learning_rate": 0.002, + "loss": 2.4227, + "step": 11230 + }, + { + "epoch": 0.04345069660280497, + "grad_norm": 0.14006571471691132, + "learning_rate": 0.002, + "loss": 2.4161, + "step": 11240 + }, + { + "epoch": 0.04348935380618824, + "grad_norm": 0.1672613024711609, + "learning_rate": 0.002, + "loss": 2.4058, + "step": 11250 + }, + { + "epoch": 0.04352801100957152, + "grad_norm": 0.16477328538894653, + "learning_rate": 0.002, + "loss": 2.4084, + "step": 11260 + }, + { + "epoch": 0.0435666682129548, + "grad_norm": 0.12407759577035904, + "learning_rate": 0.002, + "loss": 2.4317, + "step": 11270 + }, + { + "epoch": 0.04360532541633808, + "grad_norm": 0.15415887534618378, + "learning_rate": 0.002, + "loss": 2.4296, + "step": 11280 + }, + { + "epoch": 0.04364398261972136, + "grad_norm": 0.12192781269550323, + "learning_rate": 0.002, + "loss": 2.3992, + "step": 11290 + }, + { + "epoch": 0.043682639823104635, + "grad_norm": 0.14742381870746613, + "learning_rate": 0.002, + "loss": 2.4225, + "step": 11300 + }, + { + "epoch": 0.043721297026487915, + "grad_norm": 0.15054261684417725, + "learning_rate": 0.002, + "loss": 2.4257, + "step": 11310 + }, + { + "epoch": 0.043759954229871195, + "grad_norm": 0.1351064294576645, + "learning_rate": 0.002, + "loss": 2.4023, + "step": 11320 + }, + { + "epoch": 0.043798611433254475, + "grad_norm": 0.11836264282464981, + "learning_rate": 0.002, + "loss": 2.4129, + "step": 11330 + }, + { + "epoch": 0.04383726863663775, + "grad_norm": 0.1980191022157669, + "learning_rate": 0.002, + "loss": 2.4123, + "step": 11340 + }, + { + "epoch": 0.04387592584002103, + "grad_norm": 0.16777260601520538, + "learning_rate": 0.002, + "loss": 2.3954, + "step": 11350 + }, + { + "epoch": 0.04391458304340431, + "grad_norm": 0.15461204946041107, + "learning_rate": 0.002, + "loss": 2.4146, + "step": 11360 + }, + { + "epoch": 0.04395324024678759, + "grad_norm": 0.12209227681159973, + "learning_rate": 0.002, + "loss": 2.4198, + "step": 11370 + }, + { + "epoch": 0.04399189745017087, + "grad_norm": 0.14516107738018036, + "learning_rate": 0.002, + "loss": 2.4273, + "step": 11380 + }, + { + "epoch": 0.04403055465355414, + "grad_norm": 0.41329941153526306, + "learning_rate": 0.002, + "loss": 2.4405, + "step": 11390 + }, + { + "epoch": 0.04406921185693742, + "grad_norm": 0.18117031455039978, + "learning_rate": 0.002, + "loss": 2.4214, + "step": 11400 + }, + { + "epoch": 0.0441078690603207, + "grad_norm": 0.14210830628871918, + "learning_rate": 0.002, + "loss": 2.4205, + "step": 11410 + }, + { + "epoch": 0.04414652626370398, + "grad_norm": 0.14302709698677063, + "learning_rate": 0.002, + "loss": 2.4262, + "step": 11420 + }, + { + "epoch": 0.044185183467087254, + "grad_norm": 0.13019420206546783, + "learning_rate": 0.002, + "loss": 2.4177, + "step": 11430 + }, + { + "epoch": 0.044223840670470534, + "grad_norm": 0.1916237622499466, + "learning_rate": 0.002, + "loss": 2.4143, + "step": 11440 + }, + { + "epoch": 0.044262497873853814, + "grad_norm": 0.1374051868915558, + "learning_rate": 0.002, + "loss": 2.4145, + "step": 11450 + }, + { + "epoch": 0.044301155077237094, + "grad_norm": 0.13074587285518646, + "learning_rate": 0.002, + "loss": 2.4126, + "step": 11460 + }, + { + "epoch": 0.04433981228062037, + "grad_norm": 0.15300016105175018, + "learning_rate": 0.002, + "loss": 2.4224, + "step": 11470 + }, + { + "epoch": 0.044378469484003646, + "grad_norm": 0.12504509091377258, + "learning_rate": 0.002, + "loss": 2.4172, + "step": 11480 + }, + { + "epoch": 0.044417126687386926, + "grad_norm": 0.13933181762695312, + "learning_rate": 0.002, + "loss": 2.4227, + "step": 11490 + }, + { + "epoch": 0.044455783890770206, + "grad_norm": 0.12378118932247162, + "learning_rate": 0.002, + "loss": 2.428, + "step": 11500 + }, + { + "epoch": 0.044494441094153486, + "grad_norm": 0.15414609014987946, + "learning_rate": 0.002, + "loss": 2.3966, + "step": 11510 + }, + { + "epoch": 0.044533098297536766, + "grad_norm": 0.11848758161067963, + "learning_rate": 0.002, + "loss": 2.3994, + "step": 11520 + }, + { + "epoch": 0.04457175550092004, + "grad_norm": 0.1424514651298523, + "learning_rate": 0.002, + "loss": 2.4035, + "step": 11530 + }, + { + "epoch": 0.04461041270430332, + "grad_norm": 0.121200330555439, + "learning_rate": 0.002, + "loss": 2.4232, + "step": 11540 + }, + { + "epoch": 0.0446490699076866, + "grad_norm": 0.15000300109386444, + "learning_rate": 0.002, + "loss": 2.4292, + "step": 11550 + }, + { + "epoch": 0.04468772711106988, + "grad_norm": 0.14829818904399872, + "learning_rate": 0.002, + "loss": 2.4114, + "step": 11560 + }, + { + "epoch": 0.04472638431445315, + "grad_norm": 0.12678086757659912, + "learning_rate": 0.002, + "loss": 2.4287, + "step": 11570 + }, + { + "epoch": 0.04476504151783643, + "grad_norm": 0.1372426301240921, + "learning_rate": 0.002, + "loss": 2.4197, + "step": 11580 + }, + { + "epoch": 0.04480369872121971, + "grad_norm": 0.15566550195217133, + "learning_rate": 0.002, + "loss": 2.3989, + "step": 11590 + }, + { + "epoch": 0.04484235592460299, + "grad_norm": 0.13825489580631256, + "learning_rate": 0.002, + "loss": 2.4205, + "step": 11600 + }, + { + "epoch": 0.04488101312798627, + "grad_norm": 0.14873427152633667, + "learning_rate": 0.002, + "loss": 2.4188, + "step": 11610 + }, + { + "epoch": 0.044919670331369545, + "grad_norm": 0.14977797865867615, + "learning_rate": 0.002, + "loss": 2.4181, + "step": 11620 + }, + { + "epoch": 0.044958327534752825, + "grad_norm": 0.13811765611171722, + "learning_rate": 0.002, + "loss": 2.4092, + "step": 11630 + }, + { + "epoch": 0.044996984738136105, + "grad_norm": 0.1411823183298111, + "learning_rate": 0.002, + "loss": 2.4149, + "step": 11640 + }, + { + "epoch": 0.045035641941519385, + "grad_norm": 0.13481740653514862, + "learning_rate": 0.002, + "loss": 2.3933, + "step": 11650 + }, + { + "epoch": 0.045074299144902664, + "grad_norm": 0.12790456414222717, + "learning_rate": 0.002, + "loss": 2.4176, + "step": 11660 + }, + { + "epoch": 0.04511295634828594, + "grad_norm": 0.16515801846981049, + "learning_rate": 0.002, + "loss": 2.4253, + "step": 11670 + }, + { + "epoch": 0.04515161355166922, + "grad_norm": 0.1256054937839508, + "learning_rate": 0.002, + "loss": 2.4085, + "step": 11680 + }, + { + "epoch": 0.0451902707550525, + "grad_norm": 0.14855296909809113, + "learning_rate": 0.002, + "loss": 2.3995, + "step": 11690 + }, + { + "epoch": 0.04522892795843578, + "grad_norm": 0.1553608626127243, + "learning_rate": 0.002, + "loss": 2.4157, + "step": 11700 + }, + { + "epoch": 0.04526758516181905, + "grad_norm": 0.12629066407680511, + "learning_rate": 0.002, + "loss": 2.4115, + "step": 11710 + }, + { + "epoch": 0.04530624236520233, + "grad_norm": 0.17862139642238617, + "learning_rate": 0.002, + "loss": 2.423, + "step": 11720 + }, + { + "epoch": 0.04534489956858561, + "grad_norm": 0.1498710960149765, + "learning_rate": 0.002, + "loss": 2.421, + "step": 11730 + }, + { + "epoch": 0.04538355677196889, + "grad_norm": 0.11899344623088837, + "learning_rate": 0.002, + "loss": 2.4171, + "step": 11740 + }, + { + "epoch": 0.04542221397535217, + "grad_norm": 0.1494816094636917, + "learning_rate": 0.002, + "loss": 2.4111, + "step": 11750 + }, + { + "epoch": 0.04546087117873544, + "grad_norm": 0.13252732157707214, + "learning_rate": 0.002, + "loss": 2.4128, + "step": 11760 + }, + { + "epoch": 0.04549952838211872, + "grad_norm": 0.14148776233196259, + "learning_rate": 0.002, + "loss": 2.4192, + "step": 11770 + }, + { + "epoch": 0.045538185585502, + "grad_norm": 0.15762783586978912, + "learning_rate": 0.002, + "loss": 2.4446, + "step": 11780 + }, + { + "epoch": 0.04557684278888528, + "grad_norm": 0.16230829060077667, + "learning_rate": 0.002, + "loss": 2.4388, + "step": 11790 + }, + { + "epoch": 0.04561549999226856, + "grad_norm": 0.11253784596920013, + "learning_rate": 0.002, + "loss": 2.4006, + "step": 11800 + }, + { + "epoch": 0.045654157195651836, + "grad_norm": 0.1605105847120285, + "learning_rate": 0.002, + "loss": 2.4076, + "step": 11810 + }, + { + "epoch": 0.045692814399035116, + "grad_norm": 0.1744198352098465, + "learning_rate": 0.002, + "loss": 2.3956, + "step": 11820 + }, + { + "epoch": 0.045731471602418396, + "grad_norm": 0.6021363139152527, + "learning_rate": 0.002, + "loss": 2.4181, + "step": 11830 + }, + { + "epoch": 0.045770128805801676, + "grad_norm": 0.13516347110271454, + "learning_rate": 0.002, + "loss": 2.4205, + "step": 11840 + }, + { + "epoch": 0.04580878600918495, + "grad_norm": 0.1612042635679245, + "learning_rate": 0.002, + "loss": 2.4085, + "step": 11850 + }, + { + "epoch": 0.04584744321256823, + "grad_norm": 0.11100103706121445, + "learning_rate": 0.002, + "loss": 2.4116, + "step": 11860 + }, + { + "epoch": 0.04588610041595151, + "grad_norm": 0.1537276953458786, + "learning_rate": 0.002, + "loss": 2.4081, + "step": 11870 + }, + { + "epoch": 0.04592475761933479, + "grad_norm": 0.1528160721063614, + "learning_rate": 0.002, + "loss": 2.4149, + "step": 11880 + }, + { + "epoch": 0.04596341482271807, + "grad_norm": 0.12817005813121796, + "learning_rate": 0.002, + "loss": 2.4131, + "step": 11890 + }, + { + "epoch": 0.04600207202610134, + "grad_norm": 0.15865662693977356, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 11900 + }, + { + "epoch": 0.04604072922948462, + "grad_norm": 0.1338355392217636, + "learning_rate": 0.002, + "loss": 2.4311, + "step": 11910 + }, + { + "epoch": 0.0460793864328679, + "grad_norm": 0.11291113495826721, + "learning_rate": 0.002, + "loss": 2.4127, + "step": 11920 + }, + { + "epoch": 0.04611804363625118, + "grad_norm": 0.1465831995010376, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 11930 + }, + { + "epoch": 0.046156700839634454, + "grad_norm": 0.13816089928150177, + "learning_rate": 0.002, + "loss": 2.4221, + "step": 11940 + }, + { + "epoch": 0.046195358043017734, + "grad_norm": 0.13582506775856018, + "learning_rate": 0.002, + "loss": 2.4248, + "step": 11950 + }, + { + "epoch": 0.046234015246401014, + "grad_norm": 0.15690980851650238, + "learning_rate": 0.002, + "loss": 2.4055, + "step": 11960 + }, + { + "epoch": 0.046272672449784294, + "grad_norm": 0.1282709687948227, + "learning_rate": 0.002, + "loss": 2.4258, + "step": 11970 + }, + { + "epoch": 0.046311329653167574, + "grad_norm": 0.12684884667396545, + "learning_rate": 0.002, + "loss": 2.4119, + "step": 11980 + }, + { + "epoch": 0.04634998685655085, + "grad_norm": 0.17777405679225922, + "learning_rate": 0.002, + "loss": 2.4131, + "step": 11990 + }, + { + "epoch": 0.04638864405993413, + "grad_norm": 0.13052219152450562, + "learning_rate": 0.002, + "loss": 2.3977, + "step": 12000 + }, + { + "epoch": 0.04642730126331741, + "grad_norm": 0.14413100481033325, + "learning_rate": 0.002, + "loss": 2.4178, + "step": 12010 + }, + { + "epoch": 0.04646595846670069, + "grad_norm": 0.14420953392982483, + "learning_rate": 0.002, + "loss": 2.4071, + "step": 12020 + }, + { + "epoch": 0.04650461567008397, + "grad_norm": 0.14926595985889435, + "learning_rate": 0.002, + "loss": 2.4098, + "step": 12030 + }, + { + "epoch": 0.04654327287346724, + "grad_norm": 0.12959520518779755, + "learning_rate": 0.002, + "loss": 2.4039, + "step": 12040 + }, + { + "epoch": 0.04658193007685052, + "grad_norm": 0.1247779130935669, + "learning_rate": 0.002, + "loss": 2.411, + "step": 12050 + }, + { + "epoch": 0.0466205872802338, + "grad_norm": 0.1487448364496231, + "learning_rate": 0.002, + "loss": 2.4128, + "step": 12060 + }, + { + "epoch": 0.04665924448361708, + "grad_norm": 0.1244499534368515, + "learning_rate": 0.002, + "loss": 2.4115, + "step": 12070 + }, + { + "epoch": 0.04669790168700035, + "grad_norm": 0.12959147989749908, + "learning_rate": 0.002, + "loss": 2.4005, + "step": 12080 + }, + { + "epoch": 0.04673655889038363, + "grad_norm": 0.1252502202987671, + "learning_rate": 0.002, + "loss": 2.4081, + "step": 12090 + }, + { + "epoch": 0.04677521609376691, + "grad_norm": 0.11847937107086182, + "learning_rate": 0.002, + "loss": 2.4062, + "step": 12100 + }, + { + "epoch": 0.04681387329715019, + "grad_norm": 0.13584916293621063, + "learning_rate": 0.002, + "loss": 2.402, + "step": 12110 + }, + { + "epoch": 0.04685253050053347, + "grad_norm": 0.1508847028017044, + "learning_rate": 0.002, + "loss": 2.4048, + "step": 12120 + }, + { + "epoch": 0.046891187703916745, + "grad_norm": 0.12016989290714264, + "learning_rate": 0.002, + "loss": 2.4023, + "step": 12130 + }, + { + "epoch": 0.046929844907300025, + "grad_norm": 0.1213717982172966, + "learning_rate": 0.002, + "loss": 2.4051, + "step": 12140 + }, + { + "epoch": 0.046968502110683305, + "grad_norm": 0.14406681060791016, + "learning_rate": 0.002, + "loss": 2.3983, + "step": 12150 + }, + { + "epoch": 0.047007159314066585, + "grad_norm": 0.16102002561092377, + "learning_rate": 0.002, + "loss": 2.3987, + "step": 12160 + }, + { + "epoch": 0.047045816517449865, + "grad_norm": 0.11670637875795364, + "learning_rate": 0.002, + "loss": 2.4169, + "step": 12170 + }, + { + "epoch": 0.04708447372083314, + "grad_norm": 0.1357085108757019, + "learning_rate": 0.002, + "loss": 2.4074, + "step": 12180 + }, + { + "epoch": 0.04712313092421642, + "grad_norm": 0.15811742842197418, + "learning_rate": 0.002, + "loss": 2.3974, + "step": 12190 + }, + { + "epoch": 0.0471617881275997, + "grad_norm": 0.12981584668159485, + "learning_rate": 0.002, + "loss": 2.4265, + "step": 12200 + }, + { + "epoch": 0.04720044533098298, + "grad_norm": 0.1324855387210846, + "learning_rate": 0.002, + "loss": 2.4064, + "step": 12210 + }, + { + "epoch": 0.04723910253436625, + "grad_norm": 0.14152173697948456, + "learning_rate": 0.002, + "loss": 2.4178, + "step": 12220 + }, + { + "epoch": 0.04727775973774953, + "grad_norm": 0.12873715162277222, + "learning_rate": 0.002, + "loss": 2.4109, + "step": 12230 + }, + { + "epoch": 0.04731641694113281, + "grad_norm": 0.12940950691699982, + "learning_rate": 0.002, + "loss": 2.4215, + "step": 12240 + }, + { + "epoch": 0.04735507414451609, + "grad_norm": 0.1321355104446411, + "learning_rate": 0.002, + "loss": 2.4134, + "step": 12250 + }, + { + "epoch": 0.04739373134789937, + "grad_norm": 0.16091695427894592, + "learning_rate": 0.002, + "loss": 2.3976, + "step": 12260 + }, + { + "epoch": 0.047432388551282643, + "grad_norm": 0.14886890351772308, + "learning_rate": 0.002, + "loss": 2.4064, + "step": 12270 + }, + { + "epoch": 0.04747104575466592, + "grad_norm": 0.13141034543514252, + "learning_rate": 0.002, + "loss": 2.3986, + "step": 12280 + }, + { + "epoch": 0.0475097029580492, + "grad_norm": 0.1348421722650528, + "learning_rate": 0.002, + "loss": 2.4091, + "step": 12290 + }, + { + "epoch": 0.04754836016143248, + "grad_norm": 0.1288251429796219, + "learning_rate": 0.002, + "loss": 2.4095, + "step": 12300 + }, + { + "epoch": 0.04758701736481576, + "grad_norm": 0.16734634339809418, + "learning_rate": 0.002, + "loss": 2.4137, + "step": 12310 + }, + { + "epoch": 0.047625674568199036, + "grad_norm": 0.1711134910583496, + "learning_rate": 0.002, + "loss": 2.4112, + "step": 12320 + }, + { + "epoch": 0.047664331771582316, + "grad_norm": 0.12048466503620148, + "learning_rate": 0.002, + "loss": 2.4302, + "step": 12330 + }, + { + "epoch": 0.047702988974965596, + "grad_norm": 0.13013233244419098, + "learning_rate": 0.002, + "loss": 2.4024, + "step": 12340 + }, + { + "epoch": 0.047741646178348876, + "grad_norm": 0.14628617465496063, + "learning_rate": 0.002, + "loss": 2.4207, + "step": 12350 + }, + { + "epoch": 0.04778030338173215, + "grad_norm": 0.11881358176469803, + "learning_rate": 0.002, + "loss": 2.4089, + "step": 12360 + }, + { + "epoch": 0.04781896058511543, + "grad_norm": 0.14722537994384766, + "learning_rate": 0.002, + "loss": 2.4204, + "step": 12370 + }, + { + "epoch": 0.04785761778849871, + "grad_norm": 0.1352567970752716, + "learning_rate": 0.002, + "loss": 2.4002, + "step": 12380 + }, + { + "epoch": 0.04789627499188199, + "grad_norm": 0.13809798657894135, + "learning_rate": 0.002, + "loss": 2.4103, + "step": 12390 + }, + { + "epoch": 0.04793493219526527, + "grad_norm": 0.134647399187088, + "learning_rate": 0.002, + "loss": 2.4146, + "step": 12400 + }, + { + "epoch": 0.04797358939864854, + "grad_norm": 0.13529935479164124, + "learning_rate": 0.002, + "loss": 2.4162, + "step": 12410 + }, + { + "epoch": 0.04801224660203182, + "grad_norm": 0.14426128566265106, + "learning_rate": 0.002, + "loss": 2.4014, + "step": 12420 + }, + { + "epoch": 0.0480509038054151, + "grad_norm": 0.1167021095752716, + "learning_rate": 0.002, + "loss": 2.425, + "step": 12430 + }, + { + "epoch": 0.04808956100879838, + "grad_norm": 0.16464290022850037, + "learning_rate": 0.002, + "loss": 2.4127, + "step": 12440 + }, + { + "epoch": 0.048128218212181655, + "grad_norm": 0.1276850402355194, + "learning_rate": 0.002, + "loss": 2.4103, + "step": 12450 + }, + { + "epoch": 0.048166875415564935, + "grad_norm": 0.14743928611278534, + "learning_rate": 0.002, + "loss": 2.4335, + "step": 12460 + }, + { + "epoch": 0.048205532618948214, + "grad_norm": 0.11716633290052414, + "learning_rate": 0.002, + "loss": 2.4171, + "step": 12470 + }, + { + "epoch": 0.048244189822331494, + "grad_norm": 0.12930072844028473, + "learning_rate": 0.002, + "loss": 2.4085, + "step": 12480 + }, + { + "epoch": 0.048282847025714774, + "grad_norm": 0.16280917823314667, + "learning_rate": 0.002, + "loss": 2.4118, + "step": 12490 + }, + { + "epoch": 0.04832150422909805, + "grad_norm": 0.1280793398618698, + "learning_rate": 0.002, + "loss": 2.3931, + "step": 12500 + }, + { + "epoch": 0.04836016143248133, + "grad_norm": 0.12769965827465057, + "learning_rate": 0.002, + "loss": 2.4027, + "step": 12510 + }, + { + "epoch": 0.04839881863586461, + "grad_norm": 0.21294499933719635, + "learning_rate": 0.002, + "loss": 2.4061, + "step": 12520 + }, + { + "epoch": 0.04843747583924789, + "grad_norm": 0.13654080033302307, + "learning_rate": 0.002, + "loss": 2.4085, + "step": 12530 + }, + { + "epoch": 0.04847613304263117, + "grad_norm": 0.1464715600013733, + "learning_rate": 0.002, + "loss": 2.3967, + "step": 12540 + }, + { + "epoch": 0.04851479024601444, + "grad_norm": 0.17120212316513062, + "learning_rate": 0.002, + "loss": 2.3977, + "step": 12550 + }, + { + "epoch": 0.04855344744939772, + "grad_norm": 0.12013162672519684, + "learning_rate": 0.002, + "loss": 2.4128, + "step": 12560 + }, + { + "epoch": 0.048592104652781, + "grad_norm": 0.12653093039989471, + "learning_rate": 0.002, + "loss": 2.4259, + "step": 12570 + }, + { + "epoch": 0.04863076185616428, + "grad_norm": 0.12446767836809158, + "learning_rate": 0.002, + "loss": 2.4048, + "step": 12580 + }, + { + "epoch": 0.04866941905954755, + "grad_norm": 0.12352242320775986, + "learning_rate": 0.002, + "loss": 2.3932, + "step": 12590 + }, + { + "epoch": 0.04870807626293083, + "grad_norm": 0.1217745840549469, + "learning_rate": 0.002, + "loss": 2.4092, + "step": 12600 + }, + { + "epoch": 0.04874673346631411, + "grad_norm": 0.11448900401592255, + "learning_rate": 0.002, + "loss": 2.4002, + "step": 12610 + }, + { + "epoch": 0.04878539066969739, + "grad_norm": 0.16743631660938263, + "learning_rate": 0.002, + "loss": 2.4037, + "step": 12620 + }, + { + "epoch": 0.04882404787308067, + "grad_norm": 0.1504344940185547, + "learning_rate": 0.002, + "loss": 2.4046, + "step": 12630 + }, + { + "epoch": 0.048862705076463946, + "grad_norm": 0.12713049352169037, + "learning_rate": 0.002, + "loss": 2.4006, + "step": 12640 + }, + { + "epoch": 0.048901362279847226, + "grad_norm": 0.15592017769813538, + "learning_rate": 0.002, + "loss": 2.3979, + "step": 12650 + }, + { + "epoch": 0.048940019483230505, + "grad_norm": 0.13741640746593475, + "learning_rate": 0.002, + "loss": 2.406, + "step": 12660 + }, + { + "epoch": 0.048978676686613785, + "grad_norm": 0.11524946987628937, + "learning_rate": 0.002, + "loss": 2.4036, + "step": 12670 + }, + { + "epoch": 0.049017333889997065, + "grad_norm": 0.15274588763713837, + "learning_rate": 0.002, + "loss": 2.4096, + "step": 12680 + }, + { + "epoch": 0.04905599109338034, + "grad_norm": 0.12144805490970612, + "learning_rate": 0.002, + "loss": 2.4143, + "step": 12690 + }, + { + "epoch": 0.04909464829676362, + "grad_norm": 0.12664386630058289, + "learning_rate": 0.002, + "loss": 2.3967, + "step": 12700 + }, + { + "epoch": 0.0491333055001469, + "grad_norm": 0.15074361860752106, + "learning_rate": 0.002, + "loss": 2.3987, + "step": 12710 + }, + { + "epoch": 0.04917196270353018, + "grad_norm": 0.12998323142528534, + "learning_rate": 0.002, + "loss": 2.4158, + "step": 12720 + }, + { + "epoch": 0.04921061990691345, + "grad_norm": 0.12403959035873413, + "learning_rate": 0.002, + "loss": 2.4115, + "step": 12730 + }, + { + "epoch": 0.04924927711029673, + "grad_norm": 0.12004856020212173, + "learning_rate": 0.002, + "loss": 2.398, + "step": 12740 + }, + { + "epoch": 0.04928793431368001, + "grad_norm": 0.12569855153560638, + "learning_rate": 0.002, + "loss": 2.414, + "step": 12750 + }, + { + "epoch": 0.04932659151706329, + "grad_norm": 0.10845328122377396, + "learning_rate": 0.002, + "loss": 2.4085, + "step": 12760 + }, + { + "epoch": 0.04936524872044657, + "grad_norm": 0.12075639516115189, + "learning_rate": 0.002, + "loss": 2.4115, + "step": 12770 + }, + { + "epoch": 0.049403905923829844, + "grad_norm": 0.1443193107843399, + "learning_rate": 0.002, + "loss": 2.4136, + "step": 12780 + }, + { + "epoch": 0.049442563127213124, + "grad_norm": 0.131768599152565, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 12790 + }, + { + "epoch": 0.049481220330596404, + "grad_norm": 0.12630602717399597, + "learning_rate": 0.002, + "loss": 2.4034, + "step": 12800 + }, + { + "epoch": 0.049519877533979684, + "grad_norm": 0.13021297752857208, + "learning_rate": 0.002, + "loss": 2.3968, + "step": 12810 + }, + { + "epoch": 0.049558534737362964, + "grad_norm": 0.12939077615737915, + "learning_rate": 0.002, + "loss": 2.4204, + "step": 12820 + }, + { + "epoch": 0.04959719194074624, + "grad_norm": 0.13959570229053497, + "learning_rate": 0.002, + "loss": 2.4009, + "step": 12830 + }, + { + "epoch": 0.04963584914412952, + "grad_norm": 0.12185297906398773, + "learning_rate": 0.002, + "loss": 2.4016, + "step": 12840 + }, + { + "epoch": 0.0496745063475128, + "grad_norm": 0.12302763015031815, + "learning_rate": 0.002, + "loss": 2.4199, + "step": 12850 + }, + { + "epoch": 0.049713163550896076, + "grad_norm": 0.13255852460861206, + "learning_rate": 0.002, + "loss": 2.3924, + "step": 12860 + }, + { + "epoch": 0.04975182075427935, + "grad_norm": 0.1301736682653427, + "learning_rate": 0.002, + "loss": 2.4136, + "step": 12870 + }, + { + "epoch": 0.04979047795766263, + "grad_norm": 0.12346645444631577, + "learning_rate": 0.002, + "loss": 2.4028, + "step": 12880 + }, + { + "epoch": 0.04982913516104591, + "grad_norm": 0.12624023854732513, + "learning_rate": 0.002, + "loss": 2.4112, + "step": 12890 + }, + { + "epoch": 0.04986779236442919, + "grad_norm": 0.13940802216529846, + "learning_rate": 0.002, + "loss": 2.4061, + "step": 12900 + }, + { + "epoch": 0.04990644956781247, + "grad_norm": 0.12112889438867569, + "learning_rate": 0.002, + "loss": 2.405, + "step": 12910 + }, + { + "epoch": 0.04994510677119574, + "grad_norm": 0.1299026608467102, + "learning_rate": 0.002, + "loss": 2.3977, + "step": 12920 + }, + { + "epoch": 0.04998376397457902, + "grad_norm": 0.1561487466096878, + "learning_rate": 0.002, + "loss": 2.4104, + "step": 12930 + }, + { + "epoch": 0.0500224211779623, + "grad_norm": 0.11744363605976105, + "learning_rate": 0.002, + "loss": 2.3941, + "step": 12940 + }, + { + "epoch": 0.05006107838134558, + "grad_norm": 0.1262972503900528, + "learning_rate": 0.002, + "loss": 2.4013, + "step": 12950 + }, + { + "epoch": 0.050099735584728855, + "grad_norm": 0.1282545030117035, + "learning_rate": 0.002, + "loss": 2.4071, + "step": 12960 + }, + { + "epoch": 0.050138392788112135, + "grad_norm": 0.1636199951171875, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 12970 + }, + { + "epoch": 0.050177049991495415, + "grad_norm": 0.11423587054014206, + "learning_rate": 0.002, + "loss": 2.4059, + "step": 12980 + }, + { + "epoch": 0.050215707194878695, + "grad_norm": 0.1306290328502655, + "learning_rate": 0.002, + "loss": 2.4006, + "step": 12990 + }, + { + "epoch": 0.050254364398261975, + "grad_norm": 0.1477252095937729, + "learning_rate": 0.002, + "loss": 2.3894, + "step": 13000 + }, + { + "epoch": 0.05029302160164525, + "grad_norm": 0.1320551335811615, + "learning_rate": 0.002, + "loss": 2.4272, + "step": 13010 + }, + { + "epoch": 0.05033167880502853, + "grad_norm": 0.14027470350265503, + "learning_rate": 0.002, + "loss": 2.4151, + "step": 13020 + }, + { + "epoch": 0.05037033600841181, + "grad_norm": 0.1384015679359436, + "learning_rate": 0.002, + "loss": 2.4055, + "step": 13030 + }, + { + "epoch": 0.05040899321179509, + "grad_norm": 0.13307714462280273, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 13040 + }, + { + "epoch": 0.05044765041517837, + "grad_norm": 0.11577140539884567, + "learning_rate": 0.002, + "loss": 2.3969, + "step": 13050 + }, + { + "epoch": 0.05048630761856164, + "grad_norm": 0.12980316579341888, + "learning_rate": 0.002, + "loss": 2.4087, + "step": 13060 + }, + { + "epoch": 0.05052496482194492, + "grad_norm": 0.13999392092227936, + "learning_rate": 0.002, + "loss": 2.3943, + "step": 13070 + }, + { + "epoch": 0.0505636220253282, + "grad_norm": 0.10867933183908463, + "learning_rate": 0.002, + "loss": 2.4152, + "step": 13080 + }, + { + "epoch": 0.05060227922871148, + "grad_norm": 0.11368278414011002, + "learning_rate": 0.002, + "loss": 2.4101, + "step": 13090 + }, + { + "epoch": 0.05064093643209475, + "grad_norm": 0.11560127884149551, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 13100 + }, + { + "epoch": 0.05067959363547803, + "grad_norm": 0.11178620159626007, + "learning_rate": 0.002, + "loss": 2.3975, + "step": 13110 + }, + { + "epoch": 0.05071825083886131, + "grad_norm": 0.13433168828487396, + "learning_rate": 0.002, + "loss": 2.3876, + "step": 13120 + }, + { + "epoch": 0.05075690804224459, + "grad_norm": 0.1378285437822342, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 13130 + }, + { + "epoch": 0.05079556524562787, + "grad_norm": 0.11138379573822021, + "learning_rate": 0.002, + "loss": 2.4058, + "step": 13140 + }, + { + "epoch": 0.050834222449011146, + "grad_norm": 0.1196100190281868, + "learning_rate": 0.002, + "loss": 2.3945, + "step": 13150 + }, + { + "epoch": 0.050872879652394426, + "grad_norm": 0.14321012794971466, + "learning_rate": 0.002, + "loss": 2.3982, + "step": 13160 + }, + { + "epoch": 0.050911536855777706, + "grad_norm": 0.12484076619148254, + "learning_rate": 0.002, + "loss": 2.4067, + "step": 13170 + }, + { + "epoch": 0.050950194059160986, + "grad_norm": 0.14980606734752655, + "learning_rate": 0.002, + "loss": 2.407, + "step": 13180 + }, + { + "epoch": 0.050988851262544266, + "grad_norm": 0.13169430196285248, + "learning_rate": 0.002, + "loss": 2.4015, + "step": 13190 + }, + { + "epoch": 0.05102750846592754, + "grad_norm": 0.1352589726448059, + "learning_rate": 0.002, + "loss": 2.4024, + "step": 13200 + }, + { + "epoch": 0.05106616566931082, + "grad_norm": 0.14622355997562408, + "learning_rate": 0.002, + "loss": 2.4073, + "step": 13210 + }, + { + "epoch": 0.0511048228726941, + "grad_norm": 0.1075468584895134, + "learning_rate": 0.002, + "loss": 2.4132, + "step": 13220 + }, + { + "epoch": 0.05114348007607738, + "grad_norm": 0.11611749976873398, + "learning_rate": 0.002, + "loss": 2.4027, + "step": 13230 + }, + { + "epoch": 0.05118213727946065, + "grad_norm": 0.13044321537017822, + "learning_rate": 0.002, + "loss": 2.4114, + "step": 13240 + }, + { + "epoch": 0.05122079448284393, + "grad_norm": 0.1160300001502037, + "learning_rate": 0.002, + "loss": 2.4056, + "step": 13250 + }, + { + "epoch": 0.05125945168622721, + "grad_norm": 0.12946556508541107, + "learning_rate": 0.002, + "loss": 2.397, + "step": 13260 + }, + { + "epoch": 0.05129810888961049, + "grad_norm": 0.11253096908330917, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 13270 + }, + { + "epoch": 0.05133676609299377, + "grad_norm": 0.11658414453268051, + "learning_rate": 0.002, + "loss": 2.3931, + "step": 13280 + }, + { + "epoch": 0.051375423296377044, + "grad_norm": 0.13752421736717224, + "learning_rate": 0.002, + "loss": 2.4166, + "step": 13290 + }, + { + "epoch": 0.051414080499760324, + "grad_norm": 0.1143462136387825, + "learning_rate": 0.002, + "loss": 2.414, + "step": 13300 + }, + { + "epoch": 0.051452737703143604, + "grad_norm": 0.1488148272037506, + "learning_rate": 0.002, + "loss": 2.3871, + "step": 13310 + }, + { + "epoch": 0.051491394906526884, + "grad_norm": 0.1388746201992035, + "learning_rate": 0.002, + "loss": 2.3956, + "step": 13320 + }, + { + "epoch": 0.051530052109910164, + "grad_norm": 0.1489126980304718, + "learning_rate": 0.002, + "loss": 2.4011, + "step": 13330 + }, + { + "epoch": 0.05156870931329344, + "grad_norm": 0.14923395216464996, + "learning_rate": 0.002, + "loss": 2.4017, + "step": 13340 + }, + { + "epoch": 0.05160736651667672, + "grad_norm": 0.1405162364244461, + "learning_rate": 0.002, + "loss": 2.3972, + "step": 13350 + }, + { + "epoch": 0.05164602372006, + "grad_norm": 0.1152169480919838, + "learning_rate": 0.002, + "loss": 2.3877, + "step": 13360 + }, + { + "epoch": 0.05168468092344328, + "grad_norm": 0.11067835241556168, + "learning_rate": 0.002, + "loss": 2.3946, + "step": 13370 + }, + { + "epoch": 0.05172333812682655, + "grad_norm": 0.13168556988239288, + "learning_rate": 0.002, + "loss": 2.4041, + "step": 13380 + }, + { + "epoch": 0.05176199533020983, + "grad_norm": 0.1383281648159027, + "learning_rate": 0.002, + "loss": 2.3964, + "step": 13390 + }, + { + "epoch": 0.05180065253359311, + "grad_norm": 0.11505181342363358, + "learning_rate": 0.002, + "loss": 2.3998, + "step": 13400 + }, + { + "epoch": 0.05183930973697639, + "grad_norm": 0.13439294695854187, + "learning_rate": 0.002, + "loss": 2.4019, + "step": 13410 + }, + { + "epoch": 0.05187796694035967, + "grad_norm": 0.12931932508945465, + "learning_rate": 0.002, + "loss": 2.4123, + "step": 13420 + }, + { + "epoch": 0.05191662414374294, + "grad_norm": 0.12582558393478394, + "learning_rate": 0.002, + "loss": 2.408, + "step": 13430 + }, + { + "epoch": 0.05195528134712622, + "grad_norm": 0.12109485268592834, + "learning_rate": 0.002, + "loss": 2.4017, + "step": 13440 + }, + { + "epoch": 0.0519939385505095, + "grad_norm": 0.15276485681533813, + "learning_rate": 0.002, + "loss": 2.4023, + "step": 13450 + }, + { + "epoch": 0.05203259575389278, + "grad_norm": 0.12507130205631256, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 13460 + }, + { + "epoch": 0.052071252957276055, + "grad_norm": 0.14027948677539825, + "learning_rate": 0.002, + "loss": 2.4167, + "step": 13470 + }, + { + "epoch": 0.052109910160659335, + "grad_norm": 0.10799466818571091, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 13480 + }, + { + "epoch": 0.052148567364042615, + "grad_norm": 0.12428688257932663, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 13490 + }, + { + "epoch": 0.052187224567425895, + "grad_norm": 0.11244092136621475, + "learning_rate": 0.002, + "loss": 2.3953, + "step": 13500 + }, + { + "epoch": 0.052225881770809175, + "grad_norm": 0.14743168652057648, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 13510 + }, + { + "epoch": 0.05226453897419245, + "grad_norm": 0.11982240527868271, + "learning_rate": 0.002, + "loss": 2.4215, + "step": 13520 + }, + { + "epoch": 0.05230319617757573, + "grad_norm": 0.12308619916439056, + "learning_rate": 0.002, + "loss": 2.3991, + "step": 13530 + }, + { + "epoch": 0.05234185338095901, + "grad_norm": 0.12184516340494156, + "learning_rate": 0.002, + "loss": 2.3979, + "step": 13540 + }, + { + "epoch": 0.05238051058434229, + "grad_norm": 0.13491886854171753, + "learning_rate": 0.002, + "loss": 2.3994, + "step": 13550 + }, + { + "epoch": 0.05241916778772557, + "grad_norm": 0.12219549715518951, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 13560 + }, + { + "epoch": 0.05245782499110884, + "grad_norm": 0.11105125397443771, + "learning_rate": 0.002, + "loss": 2.4074, + "step": 13570 + }, + { + "epoch": 0.05249648219449212, + "grad_norm": 0.16937963664531708, + "learning_rate": 0.002, + "loss": 2.4001, + "step": 13580 + }, + { + "epoch": 0.0525351393978754, + "grad_norm": 0.10928713530302048, + "learning_rate": 0.002, + "loss": 2.4093, + "step": 13590 + }, + { + "epoch": 0.05257379660125868, + "grad_norm": 0.16527491807937622, + "learning_rate": 0.002, + "loss": 2.4012, + "step": 13600 + }, + { + "epoch": 0.052612453804641954, + "grad_norm": 0.13442641496658325, + "learning_rate": 0.002, + "loss": 2.4026, + "step": 13610 + }, + { + "epoch": 0.052651111008025234, + "grad_norm": 0.12295237183570862, + "learning_rate": 0.002, + "loss": 2.3935, + "step": 13620 + }, + { + "epoch": 0.052689768211408514, + "grad_norm": 0.13979977369308472, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 13630 + }, + { + "epoch": 0.052728425414791794, + "grad_norm": 0.12009885162115097, + "learning_rate": 0.002, + "loss": 2.4035, + "step": 13640 + }, + { + "epoch": 0.052767082618175074, + "grad_norm": 0.14299508929252625, + "learning_rate": 0.002, + "loss": 2.4086, + "step": 13650 + }, + { + "epoch": 0.052805739821558347, + "grad_norm": 0.14301536977291107, + "learning_rate": 0.002, + "loss": 2.4084, + "step": 13660 + }, + { + "epoch": 0.052844397024941626, + "grad_norm": 0.12862665951251984, + "learning_rate": 0.002, + "loss": 2.4025, + "step": 13670 + }, + { + "epoch": 0.052883054228324906, + "grad_norm": 0.162658229470253, + "learning_rate": 0.002, + "loss": 2.3974, + "step": 13680 + }, + { + "epoch": 0.052921711431708186, + "grad_norm": 0.10682334005832672, + "learning_rate": 0.002, + "loss": 2.411, + "step": 13690 + }, + { + "epoch": 0.052960368635091466, + "grad_norm": 0.13332685828208923, + "learning_rate": 0.002, + "loss": 2.4163, + "step": 13700 + }, + { + "epoch": 0.05299902583847474, + "grad_norm": 0.11928480863571167, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 13710 + }, + { + "epoch": 0.05303768304185802, + "grad_norm": 0.13431620597839355, + "learning_rate": 0.002, + "loss": 2.4059, + "step": 13720 + }, + { + "epoch": 0.0530763402452413, + "grad_norm": 0.14743177592754364, + "learning_rate": 0.002, + "loss": 2.3915, + "step": 13730 + }, + { + "epoch": 0.05311499744862458, + "grad_norm": 0.14384421706199646, + "learning_rate": 0.002, + "loss": 2.4026, + "step": 13740 + }, + { + "epoch": 0.05315365465200785, + "grad_norm": 0.11944720149040222, + "learning_rate": 0.002, + "loss": 2.3983, + "step": 13750 + }, + { + "epoch": 0.05319231185539113, + "grad_norm": 0.11968901008367538, + "learning_rate": 0.002, + "loss": 2.4023, + "step": 13760 + }, + { + "epoch": 0.05323096905877441, + "grad_norm": 0.1279086172580719, + "learning_rate": 0.002, + "loss": 2.4049, + "step": 13770 + }, + { + "epoch": 0.05326962626215769, + "grad_norm": 0.11430168896913528, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 13780 + }, + { + "epoch": 0.05330828346554097, + "grad_norm": 0.13571108877658844, + "learning_rate": 0.002, + "loss": 2.3965, + "step": 13790 + }, + { + "epoch": 0.053346940668924245, + "grad_norm": 0.1266448199748993, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 13800 + }, + { + "epoch": 0.053385597872307525, + "grad_norm": 0.13343386352062225, + "learning_rate": 0.002, + "loss": 2.4135, + "step": 13810 + }, + { + "epoch": 0.053424255075690805, + "grad_norm": 0.14449051022529602, + "learning_rate": 0.002, + "loss": 2.4074, + "step": 13820 + }, + { + "epoch": 0.053462912279074085, + "grad_norm": 0.12064560502767563, + "learning_rate": 0.002, + "loss": 2.3903, + "step": 13830 + }, + { + "epoch": 0.053501569482457365, + "grad_norm": 0.13178220391273499, + "learning_rate": 0.002, + "loss": 2.387, + "step": 13840 + }, + { + "epoch": 0.05354022668584064, + "grad_norm": 0.13305504620075226, + "learning_rate": 0.002, + "loss": 2.3928, + "step": 13850 + }, + { + "epoch": 0.05357888388922392, + "grad_norm": 0.14914706349372864, + "learning_rate": 0.002, + "loss": 2.3985, + "step": 13860 + }, + { + "epoch": 0.0536175410926072, + "grad_norm": 0.13752660155296326, + "learning_rate": 0.002, + "loss": 2.3925, + "step": 13870 + }, + { + "epoch": 0.05365619829599048, + "grad_norm": 0.1223013773560524, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 13880 + }, + { + "epoch": 0.05369485549937375, + "grad_norm": 0.13957269489765167, + "learning_rate": 0.002, + "loss": 2.3994, + "step": 13890 + }, + { + "epoch": 0.05373351270275703, + "grad_norm": 0.159816175699234, + "learning_rate": 0.002, + "loss": 2.3863, + "step": 13900 + }, + { + "epoch": 0.05377216990614031, + "grad_norm": 0.10616747289896011, + "learning_rate": 0.002, + "loss": 2.389, + "step": 13910 + }, + { + "epoch": 0.05381082710952359, + "grad_norm": 0.1293746680021286, + "learning_rate": 0.002, + "loss": 2.3972, + "step": 13920 + }, + { + "epoch": 0.05384948431290687, + "grad_norm": 0.13460808992385864, + "learning_rate": 0.002, + "loss": 2.3938, + "step": 13930 + }, + { + "epoch": 0.05388814151629014, + "grad_norm": 0.13545729219913483, + "learning_rate": 0.002, + "loss": 2.3951, + "step": 13940 + }, + { + "epoch": 0.05392679871967342, + "grad_norm": 0.11745970696210861, + "learning_rate": 0.002, + "loss": 2.4063, + "step": 13950 + }, + { + "epoch": 0.0539654559230567, + "grad_norm": 0.11512023210525513, + "learning_rate": 0.002, + "loss": 2.3982, + "step": 13960 + }, + { + "epoch": 0.05400411312643998, + "grad_norm": 0.12416357547044754, + "learning_rate": 0.002, + "loss": 2.3919, + "step": 13970 + }, + { + "epoch": 0.054042770329823256, + "grad_norm": 0.12409238517284393, + "learning_rate": 0.002, + "loss": 2.4102, + "step": 13980 + }, + { + "epoch": 0.054081427533206536, + "grad_norm": 0.10481012612581253, + "learning_rate": 0.002, + "loss": 2.4039, + "step": 13990 + }, + { + "epoch": 0.054120084736589816, + "grad_norm": 0.11006207764148712, + "learning_rate": 0.002, + "loss": 2.4024, + "step": 14000 + }, + { + "epoch": 0.054158741939973096, + "grad_norm": 0.13748984038829803, + "learning_rate": 0.002, + "loss": 2.3995, + "step": 14010 + }, + { + "epoch": 0.054197399143356376, + "grad_norm": 0.13295917212963104, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 14020 + }, + { + "epoch": 0.05423605634673965, + "grad_norm": 0.10712581872940063, + "learning_rate": 0.002, + "loss": 2.392, + "step": 14030 + }, + { + "epoch": 0.05427471355012293, + "grad_norm": 0.17704936861991882, + "learning_rate": 0.002, + "loss": 2.3972, + "step": 14040 + }, + { + "epoch": 0.05431337075350621, + "grad_norm": 0.11886173486709595, + "learning_rate": 0.002, + "loss": 2.3952, + "step": 14050 + }, + { + "epoch": 0.05435202795688949, + "grad_norm": 0.12546727061271667, + "learning_rate": 0.002, + "loss": 2.397, + "step": 14060 + }, + { + "epoch": 0.05439068516027277, + "grad_norm": 0.16801774501800537, + "learning_rate": 0.002, + "loss": 2.4181, + "step": 14070 + }, + { + "epoch": 0.05442934236365604, + "grad_norm": 0.10857007652521133, + "learning_rate": 0.002, + "loss": 2.4094, + "step": 14080 + }, + { + "epoch": 0.05446799956703932, + "grad_norm": 0.12261257320642471, + "learning_rate": 0.002, + "loss": 2.4064, + "step": 14090 + }, + { + "epoch": 0.0545066567704226, + "grad_norm": 0.12848199903964996, + "learning_rate": 0.002, + "loss": 2.3937, + "step": 14100 + }, + { + "epoch": 0.05454531397380588, + "grad_norm": 0.10281208902597427, + "learning_rate": 0.002, + "loss": 2.3913, + "step": 14110 + }, + { + "epoch": 0.054583971177189154, + "grad_norm": 0.14198198914527893, + "learning_rate": 0.002, + "loss": 2.4063, + "step": 14120 + }, + { + "epoch": 0.054622628380572434, + "grad_norm": 0.1394672691822052, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 14130 + }, + { + "epoch": 0.054661285583955714, + "grad_norm": 0.13187329471111298, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 14140 + }, + { + "epoch": 0.054699942787338994, + "grad_norm": 0.14645549654960632, + "learning_rate": 0.002, + "loss": 2.4134, + "step": 14150 + }, + { + "epoch": 0.054738599990722274, + "grad_norm": 0.13064813613891602, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 14160 + }, + { + "epoch": 0.05477725719410555, + "grad_norm": 0.12041107565164566, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 14170 + }, + { + "epoch": 0.05481591439748883, + "grad_norm": 0.11944100260734558, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 14180 + }, + { + "epoch": 0.05485457160087211, + "grad_norm": 0.13119396567344666, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 14190 + }, + { + "epoch": 0.05489322880425539, + "grad_norm": 0.12450587749481201, + "learning_rate": 0.002, + "loss": 2.3975, + "step": 14200 + }, + { + "epoch": 0.05493188600763867, + "grad_norm": 0.1268710047006607, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 14210 + }, + { + "epoch": 0.05497054321102194, + "grad_norm": 0.12374629825353622, + "learning_rate": 0.002, + "loss": 2.3886, + "step": 14220 + }, + { + "epoch": 0.05500920041440522, + "grad_norm": 0.14392772316932678, + "learning_rate": 0.002, + "loss": 2.4025, + "step": 14230 + }, + { + "epoch": 0.0550478576177885, + "grad_norm": 0.11412364989519119, + "learning_rate": 0.002, + "loss": 2.3998, + "step": 14240 + }, + { + "epoch": 0.05508651482117178, + "grad_norm": 0.11980044096708298, + "learning_rate": 0.002, + "loss": 2.3938, + "step": 14250 + }, + { + "epoch": 0.05512517202455505, + "grad_norm": 0.13415683805942535, + "learning_rate": 0.002, + "loss": 2.4014, + "step": 14260 + }, + { + "epoch": 0.05516382922793833, + "grad_norm": 0.1138518825173378, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 14270 + }, + { + "epoch": 0.05520248643132161, + "grad_norm": 0.12392842769622803, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 14280 + }, + { + "epoch": 0.05524114363470489, + "grad_norm": 0.1531682163476944, + "learning_rate": 0.002, + "loss": 2.3988, + "step": 14290 + }, + { + "epoch": 0.05527980083808817, + "grad_norm": 0.1261725127696991, + "learning_rate": 0.002, + "loss": 2.3926, + "step": 14300 + }, + { + "epoch": 0.055318458041471445, + "grad_norm": 0.10306981950998306, + "learning_rate": 0.002, + "loss": 2.3912, + "step": 14310 + }, + { + "epoch": 0.055357115244854725, + "grad_norm": 0.13406990468502045, + "learning_rate": 0.002, + "loss": 2.416, + "step": 14320 + }, + { + "epoch": 0.055395772448238005, + "grad_norm": 0.12140738219022751, + "learning_rate": 0.002, + "loss": 2.3945, + "step": 14330 + }, + { + "epoch": 0.055434429651621285, + "grad_norm": 0.12369288504123688, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 14340 + }, + { + "epoch": 0.055473086855004565, + "grad_norm": 0.12816184759140015, + "learning_rate": 0.002, + "loss": 2.3931, + "step": 14350 + }, + { + "epoch": 0.05551174405838784, + "grad_norm": 0.12383025139570236, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 14360 + }, + { + "epoch": 0.05555040126177112, + "grad_norm": 0.2093370109796524, + "learning_rate": 0.002, + "loss": 2.3928, + "step": 14370 + }, + { + "epoch": 0.0555890584651544, + "grad_norm": 0.12548458576202393, + "learning_rate": 0.002, + "loss": 2.4142, + "step": 14380 + }, + { + "epoch": 0.05562771566853768, + "grad_norm": 0.11871566623449326, + "learning_rate": 0.002, + "loss": 2.3986, + "step": 14390 + }, + { + "epoch": 0.05566637287192095, + "grad_norm": 0.12631046772003174, + "learning_rate": 0.002, + "loss": 2.3936, + "step": 14400 + }, + { + "epoch": 0.05570503007530423, + "grad_norm": 0.16620250046253204, + "learning_rate": 0.002, + "loss": 2.4023, + "step": 14410 + }, + { + "epoch": 0.05574368727868751, + "grad_norm": 0.13363073766231537, + "learning_rate": 0.002, + "loss": 2.3915, + "step": 14420 + }, + { + "epoch": 0.05578234448207079, + "grad_norm": 0.1280767321586609, + "learning_rate": 0.002, + "loss": 2.3954, + "step": 14430 + }, + { + "epoch": 0.05582100168545407, + "grad_norm": 0.11046691983938217, + "learning_rate": 0.002, + "loss": 2.3925, + "step": 14440 + }, + { + "epoch": 0.055859658888837344, + "grad_norm": 0.13728322088718414, + "learning_rate": 0.002, + "loss": 2.3992, + "step": 14450 + }, + { + "epoch": 0.055898316092220623, + "grad_norm": 0.1518576294183731, + "learning_rate": 0.002, + "loss": 2.3964, + "step": 14460 + }, + { + "epoch": 0.0559369732956039, + "grad_norm": 0.12583044171333313, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 14470 + }, + { + "epoch": 0.05597563049898718, + "grad_norm": 0.11695846915245056, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 14480 + }, + { + "epoch": 0.056014287702370456, + "grad_norm": 0.11865654587745667, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 14490 + }, + { + "epoch": 0.056052944905753736, + "grad_norm": 0.13754767179489136, + "learning_rate": 0.002, + "loss": 2.3897, + "step": 14500 + }, + { + "epoch": 0.056091602109137016, + "grad_norm": 0.13682836294174194, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 14510 + }, + { + "epoch": 0.056130259312520296, + "grad_norm": 0.11361128836870193, + "learning_rate": 0.002, + "loss": 2.4157, + "step": 14520 + }, + { + "epoch": 0.056168916515903576, + "grad_norm": 0.14468665421009064, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 14530 + }, + { + "epoch": 0.05620757371928685, + "grad_norm": 0.13871458172798157, + "learning_rate": 0.002, + "loss": 2.3996, + "step": 14540 + }, + { + "epoch": 0.05624623092267013, + "grad_norm": 0.11031243205070496, + "learning_rate": 0.002, + "loss": 2.4044, + "step": 14550 + }, + { + "epoch": 0.05628488812605341, + "grad_norm": 0.10833417624235153, + "learning_rate": 0.002, + "loss": 2.3902, + "step": 14560 + }, + { + "epoch": 0.05632354532943669, + "grad_norm": 0.12488801032304764, + "learning_rate": 0.002, + "loss": 2.4009, + "step": 14570 + }, + { + "epoch": 0.05636220253281997, + "grad_norm": 0.11902682483196259, + "learning_rate": 0.002, + "loss": 2.4042, + "step": 14580 + }, + { + "epoch": 0.05640085973620324, + "grad_norm": 0.15277214348316193, + "learning_rate": 0.002, + "loss": 2.3996, + "step": 14590 + }, + { + "epoch": 0.05643951693958652, + "grad_norm": 0.12215857207775116, + "learning_rate": 0.002, + "loss": 2.4015, + "step": 14600 + }, + { + "epoch": 0.0564781741429698, + "grad_norm": 0.10888016223907471, + "learning_rate": 0.002, + "loss": 2.3991, + "step": 14610 + }, + { + "epoch": 0.05651683134635308, + "grad_norm": 0.12086877226829529, + "learning_rate": 0.002, + "loss": 2.3884, + "step": 14620 + }, + { + "epoch": 0.056555488549736355, + "grad_norm": 0.12756405770778656, + "learning_rate": 0.002, + "loss": 2.402, + "step": 14630 + }, + { + "epoch": 0.056594145753119635, + "grad_norm": 0.11681199818849564, + "learning_rate": 0.002, + "loss": 2.3941, + "step": 14640 + }, + { + "epoch": 0.056632802956502915, + "grad_norm": 0.15769197046756744, + "learning_rate": 0.002, + "loss": 2.3885, + "step": 14650 + }, + { + "epoch": 0.056671460159886194, + "grad_norm": 0.14188191294670105, + "learning_rate": 0.002, + "loss": 2.4049, + "step": 14660 + }, + { + "epoch": 0.056710117363269474, + "grad_norm": 0.12859699130058289, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 14670 + }, + { + "epoch": 0.05674877456665275, + "grad_norm": 0.11465740203857422, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 14680 + }, + { + "epoch": 0.05678743177003603, + "grad_norm": 0.15560325980186462, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 14690 + }, + { + "epoch": 0.05682608897341931, + "grad_norm": 0.1308726817369461, + "learning_rate": 0.002, + "loss": 2.3926, + "step": 14700 + }, + { + "epoch": 0.05686474617680259, + "grad_norm": 0.14174123108386993, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 14710 + }, + { + "epoch": 0.05690340338018587, + "grad_norm": 0.12352757155895233, + "learning_rate": 0.002, + "loss": 2.41, + "step": 14720 + }, + { + "epoch": 0.05694206058356914, + "grad_norm": 0.11764993518590927, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 14730 + }, + { + "epoch": 0.05698071778695242, + "grad_norm": 0.12702719867229462, + "learning_rate": 0.002, + "loss": 2.394, + "step": 14740 + }, + { + "epoch": 0.0570193749903357, + "grad_norm": 0.11749805510044098, + "learning_rate": 0.002, + "loss": 2.3948, + "step": 14750 + }, + { + "epoch": 0.05705803219371898, + "grad_norm": 0.10064171999692917, + "learning_rate": 0.002, + "loss": 2.3877, + "step": 14760 + }, + { + "epoch": 0.05709668939710225, + "grad_norm": 0.12813323736190796, + "learning_rate": 0.002, + "loss": 2.3953, + "step": 14770 + }, + { + "epoch": 0.05713534660048553, + "grad_norm": 0.11712568998336792, + "learning_rate": 0.002, + "loss": 2.3969, + "step": 14780 + }, + { + "epoch": 0.05717400380386881, + "grad_norm": 0.1386580765247345, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 14790 + }, + { + "epoch": 0.05721266100725209, + "grad_norm": 0.13500967621803284, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 14800 + }, + { + "epoch": 0.05725131821063537, + "grad_norm": 0.12583929300308228, + "learning_rate": 0.002, + "loss": 2.3996, + "step": 14810 + }, + { + "epoch": 0.057289975414018646, + "grad_norm": 0.10110796242952347, + "learning_rate": 0.002, + "loss": 2.3934, + "step": 14820 + }, + { + "epoch": 0.057328632617401926, + "grad_norm": 0.11224600672721863, + "learning_rate": 0.002, + "loss": 2.397, + "step": 14830 + }, + { + "epoch": 0.057367289820785206, + "grad_norm": 0.1203756108880043, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 14840 + }, + { + "epoch": 0.057405947024168485, + "grad_norm": 0.14332318305969238, + "learning_rate": 0.002, + "loss": 2.4115, + "step": 14850 + }, + { + "epoch": 0.057444604227551765, + "grad_norm": 0.13124069571495056, + "learning_rate": 0.002, + "loss": 2.3937, + "step": 14860 + }, + { + "epoch": 0.05748326143093504, + "grad_norm": 0.11798430979251862, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 14870 + }, + { + "epoch": 0.05752191863431832, + "grad_norm": 0.12214729934930801, + "learning_rate": 0.002, + "loss": 2.3888, + "step": 14880 + }, + { + "epoch": 0.0575605758377016, + "grad_norm": 0.1360270380973816, + "learning_rate": 0.002, + "loss": 2.4036, + "step": 14890 + }, + { + "epoch": 0.05759923304108488, + "grad_norm": 0.12253168225288391, + "learning_rate": 0.002, + "loss": 2.3981, + "step": 14900 + }, + { + "epoch": 0.05763789024446815, + "grad_norm": 0.12843644618988037, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 14910 + }, + { + "epoch": 0.05767654744785143, + "grad_norm": 0.10690762847661972, + "learning_rate": 0.002, + "loss": 2.381, + "step": 14920 + }, + { + "epoch": 0.05771520465123471, + "grad_norm": 0.12708573043346405, + "learning_rate": 0.002, + "loss": 2.3987, + "step": 14930 + }, + { + "epoch": 0.05775386185461799, + "grad_norm": 0.11386797577142715, + "learning_rate": 0.002, + "loss": 2.3873, + "step": 14940 + }, + { + "epoch": 0.05779251905800127, + "grad_norm": 0.1272570788860321, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 14950 + }, + { + "epoch": 0.057831176261384544, + "grad_norm": 0.1215393990278244, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 14960 + }, + { + "epoch": 0.057869833464767824, + "grad_norm": 0.11468160897493362, + "learning_rate": 0.002, + "loss": 2.4048, + "step": 14970 + }, + { + "epoch": 0.057908490668151104, + "grad_norm": 0.12078743427991867, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 14980 + }, + { + "epoch": 0.057947147871534384, + "grad_norm": 0.1239713579416275, + "learning_rate": 0.002, + "loss": 2.406, + "step": 14990 + }, + { + "epoch": 0.05798580507491766, + "grad_norm": 0.14568138122558594, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 15000 + }, + { + "epoch": 0.05802446227830094, + "grad_norm": 0.1350502371788025, + "learning_rate": 0.002, + "loss": 2.4036, + "step": 15010 + }, + { + "epoch": 0.05806311948168422, + "grad_norm": 0.12064608931541443, + "learning_rate": 0.002, + "loss": 2.4013, + "step": 15020 + }, + { + "epoch": 0.0581017766850675, + "grad_norm": 0.13205865025520325, + "learning_rate": 0.002, + "loss": 2.3941, + "step": 15030 + }, + { + "epoch": 0.05814043388845078, + "grad_norm": 0.11407764256000519, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 15040 + }, + { + "epoch": 0.05817909109183405, + "grad_norm": 0.1086033508181572, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 15050 + }, + { + "epoch": 0.05821774829521733, + "grad_norm": 0.11027983576059341, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 15060 + }, + { + "epoch": 0.05825640549860061, + "grad_norm": 0.12719698250293732, + "learning_rate": 0.002, + "loss": 2.387, + "step": 15070 + }, + { + "epoch": 0.05829506270198389, + "grad_norm": 0.12091485410928726, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 15080 + }, + { + "epoch": 0.05833371990536717, + "grad_norm": 0.11346925050020218, + "learning_rate": 0.002, + "loss": 2.3921, + "step": 15090 + }, + { + "epoch": 0.05837237710875044, + "grad_norm": 0.12581966817378998, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 15100 + }, + { + "epoch": 0.05841103431213372, + "grad_norm": 0.12088809907436371, + "learning_rate": 0.002, + "loss": 2.3979, + "step": 15110 + }, + { + "epoch": 0.058449691515517, + "grad_norm": 0.11056520789861679, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 15120 + }, + { + "epoch": 0.05848834871890028, + "grad_norm": 0.11620938032865524, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 15130 + }, + { + "epoch": 0.058527005922283555, + "grad_norm": 0.11881408095359802, + "learning_rate": 0.002, + "loss": 2.4168, + "step": 15140 + }, + { + "epoch": 0.058565663125666835, + "grad_norm": 0.1285400390625, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 15150 + }, + { + "epoch": 0.058604320329050115, + "grad_norm": 0.11662331968545914, + "learning_rate": 0.002, + "loss": 2.399, + "step": 15160 + }, + { + "epoch": 0.058642977532433395, + "grad_norm": 0.15776588022708893, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 15170 + }, + { + "epoch": 0.058681634735816675, + "grad_norm": 0.12372921407222748, + "learning_rate": 0.002, + "loss": 2.4061, + "step": 15180 + }, + { + "epoch": 0.05872029193919995, + "grad_norm": 0.1367715746164322, + "learning_rate": 0.002, + "loss": 2.3926, + "step": 15190 + }, + { + "epoch": 0.05875894914258323, + "grad_norm": 0.11921197175979614, + "learning_rate": 0.002, + "loss": 2.4003, + "step": 15200 + }, + { + "epoch": 0.05879760634596651, + "grad_norm": 0.12707039713859558, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 15210 + }, + { + "epoch": 0.05883626354934979, + "grad_norm": 0.11815090477466583, + "learning_rate": 0.002, + "loss": 2.4102, + "step": 15220 + }, + { + "epoch": 0.05887492075273307, + "grad_norm": 0.1395442932844162, + "learning_rate": 0.002, + "loss": 2.3962, + "step": 15230 + }, + { + "epoch": 0.05891357795611634, + "grad_norm": 0.10509292781352997, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 15240 + }, + { + "epoch": 0.05895223515949962, + "grad_norm": 0.13025058805942535, + "learning_rate": 0.002, + "loss": 2.3921, + "step": 15250 + }, + { + "epoch": 0.0589908923628829, + "grad_norm": 0.17052233219146729, + "learning_rate": 0.002, + "loss": 2.381, + "step": 15260 + }, + { + "epoch": 0.05902954956626618, + "grad_norm": 0.1250266581773758, + "learning_rate": 0.002, + "loss": 2.3949, + "step": 15270 + }, + { + "epoch": 0.05906820676964945, + "grad_norm": 0.11990693211555481, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 15280 + }, + { + "epoch": 0.05910686397303273, + "grad_norm": 0.11827023327350616, + "learning_rate": 0.002, + "loss": 2.4032, + "step": 15290 + }, + { + "epoch": 0.05914552117641601, + "grad_norm": 0.12932279706001282, + "learning_rate": 0.002, + "loss": 2.4075, + "step": 15300 + }, + { + "epoch": 0.05918417837979929, + "grad_norm": 0.14034594595432281, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 15310 + }, + { + "epoch": 0.05922283558318257, + "grad_norm": 0.1353245973587036, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 15320 + }, + { + "epoch": 0.059261492786565846, + "grad_norm": 0.11280287802219391, + "learning_rate": 0.002, + "loss": 2.3859, + "step": 15330 + }, + { + "epoch": 0.059300149989949126, + "grad_norm": 0.10668610036373138, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 15340 + }, + { + "epoch": 0.059338807193332406, + "grad_norm": 0.16798001527786255, + "learning_rate": 0.002, + "loss": 2.3912, + "step": 15350 + }, + { + "epoch": 0.059377464396715686, + "grad_norm": 0.1263061910867691, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 15360 + }, + { + "epoch": 0.059416121600098966, + "grad_norm": 0.11349482089281082, + "learning_rate": 0.002, + "loss": 2.4078, + "step": 15370 + }, + { + "epoch": 0.05945477880348224, + "grad_norm": 0.142970010638237, + "learning_rate": 0.002, + "loss": 2.4013, + "step": 15380 + }, + { + "epoch": 0.05949343600686552, + "grad_norm": 0.1031302735209465, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 15390 + }, + { + "epoch": 0.0595320932102488, + "grad_norm": 0.11524147540330887, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 15400 + }, + { + "epoch": 0.05957075041363208, + "grad_norm": 0.12222982197999954, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 15410 + }, + { + "epoch": 0.05960940761701535, + "grad_norm": 0.10952379554510117, + "learning_rate": 0.002, + "loss": 2.3993, + "step": 15420 + }, + { + "epoch": 0.05964806482039863, + "grad_norm": 0.1340419203042984, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 15430 + }, + { + "epoch": 0.05968672202378191, + "grad_norm": 0.17222769558429718, + "learning_rate": 0.002, + "loss": 2.3963, + "step": 15440 + }, + { + "epoch": 0.05972537922716519, + "grad_norm": 0.11056956648826599, + "learning_rate": 0.002, + "loss": 2.39, + "step": 15450 + }, + { + "epoch": 0.05976403643054847, + "grad_norm": 0.11266839504241943, + "learning_rate": 0.002, + "loss": 2.391, + "step": 15460 + }, + { + "epoch": 0.059802693633931744, + "grad_norm": 0.14877283573150635, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 15470 + }, + { + "epoch": 0.059841350837315024, + "grad_norm": 0.1140676960349083, + "learning_rate": 0.002, + "loss": 2.4032, + "step": 15480 + }, + { + "epoch": 0.059880008040698304, + "grad_norm": 0.11659525334835052, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 15490 + }, + { + "epoch": 0.059918665244081584, + "grad_norm": 0.0978536531329155, + "learning_rate": 0.002, + "loss": 2.3965, + "step": 15500 + }, + { + "epoch": 0.05995732244746486, + "grad_norm": 0.13969437777996063, + "learning_rate": 0.002, + "loss": 2.4046, + "step": 15510 + }, + { + "epoch": 0.05999597965084814, + "grad_norm": 0.1132916733622551, + "learning_rate": 0.002, + "loss": 2.3867, + "step": 15520 + }, + { + "epoch": 0.06003463685423142, + "grad_norm": 0.12317940592765808, + "learning_rate": 0.002, + "loss": 2.3947, + "step": 15530 + }, + { + "epoch": 0.0600732940576147, + "grad_norm": 0.14708033204078674, + "learning_rate": 0.002, + "loss": 2.3983, + "step": 15540 + }, + { + "epoch": 0.06011195126099798, + "grad_norm": 0.12788653373718262, + "learning_rate": 0.002, + "loss": 2.3984, + "step": 15550 + }, + { + "epoch": 0.06015060846438125, + "grad_norm": 0.12532587349414825, + "learning_rate": 0.002, + "loss": 2.3927, + "step": 15560 + }, + { + "epoch": 0.06018926566776453, + "grad_norm": 0.10975198447704315, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 15570 + }, + { + "epoch": 0.06022792287114781, + "grad_norm": 0.13159969449043274, + "learning_rate": 0.002, + "loss": 2.3946, + "step": 15580 + }, + { + "epoch": 0.06026658007453109, + "grad_norm": 0.13560065627098083, + "learning_rate": 0.002, + "loss": 2.3945, + "step": 15590 + }, + { + "epoch": 0.06030523727791437, + "grad_norm": 0.11611117422580719, + "learning_rate": 0.002, + "loss": 2.3913, + "step": 15600 + }, + { + "epoch": 0.06034389448129764, + "grad_norm": 0.1417783945798874, + "learning_rate": 0.002, + "loss": 2.391, + "step": 15610 + }, + { + "epoch": 0.06038255168468092, + "grad_norm": 0.0967651829123497, + "learning_rate": 0.002, + "loss": 2.3956, + "step": 15620 + }, + { + "epoch": 0.0604212088880642, + "grad_norm": 0.1179899126291275, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 15630 + }, + { + "epoch": 0.06045986609144748, + "grad_norm": 0.10877932608127594, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 15640 + }, + { + "epoch": 0.060498523294830756, + "grad_norm": 0.11503571271896362, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 15650 + }, + { + "epoch": 0.060537180498214035, + "grad_norm": 0.1177992895245552, + "learning_rate": 0.002, + "loss": 2.4062, + "step": 15660 + }, + { + "epoch": 0.060575837701597315, + "grad_norm": 0.14880454540252686, + "learning_rate": 0.002, + "loss": 2.3968, + "step": 15670 + }, + { + "epoch": 0.060614494904980595, + "grad_norm": 0.1386214941740036, + "learning_rate": 0.002, + "loss": 2.384, + "step": 15680 + }, + { + "epoch": 0.060653152108363875, + "grad_norm": 0.1116911992430687, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 15690 + }, + { + "epoch": 0.06069180931174715, + "grad_norm": 0.12330832332372665, + "learning_rate": 0.002, + "loss": 2.41, + "step": 15700 + }, + { + "epoch": 0.06073046651513043, + "grad_norm": 0.11281420290470123, + "learning_rate": 0.002, + "loss": 2.3865, + "step": 15710 + }, + { + "epoch": 0.06076912371851371, + "grad_norm": 0.18309475481510162, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 15720 + }, + { + "epoch": 0.06080778092189699, + "grad_norm": 0.10819264501333237, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 15730 + }, + { + "epoch": 0.06084643812528027, + "grad_norm": 0.10897762328386307, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 15740 + }, + { + "epoch": 0.06088509532866354, + "grad_norm": 0.12542861700057983, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 15750 + }, + { + "epoch": 0.06092375253204682, + "grad_norm": 0.11837367713451385, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 15760 + }, + { + "epoch": 0.0609624097354301, + "grad_norm": 0.11848590523004532, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 15770 + }, + { + "epoch": 0.06100106693881338, + "grad_norm": 0.1487286239862442, + "learning_rate": 0.002, + "loss": 2.3879, + "step": 15780 + }, + { + "epoch": 0.061039724142196654, + "grad_norm": 0.12716227769851685, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 15790 + }, + { + "epoch": 0.061078381345579934, + "grad_norm": 0.1302156001329422, + "learning_rate": 0.002, + "loss": 2.3881, + "step": 15800 + }, + { + "epoch": 0.061117038548963214, + "grad_norm": 0.11784037947654724, + "learning_rate": 0.002, + "loss": 2.402, + "step": 15810 + }, + { + "epoch": 0.061155695752346494, + "grad_norm": 0.11263030022382736, + "learning_rate": 0.002, + "loss": 2.3859, + "step": 15820 + }, + { + "epoch": 0.061194352955729774, + "grad_norm": 0.1392696052789688, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 15830 + }, + { + "epoch": 0.06123301015911305, + "grad_norm": 0.22238633036613464, + "learning_rate": 0.002, + "loss": 2.4209, + "step": 15840 + }, + { + "epoch": 0.061271667362496327, + "grad_norm": 0.1388339102268219, + "learning_rate": 0.002, + "loss": 2.3937, + "step": 15850 + }, + { + "epoch": 0.061310324565879606, + "grad_norm": 0.13323858380317688, + "learning_rate": 0.002, + "loss": 2.3897, + "step": 15860 + }, + { + "epoch": 0.061348981769262886, + "grad_norm": 0.15302272140979767, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 15870 + }, + { + "epoch": 0.061387638972646166, + "grad_norm": 0.17193780839443207, + "learning_rate": 0.002, + "loss": 2.3962, + "step": 15880 + }, + { + "epoch": 0.06142629617602944, + "grad_norm": 0.11914423853158951, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 15890 + }, + { + "epoch": 0.06146495337941272, + "grad_norm": 0.12326352298259735, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 15900 + }, + { + "epoch": 0.061503610582796, + "grad_norm": 0.10487114638090134, + "learning_rate": 0.002, + "loss": 2.3965, + "step": 15910 + }, + { + "epoch": 0.06154226778617928, + "grad_norm": 0.11379309743642807, + "learning_rate": 0.002, + "loss": 2.4036, + "step": 15920 + }, + { + "epoch": 0.06158092498956255, + "grad_norm": 0.1775919795036316, + "learning_rate": 0.002, + "loss": 2.4072, + "step": 15930 + }, + { + "epoch": 0.06161958219294583, + "grad_norm": 0.13293004035949707, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 15940 + }, + { + "epoch": 0.06165823939632911, + "grad_norm": 0.11617787182331085, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 15950 + }, + { + "epoch": 0.06169689659971239, + "grad_norm": 0.1525639146566391, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 15960 + }, + { + "epoch": 0.06173555380309567, + "grad_norm": 0.114244744181633, + "learning_rate": 0.002, + "loss": 2.389, + "step": 15970 + }, + { + "epoch": 0.061774211006478945, + "grad_norm": 0.10915686190128326, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 15980 + }, + { + "epoch": 0.061812868209862225, + "grad_norm": 0.11189400404691696, + "learning_rate": 0.002, + "loss": 2.391, + "step": 15990 + }, + { + "epoch": 0.061851525413245505, + "grad_norm": 0.13927581906318665, + "learning_rate": 0.002, + "loss": 2.3952, + "step": 16000 + }, + { + "epoch": 0.061890182616628785, + "grad_norm": 0.12263130396604538, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 16010 + }, + { + "epoch": 0.06192883982001206, + "grad_norm": 0.14536434412002563, + "learning_rate": 0.002, + "loss": 2.3894, + "step": 16020 + }, + { + "epoch": 0.06196749702339534, + "grad_norm": 0.12297393381595612, + "learning_rate": 0.002, + "loss": 2.4019, + "step": 16030 + }, + { + "epoch": 0.06200615422677862, + "grad_norm": 0.11197999864816666, + "learning_rate": 0.002, + "loss": 2.4026, + "step": 16040 + }, + { + "epoch": 0.0620448114301619, + "grad_norm": 0.0987369641661644, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 16050 + }, + { + "epoch": 0.06208346863354518, + "grad_norm": 0.12647844851016998, + "learning_rate": 0.002, + "loss": 2.3969, + "step": 16060 + }, + { + "epoch": 0.06212212583692845, + "grad_norm": 0.1253211498260498, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 16070 + }, + { + "epoch": 0.06216078304031173, + "grad_norm": 0.17184260487556458, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 16080 + }, + { + "epoch": 0.06219944024369501, + "grad_norm": 0.12006668746471405, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 16090 + }, + { + "epoch": 0.06223809744707829, + "grad_norm": 0.1277729719877243, + "learning_rate": 0.002, + "loss": 2.391, + "step": 16100 + }, + { + "epoch": 0.06227675465046157, + "grad_norm": 0.13591617345809937, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 16110 + }, + { + "epoch": 0.06231541185384484, + "grad_norm": 0.0973956510424614, + "learning_rate": 0.002, + "loss": 2.392, + "step": 16120 + }, + { + "epoch": 0.06235406905722812, + "grad_norm": 0.11103206872940063, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 16130 + }, + { + "epoch": 0.0623927262606114, + "grad_norm": 0.1389506459236145, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 16140 + }, + { + "epoch": 0.06243138346399468, + "grad_norm": 0.12428136169910431, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 16150 + }, + { + "epoch": 0.062470040667377956, + "grad_norm": 0.11365336179733276, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 16160 + }, + { + "epoch": 0.06250869787076124, + "grad_norm": 0.1254616379737854, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 16170 + }, + { + "epoch": 0.06254735507414452, + "grad_norm": 0.1262783706188202, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 16180 + }, + { + "epoch": 0.06258601227752779, + "grad_norm": 0.10713041573762894, + "learning_rate": 0.002, + "loss": 2.3823, + "step": 16190 + }, + { + "epoch": 0.06262466948091107, + "grad_norm": 0.10575401782989502, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 16200 + }, + { + "epoch": 0.06266332668429435, + "grad_norm": 0.11362715810537338, + "learning_rate": 0.002, + "loss": 2.3933, + "step": 16210 + }, + { + "epoch": 0.06270198388767763, + "grad_norm": 0.11229918152093887, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 16220 + }, + { + "epoch": 0.06274064109106091, + "grad_norm": 0.1316540241241455, + "learning_rate": 0.002, + "loss": 2.3891, + "step": 16230 + }, + { + "epoch": 0.06277929829444419, + "grad_norm": 0.10296753793954849, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 16240 + }, + { + "epoch": 0.06281795549782747, + "grad_norm": 0.12756431102752686, + "learning_rate": 0.002, + "loss": 2.3934, + "step": 16250 + }, + { + "epoch": 0.06285661270121075, + "grad_norm": 0.11889172345399857, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 16260 + }, + { + "epoch": 0.06289526990459403, + "grad_norm": 0.1506144106388092, + "learning_rate": 0.002, + "loss": 2.3856, + "step": 16270 + }, + { + "epoch": 0.0629339271079773, + "grad_norm": 0.11867709457874298, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 16280 + }, + { + "epoch": 0.06297258431136057, + "grad_norm": 0.10856199264526367, + "learning_rate": 0.002, + "loss": 2.389, + "step": 16290 + }, + { + "epoch": 0.06301124151474385, + "grad_norm": 0.1227448359131813, + "learning_rate": 0.002, + "loss": 2.399, + "step": 16300 + }, + { + "epoch": 0.06304989871812713, + "grad_norm": 0.11154075711965561, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 16310 + }, + { + "epoch": 0.06308855592151041, + "grad_norm": 0.13388383388519287, + "learning_rate": 0.002, + "loss": 2.3954, + "step": 16320 + }, + { + "epoch": 0.0631272131248937, + "grad_norm": 0.14490358531475067, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 16330 + }, + { + "epoch": 0.06316587032827697, + "grad_norm": 0.12403707951307297, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 16340 + }, + { + "epoch": 0.06320452753166025, + "grad_norm": 0.6374202370643616, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 16350 + }, + { + "epoch": 0.06324318473504353, + "grad_norm": 0.1515141725540161, + "learning_rate": 0.002, + "loss": 2.3872, + "step": 16360 + }, + { + "epoch": 0.0632818419384268, + "grad_norm": 0.12579821050167084, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 16370 + }, + { + "epoch": 0.06332049914181008, + "grad_norm": 0.11145832389593124, + "learning_rate": 0.002, + "loss": 2.4088, + "step": 16380 + }, + { + "epoch": 0.06335915634519336, + "grad_norm": 0.11568121612071991, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 16390 + }, + { + "epoch": 0.06339781354857664, + "grad_norm": 0.1275532841682434, + "learning_rate": 0.002, + "loss": 2.3909, + "step": 16400 + }, + { + "epoch": 0.06343647075195992, + "grad_norm": 0.1336016207933426, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 16410 + }, + { + "epoch": 0.0634751279553432, + "grad_norm": 0.1019514799118042, + "learning_rate": 0.002, + "loss": 2.393, + "step": 16420 + }, + { + "epoch": 0.06351378515872648, + "grad_norm": 0.12241934984922409, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 16430 + }, + { + "epoch": 0.06355244236210976, + "grad_norm": 0.13257557153701782, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 16440 + }, + { + "epoch": 0.06359109956549304, + "grad_norm": 0.12866371870040894, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 16450 + }, + { + "epoch": 0.0636297567688763, + "grad_norm": 0.10742007941007614, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 16460 + }, + { + "epoch": 0.06366841397225959, + "grad_norm": 0.11636551469564438, + "learning_rate": 0.002, + "loss": 2.3968, + "step": 16470 + }, + { + "epoch": 0.06370707117564287, + "grad_norm": 0.11096464097499847, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 16480 + }, + { + "epoch": 0.06374572837902615, + "grad_norm": 0.10752718150615692, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 16490 + }, + { + "epoch": 0.06378438558240943, + "grad_norm": 0.11533330380916595, + "learning_rate": 0.002, + "loss": 2.3954, + "step": 16500 + }, + { + "epoch": 0.0638230427857927, + "grad_norm": 0.14629951119422913, + "learning_rate": 0.002, + "loss": 2.4096, + "step": 16510 + }, + { + "epoch": 0.06386169998917599, + "grad_norm": 0.11980853974819183, + "learning_rate": 0.002, + "loss": 2.3902, + "step": 16520 + }, + { + "epoch": 0.06390035719255927, + "grad_norm": 0.14841502904891968, + "learning_rate": 0.002, + "loss": 2.397, + "step": 16530 + }, + { + "epoch": 0.06393901439594254, + "grad_norm": 0.11659996956586838, + "learning_rate": 0.002, + "loss": 2.3851, + "step": 16540 + }, + { + "epoch": 0.06397767159932582, + "grad_norm": 0.1393616944551468, + "learning_rate": 0.002, + "loss": 2.3913, + "step": 16550 + }, + { + "epoch": 0.06401632880270909, + "grad_norm": 0.12829765677452087, + "learning_rate": 0.002, + "loss": 2.3959, + "step": 16560 + }, + { + "epoch": 0.06405498600609237, + "grad_norm": 0.10121653974056244, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 16570 + }, + { + "epoch": 0.06409364320947565, + "grad_norm": 0.130237877368927, + "learning_rate": 0.002, + "loss": 2.4068, + "step": 16580 + }, + { + "epoch": 0.06413230041285893, + "grad_norm": 0.12805424630641937, + "learning_rate": 0.002, + "loss": 2.4003, + "step": 16590 + }, + { + "epoch": 0.06417095761624221, + "grad_norm": 0.13839778304100037, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 16600 + }, + { + "epoch": 0.06420961481962549, + "grad_norm": 0.10394293069839478, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 16610 + }, + { + "epoch": 0.06424827202300877, + "grad_norm": 0.1352817714214325, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 16620 + }, + { + "epoch": 0.06428692922639205, + "grad_norm": 0.12022780627012253, + "learning_rate": 0.002, + "loss": 2.397, + "step": 16630 + }, + { + "epoch": 0.06432558642977533, + "grad_norm": 0.15451140701770782, + "learning_rate": 0.002, + "loss": 2.391, + "step": 16640 + }, + { + "epoch": 0.0643642436331586, + "grad_norm": 0.12435924261808395, + "learning_rate": 0.002, + "loss": 2.3897, + "step": 16650 + }, + { + "epoch": 0.06440290083654188, + "grad_norm": 0.09402734041213989, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 16660 + }, + { + "epoch": 0.06444155803992516, + "grad_norm": 0.15301144123077393, + "learning_rate": 0.002, + "loss": 2.3899, + "step": 16670 + }, + { + "epoch": 0.06448021524330844, + "grad_norm": 0.12951329350471497, + "learning_rate": 0.002, + "loss": 2.3867, + "step": 16680 + }, + { + "epoch": 0.06451887244669172, + "grad_norm": 0.10923568904399872, + "learning_rate": 0.002, + "loss": 2.3841, + "step": 16690 + }, + { + "epoch": 0.064557529650075, + "grad_norm": 0.10874821990728378, + "learning_rate": 0.002, + "loss": 2.392, + "step": 16700 + }, + { + "epoch": 0.06459618685345828, + "grad_norm": 0.12163813412189484, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 16710 + }, + { + "epoch": 0.06463484405684156, + "grad_norm": 0.11804139614105225, + "learning_rate": 0.002, + "loss": 2.3988, + "step": 16720 + }, + { + "epoch": 0.06467350126022484, + "grad_norm": 0.11225691437721252, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 16730 + }, + { + "epoch": 0.0647121584636081, + "grad_norm": 0.12440955638885498, + "learning_rate": 0.002, + "loss": 2.4036, + "step": 16740 + }, + { + "epoch": 0.06475081566699138, + "grad_norm": 0.11335624009370804, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 16750 + }, + { + "epoch": 0.06478947287037466, + "grad_norm": 0.13928361237049103, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 16760 + }, + { + "epoch": 0.06482813007375794, + "grad_norm": 0.12787578999996185, + "learning_rate": 0.002, + "loss": 2.3784, + "step": 16770 + }, + { + "epoch": 0.06486678727714122, + "grad_norm": 0.12436560541391373, + "learning_rate": 0.002, + "loss": 2.3998, + "step": 16780 + }, + { + "epoch": 0.0649054444805245, + "grad_norm": 0.12330484390258789, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 16790 + }, + { + "epoch": 0.06494410168390778, + "grad_norm": 0.12958766520023346, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 16800 + }, + { + "epoch": 0.06498275888729106, + "grad_norm": 0.12021058052778244, + "learning_rate": 0.002, + "loss": 2.4026, + "step": 16810 + }, + { + "epoch": 0.06502141609067434, + "grad_norm": 0.11794283241033554, + "learning_rate": 0.002, + "loss": 2.373, + "step": 16820 + }, + { + "epoch": 0.06506007329405762, + "grad_norm": 0.1235567033290863, + "learning_rate": 0.002, + "loss": 2.4002, + "step": 16830 + }, + { + "epoch": 0.06509873049744089, + "grad_norm": 0.11046295613050461, + "learning_rate": 0.002, + "loss": 2.397, + "step": 16840 + }, + { + "epoch": 0.06513738770082417, + "grad_norm": 0.10009758919477463, + "learning_rate": 0.002, + "loss": 2.3961, + "step": 16850 + }, + { + "epoch": 0.06517604490420745, + "grad_norm": 0.13078537583351135, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 16860 + }, + { + "epoch": 0.06521470210759073, + "grad_norm": 0.11758499592542648, + "learning_rate": 0.002, + "loss": 2.3928, + "step": 16870 + }, + { + "epoch": 0.06525335931097401, + "grad_norm": 0.12084563821554184, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 16880 + }, + { + "epoch": 0.06529201651435729, + "grad_norm": 0.1334090530872345, + "learning_rate": 0.002, + "loss": 2.402, + "step": 16890 + }, + { + "epoch": 0.06533067371774057, + "grad_norm": 0.1250433474779129, + "learning_rate": 0.002, + "loss": 2.387, + "step": 16900 + }, + { + "epoch": 0.06536933092112385, + "grad_norm": 0.1300637125968933, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 16910 + }, + { + "epoch": 0.06540798812450713, + "grad_norm": 0.09776023030281067, + "learning_rate": 0.002, + "loss": 2.3909, + "step": 16920 + }, + { + "epoch": 0.0654466453278904, + "grad_norm": 0.11161220073699951, + "learning_rate": 0.002, + "loss": 2.387, + "step": 16930 + }, + { + "epoch": 0.06548530253127367, + "grad_norm": 0.16109618544578552, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 16940 + }, + { + "epoch": 0.06552395973465695, + "grad_norm": 0.10100391507148743, + "learning_rate": 0.002, + "loss": 2.3986, + "step": 16950 + }, + { + "epoch": 0.06556261693804023, + "grad_norm": 0.12305068969726562, + "learning_rate": 0.002, + "loss": 2.3967, + "step": 16960 + }, + { + "epoch": 0.06560127414142351, + "grad_norm": 0.1368272602558136, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 16970 + }, + { + "epoch": 0.06563993134480679, + "grad_norm": 0.11573801189661026, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 16980 + }, + { + "epoch": 0.06567858854819007, + "grad_norm": 0.15364046394824982, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 16990 + }, + { + "epoch": 0.06571724575157335, + "grad_norm": 0.1141384094953537, + "learning_rate": 0.002, + "loss": 2.4019, + "step": 17000 + }, + { + "epoch": 0.06575590295495663, + "grad_norm": 0.12331130355596542, + "learning_rate": 0.002, + "loss": 2.392, + "step": 17010 + }, + { + "epoch": 0.0657945601583399, + "grad_norm": 0.10936921834945679, + "learning_rate": 0.002, + "loss": 2.4014, + "step": 17020 + }, + { + "epoch": 0.06583321736172318, + "grad_norm": 0.112908273935318, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 17030 + }, + { + "epoch": 0.06587187456510646, + "grad_norm": 0.10883111506700516, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 17040 + }, + { + "epoch": 0.06591053176848974, + "grad_norm": 0.1455990970134735, + "learning_rate": 0.002, + "loss": 2.3862, + "step": 17050 + }, + { + "epoch": 0.06594918897187302, + "grad_norm": 0.11590580642223358, + "learning_rate": 0.002, + "loss": 2.4026, + "step": 17060 + }, + { + "epoch": 0.0659878461752563, + "grad_norm": 0.10473395138978958, + "learning_rate": 0.002, + "loss": 2.3899, + "step": 17070 + }, + { + "epoch": 0.06602650337863958, + "grad_norm": 0.13969959318637848, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 17080 + }, + { + "epoch": 0.06606516058202286, + "grad_norm": 0.10769690573215485, + "learning_rate": 0.002, + "loss": 2.3938, + "step": 17090 + }, + { + "epoch": 0.06610381778540614, + "grad_norm": 0.10526862740516663, + "learning_rate": 0.002, + "loss": 2.4033, + "step": 17100 + }, + { + "epoch": 0.0661424749887894, + "grad_norm": 0.10747144371271133, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 17110 + }, + { + "epoch": 0.06618113219217268, + "grad_norm": 0.1294855922460556, + "learning_rate": 0.002, + "loss": 2.3993, + "step": 17120 + }, + { + "epoch": 0.06621978939555596, + "grad_norm": 0.13381676375865936, + "learning_rate": 0.002, + "loss": 2.3977, + "step": 17130 + }, + { + "epoch": 0.06625844659893924, + "grad_norm": 0.0924403965473175, + "learning_rate": 0.002, + "loss": 2.3917, + "step": 17140 + }, + { + "epoch": 0.06629710380232252, + "grad_norm": 0.12378328293561935, + "learning_rate": 0.002, + "loss": 2.4003, + "step": 17150 + }, + { + "epoch": 0.0663357610057058, + "grad_norm": 0.11626161634922028, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 17160 + }, + { + "epoch": 0.06637441820908908, + "grad_norm": 0.11259350925683975, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 17170 + }, + { + "epoch": 0.06641307541247236, + "grad_norm": 0.10405412316322327, + "learning_rate": 0.002, + "loss": 2.3879, + "step": 17180 + }, + { + "epoch": 0.06645173261585564, + "grad_norm": 0.11657468974590302, + "learning_rate": 0.002, + "loss": 2.3949, + "step": 17190 + }, + { + "epoch": 0.06649038981923892, + "grad_norm": 0.1102895587682724, + "learning_rate": 0.002, + "loss": 2.3933, + "step": 17200 + }, + { + "epoch": 0.06652904702262219, + "grad_norm": 0.12945479154586792, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 17210 + }, + { + "epoch": 0.06656770422600547, + "grad_norm": 0.1273641437292099, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 17220 + }, + { + "epoch": 0.06660636142938875, + "grad_norm": 0.1088409423828125, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 17230 + }, + { + "epoch": 0.06664501863277203, + "grad_norm": 0.10263818502426147, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 17240 + }, + { + "epoch": 0.06668367583615531, + "grad_norm": 0.12657175958156586, + "learning_rate": 0.002, + "loss": 2.4051, + "step": 17250 + }, + { + "epoch": 0.06672233303953859, + "grad_norm": 0.13320571184158325, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 17260 + }, + { + "epoch": 0.06676099024292187, + "grad_norm": 0.10921880602836609, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 17270 + }, + { + "epoch": 0.06679964744630515, + "grad_norm": 0.1231456771492958, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 17280 + }, + { + "epoch": 0.06683830464968843, + "grad_norm": 0.1348007470369339, + "learning_rate": 0.002, + "loss": 2.4037, + "step": 17290 + }, + { + "epoch": 0.0668769618530717, + "grad_norm": 0.10821889340877533, + "learning_rate": 0.002, + "loss": 2.3881, + "step": 17300 + }, + { + "epoch": 0.06691561905645498, + "grad_norm": 0.1109113097190857, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 17310 + }, + { + "epoch": 0.06695427625983826, + "grad_norm": 0.1150231882929802, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 17320 + }, + { + "epoch": 0.06699293346322154, + "grad_norm": 0.10427354276180267, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 17330 + }, + { + "epoch": 0.06703159066660482, + "grad_norm": 0.10868360102176666, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 17340 + }, + { + "epoch": 0.0670702478699881, + "grad_norm": 0.10528600960969925, + "learning_rate": 0.002, + "loss": 2.3925, + "step": 17350 + }, + { + "epoch": 0.06710890507337137, + "grad_norm": 0.12943026423454285, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 17360 + }, + { + "epoch": 0.06714756227675465, + "grad_norm": 0.11128256469964981, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 17370 + }, + { + "epoch": 0.06718621948013793, + "grad_norm": 0.13703569769859314, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 17380 + }, + { + "epoch": 0.0672248766835212, + "grad_norm": 0.11831612139940262, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 17390 + }, + { + "epoch": 0.06726353388690448, + "grad_norm": 0.13398098945617676, + "learning_rate": 0.002, + "loss": 2.3936, + "step": 17400 + }, + { + "epoch": 0.06730219109028776, + "grad_norm": 0.10794021934270859, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 17410 + }, + { + "epoch": 0.06734084829367104, + "grad_norm": 0.14635983109474182, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 17420 + }, + { + "epoch": 0.06737950549705432, + "grad_norm": 0.12673452496528625, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 17430 + }, + { + "epoch": 0.0674181627004376, + "grad_norm": 0.10255026817321777, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 17440 + }, + { + "epoch": 0.06745681990382088, + "grad_norm": 0.12468329817056656, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 17450 + }, + { + "epoch": 0.06749547710720416, + "grad_norm": 0.11799022555351257, + "learning_rate": 0.002, + "loss": 2.389, + "step": 17460 + }, + { + "epoch": 0.06753413431058744, + "grad_norm": 0.1194017082452774, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 17470 + }, + { + "epoch": 0.0675727915139707, + "grad_norm": 0.11861254274845123, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 17480 + }, + { + "epoch": 0.06761144871735399, + "grad_norm": 0.12071160227060318, + "learning_rate": 0.002, + "loss": 2.3997, + "step": 17490 + }, + { + "epoch": 0.06765010592073727, + "grad_norm": 0.12513267993927002, + "learning_rate": 0.002, + "loss": 2.3934, + "step": 17500 + }, + { + "epoch": 0.06768876312412055, + "grad_norm": 0.13575059175491333, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 17510 + }, + { + "epoch": 0.06772742032750383, + "grad_norm": 0.10630622506141663, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 17520 + }, + { + "epoch": 0.0677660775308871, + "grad_norm": 0.10673921555280685, + "learning_rate": 0.002, + "loss": 2.3989, + "step": 17530 + }, + { + "epoch": 0.06780473473427039, + "grad_norm": 0.14430269598960876, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 17540 + }, + { + "epoch": 0.06784339193765367, + "grad_norm": 0.144783154129982, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 17550 + }, + { + "epoch": 0.06788204914103695, + "grad_norm": 0.1043168380856514, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 17560 + }, + { + "epoch": 0.06792070634442023, + "grad_norm": 0.12369021773338318, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 17570 + }, + { + "epoch": 0.06795936354780349, + "grad_norm": 0.10070551931858063, + "learning_rate": 0.002, + "loss": 2.388, + "step": 17580 + }, + { + "epoch": 0.06799802075118677, + "grad_norm": 0.13216231763362885, + "learning_rate": 0.002, + "loss": 2.3996, + "step": 17590 + }, + { + "epoch": 0.06803667795457005, + "grad_norm": 0.10784224420785904, + "learning_rate": 0.002, + "loss": 2.3898, + "step": 17600 + }, + { + "epoch": 0.06807533515795333, + "grad_norm": 0.1283050924539566, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 17610 + }, + { + "epoch": 0.06811399236133661, + "grad_norm": 0.12371443212032318, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 17620 + }, + { + "epoch": 0.06815264956471989, + "grad_norm": 0.13803911209106445, + "learning_rate": 0.002, + "loss": 2.3898, + "step": 17630 + }, + { + "epoch": 0.06819130676810317, + "grad_norm": 0.10237491130828857, + "learning_rate": 0.002, + "loss": 2.3932, + "step": 17640 + }, + { + "epoch": 0.06822996397148645, + "grad_norm": 0.11532513797283173, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 17650 + }, + { + "epoch": 0.06826862117486973, + "grad_norm": 0.14253586530685425, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 17660 + }, + { + "epoch": 0.068307278378253, + "grad_norm": 0.14113810658454895, + "learning_rate": 0.002, + "loss": 2.395, + "step": 17670 + }, + { + "epoch": 0.06834593558163628, + "grad_norm": 0.1139352023601532, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 17680 + }, + { + "epoch": 0.06838459278501956, + "grad_norm": 0.09490272402763367, + "learning_rate": 0.002, + "loss": 2.3967, + "step": 17690 + }, + { + "epoch": 0.06842324998840284, + "grad_norm": 0.1430560201406479, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 17700 + }, + { + "epoch": 0.06846190719178612, + "grad_norm": 0.12731243669986725, + "learning_rate": 0.002, + "loss": 2.3915, + "step": 17710 + }, + { + "epoch": 0.0685005643951694, + "grad_norm": 0.12364219129085541, + "learning_rate": 0.002, + "loss": 2.3925, + "step": 17720 + }, + { + "epoch": 0.06853922159855268, + "grad_norm": 0.12770256400108337, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 17730 + }, + { + "epoch": 0.06857787880193596, + "grad_norm": 0.13035711646080017, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 17740 + }, + { + "epoch": 0.06861653600531924, + "grad_norm": 0.1317276805639267, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 17750 + }, + { + "epoch": 0.0686551932087025, + "grad_norm": 0.11247614026069641, + "learning_rate": 0.002, + "loss": 2.376, + "step": 17760 + }, + { + "epoch": 0.06869385041208578, + "grad_norm": 0.10378215461969376, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 17770 + }, + { + "epoch": 0.06873250761546906, + "grad_norm": 0.1200520247220993, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 17780 + }, + { + "epoch": 0.06877116481885234, + "grad_norm": 0.11229659616947174, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 17790 + }, + { + "epoch": 0.06880982202223562, + "grad_norm": 0.11006788909435272, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 17800 + }, + { + "epoch": 0.0688484792256189, + "grad_norm": 0.11270550638437271, + "learning_rate": 0.002, + "loss": 2.3864, + "step": 17810 + }, + { + "epoch": 0.06888713642900218, + "grad_norm": 0.11977479606866837, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 17820 + }, + { + "epoch": 0.06892579363238546, + "grad_norm": 0.11547745764255524, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 17830 + }, + { + "epoch": 0.06896445083576874, + "grad_norm": 0.13601486384868622, + "learning_rate": 0.002, + "loss": 2.3968, + "step": 17840 + }, + { + "epoch": 0.06900310803915202, + "grad_norm": 0.10460959374904633, + "learning_rate": 0.002, + "loss": 2.371, + "step": 17850 + }, + { + "epoch": 0.06904176524253529, + "grad_norm": 0.10961294919252396, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 17860 + }, + { + "epoch": 0.06908042244591857, + "grad_norm": 0.1438174694776535, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 17870 + }, + { + "epoch": 0.06911907964930185, + "grad_norm": 0.1140781119465828, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 17880 + }, + { + "epoch": 0.06915773685268513, + "grad_norm": 0.12019556760787964, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 17890 + }, + { + "epoch": 0.06919639405606841, + "grad_norm": 0.11578171700239182, + "learning_rate": 0.002, + "loss": 2.392, + "step": 17900 + }, + { + "epoch": 0.06923505125945169, + "grad_norm": 0.10989246517419815, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 17910 + }, + { + "epoch": 0.06927370846283497, + "grad_norm": 0.12332355231046677, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 17920 + }, + { + "epoch": 0.06931236566621825, + "grad_norm": 0.1192823126912117, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 17930 + }, + { + "epoch": 0.06935102286960153, + "grad_norm": 0.14072348177433014, + "learning_rate": 0.002, + "loss": 2.3855, + "step": 17940 + }, + { + "epoch": 0.0693896800729848, + "grad_norm": 0.13811370730400085, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 17950 + }, + { + "epoch": 0.06942833727636807, + "grad_norm": 0.09993165731430054, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 17960 + }, + { + "epoch": 0.06946699447975135, + "grad_norm": 0.11219470202922821, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 17970 + }, + { + "epoch": 0.06950565168313463, + "grad_norm": 0.30075761675834656, + "learning_rate": 0.002, + "loss": 2.395, + "step": 17980 + }, + { + "epoch": 0.06954430888651791, + "grad_norm": 0.1219414696097374, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 17990 + }, + { + "epoch": 0.0695829660899012, + "grad_norm": 0.1216287612915039, + "learning_rate": 0.002, + "loss": 2.3913, + "step": 18000 + }, + { + "epoch": 0.06962162329328447, + "grad_norm": 0.12025120854377747, + "learning_rate": 0.002, + "loss": 2.3864, + "step": 18010 + }, + { + "epoch": 0.06966028049666775, + "grad_norm": 0.12074564397335052, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 18020 + }, + { + "epoch": 0.06969893770005103, + "grad_norm": 0.097334124147892, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 18030 + }, + { + "epoch": 0.0697375949034343, + "grad_norm": 0.12103715538978577, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 18040 + }, + { + "epoch": 0.06977625210681758, + "grad_norm": 0.12247490882873535, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 18050 + }, + { + "epoch": 0.06981490931020086, + "grad_norm": 0.12782634794712067, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 18060 + }, + { + "epoch": 0.06985356651358414, + "grad_norm": 0.12588289380073547, + "learning_rate": 0.002, + "loss": 2.3981, + "step": 18070 + }, + { + "epoch": 0.06989222371696742, + "grad_norm": 0.11344539374113083, + "learning_rate": 0.002, + "loss": 2.3856, + "step": 18080 + }, + { + "epoch": 0.0699308809203507, + "grad_norm": 0.11065513640642166, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 18090 + }, + { + "epoch": 0.06996953812373398, + "grad_norm": 0.11105602234601974, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 18100 + }, + { + "epoch": 0.07000819532711726, + "grad_norm": 0.12619774043560028, + "learning_rate": 0.002, + "loss": 2.3977, + "step": 18110 + }, + { + "epoch": 0.07004685253050054, + "grad_norm": 0.1004246398806572, + "learning_rate": 0.002, + "loss": 2.3947, + "step": 18120 + }, + { + "epoch": 0.0700855097338838, + "grad_norm": 0.12445876747369766, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 18130 + }, + { + "epoch": 0.07012416693726709, + "grad_norm": 0.11129860579967499, + "learning_rate": 0.002, + "loss": 2.3959, + "step": 18140 + }, + { + "epoch": 0.07016282414065036, + "grad_norm": 0.11270508915185928, + "learning_rate": 0.002, + "loss": 2.37, + "step": 18150 + }, + { + "epoch": 0.07020148134403364, + "grad_norm": 0.10359988361597061, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 18160 + }, + { + "epoch": 0.07024013854741692, + "grad_norm": 0.11541220545768738, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 18170 + }, + { + "epoch": 0.0702787957508002, + "grad_norm": 0.1319669783115387, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 18180 + }, + { + "epoch": 0.07031745295418348, + "grad_norm": 0.12286441773176193, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 18190 + }, + { + "epoch": 0.07035611015756676, + "grad_norm": 0.12172552198171616, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 18200 + }, + { + "epoch": 0.07039476736095004, + "grad_norm": 0.11214584857225418, + "learning_rate": 0.002, + "loss": 2.3964, + "step": 18210 + }, + { + "epoch": 0.07043342456433332, + "grad_norm": 0.14144858717918396, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 18220 + }, + { + "epoch": 0.07047208176771659, + "grad_norm": 0.1375395655632019, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 18230 + }, + { + "epoch": 0.07051073897109987, + "grad_norm": 0.1334933638572693, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 18240 + }, + { + "epoch": 0.07054939617448315, + "grad_norm": 0.11723175644874573, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 18250 + }, + { + "epoch": 0.07058805337786643, + "grad_norm": 0.1289069652557373, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 18260 + }, + { + "epoch": 0.07062671058124971, + "grad_norm": 0.10577051341533661, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 18270 + }, + { + "epoch": 0.07066536778463299, + "grad_norm": 0.09948733448982239, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 18280 + }, + { + "epoch": 0.07070402498801627, + "grad_norm": 0.12849846482276917, + "learning_rate": 0.002, + "loss": 2.3949, + "step": 18290 + }, + { + "epoch": 0.07074268219139955, + "grad_norm": 0.10710543394088745, + "learning_rate": 0.002, + "loss": 2.391, + "step": 18300 + }, + { + "epoch": 0.07078133939478283, + "grad_norm": 0.12671582400798798, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 18310 + }, + { + "epoch": 0.0708199965981661, + "grad_norm": 0.12102266401052475, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 18320 + }, + { + "epoch": 0.07085865380154938, + "grad_norm": 0.12042495608329773, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 18330 + }, + { + "epoch": 0.07089731100493266, + "grad_norm": 0.10737396776676178, + "learning_rate": 0.002, + "loss": 2.3856, + "step": 18340 + }, + { + "epoch": 0.07093596820831594, + "grad_norm": 0.11897099018096924, + "learning_rate": 0.002, + "loss": 2.394, + "step": 18350 + }, + { + "epoch": 0.07097462541169922, + "grad_norm": 0.10696249455213547, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 18360 + }, + { + "epoch": 0.0710132826150825, + "grad_norm": 0.12204062193632126, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 18370 + }, + { + "epoch": 0.07105193981846578, + "grad_norm": 0.09256469458341599, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 18380 + }, + { + "epoch": 0.07109059702184906, + "grad_norm": 0.12206672877073288, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 18390 + }, + { + "epoch": 0.07112925422523234, + "grad_norm": 0.10235872119665146, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 18400 + }, + { + "epoch": 0.0711679114286156, + "grad_norm": 0.11231094598770142, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 18410 + }, + { + "epoch": 0.07120656863199888, + "grad_norm": 0.15380600094795227, + "learning_rate": 0.002, + "loss": 2.3886, + "step": 18420 + }, + { + "epoch": 0.07124522583538216, + "grad_norm": 0.11261408776044846, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 18430 + }, + { + "epoch": 0.07128388303876544, + "grad_norm": 0.11684536933898926, + "learning_rate": 0.002, + "loss": 2.3979, + "step": 18440 + }, + { + "epoch": 0.07132254024214872, + "grad_norm": 0.12610866129398346, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 18450 + }, + { + "epoch": 0.071361197445532, + "grad_norm": 0.11578212678432465, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 18460 + }, + { + "epoch": 0.07139985464891528, + "grad_norm": 0.12588560581207275, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 18470 + }, + { + "epoch": 0.07143851185229856, + "grad_norm": 0.10358493030071259, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 18480 + }, + { + "epoch": 0.07147716905568184, + "grad_norm": 0.10771401971578598, + "learning_rate": 0.002, + "loss": 2.3879, + "step": 18490 + }, + { + "epoch": 0.07151582625906511, + "grad_norm": 0.5038573145866394, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 18500 + }, + { + "epoch": 0.07155448346244839, + "grad_norm": 0.11553716659545898, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 18510 + }, + { + "epoch": 0.07159314066583167, + "grad_norm": 0.10773370414972305, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 18520 + }, + { + "epoch": 0.07163179786921495, + "grad_norm": 0.11228738725185394, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 18530 + }, + { + "epoch": 0.07167045507259823, + "grad_norm": 0.12033947557210922, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 18540 + }, + { + "epoch": 0.07170911227598151, + "grad_norm": 0.10592275112867355, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 18550 + }, + { + "epoch": 0.07174776947936479, + "grad_norm": 0.11138555407524109, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 18560 + }, + { + "epoch": 0.07178642668274807, + "grad_norm": 0.1372644156217575, + "learning_rate": 0.002, + "loss": 2.3902, + "step": 18570 + }, + { + "epoch": 0.07182508388613135, + "grad_norm": 0.5497710704803467, + "learning_rate": 0.002, + "loss": 2.3934, + "step": 18580 + }, + { + "epoch": 0.07186374108951463, + "grad_norm": 0.1122540831565857, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 18590 + }, + { + "epoch": 0.07190239829289789, + "grad_norm": 0.12664495408535004, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 18600 + }, + { + "epoch": 0.07194105549628117, + "grad_norm": 0.10843063145875931, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 18610 + }, + { + "epoch": 0.07197971269966445, + "grad_norm": 0.11812781542539597, + "learning_rate": 0.002, + "loss": 2.3912, + "step": 18620 + }, + { + "epoch": 0.07201836990304773, + "grad_norm": 0.10957538336515427, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 18630 + }, + { + "epoch": 0.07205702710643101, + "grad_norm": 0.11784140020608902, + "learning_rate": 0.002, + "loss": 2.378, + "step": 18640 + }, + { + "epoch": 0.07209568430981429, + "grad_norm": 0.12863163650035858, + "learning_rate": 0.002, + "loss": 2.3956, + "step": 18650 + }, + { + "epoch": 0.07213434151319757, + "grad_norm": 0.1179456114768982, + "learning_rate": 0.002, + "loss": 2.377, + "step": 18660 + }, + { + "epoch": 0.07217299871658085, + "grad_norm": 0.12140975892543793, + "learning_rate": 0.002, + "loss": 2.3956, + "step": 18670 + }, + { + "epoch": 0.07221165591996413, + "grad_norm": 0.10543189942836761, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 18680 + }, + { + "epoch": 0.0722503131233474, + "grad_norm": 0.1302126944065094, + "learning_rate": 0.002, + "loss": 2.4016, + "step": 18690 + }, + { + "epoch": 0.07228897032673068, + "grad_norm": 0.10657652467489243, + "learning_rate": 0.002, + "loss": 2.3917, + "step": 18700 + }, + { + "epoch": 0.07232762753011396, + "grad_norm": 0.13003182411193848, + "learning_rate": 0.002, + "loss": 2.368, + "step": 18710 + }, + { + "epoch": 0.07236628473349724, + "grad_norm": 0.11850715428590775, + "learning_rate": 0.002, + "loss": 2.3906, + "step": 18720 + }, + { + "epoch": 0.07240494193688052, + "grad_norm": 0.1252976953983307, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 18730 + }, + { + "epoch": 0.0724435991402638, + "grad_norm": 0.10448279976844788, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 18740 + }, + { + "epoch": 0.07248225634364708, + "grad_norm": 0.0973781943321228, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 18750 + }, + { + "epoch": 0.07252091354703036, + "grad_norm": 0.11515215784311295, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 18760 + }, + { + "epoch": 0.07255957075041364, + "grad_norm": 0.12835046648979187, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 18770 + }, + { + "epoch": 0.0725982279537969, + "grad_norm": 0.09265491366386414, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 18780 + }, + { + "epoch": 0.07263688515718018, + "grad_norm": 0.12700672447681427, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 18790 + }, + { + "epoch": 0.07267554236056346, + "grad_norm": 0.10078644007444382, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 18800 + }, + { + "epoch": 0.07271419956394674, + "grad_norm": 0.1234968900680542, + "learning_rate": 0.002, + "loss": 2.3886, + "step": 18810 + }, + { + "epoch": 0.07275285676733002, + "grad_norm": 0.10350227355957031, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 18820 + }, + { + "epoch": 0.0727915139707133, + "grad_norm": 0.10174798220396042, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 18830 + }, + { + "epoch": 0.07283017117409658, + "grad_norm": 0.12602569162845612, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 18840 + }, + { + "epoch": 0.07286882837747986, + "grad_norm": 0.14451903104782104, + "learning_rate": 0.002, + "loss": 2.3904, + "step": 18850 + }, + { + "epoch": 0.07290748558086314, + "grad_norm": 0.10017625987529755, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 18860 + }, + { + "epoch": 0.07294614278424642, + "grad_norm": 0.10255925357341766, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 18870 + }, + { + "epoch": 0.07298479998762969, + "grad_norm": 0.11854524165391922, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 18880 + }, + { + "epoch": 0.07302345719101297, + "grad_norm": 0.0942547544836998, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 18890 + }, + { + "epoch": 0.07306211439439625, + "grad_norm": 0.12876085937023163, + "learning_rate": 0.002, + "loss": 2.378, + "step": 18900 + }, + { + "epoch": 0.07310077159777953, + "grad_norm": 0.1679118126630783, + "learning_rate": 0.002, + "loss": 2.3923, + "step": 18910 + }, + { + "epoch": 0.07313942880116281, + "grad_norm": 0.12198128551244736, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 18920 + }, + { + "epoch": 0.07317808600454609, + "grad_norm": 0.10201894491910934, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 18930 + }, + { + "epoch": 0.07321674320792937, + "grad_norm": 0.11405906826257706, + "learning_rate": 0.002, + "loss": 2.3974, + "step": 18940 + }, + { + "epoch": 0.07325540041131265, + "grad_norm": 0.11819473654031754, + "learning_rate": 0.002, + "loss": 2.3838, + "step": 18950 + }, + { + "epoch": 0.07329405761469593, + "grad_norm": 0.09402453154325485, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 18960 + }, + { + "epoch": 0.0733327148180792, + "grad_norm": 0.13503125309944153, + "learning_rate": 0.002, + "loss": 2.3855, + "step": 18970 + }, + { + "epoch": 0.07337137202146247, + "grad_norm": 0.11238619685173035, + "learning_rate": 0.002, + "loss": 2.3873, + "step": 18980 + }, + { + "epoch": 0.07341002922484575, + "grad_norm": 0.15545395016670227, + "learning_rate": 0.002, + "loss": 2.392, + "step": 18990 + }, + { + "epoch": 0.07344868642822903, + "grad_norm": 0.13186931610107422, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 19000 + }, + { + "epoch": 0.07348734363161231, + "grad_norm": 0.10326071828603745, + "learning_rate": 0.002, + "loss": 2.3857, + "step": 19010 + }, + { + "epoch": 0.0735260008349956, + "grad_norm": 0.11343759298324585, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 19020 + }, + { + "epoch": 0.07356465803837887, + "grad_norm": 0.13040369749069214, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 19030 + }, + { + "epoch": 0.07360331524176215, + "grad_norm": 0.1187443807721138, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 19040 + }, + { + "epoch": 0.07364197244514543, + "grad_norm": 0.09433488547801971, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 19050 + }, + { + "epoch": 0.0736806296485287, + "grad_norm": 0.12784084677696228, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 19060 + }, + { + "epoch": 0.07371928685191198, + "grad_norm": 0.14179980754852295, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 19070 + }, + { + "epoch": 0.07375794405529526, + "grad_norm": 0.11444813013076782, + "learning_rate": 0.002, + "loss": 2.383, + "step": 19080 + }, + { + "epoch": 0.07379660125867854, + "grad_norm": 0.11803031712770462, + "learning_rate": 0.002, + "loss": 2.3851, + "step": 19090 + }, + { + "epoch": 0.07383525846206182, + "grad_norm": 0.13434816896915436, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 19100 + }, + { + "epoch": 0.0738739156654451, + "grad_norm": 0.11736548691987991, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 19110 + }, + { + "epoch": 0.07391257286882838, + "grad_norm": 0.10972438007593155, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 19120 + }, + { + "epoch": 0.07395123007221166, + "grad_norm": 0.09769081324338913, + "learning_rate": 0.002, + "loss": 2.3922, + "step": 19130 + }, + { + "epoch": 0.07398988727559494, + "grad_norm": 0.16008608043193817, + "learning_rate": 0.002, + "loss": 2.384, + "step": 19140 + }, + { + "epoch": 0.0740285444789782, + "grad_norm": 0.21446815133094788, + "learning_rate": 0.002, + "loss": 2.379, + "step": 19150 + }, + { + "epoch": 0.07406720168236149, + "grad_norm": 0.10447430610656738, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 19160 + }, + { + "epoch": 0.07410585888574477, + "grad_norm": 0.12838858366012573, + "learning_rate": 0.002, + "loss": 2.3784, + "step": 19170 + }, + { + "epoch": 0.07414451608912805, + "grad_norm": 0.11898154765367508, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 19180 + }, + { + "epoch": 0.07418317329251133, + "grad_norm": 0.12039216607809067, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 19190 + }, + { + "epoch": 0.0742218304958946, + "grad_norm": 0.10423491895198822, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 19200 + }, + { + "epoch": 0.07426048769927789, + "grad_norm": 0.11539363116025925, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 19210 + }, + { + "epoch": 0.07429914490266117, + "grad_norm": 0.12009965628385544, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 19220 + }, + { + "epoch": 0.07433780210604445, + "grad_norm": 0.09702429175376892, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 19230 + }, + { + "epoch": 0.07437645930942773, + "grad_norm": 0.11230161041021347, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 19240 + }, + { + "epoch": 0.07441511651281099, + "grad_norm": 0.11779018491506577, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 19250 + }, + { + "epoch": 0.07445377371619427, + "grad_norm": 0.10018932819366455, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 19260 + }, + { + "epoch": 0.07449243091957755, + "grad_norm": 0.10820461064577103, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 19270 + }, + { + "epoch": 0.07453108812296083, + "grad_norm": 0.10591787844896317, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 19280 + }, + { + "epoch": 0.07456974532634411, + "grad_norm": 0.10695263743400574, + "learning_rate": 0.002, + "loss": 2.386, + "step": 19290 + }, + { + "epoch": 0.07460840252972739, + "grad_norm": 0.10405397415161133, + "learning_rate": 0.002, + "loss": 2.3945, + "step": 19300 + }, + { + "epoch": 0.07464705973311067, + "grad_norm": 0.121544249355793, + "learning_rate": 0.002, + "loss": 2.3863, + "step": 19310 + }, + { + "epoch": 0.07468571693649395, + "grad_norm": 0.1174372136592865, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 19320 + }, + { + "epoch": 0.07472437413987723, + "grad_norm": 0.11053747683763504, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 19330 + }, + { + "epoch": 0.0747630313432605, + "grad_norm": 0.10836398601531982, + "learning_rate": 0.002, + "loss": 2.4037, + "step": 19340 + }, + { + "epoch": 0.07480168854664378, + "grad_norm": 0.11245734244585037, + "learning_rate": 0.002, + "loss": 2.4008, + "step": 19350 + }, + { + "epoch": 0.07484034575002706, + "grad_norm": 0.12424355745315552, + "learning_rate": 0.002, + "loss": 2.3923, + "step": 19360 + }, + { + "epoch": 0.07487900295341034, + "grad_norm": 0.12254510074853897, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 19370 + }, + { + "epoch": 0.07491766015679362, + "grad_norm": 0.10089680552482605, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 19380 + }, + { + "epoch": 0.0749563173601769, + "grad_norm": 0.11235611885786057, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 19390 + }, + { + "epoch": 0.07499497456356018, + "grad_norm": 0.12315353006124496, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 19400 + }, + { + "epoch": 0.07503363176694346, + "grad_norm": 0.10509892553091049, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 19410 + }, + { + "epoch": 0.07507228897032674, + "grad_norm": 0.12031447142362595, + "learning_rate": 0.002, + "loss": 2.3917, + "step": 19420 + }, + { + "epoch": 0.07511094617371, + "grad_norm": 0.15881817042827606, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 19430 + }, + { + "epoch": 0.07514960337709328, + "grad_norm": 0.11184466630220413, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 19440 + }, + { + "epoch": 0.07518826058047656, + "grad_norm": 0.12158004194498062, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 19450 + }, + { + "epoch": 0.07522691778385984, + "grad_norm": 0.12418025732040405, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 19460 + }, + { + "epoch": 0.07526557498724312, + "grad_norm": 0.16225135326385498, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 19470 + }, + { + "epoch": 0.0753042321906264, + "grad_norm": 0.11569126695394516, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 19480 + }, + { + "epoch": 0.07534288939400968, + "grad_norm": 0.10566077381372452, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 19490 + }, + { + "epoch": 0.07538154659739296, + "grad_norm": 0.1229267567396164, + "learning_rate": 0.002, + "loss": 2.3944, + "step": 19500 + }, + { + "epoch": 0.07542020380077624, + "grad_norm": 0.13145491480827332, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 19510 + }, + { + "epoch": 0.07545886100415951, + "grad_norm": 0.10700027644634247, + "learning_rate": 0.002, + "loss": 2.357, + "step": 19520 + }, + { + "epoch": 0.07549751820754279, + "grad_norm": 0.09952887147665024, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 19530 + }, + { + "epoch": 0.07553617541092607, + "grad_norm": 0.1233973428606987, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 19540 + }, + { + "epoch": 0.07557483261430935, + "grad_norm": 0.13303498923778534, + "learning_rate": 0.002, + "loss": 2.3857, + "step": 19550 + }, + { + "epoch": 0.07561348981769263, + "grad_norm": 0.12933650612831116, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 19560 + }, + { + "epoch": 0.07565214702107591, + "grad_norm": 0.10663676261901855, + "learning_rate": 0.002, + "loss": 2.3876, + "step": 19570 + }, + { + "epoch": 0.07569080422445919, + "grad_norm": 0.10642411559820175, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 19580 + }, + { + "epoch": 0.07572946142784247, + "grad_norm": 0.11113075911998749, + "learning_rate": 0.002, + "loss": 2.3881, + "step": 19590 + }, + { + "epoch": 0.07576811863122575, + "grad_norm": 0.11284121870994568, + "learning_rate": 0.002, + "loss": 2.3904, + "step": 19600 + }, + { + "epoch": 0.07580677583460903, + "grad_norm": 0.14630155265331268, + "learning_rate": 0.002, + "loss": 2.3907, + "step": 19610 + }, + { + "epoch": 0.0758454330379923, + "grad_norm": 0.12654368579387665, + "learning_rate": 0.002, + "loss": 2.3823, + "step": 19620 + }, + { + "epoch": 0.07588409024137557, + "grad_norm": 0.13922430574893951, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 19630 + }, + { + "epoch": 0.07592274744475885, + "grad_norm": 0.12410923093557358, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 19640 + }, + { + "epoch": 0.07596140464814213, + "grad_norm": 0.11142056435346603, + "learning_rate": 0.002, + "loss": 2.3881, + "step": 19650 + }, + { + "epoch": 0.07600006185152541, + "grad_norm": 0.11221200227737427, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 19660 + }, + { + "epoch": 0.0760387190549087, + "grad_norm": 0.12378339469432831, + "learning_rate": 0.002, + "loss": 2.377, + "step": 19670 + }, + { + "epoch": 0.07607737625829197, + "grad_norm": 0.11832631379365921, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 19680 + }, + { + "epoch": 0.07611603346167525, + "grad_norm": 0.1267072558403015, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 19690 + }, + { + "epoch": 0.07615469066505853, + "grad_norm": 0.14015980064868927, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 19700 + }, + { + "epoch": 0.0761933478684418, + "grad_norm": 0.11285578459501266, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 19710 + }, + { + "epoch": 0.07623200507182508, + "grad_norm": 0.09963081032037735, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 19720 + }, + { + "epoch": 0.07627066227520836, + "grad_norm": 0.13140037655830383, + "learning_rate": 0.002, + "loss": 2.3891, + "step": 19730 + }, + { + "epoch": 0.07630931947859164, + "grad_norm": 0.11250842362642288, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 19740 + }, + { + "epoch": 0.07634797668197492, + "grad_norm": 0.12486717104911804, + "learning_rate": 0.002, + "loss": 2.3925, + "step": 19750 + }, + { + "epoch": 0.0763866338853582, + "grad_norm": 0.12867963314056396, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 19760 + }, + { + "epoch": 0.07642529108874148, + "grad_norm": 0.10435399413108826, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 19770 + }, + { + "epoch": 0.07646394829212476, + "grad_norm": 0.12695525586605072, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 19780 + }, + { + "epoch": 0.07650260549550804, + "grad_norm": 0.1107652485370636, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 19790 + }, + { + "epoch": 0.0765412626988913, + "grad_norm": 0.14867202937602997, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 19800 + }, + { + "epoch": 0.07657991990227458, + "grad_norm": 0.10924647003412247, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 19810 + }, + { + "epoch": 0.07661857710565786, + "grad_norm": 0.1168089285492897, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 19820 + }, + { + "epoch": 0.07665723430904114, + "grad_norm": 0.12944529950618744, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 19830 + }, + { + "epoch": 0.07669589151242442, + "grad_norm": 0.12026828527450562, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 19840 + }, + { + "epoch": 0.0767345487158077, + "grad_norm": 0.10217534750699997, + "learning_rate": 0.002, + "loss": 2.39, + "step": 19850 + }, + { + "epoch": 0.07677320591919098, + "grad_norm": 0.11586353927850723, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 19860 + }, + { + "epoch": 0.07681186312257426, + "grad_norm": 0.09830108284950256, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 19870 + }, + { + "epoch": 0.07685052032595754, + "grad_norm": 0.12570495903491974, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 19880 + }, + { + "epoch": 0.07688917752934082, + "grad_norm": 0.11000153422355652, + "learning_rate": 0.002, + "loss": 2.3871, + "step": 19890 + }, + { + "epoch": 0.07692783473272409, + "grad_norm": 0.10306507349014282, + "learning_rate": 0.002, + "loss": 2.3868, + "step": 19900 + }, + { + "epoch": 0.07696649193610737, + "grad_norm": 0.11710715293884277, + "learning_rate": 0.002, + "loss": 2.386, + "step": 19910 + }, + { + "epoch": 0.07700514913949065, + "grad_norm": 0.12766055762767792, + "learning_rate": 0.002, + "loss": 2.4044, + "step": 19920 + }, + { + "epoch": 0.07704380634287393, + "grad_norm": 0.10362893342971802, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 19930 + }, + { + "epoch": 0.07708246354625721, + "grad_norm": 0.11141230165958405, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 19940 + }, + { + "epoch": 0.07712112074964049, + "grad_norm": 0.11341533064842224, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 19950 + }, + { + "epoch": 0.07715977795302377, + "grad_norm": 0.10786847025156021, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 19960 + }, + { + "epoch": 0.07719843515640705, + "grad_norm": 0.09927070885896683, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 19970 + }, + { + "epoch": 0.07723709235979033, + "grad_norm": 0.11835852265357971, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 19980 + }, + { + "epoch": 0.0772757495631736, + "grad_norm": 0.1367740035057068, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 19990 + }, + { + "epoch": 0.07731440676655688, + "grad_norm": 0.10808567702770233, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 20000 + }, + { + "epoch": 0.07735306396994016, + "grad_norm": 0.11161024123430252, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 20010 + }, + { + "epoch": 0.07739172117332344, + "grad_norm": 0.10708248615264893, + "learning_rate": 0.002, + "loss": 2.3821, + "step": 20020 + }, + { + "epoch": 0.07743037837670672, + "grad_norm": 0.1220446527004242, + "learning_rate": 0.002, + "loss": 2.359, + "step": 20030 + }, + { + "epoch": 0.07746903558009, + "grad_norm": 0.11131221801042557, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 20040 + }, + { + "epoch": 0.07750769278347328, + "grad_norm": 0.14677201211452484, + "learning_rate": 0.002, + "loss": 2.3867, + "step": 20050 + }, + { + "epoch": 0.07754634998685656, + "grad_norm": 0.11652742326259613, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 20060 + }, + { + "epoch": 0.07758500719023984, + "grad_norm": 0.1017276793718338, + "learning_rate": 0.002, + "loss": 2.3834, + "step": 20070 + }, + { + "epoch": 0.0776236643936231, + "grad_norm": 0.11347658932209015, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 20080 + }, + { + "epoch": 0.07766232159700638, + "grad_norm": 0.15333813428878784, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 20090 + }, + { + "epoch": 0.07770097880038966, + "grad_norm": 0.10666406154632568, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 20100 + }, + { + "epoch": 0.07773963600377294, + "grad_norm": 0.115127794444561, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 20110 + }, + { + "epoch": 0.07777829320715622, + "grad_norm": 0.10911829769611359, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 20120 + }, + { + "epoch": 0.0778169504105395, + "grad_norm": 0.12019343674182892, + "learning_rate": 0.002, + "loss": 2.3975, + "step": 20130 + }, + { + "epoch": 0.07785560761392278, + "grad_norm": 0.12885767221450806, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 20140 + }, + { + "epoch": 0.07789426481730606, + "grad_norm": 0.11365063488483429, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 20150 + }, + { + "epoch": 0.07793292202068934, + "grad_norm": 0.12837842106819153, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 20160 + }, + { + "epoch": 0.0779715792240726, + "grad_norm": 0.10112041234970093, + "learning_rate": 0.002, + "loss": 2.382, + "step": 20170 + }, + { + "epoch": 0.07801023642745589, + "grad_norm": 0.11926894634962082, + "learning_rate": 0.002, + "loss": 2.3904, + "step": 20180 + }, + { + "epoch": 0.07804889363083917, + "grad_norm": 0.13643495738506317, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 20190 + }, + { + "epoch": 0.07808755083422245, + "grad_norm": 0.12283652275800705, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 20200 + }, + { + "epoch": 0.07812620803760573, + "grad_norm": 0.1476060301065445, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 20210 + }, + { + "epoch": 0.078164865240989, + "grad_norm": 0.12512938678264618, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 20220 + }, + { + "epoch": 0.07820352244437229, + "grad_norm": 0.10858415067195892, + "learning_rate": 0.002, + "loss": 2.3908, + "step": 20230 + }, + { + "epoch": 0.07824217964775557, + "grad_norm": 0.09735766798257828, + "learning_rate": 0.002, + "loss": 2.374, + "step": 20240 + }, + { + "epoch": 0.07828083685113885, + "grad_norm": 0.13043081760406494, + "learning_rate": 0.002, + "loss": 2.3928, + "step": 20250 + }, + { + "epoch": 0.07831949405452213, + "grad_norm": 0.1227329671382904, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 20260 + }, + { + "epoch": 0.07835815125790539, + "grad_norm": 0.1388980597257614, + "learning_rate": 0.002, + "loss": 2.38, + "step": 20270 + }, + { + "epoch": 0.07839680846128867, + "grad_norm": 0.10539897531270981, + "learning_rate": 0.002, + "loss": 2.388, + "step": 20280 + }, + { + "epoch": 0.07843546566467195, + "grad_norm": 0.10891108214855194, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 20290 + }, + { + "epoch": 0.07847412286805523, + "grad_norm": 0.11661384254693985, + "learning_rate": 0.002, + "loss": 2.3859, + "step": 20300 + }, + { + "epoch": 0.07851278007143851, + "grad_norm": 0.10258234292268753, + "learning_rate": 0.002, + "loss": 2.3994, + "step": 20310 + }, + { + "epoch": 0.07855143727482179, + "grad_norm": 0.11731996387243271, + "learning_rate": 0.002, + "loss": 2.3898, + "step": 20320 + }, + { + "epoch": 0.07859009447820507, + "grad_norm": 0.1236349493265152, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 20330 + }, + { + "epoch": 0.07862875168158835, + "grad_norm": 0.11339427530765533, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 20340 + }, + { + "epoch": 0.07866740888497163, + "grad_norm": 0.10862452536821365, + "learning_rate": 0.002, + "loss": 2.3856, + "step": 20350 + }, + { + "epoch": 0.0787060660883549, + "grad_norm": 0.14031213521957397, + "learning_rate": 0.002, + "loss": 2.373, + "step": 20360 + }, + { + "epoch": 0.07874472329173818, + "grad_norm": 0.1003158912062645, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 20370 + }, + { + "epoch": 0.07878338049512146, + "grad_norm": 0.14314481616020203, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 20380 + }, + { + "epoch": 0.07882203769850474, + "grad_norm": 0.1109648272395134, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 20390 + }, + { + "epoch": 0.07886069490188802, + "grad_norm": 0.1257660835981369, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 20400 + }, + { + "epoch": 0.0788993521052713, + "grad_norm": 0.10887817293405533, + "learning_rate": 0.002, + "loss": 2.3842, + "step": 20410 + }, + { + "epoch": 0.07893800930865458, + "grad_norm": 0.13619078695774078, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 20420 + }, + { + "epoch": 0.07897666651203786, + "grad_norm": 0.10961946099996567, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 20430 + }, + { + "epoch": 0.07901532371542114, + "grad_norm": 0.10585474222898483, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 20440 + }, + { + "epoch": 0.0790539809188044, + "grad_norm": 0.10117260366678238, + "learning_rate": 0.002, + "loss": 2.3979, + "step": 20450 + }, + { + "epoch": 0.07909263812218768, + "grad_norm": 0.2224205732345581, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 20460 + }, + { + "epoch": 0.07913129532557096, + "grad_norm": 0.1441737562417984, + "learning_rate": 0.002, + "loss": 2.3879, + "step": 20470 + }, + { + "epoch": 0.07916995252895424, + "grad_norm": 0.09196843951940536, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 20480 + }, + { + "epoch": 0.07920860973233752, + "grad_norm": 0.11055919528007507, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 20490 + }, + { + "epoch": 0.0792472669357208, + "grad_norm": 0.10769325494766235, + "learning_rate": 0.002, + "loss": 2.374, + "step": 20500 + }, + { + "epoch": 0.07928592413910408, + "grad_norm": 0.10402721911668777, + "learning_rate": 0.002, + "loss": 2.362, + "step": 20510 + }, + { + "epoch": 0.07932458134248736, + "grad_norm": 0.11635389924049377, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 20520 + }, + { + "epoch": 0.07936323854587064, + "grad_norm": 0.10108437389135361, + "learning_rate": 0.002, + "loss": 2.3873, + "step": 20530 + }, + { + "epoch": 0.07940189574925392, + "grad_norm": 0.1175009086728096, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 20540 + }, + { + "epoch": 0.07944055295263719, + "grad_norm": 0.10868023335933685, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 20550 + }, + { + "epoch": 0.07947921015602047, + "grad_norm": 0.11248388141393661, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 20560 + }, + { + "epoch": 0.07951786735940375, + "grad_norm": 0.10098931193351746, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 20570 + }, + { + "epoch": 0.07955652456278703, + "grad_norm": 0.1326024979352951, + "learning_rate": 0.002, + "loss": 2.388, + "step": 20580 + }, + { + "epoch": 0.07959518176617031, + "grad_norm": 0.1198132261633873, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 20590 + }, + { + "epoch": 0.07963383896955359, + "grad_norm": 0.10643836855888367, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 20600 + }, + { + "epoch": 0.07967249617293687, + "grad_norm": 0.11332546919584274, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 20610 + }, + { + "epoch": 0.07971115337632015, + "grad_norm": 0.11369810998439789, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 20620 + }, + { + "epoch": 0.07974981057970343, + "grad_norm": 0.1119384691119194, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 20630 + }, + { + "epoch": 0.0797884677830867, + "grad_norm": 0.1512312889099121, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 20640 + }, + { + "epoch": 0.07982712498646997, + "grad_norm": 0.10424868762493134, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 20650 + }, + { + "epoch": 0.07986578218985325, + "grad_norm": 0.4727657437324524, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 20660 + }, + { + "epoch": 0.07990443939323653, + "grad_norm": 0.10331464558839798, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 20670 + }, + { + "epoch": 0.07994309659661981, + "grad_norm": 0.10814424604177475, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 20680 + }, + { + "epoch": 0.0799817538000031, + "grad_norm": 0.10030341893434525, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 20690 + }, + { + "epoch": 0.08002041100338637, + "grad_norm": 0.1187674030661583, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 20700 + }, + { + "epoch": 0.08005906820676965, + "grad_norm": 0.12968982756137848, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 20710 + }, + { + "epoch": 0.08009772541015293, + "grad_norm": 0.11431638896465302, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 20720 + }, + { + "epoch": 0.0801363826135362, + "grad_norm": 0.10570517927408218, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 20730 + }, + { + "epoch": 0.08017503981691948, + "grad_norm": 0.10601756721735, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 20740 + }, + { + "epoch": 0.08021369702030276, + "grad_norm": 0.12536196410655975, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 20750 + }, + { + "epoch": 0.08025235422368604, + "grad_norm": 0.11333110183477402, + "learning_rate": 0.002, + "loss": 2.376, + "step": 20760 + }, + { + "epoch": 0.08029101142706932, + "grad_norm": 0.11019251495599747, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 20770 + }, + { + "epoch": 0.0803296686304526, + "grad_norm": 0.10505245625972748, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 20780 + }, + { + "epoch": 0.08036832583383588, + "grad_norm": 0.12179173529148102, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 20790 + }, + { + "epoch": 0.08040698303721916, + "grad_norm": 0.11226214468479156, + "learning_rate": 0.002, + "loss": 2.3851, + "step": 20800 + }, + { + "epoch": 0.08044564024060244, + "grad_norm": 0.11181282997131348, + "learning_rate": 0.002, + "loss": 2.3841, + "step": 20810 + }, + { + "epoch": 0.0804842974439857, + "grad_norm": 0.11984371393918991, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 20820 + }, + { + "epoch": 0.08052295464736899, + "grad_norm": 0.12782707810401917, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 20830 + }, + { + "epoch": 0.08056161185075227, + "grad_norm": 0.10775158554315567, + "learning_rate": 0.002, + "loss": 2.376, + "step": 20840 + }, + { + "epoch": 0.08060026905413555, + "grad_norm": 0.11608025431632996, + "learning_rate": 0.002, + "loss": 2.3896, + "step": 20850 + }, + { + "epoch": 0.08063892625751883, + "grad_norm": 0.12389634549617767, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 20860 + }, + { + "epoch": 0.0806775834609021, + "grad_norm": 0.10256687551736832, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 20870 + }, + { + "epoch": 0.08071624066428539, + "grad_norm": 0.12604756653308868, + "learning_rate": 0.002, + "loss": 2.3946, + "step": 20880 + }, + { + "epoch": 0.08075489786766867, + "grad_norm": 0.12273551523685455, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 20890 + }, + { + "epoch": 0.08079355507105195, + "grad_norm": 0.11249291896820068, + "learning_rate": 0.002, + "loss": 2.3885, + "step": 20900 + }, + { + "epoch": 0.08083221227443523, + "grad_norm": 0.10646382719278336, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 20910 + }, + { + "epoch": 0.08087086947781849, + "grad_norm": 0.11489584296941757, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 20920 + }, + { + "epoch": 0.08090952668120177, + "grad_norm": 0.11377881467342377, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 20930 + }, + { + "epoch": 0.08094818388458505, + "grad_norm": 0.10648036748170853, + "learning_rate": 0.002, + "loss": 2.387, + "step": 20940 + }, + { + "epoch": 0.08098684108796833, + "grad_norm": 0.11882845312356949, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 20950 + }, + { + "epoch": 0.08102549829135161, + "grad_norm": 0.1182943731546402, + "learning_rate": 0.002, + "loss": 2.371, + "step": 20960 + }, + { + "epoch": 0.08106415549473489, + "grad_norm": 0.10753022134304047, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 20970 + }, + { + "epoch": 0.08110281269811817, + "grad_norm": 0.11233070492744446, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 20980 + }, + { + "epoch": 0.08114146990150145, + "grad_norm": 0.1118980348110199, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 20990 + }, + { + "epoch": 0.08118012710488473, + "grad_norm": 0.11435777693986893, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 21000 + }, + { + "epoch": 0.081218784308268, + "grad_norm": 0.11617391556501389, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 21010 + }, + { + "epoch": 0.08125744151165128, + "grad_norm": 0.09662947803735733, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 21020 + }, + { + "epoch": 0.08129609871503456, + "grad_norm": 0.1029861569404602, + "learning_rate": 0.002, + "loss": 2.395, + "step": 21030 + }, + { + "epoch": 0.08133475591841784, + "grad_norm": 0.10826099663972855, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 21040 + }, + { + "epoch": 0.08137341312180112, + "grad_norm": 0.14398784935474396, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 21050 + }, + { + "epoch": 0.0814120703251844, + "grad_norm": 0.11851483583450317, + "learning_rate": 0.002, + "loss": 2.3787, + "step": 21060 + }, + { + "epoch": 0.08145072752856768, + "grad_norm": 0.10036487132310867, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 21070 + }, + { + "epoch": 0.08148938473195096, + "grad_norm": 0.11577463150024414, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 21080 + }, + { + "epoch": 0.08152804193533424, + "grad_norm": 0.10647506266832352, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 21090 + }, + { + "epoch": 0.0815666991387175, + "grad_norm": 0.10589282959699631, + "learning_rate": 0.002, + "loss": 2.3823, + "step": 21100 + }, + { + "epoch": 0.08160535634210078, + "grad_norm": 0.12870502471923828, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 21110 + }, + { + "epoch": 0.08164401354548406, + "grad_norm": 0.14981471002101898, + "learning_rate": 0.002, + "loss": 2.3903, + "step": 21120 + }, + { + "epoch": 0.08168267074886734, + "grad_norm": 0.12884055078029633, + "learning_rate": 0.002, + "loss": 2.375, + "step": 21130 + }, + { + "epoch": 0.08172132795225062, + "grad_norm": 0.1172378733754158, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 21140 + }, + { + "epoch": 0.0817599851556339, + "grad_norm": 0.11848177760839462, + "learning_rate": 0.002, + "loss": 2.3984, + "step": 21150 + }, + { + "epoch": 0.08179864235901718, + "grad_norm": 0.35388875007629395, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 21160 + }, + { + "epoch": 0.08183729956240046, + "grad_norm": 0.09712839871644974, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 21170 + }, + { + "epoch": 0.08187595676578374, + "grad_norm": 0.0974545031785965, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 21180 + }, + { + "epoch": 0.08191461396916701, + "grad_norm": 0.1533958464860916, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 21190 + }, + { + "epoch": 0.08195327117255029, + "grad_norm": 0.1025494858622551, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 21200 + }, + { + "epoch": 0.08199192837593357, + "grad_norm": 0.10949676483869553, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 21210 + }, + { + "epoch": 0.08203058557931685, + "grad_norm": 0.10791637003421783, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 21220 + }, + { + "epoch": 0.08206924278270013, + "grad_norm": 0.10763223469257355, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 21230 + }, + { + "epoch": 0.08210789998608341, + "grad_norm": 0.12494704872369766, + "learning_rate": 0.002, + "loss": 2.372, + "step": 21240 + }, + { + "epoch": 0.08214655718946669, + "grad_norm": 0.11371438950300217, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 21250 + }, + { + "epoch": 0.08218521439284997, + "grad_norm": 0.11540467292070389, + "learning_rate": 0.002, + "loss": 2.383, + "step": 21260 + }, + { + "epoch": 0.08222387159623325, + "grad_norm": 0.1142238900065422, + "learning_rate": 0.002, + "loss": 2.378, + "step": 21270 + }, + { + "epoch": 0.08226252879961653, + "grad_norm": 0.13156285881996155, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 21280 + }, + { + "epoch": 0.0823011860029998, + "grad_norm": 0.11045181006193161, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 21290 + }, + { + "epoch": 0.08233984320638307, + "grad_norm": 0.10513743758201599, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 21300 + }, + { + "epoch": 0.08237850040976635, + "grad_norm": 0.08865063637495041, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 21310 + }, + { + "epoch": 0.08241715761314963, + "grad_norm": 0.13477951288223267, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 21320 + }, + { + "epoch": 0.08245581481653291, + "grad_norm": 0.1116044670343399, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 21330 + }, + { + "epoch": 0.08249447201991619, + "grad_norm": 0.10824161022901535, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 21340 + }, + { + "epoch": 0.08253312922329947, + "grad_norm": 0.10181602835655212, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 21350 + }, + { + "epoch": 0.08257178642668275, + "grad_norm": 0.11037593334913254, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 21360 + }, + { + "epoch": 0.08261044363006603, + "grad_norm": 0.1049826517701149, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 21370 + }, + { + "epoch": 0.0826491008334493, + "grad_norm": 0.11907021701335907, + "learning_rate": 0.002, + "loss": 2.3867, + "step": 21380 + }, + { + "epoch": 0.08268775803683258, + "grad_norm": 0.10949509590864182, + "learning_rate": 0.002, + "loss": 2.3784, + "step": 21390 + }, + { + "epoch": 0.08272641524021586, + "grad_norm": 0.16441144049167633, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 21400 + }, + { + "epoch": 0.08276507244359914, + "grad_norm": 0.10310792177915573, + "learning_rate": 0.002, + "loss": 2.3903, + "step": 21410 + }, + { + "epoch": 0.08280372964698242, + "grad_norm": 0.11294974386692047, + "learning_rate": 0.002, + "loss": 2.373, + "step": 21420 + }, + { + "epoch": 0.0828423868503657, + "grad_norm": 0.09527469426393509, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 21430 + }, + { + "epoch": 0.08288104405374898, + "grad_norm": 0.10629330575466156, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 21440 + }, + { + "epoch": 0.08291970125713226, + "grad_norm": 0.11222105473279953, + "learning_rate": 0.002, + "loss": 2.3844, + "step": 21450 + }, + { + "epoch": 0.08295835846051554, + "grad_norm": 0.11182720214128494, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 21460 + }, + { + "epoch": 0.0829970156638988, + "grad_norm": 0.10755111277103424, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 21470 + }, + { + "epoch": 0.08303567286728208, + "grad_norm": 0.10462916642427444, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 21480 + }, + { + "epoch": 0.08307433007066536, + "grad_norm": 0.10938968509435654, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 21490 + }, + { + "epoch": 0.08311298727404864, + "grad_norm": 0.19250869750976562, + "learning_rate": 0.002, + "loss": 2.39, + "step": 21500 + }, + { + "epoch": 0.08315164447743192, + "grad_norm": 0.10719188302755356, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 21510 + }, + { + "epoch": 0.0831903016808152, + "grad_norm": 0.1125466376543045, + "learning_rate": 0.002, + "loss": 2.361, + "step": 21520 + }, + { + "epoch": 0.08322895888419848, + "grad_norm": 0.10615231096744537, + "learning_rate": 0.002, + "loss": 2.3969, + "step": 21530 + }, + { + "epoch": 0.08326761608758176, + "grad_norm": 0.1436234563589096, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 21540 + }, + { + "epoch": 0.08330627329096504, + "grad_norm": 0.1111617162823677, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 21550 + }, + { + "epoch": 0.08334493049434832, + "grad_norm": 0.11619032174348831, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 21560 + }, + { + "epoch": 0.08338358769773159, + "grad_norm": 0.15463846921920776, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 21570 + }, + { + "epoch": 0.08342224490111487, + "grad_norm": 0.11931682378053665, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 21580 + }, + { + "epoch": 0.08346090210449815, + "grad_norm": 0.12608924508094788, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 21590 + }, + { + "epoch": 0.08349955930788143, + "grad_norm": 0.11588284373283386, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 21600 + }, + { + "epoch": 0.08353821651126471, + "grad_norm": 0.1295454353094101, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 21610 + }, + { + "epoch": 0.08357687371464799, + "grad_norm": 0.10524514317512512, + "learning_rate": 0.002, + "loss": 2.372, + "step": 21620 + }, + { + "epoch": 0.08361553091803127, + "grad_norm": 0.10964474081993103, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 21630 + }, + { + "epoch": 0.08365418812141455, + "grad_norm": 0.11840105056762695, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 21640 + }, + { + "epoch": 0.08369284532479783, + "grad_norm": 0.12307754904031754, + "learning_rate": 0.002, + "loss": 2.3889, + "step": 21650 + }, + { + "epoch": 0.0837315025281811, + "grad_norm": 0.09594234824180603, + "learning_rate": 0.002, + "loss": 2.3894, + "step": 21660 + }, + { + "epoch": 0.08377015973156438, + "grad_norm": 0.1343754529953003, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 21670 + }, + { + "epoch": 0.08380881693494766, + "grad_norm": 0.10107145458459854, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 21680 + }, + { + "epoch": 0.08384747413833094, + "grad_norm": 0.10919588059186935, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 21690 + }, + { + "epoch": 0.08388613134171422, + "grad_norm": 0.10154515504837036, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 21700 + }, + { + "epoch": 0.0839247885450975, + "grad_norm": 0.11246238648891449, + "learning_rate": 0.002, + "loss": 2.3888, + "step": 21710 + }, + { + "epoch": 0.08396344574848078, + "grad_norm": 0.10734483599662781, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 21720 + }, + { + "epoch": 0.08400210295186405, + "grad_norm": 0.11789402365684509, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 21730 + }, + { + "epoch": 0.08404076015524733, + "grad_norm": 0.10355214029550552, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 21740 + }, + { + "epoch": 0.0840794173586306, + "grad_norm": 0.10136032849550247, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 21750 + }, + { + "epoch": 0.08411807456201388, + "grad_norm": 0.1416572630405426, + "learning_rate": 0.002, + "loss": 2.3941, + "step": 21760 + }, + { + "epoch": 0.08415673176539716, + "grad_norm": 0.11044862866401672, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 21770 + }, + { + "epoch": 0.08419538896878044, + "grad_norm": 0.12113585323095322, + "learning_rate": 0.002, + "loss": 2.3937, + "step": 21780 + }, + { + "epoch": 0.08423404617216372, + "grad_norm": 0.09596196562051773, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 21790 + }, + { + "epoch": 0.084272703375547, + "grad_norm": 0.11969374120235443, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 21800 + }, + { + "epoch": 0.08431136057893028, + "grad_norm": 0.11508044600486755, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 21810 + }, + { + "epoch": 0.08435001778231356, + "grad_norm": 0.10925393551588058, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 21820 + }, + { + "epoch": 0.08438867498569684, + "grad_norm": 0.12044740468263626, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 21830 + }, + { + "epoch": 0.0844273321890801, + "grad_norm": 0.10578937828540802, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 21840 + }, + { + "epoch": 0.08446598939246339, + "grad_norm": 0.1018751934170723, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 21850 + }, + { + "epoch": 0.08450464659584667, + "grad_norm": 0.11004742980003357, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 21860 + }, + { + "epoch": 0.08454330379922995, + "grad_norm": 0.12035630643367767, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 21870 + }, + { + "epoch": 0.08458196100261323, + "grad_norm": 0.14227500557899475, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 21880 + }, + { + "epoch": 0.0846206182059965, + "grad_norm": 0.1038215234875679, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 21890 + }, + { + "epoch": 0.08465927540937979, + "grad_norm": 0.10981255024671555, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 21900 + }, + { + "epoch": 0.08469793261276307, + "grad_norm": 0.13036499917507172, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 21910 + }, + { + "epoch": 0.08473658981614635, + "grad_norm": 0.10847773402929306, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 21920 + }, + { + "epoch": 0.08477524701952963, + "grad_norm": 0.11355654150247574, + "learning_rate": 0.002, + "loss": 2.4061, + "step": 21930 + }, + { + "epoch": 0.08481390422291289, + "grad_norm": 0.10847753286361694, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 21940 + }, + { + "epoch": 0.08485256142629617, + "grad_norm": 0.1272539645433426, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 21950 + }, + { + "epoch": 0.08489121862967945, + "grad_norm": 0.10342312604188919, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 21960 + }, + { + "epoch": 0.08492987583306273, + "grad_norm": 0.12353398650884628, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 21970 + }, + { + "epoch": 0.08496853303644601, + "grad_norm": 0.1458020955324173, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 21980 + }, + { + "epoch": 0.08500719023982929, + "grad_norm": 0.11380862444639206, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 21990 + }, + { + "epoch": 0.08504584744321257, + "grad_norm": 0.12858135998249054, + "learning_rate": 0.002, + "loss": 2.391, + "step": 22000 + }, + { + "epoch": 0.08508450464659585, + "grad_norm": 0.10241194814443588, + "learning_rate": 0.002, + "loss": 2.3877, + "step": 22010 + }, + { + "epoch": 0.08512316184997913, + "grad_norm": 0.10861173272132874, + "learning_rate": 0.002, + "loss": 2.3798, + "step": 22020 + }, + { + "epoch": 0.0851618190533624, + "grad_norm": 0.09653986990451813, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 22030 + }, + { + "epoch": 0.08520047625674568, + "grad_norm": 0.1258213073015213, + "learning_rate": 0.002, + "loss": 2.379, + "step": 22040 + }, + { + "epoch": 0.08523913346012896, + "grad_norm": 0.11620525270700455, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 22050 + }, + { + "epoch": 0.08527779066351224, + "grad_norm": 0.10526707023382187, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 22060 + }, + { + "epoch": 0.08531644786689552, + "grad_norm": 0.11238943785429001, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 22070 + }, + { + "epoch": 0.0853551050702788, + "grad_norm": 0.1280938982963562, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 22080 + }, + { + "epoch": 0.08539376227366208, + "grad_norm": 0.09797575324773788, + "learning_rate": 0.002, + "loss": 2.3991, + "step": 22090 + }, + { + "epoch": 0.08543241947704536, + "grad_norm": 0.1478355973958969, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 22100 + }, + { + "epoch": 0.08547107668042864, + "grad_norm": 0.11716704070568085, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 22110 + }, + { + "epoch": 0.0855097338838119, + "grad_norm": 0.1012004092335701, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 22120 + }, + { + "epoch": 0.08554839108719518, + "grad_norm": 0.09938769787549973, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 22130 + }, + { + "epoch": 0.08558704829057846, + "grad_norm": 0.12744750082492828, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 22140 + }, + { + "epoch": 0.08562570549396174, + "grad_norm": 0.14632226526737213, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 22150 + }, + { + "epoch": 0.08566436269734502, + "grad_norm": 0.10066097229719162, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 22160 + }, + { + "epoch": 0.0857030199007283, + "grad_norm": 0.10367601364850998, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 22170 + }, + { + "epoch": 0.08574167710411158, + "grad_norm": 0.11595974117517471, + "learning_rate": 0.002, + "loss": 2.383, + "step": 22180 + }, + { + "epoch": 0.08578033430749486, + "grad_norm": 0.13266971707344055, + "learning_rate": 0.002, + "loss": 2.377, + "step": 22190 + }, + { + "epoch": 0.08581899151087814, + "grad_norm": 0.10529126226902008, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 22200 + }, + { + "epoch": 0.08585764871426141, + "grad_norm": 0.10715759545564651, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 22210 + }, + { + "epoch": 0.08589630591764469, + "grad_norm": 0.11852872371673584, + "learning_rate": 0.002, + "loss": 2.3844, + "step": 22220 + }, + { + "epoch": 0.08593496312102797, + "grad_norm": 0.11074218899011612, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 22230 + }, + { + "epoch": 0.08597362032441125, + "grad_norm": 0.1398768573999405, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 22240 + }, + { + "epoch": 0.08601227752779453, + "grad_norm": 0.12655308842658997, + "learning_rate": 0.002, + "loss": 2.3886, + "step": 22250 + }, + { + "epoch": 0.08605093473117781, + "grad_norm": 0.13221947848796844, + "learning_rate": 0.002, + "loss": 2.365, + "step": 22260 + }, + { + "epoch": 0.08608959193456109, + "grad_norm": 0.12071418017148972, + "learning_rate": 0.002, + "loss": 2.393, + "step": 22270 + }, + { + "epoch": 0.08612824913794437, + "grad_norm": 0.09518468379974365, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 22280 + }, + { + "epoch": 0.08616690634132765, + "grad_norm": 0.11835268884897232, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 22290 + }, + { + "epoch": 0.08620556354471093, + "grad_norm": 0.11249658465385437, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 22300 + }, + { + "epoch": 0.0862442207480942, + "grad_norm": 0.12210391461849213, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 22310 + }, + { + "epoch": 0.08628287795147747, + "grad_norm": 0.10461321473121643, + "learning_rate": 0.002, + "loss": 2.3923, + "step": 22320 + }, + { + "epoch": 0.08632153515486075, + "grad_norm": 0.10224251449108124, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 22330 + }, + { + "epoch": 0.08636019235824403, + "grad_norm": 0.12130418419837952, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 22340 + }, + { + "epoch": 0.08639884956162731, + "grad_norm": 0.1214376837015152, + "learning_rate": 0.002, + "loss": 2.3913, + "step": 22350 + }, + { + "epoch": 0.0864375067650106, + "grad_norm": 0.11018189042806625, + "learning_rate": 0.002, + "loss": 2.376, + "step": 22360 + }, + { + "epoch": 0.08647616396839387, + "grad_norm": 0.11936061084270477, + "learning_rate": 0.002, + "loss": 2.39, + "step": 22370 + }, + { + "epoch": 0.08651482117177715, + "grad_norm": 0.10982014983892441, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 22380 + }, + { + "epoch": 0.08655347837516043, + "grad_norm": 0.11572841554880142, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 22390 + }, + { + "epoch": 0.0865921355785437, + "grad_norm": 0.11071977764368057, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 22400 + }, + { + "epoch": 0.08663079278192698, + "grad_norm": 0.1090395525097847, + "learning_rate": 0.002, + "loss": 2.3885, + "step": 22410 + }, + { + "epoch": 0.08666944998531026, + "grad_norm": 0.11444620043039322, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 22420 + }, + { + "epoch": 0.08670810718869354, + "grad_norm": 0.11480669677257538, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 22430 + }, + { + "epoch": 0.08674676439207682, + "grad_norm": 0.11158749461174011, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 22440 + }, + { + "epoch": 0.0867854215954601, + "grad_norm": 0.10097053647041321, + "learning_rate": 0.002, + "loss": 2.3953, + "step": 22450 + }, + { + "epoch": 0.08682407879884338, + "grad_norm": 0.12639379501342773, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 22460 + }, + { + "epoch": 0.08686273600222666, + "grad_norm": 0.12102729082107544, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 22470 + }, + { + "epoch": 0.08690139320560994, + "grad_norm": 0.1355907917022705, + "learning_rate": 0.002, + "loss": 2.3871, + "step": 22480 + }, + { + "epoch": 0.0869400504089932, + "grad_norm": 0.11741629987955093, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 22490 + }, + { + "epoch": 0.08697870761237649, + "grad_norm": 0.11757150292396545, + "learning_rate": 0.002, + "loss": 2.3884, + "step": 22500 + }, + { + "epoch": 0.08701736481575977, + "grad_norm": 0.12271060794591904, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 22510 + }, + { + "epoch": 0.08705602201914305, + "grad_norm": 0.10545120388269424, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 22520 + }, + { + "epoch": 0.08709467922252632, + "grad_norm": 0.1205563023686409, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 22530 + }, + { + "epoch": 0.0871333364259096, + "grad_norm": 0.10538642108440399, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 22540 + }, + { + "epoch": 0.08717199362929288, + "grad_norm": 0.13118794560432434, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 22550 + }, + { + "epoch": 0.08721065083267616, + "grad_norm": 0.11891195923089981, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 22560 + }, + { + "epoch": 0.08724930803605944, + "grad_norm": 0.12343083322048187, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 22570 + }, + { + "epoch": 0.08728796523944272, + "grad_norm": 0.10861959308385849, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 22580 + }, + { + "epoch": 0.08732662244282599, + "grad_norm": 0.11839080601930618, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 22590 + }, + { + "epoch": 0.08736527964620927, + "grad_norm": 0.1273038387298584, + "learning_rate": 0.002, + "loss": 2.398, + "step": 22600 + }, + { + "epoch": 0.08740393684959255, + "grad_norm": 0.1179809421300888, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 22610 + }, + { + "epoch": 0.08744259405297583, + "grad_norm": 0.27098774909973145, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 22620 + }, + { + "epoch": 0.08748125125635911, + "grad_norm": 0.12713086605072021, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 22630 + }, + { + "epoch": 0.08751990845974239, + "grad_norm": 0.11043170839548111, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 22640 + }, + { + "epoch": 0.08755856566312567, + "grad_norm": 0.11204589903354645, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 22650 + }, + { + "epoch": 0.08759722286650895, + "grad_norm": 0.11137638986110687, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 22660 + }, + { + "epoch": 0.08763588006989223, + "grad_norm": 0.12163740396499634, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 22670 + }, + { + "epoch": 0.0876745372732755, + "grad_norm": 0.14378522336483002, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 22680 + }, + { + "epoch": 0.08771319447665878, + "grad_norm": 0.14936842024326324, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 22690 + }, + { + "epoch": 0.08775185168004206, + "grad_norm": 0.1017768532037735, + "learning_rate": 0.002, + "loss": 2.387, + "step": 22700 + }, + { + "epoch": 0.08779050888342534, + "grad_norm": 0.12708184123039246, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 22710 + }, + { + "epoch": 0.08782916608680862, + "grad_norm": 0.1156398355960846, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 22720 + }, + { + "epoch": 0.0878678232901919, + "grad_norm": 0.12609833478927612, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 22730 + }, + { + "epoch": 0.08790648049357518, + "grad_norm": 0.09571592509746552, + "learning_rate": 0.002, + "loss": 2.378, + "step": 22740 + }, + { + "epoch": 0.08794513769695846, + "grad_norm": 0.12027204781770706, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 22750 + }, + { + "epoch": 0.08798379490034174, + "grad_norm": 0.11196243017911911, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 22760 + }, + { + "epoch": 0.088022452103725, + "grad_norm": 0.10924676060676575, + "learning_rate": 0.002, + "loss": 2.3787, + "step": 22770 + }, + { + "epoch": 0.08806110930710828, + "grad_norm": 0.11749155819416046, + "learning_rate": 0.002, + "loss": 2.4005, + "step": 22780 + }, + { + "epoch": 0.08809976651049156, + "grad_norm": 0.11545772105455399, + "learning_rate": 0.002, + "loss": 2.3803, + "step": 22790 + }, + { + "epoch": 0.08813842371387484, + "grad_norm": 0.11182691156864166, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 22800 + }, + { + "epoch": 0.08817708091725812, + "grad_norm": 0.13504658639431, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 22810 + }, + { + "epoch": 0.0882157381206414, + "grad_norm": 0.11489430069923401, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 22820 + }, + { + "epoch": 0.08825439532402468, + "grad_norm": 0.11193528026342392, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 22830 + }, + { + "epoch": 0.08829305252740796, + "grad_norm": 0.13250324130058289, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 22840 + }, + { + "epoch": 0.08833170973079124, + "grad_norm": 0.11712247878313065, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 22850 + }, + { + "epoch": 0.08837036693417451, + "grad_norm": 0.11358913779258728, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 22860 + }, + { + "epoch": 0.08840902413755779, + "grad_norm": 0.10116658359766006, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 22870 + }, + { + "epoch": 0.08844768134094107, + "grad_norm": 0.12980881333351135, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 22880 + }, + { + "epoch": 0.08848633854432435, + "grad_norm": 0.1002582311630249, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 22890 + }, + { + "epoch": 0.08852499574770763, + "grad_norm": 0.09856297075748444, + "learning_rate": 0.002, + "loss": 2.3962, + "step": 22900 + }, + { + "epoch": 0.08856365295109091, + "grad_norm": 0.10269007831811905, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 22910 + }, + { + "epoch": 0.08860231015447419, + "grad_norm": 0.11187140643596649, + "learning_rate": 0.002, + "loss": 2.3951, + "step": 22920 + }, + { + "epoch": 0.08864096735785747, + "grad_norm": 0.11259777843952179, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 22930 + }, + { + "epoch": 0.08867962456124075, + "grad_norm": 0.09621060639619827, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 22940 + }, + { + "epoch": 0.08871828176462403, + "grad_norm": 0.11501479893922806, + "learning_rate": 0.002, + "loss": 2.3937, + "step": 22950 + }, + { + "epoch": 0.08875693896800729, + "grad_norm": 0.17122884094715118, + "learning_rate": 0.002, + "loss": 2.367, + "step": 22960 + }, + { + "epoch": 0.08879559617139057, + "grad_norm": 0.10345969349145889, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 22970 + }, + { + "epoch": 0.08883425337477385, + "grad_norm": 0.11186736822128296, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 22980 + }, + { + "epoch": 0.08887291057815713, + "grad_norm": 0.11556321382522583, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 22990 + }, + { + "epoch": 0.08891156778154041, + "grad_norm": 0.10784627497196198, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 23000 + }, + { + "epoch": 0.08895022498492369, + "grad_norm": 0.12641456723213196, + "learning_rate": 0.002, + "loss": 2.3868, + "step": 23010 + }, + { + "epoch": 0.08898888218830697, + "grad_norm": 0.14297804236412048, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 23020 + }, + { + "epoch": 0.08902753939169025, + "grad_norm": 0.12131819874048233, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 23030 + }, + { + "epoch": 0.08906619659507353, + "grad_norm": 0.11721863597631454, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 23040 + }, + { + "epoch": 0.0891048537984568, + "grad_norm": 0.13657094538211823, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 23050 + }, + { + "epoch": 0.08914351100184008, + "grad_norm": 0.15033309161663055, + "learning_rate": 0.002, + "loss": 2.3971, + "step": 23060 + }, + { + "epoch": 0.08918216820522336, + "grad_norm": 0.10266692191362381, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 23070 + }, + { + "epoch": 0.08922082540860664, + "grad_norm": 0.1215285062789917, + "learning_rate": 0.002, + "loss": 2.3915, + "step": 23080 + }, + { + "epoch": 0.08925948261198992, + "grad_norm": 0.12149213999509811, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 23090 + }, + { + "epoch": 0.0892981398153732, + "grad_norm": 0.2718287408351898, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 23100 + }, + { + "epoch": 0.08933679701875648, + "grad_norm": 0.12585672736167908, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 23110 + }, + { + "epoch": 0.08937545422213976, + "grad_norm": 0.1085464283823967, + "learning_rate": 0.002, + "loss": 2.357, + "step": 23120 + }, + { + "epoch": 0.08941411142552304, + "grad_norm": 0.12199816852807999, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 23130 + }, + { + "epoch": 0.0894527686289063, + "grad_norm": 0.12557195127010345, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 23140 + }, + { + "epoch": 0.08949142583228958, + "grad_norm": 0.11516721546649933, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 23150 + }, + { + "epoch": 0.08953008303567286, + "grad_norm": 0.11621605604887009, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 23160 + }, + { + "epoch": 0.08956874023905614, + "grad_norm": 0.13228261470794678, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 23170 + }, + { + "epoch": 0.08960739744243942, + "grad_norm": 0.111338771879673, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 23180 + }, + { + "epoch": 0.0896460546458227, + "grad_norm": 0.12004300951957703, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 23190 + }, + { + "epoch": 0.08968471184920598, + "grad_norm": 0.1402856856584549, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 23200 + }, + { + "epoch": 0.08972336905258926, + "grad_norm": 0.09876136481761932, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 23210 + }, + { + "epoch": 0.08976202625597254, + "grad_norm": 0.10694080591201782, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 23220 + }, + { + "epoch": 0.08980068345935581, + "grad_norm": 0.1302351951599121, + "learning_rate": 0.002, + "loss": 2.3882, + "step": 23230 + }, + { + "epoch": 0.08983934066273909, + "grad_norm": 0.10748574882745743, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 23240 + }, + { + "epoch": 0.08987799786612237, + "grad_norm": 0.1744678020477295, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 23250 + }, + { + "epoch": 0.08991665506950565, + "grad_norm": 0.09777440130710602, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 23260 + }, + { + "epoch": 0.08995531227288893, + "grad_norm": 0.11797336488962173, + "learning_rate": 0.002, + "loss": 2.3902, + "step": 23270 + }, + { + "epoch": 0.08999396947627221, + "grad_norm": 0.10277258604764938, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 23280 + }, + { + "epoch": 0.09003262667965549, + "grad_norm": 0.12030167877674103, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 23290 + }, + { + "epoch": 0.09007128388303877, + "grad_norm": 0.10009963065385818, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 23300 + }, + { + "epoch": 0.09010994108642205, + "grad_norm": 0.13436256349086761, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 23310 + }, + { + "epoch": 0.09014859828980533, + "grad_norm": 0.11342042684555054, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 23320 + }, + { + "epoch": 0.0901872554931886, + "grad_norm": 0.14033156633377075, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 23330 + }, + { + "epoch": 0.09022591269657187, + "grad_norm": 0.11601479351520538, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 23340 + }, + { + "epoch": 0.09026456989995515, + "grad_norm": 0.1295476257801056, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 23350 + }, + { + "epoch": 0.09030322710333843, + "grad_norm": 0.1155814528465271, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 23360 + }, + { + "epoch": 0.09034188430672171, + "grad_norm": 0.11913534253835678, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 23370 + }, + { + "epoch": 0.090380541510105, + "grad_norm": 0.13233350217342377, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 23380 + }, + { + "epoch": 0.09041919871348827, + "grad_norm": 0.1004096195101738, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 23390 + }, + { + "epoch": 0.09045785591687155, + "grad_norm": 0.12615826725959778, + "learning_rate": 0.002, + "loss": 2.3881, + "step": 23400 + }, + { + "epoch": 0.09049651312025483, + "grad_norm": 0.11457603424787521, + "learning_rate": 0.002, + "loss": 2.3899, + "step": 23410 + }, + { + "epoch": 0.0905351703236381, + "grad_norm": 0.11348090320825577, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 23420 + }, + { + "epoch": 0.09057382752702138, + "grad_norm": 0.10704642534255981, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 23430 + }, + { + "epoch": 0.09061248473040466, + "grad_norm": 0.11089534312486649, + "learning_rate": 0.002, + "loss": 2.3991, + "step": 23440 + }, + { + "epoch": 0.09065114193378794, + "grad_norm": 0.1276397705078125, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 23450 + }, + { + "epoch": 0.09068979913717122, + "grad_norm": 0.11047758162021637, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 23460 + }, + { + "epoch": 0.0907284563405545, + "grad_norm": 0.1099272072315216, + "learning_rate": 0.002, + "loss": 2.379, + "step": 23470 + }, + { + "epoch": 0.09076711354393778, + "grad_norm": 0.13830797374248505, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 23480 + }, + { + "epoch": 0.09080577074732106, + "grad_norm": 0.1121407300233841, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 23490 + }, + { + "epoch": 0.09084442795070434, + "grad_norm": 0.12352314591407776, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 23500 + }, + { + "epoch": 0.0908830851540876, + "grad_norm": 0.11088527739048004, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 23510 + }, + { + "epoch": 0.09092174235747089, + "grad_norm": 0.11665134131908417, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 23520 + }, + { + "epoch": 0.09096039956085417, + "grad_norm": 0.1182350143790245, + "learning_rate": 0.002, + "loss": 2.3898, + "step": 23530 + }, + { + "epoch": 0.09099905676423745, + "grad_norm": 0.1312289983034134, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 23540 + }, + { + "epoch": 0.09103771396762073, + "grad_norm": 0.0952557697892189, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 23550 + }, + { + "epoch": 0.091076371171004, + "grad_norm": 0.11382191628217697, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 23560 + }, + { + "epoch": 0.09111502837438729, + "grad_norm": 0.11281711608171463, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 23570 + }, + { + "epoch": 0.09115368557777057, + "grad_norm": 0.13775767385959625, + "learning_rate": 0.002, + "loss": 2.3964, + "step": 23580 + }, + { + "epoch": 0.09119234278115385, + "grad_norm": 0.11854385584592819, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 23590 + }, + { + "epoch": 0.09123099998453713, + "grad_norm": 0.11661964654922485, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 23600 + }, + { + "epoch": 0.09126965718792039, + "grad_norm": 0.11538533866405487, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 23610 + }, + { + "epoch": 0.09130831439130367, + "grad_norm": 0.12258029729127884, + "learning_rate": 0.002, + "loss": 2.37, + "step": 23620 + }, + { + "epoch": 0.09134697159468695, + "grad_norm": 0.10069010406732559, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 23630 + }, + { + "epoch": 0.09138562879807023, + "grad_norm": 0.0946861207485199, + "learning_rate": 0.002, + "loss": 2.383, + "step": 23640 + }, + { + "epoch": 0.09142428600145351, + "grad_norm": 0.11013893038034439, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 23650 + }, + { + "epoch": 0.09146294320483679, + "grad_norm": 0.13029184937477112, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 23660 + }, + { + "epoch": 0.09150160040822007, + "grad_norm": 0.11333515495061874, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 23670 + }, + { + "epoch": 0.09154025761160335, + "grad_norm": 0.09361569583415985, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 23680 + }, + { + "epoch": 0.09157891481498663, + "grad_norm": 0.14618952572345734, + "learning_rate": 0.002, + "loss": 2.389, + "step": 23690 + }, + { + "epoch": 0.0916175720183699, + "grad_norm": 0.12504111230373383, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 23700 + }, + { + "epoch": 0.09165622922175318, + "grad_norm": 0.11918000131845474, + "learning_rate": 0.002, + "loss": 2.3891, + "step": 23710 + }, + { + "epoch": 0.09169488642513646, + "grad_norm": 0.21253357827663422, + "learning_rate": 0.002, + "loss": 2.379, + "step": 23720 + }, + { + "epoch": 0.09173354362851974, + "grad_norm": 0.1254817694425583, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 23730 + }, + { + "epoch": 0.09177220083190302, + "grad_norm": 0.1153513491153717, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 23740 + }, + { + "epoch": 0.0918108580352863, + "grad_norm": 0.11657676100730896, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 23750 + }, + { + "epoch": 0.09184951523866958, + "grad_norm": 0.13089793920516968, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 23760 + }, + { + "epoch": 0.09188817244205286, + "grad_norm": 0.10609673708677292, + "learning_rate": 0.002, + "loss": 2.376, + "step": 23770 + }, + { + "epoch": 0.09192682964543614, + "grad_norm": 0.13091377913951874, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 23780 + }, + { + "epoch": 0.0919654868488194, + "grad_norm": 0.10436306893825531, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 23790 + }, + { + "epoch": 0.09200414405220268, + "grad_norm": 0.10807617008686066, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 23800 + }, + { + "epoch": 0.09204280125558596, + "grad_norm": 0.10745303332805634, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 23810 + }, + { + "epoch": 0.09208145845896924, + "grad_norm": 0.11694838106632233, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 23820 + }, + { + "epoch": 0.09212011566235252, + "grad_norm": 0.1124061644077301, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 23830 + }, + { + "epoch": 0.0921587728657358, + "grad_norm": 0.1196490079164505, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 23840 + }, + { + "epoch": 0.09219743006911908, + "grad_norm": 0.10835790634155273, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 23850 + }, + { + "epoch": 0.09223608727250236, + "grad_norm": 0.13237883150577545, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 23860 + }, + { + "epoch": 0.09227474447588564, + "grad_norm": 0.1156863123178482, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 23870 + }, + { + "epoch": 0.09231340167926891, + "grad_norm": 0.10456960648298264, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 23880 + }, + { + "epoch": 0.09235205888265219, + "grad_norm": 0.09652217477560043, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 23890 + }, + { + "epoch": 0.09239071608603547, + "grad_norm": 0.12134707719087601, + "learning_rate": 0.002, + "loss": 2.3895, + "step": 23900 + }, + { + "epoch": 0.09242937328941875, + "grad_norm": 0.10908094793558121, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 23910 + }, + { + "epoch": 0.09246803049280203, + "grad_norm": 0.13318543136119843, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 23920 + }, + { + "epoch": 0.09250668769618531, + "grad_norm": 0.10980760306119919, + "learning_rate": 0.002, + "loss": 2.3926, + "step": 23930 + }, + { + "epoch": 0.09254534489956859, + "grad_norm": 0.10727297514677048, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 23940 + }, + { + "epoch": 0.09258400210295187, + "grad_norm": 0.11008038371801376, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 23950 + }, + { + "epoch": 0.09262265930633515, + "grad_norm": 0.1304430365562439, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 23960 + }, + { + "epoch": 0.09266131650971843, + "grad_norm": 0.10171637684106827, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 23970 + }, + { + "epoch": 0.0926999737131017, + "grad_norm": 0.11464967578649521, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 23980 + }, + { + "epoch": 0.09273863091648497, + "grad_norm": 0.10553108900785446, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 23990 + }, + { + "epoch": 0.09277728811986825, + "grad_norm": 0.12580671906471252, + "learning_rate": 0.002, + "loss": 2.3844, + "step": 24000 + }, + { + "epoch": 0.09281594532325153, + "grad_norm": 0.11327177286148071, + "learning_rate": 0.002, + "loss": 2.38, + "step": 24010 + }, + { + "epoch": 0.09285460252663481, + "grad_norm": 0.10099143534898758, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 24020 + }, + { + "epoch": 0.0928932597300181, + "grad_norm": 0.13738584518432617, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 24030 + }, + { + "epoch": 0.09293191693340137, + "grad_norm": 0.11237277090549469, + "learning_rate": 0.002, + "loss": 2.37, + "step": 24040 + }, + { + "epoch": 0.09297057413678465, + "grad_norm": 0.11633989959955215, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 24050 + }, + { + "epoch": 0.09300923134016793, + "grad_norm": 0.13667155802249908, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 24060 + }, + { + "epoch": 0.0930478885435512, + "grad_norm": 0.10400626808404922, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 24070 + }, + { + "epoch": 0.09308654574693448, + "grad_norm": 0.12100454419851303, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 24080 + }, + { + "epoch": 0.09312520295031776, + "grad_norm": 0.1407627910375595, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 24090 + }, + { + "epoch": 0.09316386015370104, + "grad_norm": 0.116483174264431, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 24100 + }, + { + "epoch": 0.09320251735708432, + "grad_norm": 0.1100977286696434, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 24110 + }, + { + "epoch": 0.0932411745604676, + "grad_norm": 0.10602124035358429, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 24120 + }, + { + "epoch": 0.09327983176385088, + "grad_norm": 0.09690314531326294, + "learning_rate": 0.002, + "loss": 2.3803, + "step": 24130 + }, + { + "epoch": 0.09331848896723416, + "grad_norm": 0.10744574666023254, + "learning_rate": 0.002, + "loss": 2.374, + "step": 24140 + }, + { + "epoch": 0.09335714617061744, + "grad_norm": 0.10048481076955795, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 24150 + }, + { + "epoch": 0.0933958033740007, + "grad_norm": 0.13025830686092377, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 24160 + }, + { + "epoch": 0.09343446057738398, + "grad_norm": 0.09571056067943573, + "learning_rate": 0.002, + "loss": 2.3916, + "step": 24170 + }, + { + "epoch": 0.09347311778076726, + "grad_norm": 0.13225597143173218, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 24180 + }, + { + "epoch": 0.09351177498415054, + "grad_norm": 0.11436488479375839, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 24190 + }, + { + "epoch": 0.09355043218753382, + "grad_norm": 0.13618864119052887, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 24200 + }, + { + "epoch": 0.0935890893909171, + "grad_norm": 0.1187516301870346, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 24210 + }, + { + "epoch": 0.09362774659430038, + "grad_norm": 0.09682077914476395, + "learning_rate": 0.002, + "loss": 2.353, + "step": 24220 + }, + { + "epoch": 0.09366640379768366, + "grad_norm": 0.10571873933076859, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 24230 + }, + { + "epoch": 0.09370506100106694, + "grad_norm": 0.10611064732074738, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 24240 + }, + { + "epoch": 0.09374371820445021, + "grad_norm": 0.12139905989170074, + "learning_rate": 0.002, + "loss": 2.3842, + "step": 24250 + }, + { + "epoch": 0.09378237540783349, + "grad_norm": 0.1359192132949829, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 24260 + }, + { + "epoch": 0.09382103261121677, + "grad_norm": 0.11392809450626373, + "learning_rate": 0.002, + "loss": 2.383, + "step": 24270 + }, + { + "epoch": 0.09385968981460005, + "grad_norm": 0.12583954632282257, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 24280 + }, + { + "epoch": 0.09389834701798333, + "grad_norm": 0.10504510998725891, + "learning_rate": 0.002, + "loss": 2.3955, + "step": 24290 + }, + { + "epoch": 0.09393700422136661, + "grad_norm": 0.12139500677585602, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 24300 + }, + { + "epoch": 0.09397566142474989, + "grad_norm": 0.12326785922050476, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 24310 + }, + { + "epoch": 0.09401431862813317, + "grad_norm": 0.10977497696876526, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 24320 + }, + { + "epoch": 0.09405297583151645, + "grad_norm": 0.10636772215366364, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 24330 + }, + { + "epoch": 0.09409163303489973, + "grad_norm": 0.10684899240732193, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 24340 + }, + { + "epoch": 0.094130290238283, + "grad_norm": 0.09462485462427139, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 24350 + }, + { + "epoch": 0.09416894744166628, + "grad_norm": 0.1221616268157959, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 24360 + }, + { + "epoch": 0.09420760464504956, + "grad_norm": 0.10672114044427872, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 24370 + }, + { + "epoch": 0.09424626184843284, + "grad_norm": 0.11299077421426773, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 24380 + }, + { + "epoch": 0.09428491905181612, + "grad_norm": 0.1013621836900711, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 24390 + }, + { + "epoch": 0.0943235762551994, + "grad_norm": 0.10756544023752213, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 24400 + }, + { + "epoch": 0.09436223345858268, + "grad_norm": 0.12545613944530487, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 24410 + }, + { + "epoch": 0.09440089066196596, + "grad_norm": 0.11735380440950394, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 24420 + }, + { + "epoch": 0.09443954786534924, + "grad_norm": 0.11142757534980774, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 24430 + }, + { + "epoch": 0.0944782050687325, + "grad_norm": 0.11514252424240112, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 24440 + }, + { + "epoch": 0.09451686227211578, + "grad_norm": 0.13839270174503326, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 24450 + }, + { + "epoch": 0.09455551947549906, + "grad_norm": 0.12157618999481201, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 24460 + }, + { + "epoch": 0.09459417667888234, + "grad_norm": 0.10932943224906921, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 24470 + }, + { + "epoch": 0.09463283388226562, + "grad_norm": 0.08984825760126114, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 24480 + }, + { + "epoch": 0.0946714910856489, + "grad_norm": 0.11526817828416824, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 24490 + }, + { + "epoch": 0.09471014828903218, + "grad_norm": 0.10593192279338837, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 24500 + }, + { + "epoch": 0.09474880549241546, + "grad_norm": 0.11164966970682144, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 24510 + }, + { + "epoch": 0.09478746269579874, + "grad_norm": 0.17076915502548218, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 24520 + }, + { + "epoch": 0.09482611989918201, + "grad_norm": 0.11258591711521149, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 24530 + }, + { + "epoch": 0.09486477710256529, + "grad_norm": 0.1002669557929039, + "learning_rate": 0.002, + "loss": 2.3865, + "step": 24540 + }, + { + "epoch": 0.09490343430594857, + "grad_norm": 0.11262197047472, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 24550 + }, + { + "epoch": 0.09494209150933185, + "grad_norm": 0.10771956294775009, + "learning_rate": 0.002, + "loss": 2.3797, + "step": 24560 + }, + { + "epoch": 0.09498074871271513, + "grad_norm": 0.1211184710264206, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 24570 + }, + { + "epoch": 0.0950194059160984, + "grad_norm": 0.11493786424398422, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 24580 + }, + { + "epoch": 0.09505806311948169, + "grad_norm": 0.11493559926748276, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 24590 + }, + { + "epoch": 0.09509672032286497, + "grad_norm": 0.11849837005138397, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 24600 + }, + { + "epoch": 0.09513537752624825, + "grad_norm": 0.10788939148187637, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 24610 + }, + { + "epoch": 0.09517403472963153, + "grad_norm": 0.11454930901527405, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 24620 + }, + { + "epoch": 0.09521269193301479, + "grad_norm": 0.1146981343626976, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 24630 + }, + { + "epoch": 0.09525134913639807, + "grad_norm": 0.12724652886390686, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 24640 + }, + { + "epoch": 0.09529000633978135, + "grad_norm": 0.10866862535476685, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 24650 + }, + { + "epoch": 0.09532866354316463, + "grad_norm": 0.14221100509166718, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 24660 + }, + { + "epoch": 0.09536732074654791, + "grad_norm": 0.1002168357372284, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 24670 + }, + { + "epoch": 0.09540597794993119, + "grad_norm": 0.1249079555273056, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 24680 + }, + { + "epoch": 0.09544463515331447, + "grad_norm": 0.10911814868450165, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 24690 + }, + { + "epoch": 0.09548329235669775, + "grad_norm": 0.11465930193662643, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 24700 + }, + { + "epoch": 0.09552194956008103, + "grad_norm": 0.09612670540809631, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 24710 + }, + { + "epoch": 0.0955606067634643, + "grad_norm": 0.11884750425815582, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 24720 + }, + { + "epoch": 0.09559926396684758, + "grad_norm": 0.10053402930498123, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 24730 + }, + { + "epoch": 0.09563792117023086, + "grad_norm": 0.11036361008882523, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 24740 + }, + { + "epoch": 0.09567657837361414, + "grad_norm": 0.09313686192035675, + "learning_rate": 0.002, + "loss": 2.3787, + "step": 24750 + }, + { + "epoch": 0.09571523557699742, + "grad_norm": 0.12321452051401138, + "learning_rate": 0.002, + "loss": 2.374, + "step": 24760 + }, + { + "epoch": 0.0957538927803807, + "grad_norm": 0.10040561854839325, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 24770 + }, + { + "epoch": 0.09579254998376398, + "grad_norm": 0.1582271158695221, + "learning_rate": 0.002, + "loss": 2.372, + "step": 24780 + }, + { + "epoch": 0.09583120718714726, + "grad_norm": 0.09811810404062271, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 24790 + }, + { + "epoch": 0.09586986439053054, + "grad_norm": 0.12261014431715012, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 24800 + }, + { + "epoch": 0.0959085215939138, + "grad_norm": 0.09658876806497574, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 24810 + }, + { + "epoch": 0.09594717879729708, + "grad_norm": 0.13847355544567108, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 24820 + }, + { + "epoch": 0.09598583600068036, + "grad_norm": 0.09949313849210739, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 24830 + }, + { + "epoch": 0.09602449320406364, + "grad_norm": 0.11247383058071136, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 24840 + }, + { + "epoch": 0.09606315040744692, + "grad_norm": 0.11727588623762131, + "learning_rate": 0.002, + "loss": 2.3899, + "step": 24850 + }, + { + "epoch": 0.0961018076108302, + "grad_norm": 0.12999024987220764, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 24860 + }, + { + "epoch": 0.09614046481421348, + "grad_norm": 0.10106495022773743, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 24870 + }, + { + "epoch": 0.09617912201759676, + "grad_norm": 0.1225874051451683, + "learning_rate": 0.002, + "loss": 2.3991, + "step": 24880 + }, + { + "epoch": 0.09621777922098004, + "grad_norm": 0.11529222130775452, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 24890 + }, + { + "epoch": 0.09625643642436331, + "grad_norm": 0.10541075468063354, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 24900 + }, + { + "epoch": 0.09629509362774659, + "grad_norm": 0.11571445316076279, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 24910 + }, + { + "epoch": 0.09633375083112987, + "grad_norm": 0.10362482815980911, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 24920 + }, + { + "epoch": 0.09637240803451315, + "grad_norm": 0.12240533530712128, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 24930 + }, + { + "epoch": 0.09641106523789643, + "grad_norm": 0.09573902934789658, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 24940 + }, + { + "epoch": 0.09644972244127971, + "grad_norm": 0.098006471991539, + "learning_rate": 0.002, + "loss": 2.379, + "step": 24950 + }, + { + "epoch": 0.09648837964466299, + "grad_norm": 0.10664179921150208, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 24960 + }, + { + "epoch": 0.09652703684804627, + "grad_norm": 0.12383517622947693, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 24970 + }, + { + "epoch": 0.09656569405142955, + "grad_norm": 0.10212542116641998, + "learning_rate": 0.002, + "loss": 2.37, + "step": 24980 + }, + { + "epoch": 0.09660435125481283, + "grad_norm": 0.1323486864566803, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 24990 + }, + { + "epoch": 0.0966430084581961, + "grad_norm": 0.11106313019990921, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 25000 + }, + { + "epoch": 0.09668166566157937, + "grad_norm": 0.10711284726858139, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 25010 + }, + { + "epoch": 0.09672032286496265, + "grad_norm": 0.10144255310297012, + "learning_rate": 0.002, + "loss": 2.365, + "step": 25020 + }, + { + "epoch": 0.09675898006834593, + "grad_norm": 0.10904534161090851, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 25030 + }, + { + "epoch": 0.09679763727172921, + "grad_norm": 0.11617907136678696, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 25040 + }, + { + "epoch": 0.0968362944751125, + "grad_norm": 0.11424647271633148, + "learning_rate": 0.002, + "loss": 2.3798, + "step": 25050 + }, + { + "epoch": 0.09687495167849577, + "grad_norm": 0.12575869262218475, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 25060 + }, + { + "epoch": 0.09691360888187905, + "grad_norm": 0.10736224800348282, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 25070 + }, + { + "epoch": 0.09695226608526233, + "grad_norm": 0.1244237944483757, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 25080 + }, + { + "epoch": 0.0969909232886456, + "grad_norm": 0.10764366388320923, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 25090 + }, + { + "epoch": 0.09702958049202888, + "grad_norm": 0.10259444266557693, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 25100 + }, + { + "epoch": 0.09706823769541216, + "grad_norm": 0.11127594113349915, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 25110 + }, + { + "epoch": 0.09710689489879544, + "grad_norm": 0.14849691092967987, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 25120 + }, + { + "epoch": 0.09714555210217872, + "grad_norm": 0.10462760180234909, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 25130 + }, + { + "epoch": 0.097184209305562, + "grad_norm": 0.1071339026093483, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 25140 + }, + { + "epoch": 0.09722286650894528, + "grad_norm": 0.10815966129302979, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 25150 + }, + { + "epoch": 0.09726152371232856, + "grad_norm": 0.10802339017391205, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 25160 + }, + { + "epoch": 0.09730018091571184, + "grad_norm": 0.1159505546092987, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 25170 + }, + { + "epoch": 0.0973388381190951, + "grad_norm": 0.10223262757062912, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 25180 + }, + { + "epoch": 0.09737749532247839, + "grad_norm": 0.11454407870769501, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 25190 + }, + { + "epoch": 0.09741615252586167, + "grad_norm": 0.10623133182525635, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 25200 + }, + { + "epoch": 0.09745480972924495, + "grad_norm": 0.10946430265903473, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 25210 + }, + { + "epoch": 0.09749346693262823, + "grad_norm": 0.12587489187717438, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 25220 + }, + { + "epoch": 0.0975321241360115, + "grad_norm": 0.11556122452020645, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 25230 + }, + { + "epoch": 0.09757078133939479, + "grad_norm": 0.11611664295196533, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 25240 + }, + { + "epoch": 0.09760943854277807, + "grad_norm": 0.09125930815935135, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 25250 + }, + { + "epoch": 0.09764809574616135, + "grad_norm": 0.11490960419178009, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 25260 + }, + { + "epoch": 0.09768675294954461, + "grad_norm": 0.11289970576763153, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 25270 + }, + { + "epoch": 0.09772541015292789, + "grad_norm": 0.13276606798171997, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 25280 + }, + { + "epoch": 0.09776406735631117, + "grad_norm": 0.11637672036886215, + "learning_rate": 0.002, + "loss": 2.374, + "step": 25290 + }, + { + "epoch": 0.09780272455969445, + "grad_norm": 0.1237371638417244, + "learning_rate": 0.002, + "loss": 2.366, + "step": 25300 + }, + { + "epoch": 0.09784138176307773, + "grad_norm": 0.12679412961006165, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 25310 + }, + { + "epoch": 0.09788003896646101, + "grad_norm": 0.12102102488279343, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 25320 + }, + { + "epoch": 0.09791869616984429, + "grad_norm": 0.12702231109142303, + "learning_rate": 0.002, + "loss": 2.3844, + "step": 25330 + }, + { + "epoch": 0.09795735337322757, + "grad_norm": 0.11291328072547913, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 25340 + }, + { + "epoch": 0.09799601057661085, + "grad_norm": 0.1038038358092308, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 25350 + }, + { + "epoch": 0.09803466777999413, + "grad_norm": 0.12257646024227142, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 25360 + }, + { + "epoch": 0.0980733249833774, + "grad_norm": 0.15314073860645294, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 25370 + }, + { + "epoch": 0.09811198218676068, + "grad_norm": 0.104371577501297, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 25380 + }, + { + "epoch": 0.09815063939014396, + "grad_norm": 0.10793410986661911, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 25390 + }, + { + "epoch": 0.09818929659352724, + "grad_norm": 0.10416117310523987, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 25400 + }, + { + "epoch": 0.09822795379691052, + "grad_norm": 0.11453581601381302, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 25410 + }, + { + "epoch": 0.0982666110002938, + "grad_norm": 0.12008222192525864, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 25420 + }, + { + "epoch": 0.09830526820367708, + "grad_norm": 0.10592380166053772, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 25430 + }, + { + "epoch": 0.09834392540706036, + "grad_norm": 0.122405044734478, + "learning_rate": 0.002, + "loss": 2.375, + "step": 25440 + }, + { + "epoch": 0.09838258261044364, + "grad_norm": 0.10416276752948761, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 25450 + }, + { + "epoch": 0.0984212398138269, + "grad_norm": 0.12326578050851822, + "learning_rate": 0.002, + "loss": 2.372, + "step": 25460 + }, + { + "epoch": 0.09845989701721018, + "grad_norm": 0.10835543274879456, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 25470 + }, + { + "epoch": 0.09849855422059346, + "grad_norm": 0.10613939166069031, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 25480 + }, + { + "epoch": 0.09853721142397674, + "grad_norm": 0.13459204137325287, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 25490 + }, + { + "epoch": 0.09857586862736002, + "grad_norm": 0.11468186229467392, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 25500 + }, + { + "epoch": 0.0986145258307433, + "grad_norm": 0.10123708844184875, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 25510 + }, + { + "epoch": 0.09865318303412658, + "grad_norm": 0.11035001277923584, + "learning_rate": 0.002, + "loss": 2.3923, + "step": 25520 + }, + { + "epoch": 0.09869184023750986, + "grad_norm": 0.11449532955884933, + "learning_rate": 0.002, + "loss": 2.3939, + "step": 25530 + }, + { + "epoch": 0.09873049744089314, + "grad_norm": 0.1010097786784172, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 25540 + }, + { + "epoch": 0.09876915464427641, + "grad_norm": 0.11513739079236984, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 25550 + }, + { + "epoch": 0.09880781184765969, + "grad_norm": 0.11579716205596924, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 25560 + }, + { + "epoch": 0.09884646905104297, + "grad_norm": 0.11263803392648697, + "learning_rate": 0.002, + "loss": 2.3841, + "step": 25570 + }, + { + "epoch": 0.09888512625442625, + "grad_norm": 0.1053340956568718, + "learning_rate": 0.002, + "loss": 2.368, + "step": 25580 + }, + { + "epoch": 0.09892378345780953, + "grad_norm": 0.10061566531658173, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 25590 + }, + { + "epoch": 0.09896244066119281, + "grad_norm": 0.13848647475242615, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 25600 + }, + { + "epoch": 0.09900109786457609, + "grad_norm": 0.10833652317523956, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 25610 + }, + { + "epoch": 0.09903975506795937, + "grad_norm": 0.11265638470649719, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 25620 + }, + { + "epoch": 0.09907841227134265, + "grad_norm": 0.13258330523967743, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 25630 + }, + { + "epoch": 0.09911706947472593, + "grad_norm": 0.1301572173833847, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 25640 + }, + { + "epoch": 0.0991557266781092, + "grad_norm": 0.11950825154781342, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 25650 + }, + { + "epoch": 0.09919438388149247, + "grad_norm": 0.23389548063278198, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 25660 + }, + { + "epoch": 0.09923304108487575, + "grad_norm": 0.12364401668310165, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 25670 + }, + { + "epoch": 0.09927169828825903, + "grad_norm": 0.10952173918485641, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 25680 + }, + { + "epoch": 0.09931035549164231, + "grad_norm": 0.13647736608982086, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 25690 + }, + { + "epoch": 0.0993490126950256, + "grad_norm": 0.11075005680322647, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 25700 + }, + { + "epoch": 0.09938766989840887, + "grad_norm": 0.11662223935127258, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 25710 + }, + { + "epoch": 0.09942632710179215, + "grad_norm": 0.10770941525697708, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 25720 + }, + { + "epoch": 0.09946498430517543, + "grad_norm": 0.13047263026237488, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 25730 + }, + { + "epoch": 0.0995036415085587, + "grad_norm": 0.1101449728012085, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 25740 + }, + { + "epoch": 0.09954229871194198, + "grad_norm": 0.11350306868553162, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 25750 + }, + { + "epoch": 0.09958095591532526, + "grad_norm": 0.12219146639108658, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 25760 + }, + { + "epoch": 0.09961961311870854, + "grad_norm": 0.12308717519044876, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 25770 + }, + { + "epoch": 0.09965827032209182, + "grad_norm": 0.09611739218235016, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 25780 + }, + { + "epoch": 0.0996969275254751, + "grad_norm": 0.1195925772190094, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 25790 + }, + { + "epoch": 0.09973558472885838, + "grad_norm": 0.09814034402370453, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 25800 + }, + { + "epoch": 0.09977424193224166, + "grad_norm": 0.12110476940870285, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 25810 + }, + { + "epoch": 0.09981289913562494, + "grad_norm": 0.11635403335094452, + "learning_rate": 0.002, + "loss": 2.3859, + "step": 25820 + }, + { + "epoch": 0.0998515563390082, + "grad_norm": 0.1121845617890358, + "learning_rate": 0.002, + "loss": 2.3898, + "step": 25830 + }, + { + "epoch": 0.09989021354239148, + "grad_norm": 0.12140703201293945, + "learning_rate": 0.002, + "loss": 2.362, + "step": 25840 + }, + { + "epoch": 0.09992887074577476, + "grad_norm": 0.13646343350410461, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 25850 + }, + { + "epoch": 0.09996752794915804, + "grad_norm": 0.12495137006044388, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 25860 + }, + { + "epoch": 0.10000618515254132, + "grad_norm": 0.10688120126724243, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 25870 + }, + { + "epoch": 0.1000448423559246, + "grad_norm": 0.09174524247646332, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 25880 + }, + { + "epoch": 0.10008349955930788, + "grad_norm": 0.11917345225811005, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 25890 + }, + { + "epoch": 0.10012215676269116, + "grad_norm": 0.11813360452651978, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 25900 + }, + { + "epoch": 0.10016081396607444, + "grad_norm": 0.13693365454673767, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 25910 + }, + { + "epoch": 0.10019947116945771, + "grad_norm": 0.12706634402275085, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 25920 + }, + { + "epoch": 0.10023812837284099, + "grad_norm": 0.10601233690977097, + "learning_rate": 0.002, + "loss": 2.382, + "step": 25930 + }, + { + "epoch": 0.10027678557622427, + "grad_norm": 0.12050879746675491, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 25940 + }, + { + "epoch": 0.10031544277960755, + "grad_norm": 0.1077536791563034, + "learning_rate": 0.002, + "loss": 2.3864, + "step": 25950 + }, + { + "epoch": 0.10035409998299083, + "grad_norm": 0.11017072945833206, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 25960 + }, + { + "epoch": 0.10039275718637411, + "grad_norm": 0.37830984592437744, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 25970 + }, + { + "epoch": 0.10043141438975739, + "grad_norm": 0.10002478212118149, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 25980 + }, + { + "epoch": 0.10047007159314067, + "grad_norm": 0.12313816696405411, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 25990 + }, + { + "epoch": 0.10050872879652395, + "grad_norm": 0.09661564230918884, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 26000 + }, + { + "epoch": 0.10054738599990723, + "grad_norm": 0.10133011639118195, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 26010 + }, + { + "epoch": 0.1005860432032905, + "grad_norm": 0.14927417039871216, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 26020 + }, + { + "epoch": 0.10062470040667378, + "grad_norm": 0.10663340240716934, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 26030 + }, + { + "epoch": 0.10066335761005706, + "grad_norm": 0.10607342422008514, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 26040 + }, + { + "epoch": 0.10070201481344034, + "grad_norm": 0.09412727504968643, + "learning_rate": 0.002, + "loss": 2.37, + "step": 26050 + }, + { + "epoch": 0.10074067201682362, + "grad_norm": 0.11209404468536377, + "learning_rate": 0.002, + "loss": 2.366, + "step": 26060 + }, + { + "epoch": 0.1007793292202069, + "grad_norm": 0.1119234636425972, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 26070 + }, + { + "epoch": 0.10081798642359018, + "grad_norm": 0.13079524040222168, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 26080 + }, + { + "epoch": 0.10085664362697346, + "grad_norm": 0.11952626705169678, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 26090 + }, + { + "epoch": 0.10089530083035674, + "grad_norm": 0.10386581718921661, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 26100 + }, + { + "epoch": 0.10093395803374, + "grad_norm": 0.12058830261230469, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 26110 + }, + { + "epoch": 0.10097261523712328, + "grad_norm": 0.1013653352856636, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 26120 + }, + { + "epoch": 0.10101127244050656, + "grad_norm": 0.11395049095153809, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 26130 + }, + { + "epoch": 0.10104992964388984, + "grad_norm": 0.10870377719402313, + "learning_rate": 0.002, + "loss": 2.368, + "step": 26140 + }, + { + "epoch": 0.10108858684727312, + "grad_norm": 0.1162482276558876, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 26150 + }, + { + "epoch": 0.1011272440506564, + "grad_norm": 0.10904961824417114, + "learning_rate": 0.002, + "loss": 2.376, + "step": 26160 + }, + { + "epoch": 0.10116590125403968, + "grad_norm": 0.0989343523979187, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 26170 + }, + { + "epoch": 0.10120455845742296, + "grad_norm": 0.12322376668453217, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 26180 + }, + { + "epoch": 0.10124321566080624, + "grad_norm": 0.13838641345500946, + "learning_rate": 0.002, + "loss": 2.367, + "step": 26190 + }, + { + "epoch": 0.1012818728641895, + "grad_norm": 0.11140194535255432, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 26200 + }, + { + "epoch": 0.10132053006757279, + "grad_norm": 0.17812778055667877, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 26210 + }, + { + "epoch": 0.10135918727095607, + "grad_norm": 0.12174686044454575, + "learning_rate": 0.002, + "loss": 2.3904, + "step": 26220 + }, + { + "epoch": 0.10139784447433935, + "grad_norm": 0.11605304479598999, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 26230 + }, + { + "epoch": 0.10143650167772263, + "grad_norm": 0.28635290265083313, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 26240 + }, + { + "epoch": 0.1014751588811059, + "grad_norm": 0.11292878538370132, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 26250 + }, + { + "epoch": 0.10151381608448919, + "grad_norm": 0.10103033483028412, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 26260 + }, + { + "epoch": 0.10155247328787247, + "grad_norm": 0.09425285458564758, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 26270 + }, + { + "epoch": 0.10159113049125575, + "grad_norm": 0.11024901270866394, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 26280 + }, + { + "epoch": 0.10162978769463901, + "grad_norm": 0.16107779741287231, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 26290 + }, + { + "epoch": 0.10166844489802229, + "grad_norm": 0.111606165766716, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 26300 + }, + { + "epoch": 0.10170710210140557, + "grad_norm": 0.11568954586982727, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 26310 + }, + { + "epoch": 0.10174575930478885, + "grad_norm": 0.12137076258659363, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 26320 + }, + { + "epoch": 0.10178441650817213, + "grad_norm": 0.12343282252550125, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 26330 + }, + { + "epoch": 0.10182307371155541, + "grad_norm": 0.09838341176509857, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 26340 + }, + { + "epoch": 0.10186173091493869, + "grad_norm": 0.11724685877561569, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 26350 + }, + { + "epoch": 0.10190038811832197, + "grad_norm": 0.10881741344928741, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 26360 + }, + { + "epoch": 0.10193904532170525, + "grad_norm": 0.11824636906385422, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 26370 + }, + { + "epoch": 0.10197770252508853, + "grad_norm": 0.09991855919361115, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 26380 + }, + { + "epoch": 0.1020163597284718, + "grad_norm": 0.13345035910606384, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 26390 + }, + { + "epoch": 0.10205501693185508, + "grad_norm": 0.12668660283088684, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 26400 + }, + { + "epoch": 0.10209367413523836, + "grad_norm": 0.10875190794467926, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 26410 + }, + { + "epoch": 0.10213233133862164, + "grad_norm": 0.10290392488241196, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 26420 + }, + { + "epoch": 0.10217098854200492, + "grad_norm": 0.1253070831298828, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 26430 + }, + { + "epoch": 0.1022096457453882, + "grad_norm": 0.13730914890766144, + "learning_rate": 0.002, + "loss": 2.378, + "step": 26440 + }, + { + "epoch": 0.10224830294877148, + "grad_norm": 0.12453850358724594, + "learning_rate": 0.002, + "loss": 2.368, + "step": 26450 + }, + { + "epoch": 0.10228696015215476, + "grad_norm": 0.11111441254615784, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 26460 + }, + { + "epoch": 0.10232561735553804, + "grad_norm": 0.11621275544166565, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 26470 + }, + { + "epoch": 0.1023642745589213, + "grad_norm": 0.09777642786502838, + "learning_rate": 0.002, + "loss": 2.388, + "step": 26480 + }, + { + "epoch": 0.10240293176230458, + "grad_norm": 0.1324297934770584, + "learning_rate": 0.002, + "loss": 2.386, + "step": 26490 + }, + { + "epoch": 0.10244158896568786, + "grad_norm": 0.0918188989162445, + "learning_rate": 0.002, + "loss": 2.386, + "step": 26500 + }, + { + "epoch": 0.10248024616907114, + "grad_norm": 0.10434520244598389, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 26510 + }, + { + "epoch": 0.10251890337245442, + "grad_norm": 0.10555671900510788, + "learning_rate": 0.002, + "loss": 2.372, + "step": 26520 + }, + { + "epoch": 0.1025575605758377, + "grad_norm": 0.11064916849136353, + "learning_rate": 0.002, + "loss": 2.383, + "step": 26530 + }, + { + "epoch": 0.10259621777922098, + "grad_norm": 0.13481812179088593, + "learning_rate": 0.002, + "loss": 2.362, + "step": 26540 + }, + { + "epoch": 0.10263487498260426, + "grad_norm": 0.10597804188728333, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 26550 + }, + { + "epoch": 0.10267353218598754, + "grad_norm": 0.11946997791528702, + "learning_rate": 0.002, + "loss": 2.3842, + "step": 26560 + }, + { + "epoch": 0.10271218938937081, + "grad_norm": 0.0989639163017273, + "learning_rate": 0.002, + "loss": 2.367, + "step": 26570 + }, + { + "epoch": 0.10275084659275409, + "grad_norm": 0.13578683137893677, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 26580 + }, + { + "epoch": 0.10278950379613737, + "grad_norm": 0.12367697060108185, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 26590 + }, + { + "epoch": 0.10282816099952065, + "grad_norm": 0.14630058407783508, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 26600 + }, + { + "epoch": 0.10286681820290393, + "grad_norm": 0.10778837651014328, + "learning_rate": 0.002, + "loss": 2.3966, + "step": 26610 + }, + { + "epoch": 0.10290547540628721, + "grad_norm": 0.11188562214374542, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 26620 + }, + { + "epoch": 0.10294413260967049, + "grad_norm": 0.1135137602686882, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 26630 + }, + { + "epoch": 0.10298278981305377, + "grad_norm": 0.12481187283992767, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 26640 + }, + { + "epoch": 0.10302144701643705, + "grad_norm": 0.10805981606245041, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 26650 + }, + { + "epoch": 0.10306010421982033, + "grad_norm": 0.11806239187717438, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 26660 + }, + { + "epoch": 0.1030987614232036, + "grad_norm": 0.13640879094600677, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 26670 + }, + { + "epoch": 0.10313741862658687, + "grad_norm": 0.11290092021226883, + "learning_rate": 0.002, + "loss": 2.3911, + "step": 26680 + }, + { + "epoch": 0.10317607582997015, + "grad_norm": 0.11578276753425598, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 26690 + }, + { + "epoch": 0.10321473303335343, + "grad_norm": 0.12642863392829895, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 26700 + }, + { + "epoch": 0.10325339023673671, + "grad_norm": 0.10927240550518036, + "learning_rate": 0.002, + "loss": 2.3863, + "step": 26710 + }, + { + "epoch": 0.10329204744012, + "grad_norm": 0.10512517392635345, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 26720 + }, + { + "epoch": 0.10333070464350327, + "grad_norm": 0.11986581981182098, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 26730 + }, + { + "epoch": 0.10336936184688655, + "grad_norm": 0.12444537878036499, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 26740 + }, + { + "epoch": 0.10340801905026983, + "grad_norm": 0.10115274786949158, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 26750 + }, + { + "epoch": 0.1034466762536531, + "grad_norm": 0.10158008337020874, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 26760 + }, + { + "epoch": 0.10348533345703638, + "grad_norm": 0.10689114779233932, + "learning_rate": 0.002, + "loss": 2.368, + "step": 26770 + }, + { + "epoch": 0.10352399066041966, + "grad_norm": 0.11266842484474182, + "learning_rate": 0.002, + "loss": 2.373, + "step": 26780 + }, + { + "epoch": 0.10356264786380294, + "grad_norm": 0.1356581598520279, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 26790 + }, + { + "epoch": 0.10360130506718622, + "grad_norm": 0.1164923831820488, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 26800 + }, + { + "epoch": 0.1036399622705695, + "grad_norm": 0.09714135527610779, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 26810 + }, + { + "epoch": 0.10367861947395278, + "grad_norm": 0.11546872556209564, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 26820 + }, + { + "epoch": 0.10371727667733606, + "grad_norm": 0.1366276890039444, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 26830 + }, + { + "epoch": 0.10375593388071934, + "grad_norm": 0.10508016496896744, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 26840 + }, + { + "epoch": 0.1037945910841026, + "grad_norm": 0.10757789015769958, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 26850 + }, + { + "epoch": 0.10383324828748589, + "grad_norm": 0.12697778642177582, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 26860 + }, + { + "epoch": 0.10387190549086917, + "grad_norm": 0.12644273042678833, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 26870 + }, + { + "epoch": 0.10391056269425245, + "grad_norm": 0.12781940400600433, + "learning_rate": 0.002, + "loss": 2.38, + "step": 26880 + }, + { + "epoch": 0.10394921989763573, + "grad_norm": 0.13271625339984894, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 26890 + }, + { + "epoch": 0.103987877101019, + "grad_norm": 0.11471915245056152, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 26900 + }, + { + "epoch": 0.10402653430440228, + "grad_norm": 0.10376786440610886, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 26910 + }, + { + "epoch": 0.10406519150778556, + "grad_norm": 0.15959547460079193, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 26920 + }, + { + "epoch": 0.10410384871116884, + "grad_norm": 0.09334205090999603, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 26930 + }, + { + "epoch": 0.10414250591455211, + "grad_norm": 0.10745527595281601, + "learning_rate": 0.002, + "loss": 2.379, + "step": 26940 + }, + { + "epoch": 0.10418116311793539, + "grad_norm": 0.12216676771640778, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 26950 + }, + { + "epoch": 0.10421982032131867, + "grad_norm": 0.10139105468988419, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 26960 + }, + { + "epoch": 0.10425847752470195, + "grad_norm": 0.1080985888838768, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 26970 + }, + { + "epoch": 0.10429713472808523, + "grad_norm": 0.12080413848161697, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 26980 + }, + { + "epoch": 0.10433579193146851, + "grad_norm": 0.1179196834564209, + "learning_rate": 0.002, + "loss": 2.3848, + "step": 26990 + }, + { + "epoch": 0.10437444913485179, + "grad_norm": 0.09805526584386826, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 27000 + }, + { + "epoch": 0.10441310633823507, + "grad_norm": 0.11344782263040543, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 27010 + }, + { + "epoch": 0.10445176354161835, + "grad_norm": 0.10693656653165817, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 27020 + }, + { + "epoch": 0.10449042074500163, + "grad_norm": 0.14084559679031372, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 27030 + }, + { + "epoch": 0.1045290779483849, + "grad_norm": 0.10055924206972122, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 27040 + }, + { + "epoch": 0.10456773515176818, + "grad_norm": 0.11677692085504532, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 27050 + }, + { + "epoch": 0.10460639235515146, + "grad_norm": 0.13112536072731018, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 27060 + }, + { + "epoch": 0.10464504955853474, + "grad_norm": 0.09956295788288116, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 27070 + }, + { + "epoch": 0.10468370676191802, + "grad_norm": 0.10391475260257721, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 27080 + }, + { + "epoch": 0.1047223639653013, + "grad_norm": 0.12148088961839676, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 27090 + }, + { + "epoch": 0.10476102116868458, + "grad_norm": 0.1088813841342926, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 27100 + }, + { + "epoch": 0.10479967837206786, + "grad_norm": 0.10357842594385147, + "learning_rate": 0.002, + "loss": 2.382, + "step": 27110 + }, + { + "epoch": 0.10483833557545114, + "grad_norm": 0.12168553471565247, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 27120 + }, + { + "epoch": 0.1048769927788344, + "grad_norm": 0.10601655393838882, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 27130 + }, + { + "epoch": 0.10491564998221768, + "grad_norm": 0.12945091724395752, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 27140 + }, + { + "epoch": 0.10495430718560096, + "grad_norm": 0.11099984496831894, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 27150 + }, + { + "epoch": 0.10499296438898424, + "grad_norm": 0.11713527143001556, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 27160 + }, + { + "epoch": 0.10503162159236752, + "grad_norm": 0.12044794857501984, + "learning_rate": 0.002, + "loss": 2.3965, + "step": 27170 + }, + { + "epoch": 0.1050702787957508, + "grad_norm": 0.14438650012016296, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 27180 + }, + { + "epoch": 0.10510893599913408, + "grad_norm": 0.1254081428050995, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 27190 + }, + { + "epoch": 0.10514759320251736, + "grad_norm": 0.10578880459070206, + "learning_rate": 0.002, + "loss": 2.38, + "step": 27200 + }, + { + "epoch": 0.10518625040590064, + "grad_norm": 0.10591401904821396, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 27210 + }, + { + "epoch": 0.10522490760928391, + "grad_norm": 0.11413241922855377, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 27220 + }, + { + "epoch": 0.10526356481266719, + "grad_norm": 0.12489752471446991, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 27230 + }, + { + "epoch": 0.10530222201605047, + "grad_norm": 0.09213671833276749, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 27240 + }, + { + "epoch": 0.10534087921943375, + "grad_norm": 0.11025281995534897, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 27250 + }, + { + "epoch": 0.10537953642281703, + "grad_norm": 0.10654711723327637, + "learning_rate": 0.002, + "loss": 2.382, + "step": 27260 + }, + { + "epoch": 0.10541819362620031, + "grad_norm": 0.10936938971281052, + "learning_rate": 0.002, + "loss": 2.361, + "step": 27270 + }, + { + "epoch": 0.10545685082958359, + "grad_norm": 0.11113861203193665, + "learning_rate": 0.002, + "loss": 2.3931, + "step": 27280 + }, + { + "epoch": 0.10549550803296687, + "grad_norm": 0.12228459864854813, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 27290 + }, + { + "epoch": 0.10553416523635015, + "grad_norm": 0.10813633352518082, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 27300 + }, + { + "epoch": 0.10557282243973341, + "grad_norm": 0.1035778671503067, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 27310 + }, + { + "epoch": 0.10561147964311669, + "grad_norm": 0.12268638610839844, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 27320 + }, + { + "epoch": 0.10565013684649997, + "grad_norm": 0.11177417635917664, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 27330 + }, + { + "epoch": 0.10568879404988325, + "grad_norm": 0.11250213533639908, + "learning_rate": 0.002, + "loss": 2.376, + "step": 27340 + }, + { + "epoch": 0.10572745125326653, + "grad_norm": 0.12354373186826706, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 27350 + }, + { + "epoch": 0.10576610845664981, + "grad_norm": 0.11750753968954086, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 27360 + }, + { + "epoch": 0.10580476566003309, + "grad_norm": 0.11276818066835403, + "learning_rate": 0.002, + "loss": 2.354, + "step": 27370 + }, + { + "epoch": 0.10584342286341637, + "grad_norm": 0.1077154353260994, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 27380 + }, + { + "epoch": 0.10588208006679965, + "grad_norm": 0.10436894744634628, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 27390 + }, + { + "epoch": 0.10592073727018293, + "grad_norm": 0.09669612348079681, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 27400 + }, + { + "epoch": 0.1059593944735662, + "grad_norm": 0.12027565389871597, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 27410 + }, + { + "epoch": 0.10599805167694948, + "grad_norm": 0.10789991915225983, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 27420 + }, + { + "epoch": 0.10603670888033276, + "grad_norm": 0.12353754043579102, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 27430 + }, + { + "epoch": 0.10607536608371604, + "grad_norm": 0.1370828002691269, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 27440 + }, + { + "epoch": 0.10611402328709932, + "grad_norm": 0.11758144944906235, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 27450 + }, + { + "epoch": 0.1061526804904826, + "grad_norm": 0.11821454018354416, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 27460 + }, + { + "epoch": 0.10619133769386588, + "grad_norm": 0.1114441379904747, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 27470 + }, + { + "epoch": 0.10622999489724916, + "grad_norm": 0.11358706653118134, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 27480 + }, + { + "epoch": 0.10626865210063244, + "grad_norm": 0.12629422545433044, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 27490 + }, + { + "epoch": 0.1063073093040157, + "grad_norm": 0.11764193326234818, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 27500 + }, + { + "epoch": 0.10634596650739898, + "grad_norm": 0.11269880831241608, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 27510 + }, + { + "epoch": 0.10638462371078226, + "grad_norm": 0.12829801440238953, + "learning_rate": 0.002, + "loss": 2.3846, + "step": 27520 + }, + { + "epoch": 0.10642328091416554, + "grad_norm": 0.11060722172260284, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 27530 + }, + { + "epoch": 0.10646193811754882, + "grad_norm": 0.10507179796695709, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 27540 + }, + { + "epoch": 0.1065005953209321, + "grad_norm": 0.12616103887557983, + "learning_rate": 0.002, + "loss": 2.3844, + "step": 27550 + }, + { + "epoch": 0.10653925252431538, + "grad_norm": 0.10349361598491669, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 27560 + }, + { + "epoch": 0.10657790972769866, + "grad_norm": 0.10966850072145462, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 27570 + }, + { + "epoch": 0.10661656693108194, + "grad_norm": 0.11329416185617447, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 27580 + }, + { + "epoch": 0.10665522413446521, + "grad_norm": 0.11381129920482635, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 27590 + }, + { + "epoch": 0.10669388133784849, + "grad_norm": 0.11109792441129684, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 27600 + }, + { + "epoch": 0.10673253854123177, + "grad_norm": 0.09967318177223206, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 27610 + }, + { + "epoch": 0.10677119574461505, + "grad_norm": 0.10462494939565659, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 27620 + }, + { + "epoch": 0.10680985294799833, + "grad_norm": 0.10698872059583664, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 27630 + }, + { + "epoch": 0.10684851015138161, + "grad_norm": 0.12059954553842545, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 27640 + }, + { + "epoch": 0.10688716735476489, + "grad_norm": 0.10810644924640656, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 27650 + }, + { + "epoch": 0.10692582455814817, + "grad_norm": 0.10074674338102341, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 27660 + }, + { + "epoch": 0.10696448176153145, + "grad_norm": 0.13629195094108582, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 27670 + }, + { + "epoch": 0.10700313896491473, + "grad_norm": 0.13592801988124847, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 27680 + }, + { + "epoch": 0.107041796168298, + "grad_norm": 0.11054662615060806, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 27690 + }, + { + "epoch": 0.10708045337168128, + "grad_norm": 0.10843642801046371, + "learning_rate": 0.002, + "loss": 2.378, + "step": 27700 + }, + { + "epoch": 0.10711911057506456, + "grad_norm": 0.13924673199653625, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 27710 + }, + { + "epoch": 0.10715776777844783, + "grad_norm": 0.11734243482351303, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 27720 + }, + { + "epoch": 0.10719642498183111, + "grad_norm": 0.1108909323811531, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 27730 + }, + { + "epoch": 0.1072350821852144, + "grad_norm": 0.13820059597492218, + "learning_rate": 0.002, + "loss": 2.385, + "step": 27740 + }, + { + "epoch": 0.10727373938859767, + "grad_norm": 0.11467090249061584, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 27750 + }, + { + "epoch": 0.10731239659198095, + "grad_norm": 0.11033518612384796, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 27760 + }, + { + "epoch": 0.10735105379536423, + "grad_norm": 0.11089988052845001, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 27770 + }, + { + "epoch": 0.1073897109987475, + "grad_norm": 0.10706806927919388, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 27780 + }, + { + "epoch": 0.10742836820213078, + "grad_norm": 0.11792809516191483, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 27790 + }, + { + "epoch": 0.10746702540551406, + "grad_norm": 0.11145688593387604, + "learning_rate": 0.002, + "loss": 2.36, + "step": 27800 + }, + { + "epoch": 0.10750568260889734, + "grad_norm": 0.11388550698757172, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 27810 + }, + { + "epoch": 0.10754433981228062, + "grad_norm": 0.11008848994970322, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 27820 + }, + { + "epoch": 0.1075829970156639, + "grad_norm": 0.12791645526885986, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 27830 + }, + { + "epoch": 0.10762165421904718, + "grad_norm": 0.13758353888988495, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 27840 + }, + { + "epoch": 0.10766031142243046, + "grad_norm": 0.10738374292850494, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 27850 + }, + { + "epoch": 0.10769896862581374, + "grad_norm": 0.10648205131292343, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 27860 + }, + { + "epoch": 0.107737625829197, + "grad_norm": 0.10689187794923782, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 27870 + }, + { + "epoch": 0.10777628303258029, + "grad_norm": 0.10008414834737778, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 27880 + }, + { + "epoch": 0.10781494023596357, + "grad_norm": 0.10352285206317902, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 27890 + }, + { + "epoch": 0.10785359743934685, + "grad_norm": 0.10048746317625046, + "learning_rate": 0.002, + "loss": 2.363, + "step": 27900 + }, + { + "epoch": 0.10789225464273013, + "grad_norm": 0.13023824989795685, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 27910 + }, + { + "epoch": 0.1079309118461134, + "grad_norm": 0.1087852418422699, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 27920 + }, + { + "epoch": 0.10796956904949669, + "grad_norm": 0.12118271738290787, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 27930 + }, + { + "epoch": 0.10800822625287997, + "grad_norm": 0.11113785207271576, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 27940 + }, + { + "epoch": 0.10804688345626325, + "grad_norm": 0.12866459786891937, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 27950 + }, + { + "epoch": 0.10808554065964651, + "grad_norm": 0.11148317903280258, + "learning_rate": 0.002, + "loss": 2.381, + "step": 27960 + }, + { + "epoch": 0.10812419786302979, + "grad_norm": 0.10880941152572632, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 27970 + }, + { + "epoch": 0.10816285506641307, + "grad_norm": 0.3135621249675751, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 27980 + }, + { + "epoch": 0.10820151226979635, + "grad_norm": 0.11539420485496521, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 27990 + }, + { + "epoch": 0.10824016947317963, + "grad_norm": 0.11317495256662369, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 28000 + }, + { + "epoch": 0.10827882667656291, + "grad_norm": 0.10599172115325928, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 28010 + }, + { + "epoch": 0.10831748387994619, + "grad_norm": 0.11403112858533859, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 28020 + }, + { + "epoch": 0.10835614108332947, + "grad_norm": 0.10704049468040466, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 28030 + }, + { + "epoch": 0.10839479828671275, + "grad_norm": 0.21458998322486877, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 28040 + }, + { + "epoch": 0.10843345549009603, + "grad_norm": 0.1518183797597885, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 28050 + }, + { + "epoch": 0.1084721126934793, + "grad_norm": 0.10209496319293976, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 28060 + }, + { + "epoch": 0.10851076989686258, + "grad_norm": 0.10443715751171112, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 28070 + }, + { + "epoch": 0.10854942710024586, + "grad_norm": 0.12445182353258133, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 28080 + }, + { + "epoch": 0.10858808430362914, + "grad_norm": 0.10870563238859177, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 28090 + }, + { + "epoch": 0.10862674150701242, + "grad_norm": 0.11430470645427704, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 28100 + }, + { + "epoch": 0.1086653987103957, + "grad_norm": 0.1253480315208435, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 28110 + }, + { + "epoch": 0.10870405591377898, + "grad_norm": 0.1110760048031807, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 28120 + }, + { + "epoch": 0.10874271311716226, + "grad_norm": 0.11550119519233704, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 28130 + }, + { + "epoch": 0.10878137032054554, + "grad_norm": 0.1076694130897522, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 28140 + }, + { + "epoch": 0.1088200275239288, + "grad_norm": 0.11724784970283508, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 28150 + }, + { + "epoch": 0.10885868472731208, + "grad_norm": 0.12491951882839203, + "learning_rate": 0.002, + "loss": 2.3857, + "step": 28160 + }, + { + "epoch": 0.10889734193069536, + "grad_norm": 0.11169461160898209, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 28170 + }, + { + "epoch": 0.10893599913407864, + "grad_norm": 0.11813009530305862, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 28180 + }, + { + "epoch": 0.10897465633746192, + "grad_norm": 0.09784800559282303, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 28190 + }, + { + "epoch": 0.1090133135408452, + "grad_norm": 0.12332207709550858, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 28200 + }, + { + "epoch": 0.10905197074422848, + "grad_norm": 0.09766773879528046, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 28210 + }, + { + "epoch": 0.10909062794761176, + "grad_norm": 0.09964337944984436, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 28220 + }, + { + "epoch": 0.10912928515099504, + "grad_norm": 0.09515334665775299, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 28230 + }, + { + "epoch": 0.10916794235437831, + "grad_norm": 0.10735496878623962, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 28240 + }, + { + "epoch": 0.10920659955776159, + "grad_norm": 0.12152384966611862, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 28250 + }, + { + "epoch": 0.10924525676114487, + "grad_norm": 0.1107824295759201, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 28260 + }, + { + "epoch": 0.10928391396452815, + "grad_norm": 0.13151098787784576, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 28270 + }, + { + "epoch": 0.10932257116791143, + "grad_norm": 0.11157234758138657, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 28280 + }, + { + "epoch": 0.10936122837129471, + "grad_norm": 0.10893750935792923, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 28290 + }, + { + "epoch": 0.10939988557467799, + "grad_norm": 0.10100287944078445, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 28300 + }, + { + "epoch": 0.10943854277806127, + "grad_norm": 0.15186738967895508, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 28310 + }, + { + "epoch": 0.10947719998144455, + "grad_norm": 0.09146854281425476, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 28320 + }, + { + "epoch": 0.10951585718482781, + "grad_norm": 0.1130962148308754, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 28330 + }, + { + "epoch": 0.1095545143882111, + "grad_norm": 0.1127144992351532, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 28340 + }, + { + "epoch": 0.10959317159159437, + "grad_norm": 0.12409462779760361, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 28350 + }, + { + "epoch": 0.10963182879497765, + "grad_norm": 0.13971257209777832, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 28360 + }, + { + "epoch": 0.10967048599836093, + "grad_norm": 0.11502938717603683, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 28370 + }, + { + "epoch": 0.10970914320174421, + "grad_norm": 0.12327464669942856, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 28380 + }, + { + "epoch": 0.1097478004051275, + "grad_norm": 0.10914456099271774, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 28390 + }, + { + "epoch": 0.10978645760851077, + "grad_norm": 0.1268835812807083, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 28400 + }, + { + "epoch": 0.10982511481189405, + "grad_norm": 0.11616487801074982, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 28410 + }, + { + "epoch": 0.10986377201527733, + "grad_norm": 0.11308058351278305, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 28420 + }, + { + "epoch": 0.1099024292186606, + "grad_norm": 0.13024914264678955, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 28430 + }, + { + "epoch": 0.10994108642204388, + "grad_norm": 0.11701653152704239, + "learning_rate": 0.002, + "loss": 2.391, + "step": 28440 + }, + { + "epoch": 0.10997974362542716, + "grad_norm": 0.1038404330611229, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 28450 + }, + { + "epoch": 0.11001840082881044, + "grad_norm": 0.11018650978803635, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 28460 + }, + { + "epoch": 0.11005705803219372, + "grad_norm": 0.09936435520648956, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 28470 + }, + { + "epoch": 0.110095715235577, + "grad_norm": 0.09147805720567703, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 28480 + }, + { + "epoch": 0.11013437243896028, + "grad_norm": 0.11586008965969086, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 28490 + }, + { + "epoch": 0.11017302964234356, + "grad_norm": 0.10951730608940125, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 28500 + }, + { + "epoch": 0.11021168684572684, + "grad_norm": 0.12234780192375183, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 28510 + }, + { + "epoch": 0.1102503440491101, + "grad_norm": 0.11470252275466919, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 28520 + }, + { + "epoch": 0.11028900125249338, + "grad_norm": 0.1343831568956375, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 28530 + }, + { + "epoch": 0.11032765845587666, + "grad_norm": 0.12148990482091904, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 28540 + }, + { + "epoch": 0.11036631565925994, + "grad_norm": 0.09863020479679108, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 28550 + }, + { + "epoch": 0.11040497286264322, + "grad_norm": 0.18454931676387787, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 28560 + }, + { + "epoch": 0.1104436300660265, + "grad_norm": 0.1118006780743599, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 28570 + }, + { + "epoch": 0.11048228726940978, + "grad_norm": 0.09556985646486282, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 28580 + }, + { + "epoch": 0.11052094447279306, + "grad_norm": 0.09216835349798203, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 28590 + }, + { + "epoch": 0.11055960167617634, + "grad_norm": 0.11700079590082169, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 28600 + }, + { + "epoch": 0.11059825887955961, + "grad_norm": 0.11173650622367859, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 28610 + }, + { + "epoch": 0.11063691608294289, + "grad_norm": 0.1257830411195755, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 28620 + }, + { + "epoch": 0.11067557328632617, + "grad_norm": 0.1310737282037735, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 28630 + }, + { + "epoch": 0.11071423048970945, + "grad_norm": 0.12189605087041855, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 28640 + }, + { + "epoch": 0.11075288769309273, + "grad_norm": 0.10539402067661285, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 28650 + }, + { + "epoch": 0.11079154489647601, + "grad_norm": 0.11254836618900299, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 28660 + }, + { + "epoch": 0.11083020209985929, + "grad_norm": 0.11399146914482117, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 28670 + }, + { + "epoch": 0.11086885930324257, + "grad_norm": 0.10469246655702591, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 28680 + }, + { + "epoch": 0.11090751650662585, + "grad_norm": 0.11052166670560837, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 28690 + }, + { + "epoch": 0.11094617371000913, + "grad_norm": 0.10565482825040817, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 28700 + }, + { + "epoch": 0.1109848309133924, + "grad_norm": 0.09518107026815414, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 28710 + }, + { + "epoch": 0.11102348811677568, + "grad_norm": 0.10598381608724594, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 28720 + }, + { + "epoch": 0.11106214532015896, + "grad_norm": 0.095307856798172, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 28730 + }, + { + "epoch": 0.11110080252354224, + "grad_norm": 0.12068048864603043, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 28740 + }, + { + "epoch": 0.11113945972692552, + "grad_norm": 0.11202985793352127, + "learning_rate": 0.002, + "loss": 2.3873, + "step": 28750 + }, + { + "epoch": 0.1111781169303088, + "grad_norm": 0.09940610826015472, + "learning_rate": 0.002, + "loss": 2.367, + "step": 28760 + }, + { + "epoch": 0.11121677413369208, + "grad_norm": 0.10500725358724594, + "learning_rate": 0.002, + "loss": 2.379, + "step": 28770 + }, + { + "epoch": 0.11125543133707536, + "grad_norm": 0.1390557736158371, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 28780 + }, + { + "epoch": 0.11129408854045864, + "grad_norm": 0.11617904156446457, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 28790 + }, + { + "epoch": 0.1113327457438419, + "grad_norm": 0.1266857385635376, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 28800 + }, + { + "epoch": 0.11137140294722518, + "grad_norm": 0.1094893291592598, + "learning_rate": 0.002, + "loss": 2.364, + "step": 28810 + }, + { + "epoch": 0.11141006015060846, + "grad_norm": 0.10328162461519241, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 28820 + }, + { + "epoch": 0.11144871735399174, + "grad_norm": 0.13057942688465118, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 28830 + }, + { + "epoch": 0.11148737455737502, + "grad_norm": 0.1053711324930191, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 28840 + }, + { + "epoch": 0.1115260317607583, + "grad_norm": 0.10615862160921097, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 28850 + }, + { + "epoch": 0.11156468896414158, + "grad_norm": 0.11354444175958633, + "learning_rate": 0.002, + "loss": 2.373, + "step": 28860 + }, + { + "epoch": 0.11160334616752486, + "grad_norm": 0.10664892941713333, + "learning_rate": 0.002, + "loss": 2.391, + "step": 28870 + }, + { + "epoch": 0.11164200337090814, + "grad_norm": 0.12627890706062317, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 28880 + }, + { + "epoch": 0.11168066057429141, + "grad_norm": 0.2308432012796402, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 28890 + }, + { + "epoch": 0.11171931777767469, + "grad_norm": 0.14682862162590027, + "learning_rate": 0.002, + "loss": 2.391, + "step": 28900 + }, + { + "epoch": 0.11175797498105797, + "grad_norm": 0.10037046670913696, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 28910 + }, + { + "epoch": 0.11179663218444125, + "grad_norm": 0.11632565408945084, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 28920 + }, + { + "epoch": 0.11183528938782453, + "grad_norm": 0.12636606395244598, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 28930 + }, + { + "epoch": 0.1118739465912078, + "grad_norm": 0.147725448012352, + "learning_rate": 0.002, + "loss": 2.367, + "step": 28940 + }, + { + "epoch": 0.11191260379459109, + "grad_norm": 0.11857064813375473, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 28950 + }, + { + "epoch": 0.11195126099797437, + "grad_norm": 0.11522777378559113, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 28960 + }, + { + "epoch": 0.11198991820135765, + "grad_norm": 0.12320879846811295, + "learning_rate": 0.002, + "loss": 2.3909, + "step": 28970 + }, + { + "epoch": 0.11202857540474091, + "grad_norm": 0.11825113743543625, + "learning_rate": 0.002, + "loss": 2.3854, + "step": 28980 + }, + { + "epoch": 0.11206723260812419, + "grad_norm": 0.11741521209478378, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 28990 + }, + { + "epoch": 0.11210588981150747, + "grad_norm": 0.10914985835552216, + "learning_rate": 0.002, + "loss": 2.365, + "step": 29000 + }, + { + "epoch": 0.11214454701489075, + "grad_norm": 0.10588382929563522, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 29010 + }, + { + "epoch": 0.11218320421827403, + "grad_norm": 0.14614279568195343, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 29020 + }, + { + "epoch": 0.11222186142165731, + "grad_norm": 0.1200341135263443, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 29030 + }, + { + "epoch": 0.11226051862504059, + "grad_norm": 0.11353737115859985, + "learning_rate": 0.002, + "loss": 2.3842, + "step": 29040 + }, + { + "epoch": 0.11229917582842387, + "grad_norm": 0.10931562632322311, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 29050 + }, + { + "epoch": 0.11233783303180715, + "grad_norm": 0.09870153665542603, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 29060 + }, + { + "epoch": 0.11237649023519043, + "grad_norm": 0.10087848454713821, + "learning_rate": 0.002, + "loss": 2.366, + "step": 29070 + }, + { + "epoch": 0.1124151474385737, + "grad_norm": 0.14312563836574554, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 29080 + }, + { + "epoch": 0.11245380464195698, + "grad_norm": 0.10859497636556625, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 29090 + }, + { + "epoch": 0.11249246184534026, + "grad_norm": 0.1190008744597435, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 29100 + }, + { + "epoch": 0.11253111904872354, + "grad_norm": 0.10914407670497894, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 29110 + }, + { + "epoch": 0.11256977625210682, + "grad_norm": 0.12408732622861862, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 29120 + }, + { + "epoch": 0.1126084334554901, + "grad_norm": 0.09813961386680603, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 29130 + }, + { + "epoch": 0.11264709065887338, + "grad_norm": 0.13027751445770264, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 29140 + }, + { + "epoch": 0.11268574786225666, + "grad_norm": 0.10990479588508606, + "learning_rate": 0.002, + "loss": 2.3887, + "step": 29150 + }, + { + "epoch": 0.11272440506563994, + "grad_norm": 0.11599687486886978, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 29160 + }, + { + "epoch": 0.1127630622690232, + "grad_norm": 0.1316378116607666, + "learning_rate": 0.002, + "loss": 2.371, + "step": 29170 + }, + { + "epoch": 0.11280171947240648, + "grad_norm": 0.1244787871837616, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 29180 + }, + { + "epoch": 0.11284037667578976, + "grad_norm": 0.1331619769334793, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 29190 + }, + { + "epoch": 0.11287903387917304, + "grad_norm": 0.09210868179798126, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 29200 + }, + { + "epoch": 0.11291769108255632, + "grad_norm": 0.5537592172622681, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 29210 + }, + { + "epoch": 0.1129563482859396, + "grad_norm": 0.10940191894769669, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 29220 + }, + { + "epoch": 0.11299500548932288, + "grad_norm": 0.11873272061347961, + "learning_rate": 0.002, + "loss": 2.3884, + "step": 29230 + }, + { + "epoch": 0.11303366269270616, + "grad_norm": 0.12138811498880386, + "learning_rate": 0.002, + "loss": 2.3839, + "step": 29240 + }, + { + "epoch": 0.11307231989608944, + "grad_norm": 0.1103893592953682, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 29250 + }, + { + "epoch": 0.11311097709947271, + "grad_norm": 0.11036432534456253, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 29260 + }, + { + "epoch": 0.11314963430285599, + "grad_norm": 0.2721116244792938, + "learning_rate": 0.002, + "loss": 2.383, + "step": 29270 + }, + { + "epoch": 0.11318829150623927, + "grad_norm": 0.1114020049571991, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 29280 + }, + { + "epoch": 0.11322694870962255, + "grad_norm": 0.11401335150003433, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 29290 + }, + { + "epoch": 0.11326560591300583, + "grad_norm": 0.1049218401312828, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 29300 + }, + { + "epoch": 0.11330426311638911, + "grad_norm": 0.16691258549690247, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 29310 + }, + { + "epoch": 0.11334292031977239, + "grad_norm": 0.11298374086618423, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 29320 + }, + { + "epoch": 0.11338157752315567, + "grad_norm": 0.09407994896173477, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 29330 + }, + { + "epoch": 0.11342023472653895, + "grad_norm": 0.12208922952413559, + "learning_rate": 0.002, + "loss": 2.361, + "step": 29340 + }, + { + "epoch": 0.11345889192992221, + "grad_norm": 0.1252242773771286, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 29350 + }, + { + "epoch": 0.1134975491333055, + "grad_norm": 0.11096751689910889, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 29360 + }, + { + "epoch": 0.11353620633668877, + "grad_norm": 0.10881470143795013, + "learning_rate": 0.002, + "loss": 2.3857, + "step": 29370 + }, + { + "epoch": 0.11357486354007205, + "grad_norm": 0.1198098361492157, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 29380 + }, + { + "epoch": 0.11361352074345533, + "grad_norm": 0.11969012767076492, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 29390 + }, + { + "epoch": 0.11365217794683861, + "grad_norm": 0.11216343939304352, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 29400 + }, + { + "epoch": 0.1136908351502219, + "grad_norm": 0.09968668967485428, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 29410 + }, + { + "epoch": 0.11372949235360517, + "grad_norm": 0.11205536127090454, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 29420 + }, + { + "epoch": 0.11376814955698845, + "grad_norm": 0.12463561445474625, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 29430 + }, + { + "epoch": 0.11380680676037173, + "grad_norm": 0.10464975237846375, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 29440 + }, + { + "epoch": 0.113845463963755, + "grad_norm": 0.109347403049469, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 29450 + }, + { + "epoch": 0.11388412116713828, + "grad_norm": 0.11343058943748474, + "learning_rate": 0.002, + "loss": 2.373, + "step": 29460 + }, + { + "epoch": 0.11392277837052156, + "grad_norm": 0.1124456450343132, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 29470 + }, + { + "epoch": 0.11396143557390484, + "grad_norm": 0.10418668389320374, + "learning_rate": 0.002, + "loss": 2.375, + "step": 29480 + }, + { + "epoch": 0.11400009277728812, + "grad_norm": 0.10243808478116989, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 29490 + }, + { + "epoch": 0.1140387499806714, + "grad_norm": 0.160277858376503, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 29500 + }, + { + "epoch": 0.11407740718405468, + "grad_norm": 0.10653091967105865, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 29510 + }, + { + "epoch": 0.11411606438743796, + "grad_norm": 0.19174635410308838, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 29520 + }, + { + "epoch": 0.11415472159082124, + "grad_norm": 0.11096177250146866, + "learning_rate": 0.002, + "loss": 2.387, + "step": 29530 + }, + { + "epoch": 0.1141933787942045, + "grad_norm": 0.12363119423389435, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 29540 + }, + { + "epoch": 0.11423203599758779, + "grad_norm": 0.10024431347846985, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 29550 + }, + { + "epoch": 0.11427069320097107, + "grad_norm": 0.10744503140449524, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 29560 + }, + { + "epoch": 0.11430935040435435, + "grad_norm": 0.1058807298541069, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 29570 + }, + { + "epoch": 0.11434800760773763, + "grad_norm": 0.12204091995954514, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 29580 + }, + { + "epoch": 0.1143866648111209, + "grad_norm": 0.10262423008680344, + "learning_rate": 0.002, + "loss": 2.3861, + "step": 29590 + }, + { + "epoch": 0.11442532201450419, + "grad_norm": 0.12699533998966217, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 29600 + }, + { + "epoch": 0.11446397921788747, + "grad_norm": 0.13344010710716248, + "learning_rate": 0.002, + "loss": 2.382, + "step": 29610 + }, + { + "epoch": 0.11450263642127075, + "grad_norm": 0.12079144269227982, + "learning_rate": 0.002, + "loss": 2.37, + "step": 29620 + }, + { + "epoch": 0.11454129362465401, + "grad_norm": 0.11895615607500076, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 29630 + }, + { + "epoch": 0.11457995082803729, + "grad_norm": 0.1387789100408554, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 29640 + }, + { + "epoch": 0.11461860803142057, + "grad_norm": 0.10361293703317642, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 29650 + }, + { + "epoch": 0.11465726523480385, + "grad_norm": 0.11020094901323318, + "learning_rate": 0.002, + "loss": 2.3903, + "step": 29660 + }, + { + "epoch": 0.11469592243818713, + "grad_norm": 0.09268821775913239, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 29670 + }, + { + "epoch": 0.11473457964157041, + "grad_norm": 0.1092313826084137, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 29680 + }, + { + "epoch": 0.11477323684495369, + "grad_norm": 0.09730253368616104, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 29690 + }, + { + "epoch": 0.11481189404833697, + "grad_norm": 0.12316158413887024, + "learning_rate": 0.002, + "loss": 2.3865, + "step": 29700 + }, + { + "epoch": 0.11485055125172025, + "grad_norm": 0.1004888042807579, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 29710 + }, + { + "epoch": 0.11488920845510353, + "grad_norm": 0.12836909294128418, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 29720 + }, + { + "epoch": 0.1149278656584868, + "grad_norm": 0.10969601571559906, + "learning_rate": 0.002, + "loss": 2.37, + "step": 29730 + }, + { + "epoch": 0.11496652286187008, + "grad_norm": 0.11120554059743881, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 29740 + }, + { + "epoch": 0.11500518006525336, + "grad_norm": 0.10580164194107056, + "learning_rate": 0.002, + "loss": 2.378, + "step": 29750 + }, + { + "epoch": 0.11504383726863664, + "grad_norm": 0.11593388020992279, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 29760 + }, + { + "epoch": 0.11508249447201992, + "grad_norm": 0.11328724771738052, + "learning_rate": 0.002, + "loss": 2.3909, + "step": 29770 + }, + { + "epoch": 0.1151211516754032, + "grad_norm": 0.10201551020145416, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 29780 + }, + { + "epoch": 0.11515980887878648, + "grad_norm": 0.1188369020819664, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 29790 + }, + { + "epoch": 0.11519846608216976, + "grad_norm": 0.12928543984889984, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 29800 + }, + { + "epoch": 0.11523712328555304, + "grad_norm": 0.10411644726991653, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 29810 + }, + { + "epoch": 0.1152757804889363, + "grad_norm": 0.10747519880533218, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 29820 + }, + { + "epoch": 0.11531443769231958, + "grad_norm": 0.10192860662937164, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 29830 + }, + { + "epoch": 0.11535309489570286, + "grad_norm": 0.10935201495885849, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 29840 + }, + { + "epoch": 0.11539175209908614, + "grad_norm": 0.12911826372146606, + "learning_rate": 0.002, + "loss": 2.364, + "step": 29850 + }, + { + "epoch": 0.11543040930246942, + "grad_norm": 0.11535267531871796, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 29860 + }, + { + "epoch": 0.1154690665058527, + "grad_norm": 0.11060041189193726, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 29870 + }, + { + "epoch": 0.11550772370923598, + "grad_norm": 0.10553035140037537, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 29880 + }, + { + "epoch": 0.11554638091261926, + "grad_norm": 0.11950163543224335, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 29890 + }, + { + "epoch": 0.11558503811600254, + "grad_norm": 0.13742852210998535, + "learning_rate": 0.002, + "loss": 2.3869, + "step": 29900 + }, + { + "epoch": 0.11562369531938581, + "grad_norm": 0.10131851583719254, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 29910 + }, + { + "epoch": 0.11566235252276909, + "grad_norm": 0.10164907574653625, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 29920 + }, + { + "epoch": 0.11570100972615237, + "grad_norm": 0.1058017909526825, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 29930 + }, + { + "epoch": 0.11573966692953565, + "grad_norm": 0.13190488517284393, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 29940 + }, + { + "epoch": 0.11577832413291893, + "grad_norm": 0.09957809001207352, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 29950 + }, + { + "epoch": 0.11581698133630221, + "grad_norm": 0.1056378036737442, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 29960 + }, + { + "epoch": 0.11585563853968549, + "grad_norm": 0.10000760108232498, + "learning_rate": 0.002, + "loss": 2.362, + "step": 29970 + }, + { + "epoch": 0.11589429574306877, + "grad_norm": 0.10699297487735748, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 29980 + }, + { + "epoch": 0.11593295294645205, + "grad_norm": 0.09690152853727341, + "learning_rate": 0.002, + "loss": 2.3894, + "step": 29990 + }, + { + "epoch": 0.11597161014983531, + "grad_norm": 0.10324429720640182, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 30000 + }, + { + "epoch": 0.1160102673532186, + "grad_norm": 0.10979462414979935, + "learning_rate": 0.002, + "loss": 2.374, + "step": 30010 + }, + { + "epoch": 0.11604892455660187, + "grad_norm": 0.10407369583845139, + "learning_rate": 0.002, + "loss": 2.379, + "step": 30020 + }, + { + "epoch": 0.11608758175998515, + "grad_norm": 0.10763532668352127, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 30030 + }, + { + "epoch": 0.11612623896336843, + "grad_norm": 0.11464933305978775, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 30040 + }, + { + "epoch": 0.11616489616675171, + "grad_norm": 0.14590147137641907, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 30050 + }, + { + "epoch": 0.116203553370135, + "grad_norm": 0.1058560386300087, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 30060 + }, + { + "epoch": 0.11624221057351827, + "grad_norm": 0.11187131702899933, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 30070 + }, + { + "epoch": 0.11628086777690155, + "grad_norm": 0.11111967265605927, + "learning_rate": 0.002, + "loss": 2.357, + "step": 30080 + }, + { + "epoch": 0.11631952498028483, + "grad_norm": 0.10049404203891754, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 30090 + }, + { + "epoch": 0.1163581821836681, + "grad_norm": 0.15781576931476593, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 30100 + }, + { + "epoch": 0.11639683938705138, + "grad_norm": 0.13108819723129272, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 30110 + }, + { + "epoch": 0.11643549659043466, + "grad_norm": 0.10871430486440659, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 30120 + }, + { + "epoch": 0.11647415379381794, + "grad_norm": 0.1290903091430664, + "learning_rate": 0.002, + "loss": 2.3918, + "step": 30130 + }, + { + "epoch": 0.11651281099720122, + "grad_norm": 0.11009565740823746, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 30140 + }, + { + "epoch": 0.1165514682005845, + "grad_norm": 0.10931471735239029, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 30150 + }, + { + "epoch": 0.11659012540396778, + "grad_norm": 0.09931996464729309, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 30160 + }, + { + "epoch": 0.11662878260735106, + "grad_norm": 0.1256752461194992, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 30170 + }, + { + "epoch": 0.11666743981073434, + "grad_norm": 0.11275876313447952, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 30180 + }, + { + "epoch": 0.1167060970141176, + "grad_norm": 0.10061401128768921, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 30190 + }, + { + "epoch": 0.11674475421750088, + "grad_norm": 0.09852159768342972, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 30200 + }, + { + "epoch": 0.11678341142088416, + "grad_norm": 0.11242441833019257, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 30210 + }, + { + "epoch": 0.11682206862426744, + "grad_norm": 0.10587003827095032, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 30220 + }, + { + "epoch": 0.11686072582765072, + "grad_norm": 0.10544271022081375, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 30230 + }, + { + "epoch": 0.116899383031034, + "grad_norm": 0.09737107902765274, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 30240 + }, + { + "epoch": 0.11693804023441728, + "grad_norm": 0.1411416381597519, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 30250 + }, + { + "epoch": 0.11697669743780056, + "grad_norm": 0.10318762063980103, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 30260 + }, + { + "epoch": 0.11701535464118384, + "grad_norm": 0.12529000639915466, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 30270 + }, + { + "epoch": 0.11705401184456711, + "grad_norm": 0.10274723917245865, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 30280 + }, + { + "epoch": 0.11709266904795039, + "grad_norm": 0.1322697550058365, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 30290 + }, + { + "epoch": 0.11713132625133367, + "grad_norm": 0.11907245963811874, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 30300 + }, + { + "epoch": 0.11716998345471695, + "grad_norm": 0.13744261860847473, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 30310 + }, + { + "epoch": 0.11720864065810023, + "grad_norm": 0.09685148298740387, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 30320 + }, + { + "epoch": 0.11724729786148351, + "grad_norm": 0.11613011360168457, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 30330 + }, + { + "epoch": 0.11728595506486679, + "grad_norm": 0.11653503775596619, + "learning_rate": 0.002, + "loss": 2.3803, + "step": 30340 + }, + { + "epoch": 0.11732461226825007, + "grad_norm": 0.10787040740251541, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 30350 + }, + { + "epoch": 0.11736326947163335, + "grad_norm": 0.10704168677330017, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 30360 + }, + { + "epoch": 0.11740192667501662, + "grad_norm": 0.12299351394176483, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 30370 + }, + { + "epoch": 0.1174405838783999, + "grad_norm": 0.13366322219371796, + "learning_rate": 0.002, + "loss": 2.3872, + "step": 30380 + }, + { + "epoch": 0.11747924108178318, + "grad_norm": 0.11831143498420715, + "learning_rate": 0.002, + "loss": 2.37, + "step": 30390 + }, + { + "epoch": 0.11751789828516646, + "grad_norm": 0.11491730809211731, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 30400 + }, + { + "epoch": 0.11755655548854974, + "grad_norm": 0.1049700379371643, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 30410 + }, + { + "epoch": 0.11759521269193302, + "grad_norm": 0.10995157808065414, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 30420 + }, + { + "epoch": 0.1176338698953163, + "grad_norm": 0.09854548424482346, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 30430 + }, + { + "epoch": 0.11767252709869958, + "grad_norm": 0.12066702544689178, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 30440 + }, + { + "epoch": 0.11771118430208286, + "grad_norm": 0.11020725965499878, + "learning_rate": 0.002, + "loss": 2.3864, + "step": 30450 + }, + { + "epoch": 0.11774984150546614, + "grad_norm": 0.09980931878089905, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 30460 + }, + { + "epoch": 0.1177884987088494, + "grad_norm": 0.13707521557807922, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 30470 + }, + { + "epoch": 0.11782715591223268, + "grad_norm": 0.11138369143009186, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 30480 + }, + { + "epoch": 0.11786581311561596, + "grad_norm": 0.10805569589138031, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 30490 + }, + { + "epoch": 0.11790447031899924, + "grad_norm": 0.1061464250087738, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 30500 + }, + { + "epoch": 0.11794312752238252, + "grad_norm": 0.0994805321097374, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 30510 + }, + { + "epoch": 0.1179817847257658, + "grad_norm": 0.3877389430999756, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 30520 + }, + { + "epoch": 0.11802044192914908, + "grad_norm": 0.101154625415802, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 30530 + }, + { + "epoch": 0.11805909913253236, + "grad_norm": 0.12650534510612488, + "learning_rate": 0.002, + "loss": 2.3865, + "step": 30540 + }, + { + "epoch": 0.11809775633591564, + "grad_norm": 0.1112612634897232, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 30550 + }, + { + "epoch": 0.1181364135392989, + "grad_norm": 0.10789719223976135, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 30560 + }, + { + "epoch": 0.11817507074268219, + "grad_norm": 0.11468793451786041, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 30570 + }, + { + "epoch": 0.11821372794606547, + "grad_norm": 0.1100701093673706, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 30580 + }, + { + "epoch": 0.11825238514944875, + "grad_norm": 0.09143993258476257, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 30590 + }, + { + "epoch": 0.11829104235283203, + "grad_norm": 0.10562839359045029, + "learning_rate": 0.002, + "loss": 2.3901, + "step": 30600 + }, + { + "epoch": 0.1183296995562153, + "grad_norm": 0.10740886628627777, + "learning_rate": 0.002, + "loss": 2.3798, + "step": 30610 + }, + { + "epoch": 0.11836835675959859, + "grad_norm": 0.15221528708934784, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 30620 + }, + { + "epoch": 0.11840701396298187, + "grad_norm": 0.11569618433713913, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 30630 + }, + { + "epoch": 0.11844567116636515, + "grad_norm": 0.10287598520517349, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 30640 + }, + { + "epoch": 0.11848432836974841, + "grad_norm": 0.10523258149623871, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 30650 + }, + { + "epoch": 0.11852298557313169, + "grad_norm": 0.11168524622917175, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 30660 + }, + { + "epoch": 0.11856164277651497, + "grad_norm": 0.10336096584796906, + "learning_rate": 0.002, + "loss": 2.353, + "step": 30670 + }, + { + "epoch": 0.11860029997989825, + "grad_norm": 0.10352528840303421, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 30680 + }, + { + "epoch": 0.11863895718328153, + "grad_norm": 0.14516034722328186, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 30690 + }, + { + "epoch": 0.11867761438666481, + "grad_norm": 0.10678558051586151, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 30700 + }, + { + "epoch": 0.11871627159004809, + "grad_norm": 0.08876782655715942, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 30710 + }, + { + "epoch": 0.11875492879343137, + "grad_norm": 0.11112259328365326, + "learning_rate": 0.002, + "loss": 2.3879, + "step": 30720 + }, + { + "epoch": 0.11879358599681465, + "grad_norm": 0.0998547375202179, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 30730 + }, + { + "epoch": 0.11883224320019793, + "grad_norm": 0.11167199909687042, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 30740 + }, + { + "epoch": 0.1188709004035812, + "grad_norm": 0.09049560129642487, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 30750 + }, + { + "epoch": 0.11890955760696448, + "grad_norm": 0.12534171342849731, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 30760 + }, + { + "epoch": 0.11894821481034776, + "grad_norm": 0.12534017860889435, + "learning_rate": 0.002, + "loss": 2.361, + "step": 30770 + }, + { + "epoch": 0.11898687201373104, + "grad_norm": 0.1101115494966507, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 30780 + }, + { + "epoch": 0.11902552921711432, + "grad_norm": 0.11233854293823242, + "learning_rate": 0.002, + "loss": 2.375, + "step": 30790 + }, + { + "epoch": 0.1190641864204976, + "grad_norm": 0.09808002412319183, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 30800 + }, + { + "epoch": 0.11910284362388088, + "grad_norm": 0.1032966896891594, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 30810 + }, + { + "epoch": 0.11914150082726416, + "grad_norm": 0.10848798602819443, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 30820 + }, + { + "epoch": 0.11918015803064744, + "grad_norm": 0.13478338718414307, + "learning_rate": 0.002, + "loss": 2.3874, + "step": 30830 + }, + { + "epoch": 0.1192188152340307, + "grad_norm": 0.10117039084434509, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 30840 + }, + { + "epoch": 0.11925747243741398, + "grad_norm": 0.11757628619670868, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 30850 + }, + { + "epoch": 0.11929612964079726, + "grad_norm": 0.11381714046001434, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 30860 + }, + { + "epoch": 0.11933478684418054, + "grad_norm": 0.10066360980272293, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 30870 + }, + { + "epoch": 0.11937344404756382, + "grad_norm": 0.09848344326019287, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 30880 + }, + { + "epoch": 0.1194121012509471, + "grad_norm": 0.13607080280780792, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 30890 + }, + { + "epoch": 0.11945075845433038, + "grad_norm": 0.13393841683864594, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 30900 + }, + { + "epoch": 0.11948941565771366, + "grad_norm": 0.11103633046150208, + "learning_rate": 0.002, + "loss": 2.3972, + "step": 30910 + }, + { + "epoch": 0.11952807286109694, + "grad_norm": 0.13245902955532074, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 30920 + }, + { + "epoch": 0.11956673006448021, + "grad_norm": 0.10202698409557343, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 30930 + }, + { + "epoch": 0.11960538726786349, + "grad_norm": 0.10218498855829239, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 30940 + }, + { + "epoch": 0.11964404447124677, + "grad_norm": 0.09530593454837799, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 30950 + }, + { + "epoch": 0.11968270167463005, + "grad_norm": 0.11549419164657593, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 30960 + }, + { + "epoch": 0.11972135887801333, + "grad_norm": 0.10871879756450653, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 30970 + }, + { + "epoch": 0.11976001608139661, + "grad_norm": 0.12116476148366928, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 30980 + }, + { + "epoch": 0.11979867328477989, + "grad_norm": 0.1303841769695282, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 30990 + }, + { + "epoch": 0.11983733048816317, + "grad_norm": 0.11049704253673553, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 31000 + }, + { + "epoch": 0.11987598769154645, + "grad_norm": 0.10392177104949951, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 31010 + }, + { + "epoch": 0.11991464489492971, + "grad_norm": 0.11160852760076523, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 31020 + }, + { + "epoch": 0.119953302098313, + "grad_norm": 0.10968372225761414, + "learning_rate": 0.002, + "loss": 2.377, + "step": 31030 + }, + { + "epoch": 0.11999195930169627, + "grad_norm": 0.10463325679302216, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 31040 + }, + { + "epoch": 0.12003061650507955, + "grad_norm": 0.10578692704439163, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 31050 + }, + { + "epoch": 0.12006927370846283, + "grad_norm": 0.13343378901481628, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 31060 + }, + { + "epoch": 0.12010793091184611, + "grad_norm": 0.11525391787290573, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 31070 + }, + { + "epoch": 0.1201465881152294, + "grad_norm": 0.10206209868192673, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 31080 + }, + { + "epoch": 0.12018524531861267, + "grad_norm": 0.11907844245433807, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 31090 + }, + { + "epoch": 0.12022390252199595, + "grad_norm": 0.1089697778224945, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 31100 + }, + { + "epoch": 0.12026255972537923, + "grad_norm": 0.11410657316446304, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 31110 + }, + { + "epoch": 0.1203012169287625, + "grad_norm": 0.12164189666509628, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 31120 + }, + { + "epoch": 0.12033987413214578, + "grad_norm": 0.11357486248016357, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 31130 + }, + { + "epoch": 0.12037853133552906, + "grad_norm": 0.12656593322753906, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 31140 + }, + { + "epoch": 0.12041718853891234, + "grad_norm": 0.11332329362630844, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 31150 + }, + { + "epoch": 0.12045584574229562, + "grad_norm": 0.10493875294923782, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 31160 + }, + { + "epoch": 0.1204945029456789, + "grad_norm": 0.10366437584161758, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 31170 + }, + { + "epoch": 0.12053316014906218, + "grad_norm": 0.12637628614902496, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 31180 + }, + { + "epoch": 0.12057181735244546, + "grad_norm": 0.12367497384548187, + "learning_rate": 0.002, + "loss": 2.361, + "step": 31190 + }, + { + "epoch": 0.12061047455582874, + "grad_norm": 0.1017698347568512, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 31200 + }, + { + "epoch": 0.120649131759212, + "grad_norm": 0.1321551352739334, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 31210 + }, + { + "epoch": 0.12068778896259529, + "grad_norm": 0.10494833439588547, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 31220 + }, + { + "epoch": 0.12072644616597857, + "grad_norm": 0.11836351454257965, + "learning_rate": 0.002, + "loss": 2.384, + "step": 31230 + }, + { + "epoch": 0.12076510336936185, + "grad_norm": 0.12958987057209015, + "learning_rate": 0.002, + "loss": 2.368, + "step": 31240 + }, + { + "epoch": 0.12080376057274513, + "grad_norm": 0.10293237864971161, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 31250 + }, + { + "epoch": 0.1208424177761284, + "grad_norm": 0.11553335934877396, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 31260 + }, + { + "epoch": 0.12088107497951169, + "grad_norm": 0.12096146494150162, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 31270 + }, + { + "epoch": 0.12091973218289497, + "grad_norm": 0.11854767799377441, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 31280 + }, + { + "epoch": 0.12095838938627824, + "grad_norm": 0.10580070316791534, + "learning_rate": 0.002, + "loss": 2.3798, + "step": 31290 + }, + { + "epoch": 0.12099704658966151, + "grad_norm": 0.11417514830827713, + "learning_rate": 0.002, + "loss": 2.3862, + "step": 31300 + }, + { + "epoch": 0.12103570379304479, + "grad_norm": 0.11195675283670425, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 31310 + }, + { + "epoch": 0.12107436099642807, + "grad_norm": 0.09545913338661194, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 31320 + }, + { + "epoch": 0.12111301819981135, + "grad_norm": 0.10411213338375092, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 31330 + }, + { + "epoch": 0.12115167540319463, + "grad_norm": 0.11142470687627792, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 31340 + }, + { + "epoch": 0.12119033260657791, + "grad_norm": 0.09210135042667389, + "learning_rate": 0.002, + "loss": 2.359, + "step": 31350 + }, + { + "epoch": 0.12122898980996119, + "grad_norm": 0.10986145585775375, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 31360 + }, + { + "epoch": 0.12126764701334447, + "grad_norm": 0.10226043313741684, + "learning_rate": 0.002, + "loss": 2.377, + "step": 31370 + }, + { + "epoch": 0.12130630421672775, + "grad_norm": 0.12476920336484909, + "learning_rate": 0.002, + "loss": 2.3847, + "step": 31380 + }, + { + "epoch": 0.12134496142011102, + "grad_norm": 0.10898647457361221, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 31390 + }, + { + "epoch": 0.1213836186234943, + "grad_norm": 0.10633208602666855, + "learning_rate": 0.002, + "loss": 2.362, + "step": 31400 + }, + { + "epoch": 0.12142227582687758, + "grad_norm": 0.09353228658437729, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 31410 + }, + { + "epoch": 0.12146093303026086, + "grad_norm": 0.13745516538619995, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 31420 + }, + { + "epoch": 0.12149959023364414, + "grad_norm": 0.11073607206344604, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 31430 + }, + { + "epoch": 0.12153824743702742, + "grad_norm": 0.11021778732538223, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 31440 + }, + { + "epoch": 0.1215769046404107, + "grad_norm": 0.10833906382322311, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 31450 + }, + { + "epoch": 0.12161556184379398, + "grad_norm": 0.1117563247680664, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 31460 + }, + { + "epoch": 0.12165421904717726, + "grad_norm": 0.12070424854755402, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 31470 + }, + { + "epoch": 0.12169287625056054, + "grad_norm": 0.11928492784500122, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 31480 + }, + { + "epoch": 0.1217315334539438, + "grad_norm": 0.10886865854263306, + "learning_rate": 0.002, + "loss": 2.368, + "step": 31490 + }, + { + "epoch": 0.12177019065732708, + "grad_norm": 0.09866543859243393, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 31500 + }, + { + "epoch": 0.12180884786071036, + "grad_norm": 0.1067083477973938, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 31510 + }, + { + "epoch": 0.12184750506409364, + "grad_norm": 0.09977913647890091, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 31520 + }, + { + "epoch": 0.12188616226747692, + "grad_norm": 0.11578541994094849, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 31530 + }, + { + "epoch": 0.1219248194708602, + "grad_norm": 0.10357668250799179, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 31540 + }, + { + "epoch": 0.12196347667424348, + "grad_norm": 0.1520976573228836, + "learning_rate": 0.002, + "loss": 2.3823, + "step": 31550 + }, + { + "epoch": 0.12200213387762676, + "grad_norm": 0.0963573008775711, + "learning_rate": 0.002, + "loss": 2.373, + "step": 31560 + }, + { + "epoch": 0.12204079108101004, + "grad_norm": 0.20803941786289215, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 31570 + }, + { + "epoch": 0.12207944828439331, + "grad_norm": 0.10182123631238937, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 31580 + }, + { + "epoch": 0.12211810548777659, + "grad_norm": 0.10604804754257202, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 31590 + }, + { + "epoch": 0.12215676269115987, + "grad_norm": 0.11439476162195206, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 31600 + }, + { + "epoch": 0.12219541989454315, + "grad_norm": 0.1221204400062561, + "learning_rate": 0.002, + "loss": 2.3819, + "step": 31610 + }, + { + "epoch": 0.12223407709792643, + "grad_norm": 0.10415617376565933, + "learning_rate": 0.002, + "loss": 2.389, + "step": 31620 + }, + { + "epoch": 0.12227273430130971, + "grad_norm": 0.10428808629512787, + "learning_rate": 0.002, + "loss": 2.363, + "step": 31630 + }, + { + "epoch": 0.12231139150469299, + "grad_norm": 0.1038336530327797, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 31640 + }, + { + "epoch": 0.12235004870807627, + "grad_norm": 0.10959843546152115, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 31650 + }, + { + "epoch": 0.12238870591145955, + "grad_norm": 0.10477408766746521, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 31660 + }, + { + "epoch": 0.12242736311484281, + "grad_norm": 0.11104969680309296, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 31670 + }, + { + "epoch": 0.1224660203182261, + "grad_norm": 0.2157076746225357, + "learning_rate": 0.002, + "loss": 2.367, + "step": 31680 + }, + { + "epoch": 0.12250467752160937, + "grad_norm": 0.10109077394008636, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 31690 + }, + { + "epoch": 0.12254333472499265, + "grad_norm": 0.1090427115559578, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 31700 + }, + { + "epoch": 0.12258199192837593, + "grad_norm": 0.12032198160886765, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 31710 + }, + { + "epoch": 0.12262064913175921, + "grad_norm": 0.1247207447886467, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 31720 + }, + { + "epoch": 0.12265930633514249, + "grad_norm": 0.10226906836032867, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 31730 + }, + { + "epoch": 0.12269796353852577, + "grad_norm": 0.11415991932153702, + "learning_rate": 0.002, + "loss": 2.3876, + "step": 31740 + }, + { + "epoch": 0.12273662074190905, + "grad_norm": 0.14343005418777466, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 31750 + }, + { + "epoch": 0.12277527794529233, + "grad_norm": 0.10491285473108292, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 31760 + }, + { + "epoch": 0.1228139351486756, + "grad_norm": 0.12787176668643951, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 31770 + }, + { + "epoch": 0.12285259235205888, + "grad_norm": 0.1200731173157692, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 31780 + }, + { + "epoch": 0.12289124955544216, + "grad_norm": 0.09315181523561478, + "learning_rate": 0.002, + "loss": 2.371, + "step": 31790 + }, + { + "epoch": 0.12292990675882544, + "grad_norm": 0.10988342016935349, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 31800 + }, + { + "epoch": 0.12296856396220872, + "grad_norm": 0.1178271472454071, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 31810 + }, + { + "epoch": 0.123007221165592, + "grad_norm": 0.12532812356948853, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 31820 + }, + { + "epoch": 0.12304587836897528, + "grad_norm": 0.1229834333062172, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 31830 + }, + { + "epoch": 0.12308453557235856, + "grad_norm": 0.12011890858411789, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 31840 + }, + { + "epoch": 0.12312319277574184, + "grad_norm": 0.11288615316152573, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 31850 + }, + { + "epoch": 0.1231618499791251, + "grad_norm": 0.13161183893680573, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 31860 + }, + { + "epoch": 0.12320050718250838, + "grad_norm": 0.11495167762041092, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 31870 + }, + { + "epoch": 0.12323916438589166, + "grad_norm": 0.10141012817621231, + "learning_rate": 0.002, + "loss": 2.3842, + "step": 31880 + }, + { + "epoch": 0.12327782158927494, + "grad_norm": 0.13126742839813232, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 31890 + }, + { + "epoch": 0.12331647879265822, + "grad_norm": 0.09751483798027039, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 31900 + }, + { + "epoch": 0.1233551359960415, + "grad_norm": 0.11303214728832245, + "learning_rate": 0.002, + "loss": 2.3934, + "step": 31910 + }, + { + "epoch": 0.12339379319942478, + "grad_norm": 0.10097888857126236, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 31920 + }, + { + "epoch": 0.12343245040280806, + "grad_norm": 0.09599561244249344, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 31930 + }, + { + "epoch": 0.12347110760619134, + "grad_norm": 0.10423211753368378, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 31940 + }, + { + "epoch": 0.12350976480957461, + "grad_norm": 0.09624336659908295, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 31950 + }, + { + "epoch": 0.12354842201295789, + "grad_norm": 0.09582757949829102, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 31960 + }, + { + "epoch": 0.12358707921634117, + "grad_norm": 0.1117819994688034, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 31970 + }, + { + "epoch": 0.12362573641972445, + "grad_norm": 0.10983041673898697, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 31980 + }, + { + "epoch": 0.12366439362310773, + "grad_norm": 0.11222923547029495, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 31990 + }, + { + "epoch": 0.12370305082649101, + "grad_norm": 0.1040363609790802, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 32000 + }, + { + "epoch": 0.12374170802987429, + "grad_norm": 0.12385562807321548, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 32010 + }, + { + "epoch": 0.12378036523325757, + "grad_norm": 0.10607069730758667, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 32020 + }, + { + "epoch": 0.12381902243664085, + "grad_norm": 0.09980407357215881, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 32030 + }, + { + "epoch": 0.12385767964002412, + "grad_norm": 0.12108637392520905, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 32040 + }, + { + "epoch": 0.1238963368434074, + "grad_norm": 0.10419555008411407, + "learning_rate": 0.002, + "loss": 2.374, + "step": 32050 + }, + { + "epoch": 0.12393499404679068, + "grad_norm": 0.1171611100435257, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 32060 + }, + { + "epoch": 0.12397365125017396, + "grad_norm": 0.10272298008203506, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 32070 + }, + { + "epoch": 0.12401230845355724, + "grad_norm": 0.11056289076805115, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 32080 + }, + { + "epoch": 0.12405096565694052, + "grad_norm": 0.11453638225793839, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 32090 + }, + { + "epoch": 0.1240896228603238, + "grad_norm": 0.11036056280136108, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 32100 + }, + { + "epoch": 0.12412828006370707, + "grad_norm": 0.11167748272418976, + "learning_rate": 0.002, + "loss": 2.3914, + "step": 32110 + }, + { + "epoch": 0.12416693726709035, + "grad_norm": 0.10620572417974472, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 32120 + }, + { + "epoch": 0.12420559447047363, + "grad_norm": 0.10564051568508148, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 32130 + }, + { + "epoch": 0.1242442516738569, + "grad_norm": 0.1290002316236496, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 32140 + }, + { + "epoch": 0.12428290887724018, + "grad_norm": 0.12541048228740692, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 32150 + }, + { + "epoch": 0.12432156608062346, + "grad_norm": 0.13069938123226166, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 32160 + }, + { + "epoch": 0.12436022328400674, + "grad_norm": 0.10358031839132309, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 32170 + }, + { + "epoch": 0.12439888048739002, + "grad_norm": 0.10525479167699814, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 32180 + }, + { + "epoch": 0.1244375376907733, + "grad_norm": 0.10335072129964828, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 32190 + }, + { + "epoch": 0.12447619489415658, + "grad_norm": 0.10958348959684372, + "learning_rate": 0.002, + "loss": 2.352, + "step": 32200 + }, + { + "epoch": 0.12451485209753986, + "grad_norm": 0.1179596409201622, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 32210 + }, + { + "epoch": 0.12455350930092314, + "grad_norm": 0.09787681698799133, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 32220 + }, + { + "epoch": 0.1245921665043064, + "grad_norm": 0.11009865999221802, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 32230 + }, + { + "epoch": 0.12463082370768969, + "grad_norm": 0.12366067618131638, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 32240 + }, + { + "epoch": 0.12466948091107297, + "grad_norm": 0.1004662960767746, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 32250 + }, + { + "epoch": 0.12470813811445625, + "grad_norm": 0.10903781652450562, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 32260 + }, + { + "epoch": 0.12474679531783953, + "grad_norm": 0.11716281622648239, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 32270 + }, + { + "epoch": 0.1247854525212228, + "grad_norm": 0.10709843039512634, + "learning_rate": 0.002, + "loss": 2.36, + "step": 32280 + }, + { + "epoch": 0.12482410972460609, + "grad_norm": 0.10901437699794769, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 32290 + }, + { + "epoch": 0.12486276692798937, + "grad_norm": 0.10774769634008408, + "learning_rate": 0.002, + "loss": 2.3926, + "step": 32300 + }, + { + "epoch": 0.12490142413137265, + "grad_norm": 0.10092653334140778, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 32310 + }, + { + "epoch": 0.12494008133475591, + "grad_norm": 0.10283713787794113, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 32320 + }, + { + "epoch": 0.12497873853813919, + "grad_norm": 0.09721902012825012, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 32330 + }, + { + "epoch": 0.12501739574152249, + "grad_norm": 0.11538095027208328, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 32340 + }, + { + "epoch": 0.12505605294490577, + "grad_norm": 0.1069326102733612, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 32350 + }, + { + "epoch": 0.12509471014828905, + "grad_norm": 0.11422941833734512, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 32360 + }, + { + "epoch": 0.1251333673516723, + "grad_norm": 0.10990530997514725, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 32370 + }, + { + "epoch": 0.12517202455505558, + "grad_norm": 0.13378532230854034, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 32380 + }, + { + "epoch": 0.12521068175843886, + "grad_norm": 0.112189382314682, + "learning_rate": 0.002, + "loss": 2.38, + "step": 32390 + }, + { + "epoch": 0.12524933896182214, + "grad_norm": 0.11024999618530273, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 32400 + }, + { + "epoch": 0.12528799616520542, + "grad_norm": 0.11159463226795197, + "learning_rate": 0.002, + "loss": 2.369, + "step": 32410 + }, + { + "epoch": 0.1253266533685887, + "grad_norm": 0.11366341263055801, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 32420 + }, + { + "epoch": 0.12536531057197198, + "grad_norm": 0.10603636503219604, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 32430 + }, + { + "epoch": 0.12540396777535526, + "grad_norm": 0.11254012584686279, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 32440 + }, + { + "epoch": 0.12544262497873854, + "grad_norm": 0.1268596053123474, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 32450 + }, + { + "epoch": 0.12548128218212182, + "grad_norm": 0.1108691468834877, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 32460 + }, + { + "epoch": 0.1255199393855051, + "grad_norm": 0.12088431417942047, + "learning_rate": 0.002, + "loss": 2.3811, + "step": 32470 + }, + { + "epoch": 0.12555859658888838, + "grad_norm": 0.24373848736286163, + "learning_rate": 0.002, + "loss": 2.3821, + "step": 32480 + }, + { + "epoch": 0.12559725379227166, + "grad_norm": 0.11183463037014008, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 32490 + }, + { + "epoch": 0.12563591099565494, + "grad_norm": 0.12124831974506378, + "learning_rate": 0.002, + "loss": 2.3888, + "step": 32500 + }, + { + "epoch": 0.12567456819903822, + "grad_norm": 0.10651678591966629, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 32510 + }, + { + "epoch": 0.1257132254024215, + "grad_norm": 0.12588591873645782, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 32520 + }, + { + "epoch": 0.12575188260580478, + "grad_norm": 0.09625239670276642, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 32530 + }, + { + "epoch": 0.12579053980918806, + "grad_norm": 0.11514429748058319, + "learning_rate": 0.002, + "loss": 2.375, + "step": 32540 + }, + { + "epoch": 0.1258291970125713, + "grad_norm": 0.10993642359972, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 32550 + }, + { + "epoch": 0.1258678542159546, + "grad_norm": 0.10194283723831177, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 32560 + }, + { + "epoch": 0.12590651141933787, + "grad_norm": 0.09949901700019836, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 32570 + }, + { + "epoch": 0.12594516862272115, + "grad_norm": 0.12141376733779907, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 32580 + }, + { + "epoch": 0.12598382582610443, + "grad_norm": 0.12764514982700348, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 32590 + }, + { + "epoch": 0.1260224830294877, + "grad_norm": 0.11071311682462692, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 32600 + }, + { + "epoch": 0.126061140232871, + "grad_norm": 0.10684597492218018, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 32610 + }, + { + "epoch": 0.12609979743625427, + "grad_norm": 0.11873281747102737, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 32620 + }, + { + "epoch": 0.12613845463963755, + "grad_norm": 0.08808255940675735, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 32630 + }, + { + "epoch": 0.12617711184302083, + "grad_norm": 0.10667643696069717, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 32640 + }, + { + "epoch": 0.1262157690464041, + "grad_norm": 0.12549149990081787, + "learning_rate": 0.002, + "loss": 2.351, + "step": 32650 + }, + { + "epoch": 0.1262544262497874, + "grad_norm": 0.09748613834381104, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 32660 + }, + { + "epoch": 0.12629308345317067, + "grad_norm": 0.11375278234481812, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 32670 + }, + { + "epoch": 0.12633174065655395, + "grad_norm": 0.10890177637338638, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 32680 + }, + { + "epoch": 0.12637039785993723, + "grad_norm": 0.09607157856225967, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 32690 + }, + { + "epoch": 0.1264090550633205, + "grad_norm": 0.11426623165607452, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 32700 + }, + { + "epoch": 0.1264477122667038, + "grad_norm": 0.09992794692516327, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 32710 + }, + { + "epoch": 0.12648636947008707, + "grad_norm": 0.10609929263591766, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 32720 + }, + { + "epoch": 0.12652502667347035, + "grad_norm": 0.09984554350376129, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 32730 + }, + { + "epoch": 0.1265636838768536, + "grad_norm": 0.09323658049106598, + "learning_rate": 0.002, + "loss": 2.3834, + "step": 32740 + }, + { + "epoch": 0.12660234108023688, + "grad_norm": 0.11080362647771835, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 32750 + }, + { + "epoch": 0.12664099828362016, + "grad_norm": 0.11429428309202194, + "learning_rate": 0.002, + "loss": 2.383, + "step": 32760 + }, + { + "epoch": 0.12667965548700344, + "grad_norm": 0.13485601544380188, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 32770 + }, + { + "epoch": 0.12671831269038672, + "grad_norm": 0.10016801953315735, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 32780 + }, + { + "epoch": 0.12675696989377, + "grad_norm": 0.11193803697824478, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 32790 + }, + { + "epoch": 0.12679562709715328, + "grad_norm": 0.10117737203836441, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 32800 + }, + { + "epoch": 0.12683428430053656, + "grad_norm": 0.11539480090141296, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 32810 + }, + { + "epoch": 0.12687294150391984, + "grad_norm": 0.1457090973854065, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 32820 + }, + { + "epoch": 0.12691159870730312, + "grad_norm": 0.10625103861093521, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 32830 + }, + { + "epoch": 0.1269502559106864, + "grad_norm": 0.10272681713104248, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 32840 + }, + { + "epoch": 0.12698891311406968, + "grad_norm": 0.13046136498451233, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 32850 + }, + { + "epoch": 0.12702757031745296, + "grad_norm": 0.10971887409687042, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 32860 + }, + { + "epoch": 0.12706622752083624, + "grad_norm": 0.09908357262611389, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 32870 + }, + { + "epoch": 0.12710488472421952, + "grad_norm": 0.09171049296855927, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 32880 + }, + { + "epoch": 0.1271435419276028, + "grad_norm": 0.11479821056127548, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 32890 + }, + { + "epoch": 0.12718219913098608, + "grad_norm": 0.11469388753175735, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 32900 + }, + { + "epoch": 0.12722085633436936, + "grad_norm": 0.10828559100627899, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 32910 + }, + { + "epoch": 0.1272595135377526, + "grad_norm": 0.12178273499011993, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 32920 + }, + { + "epoch": 0.1272981707411359, + "grad_norm": 0.12015478312969208, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 32930 + }, + { + "epoch": 0.12733682794451917, + "grad_norm": 0.1151486188173294, + "learning_rate": 0.002, + "loss": 2.377, + "step": 32940 + }, + { + "epoch": 0.12737548514790245, + "grad_norm": 0.10822568833827972, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 32950 + }, + { + "epoch": 0.12741414235128573, + "grad_norm": 0.12262659519910812, + "learning_rate": 0.002, + "loss": 2.3843, + "step": 32960 + }, + { + "epoch": 0.127452799554669, + "grad_norm": 0.09884728491306305, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 32970 + }, + { + "epoch": 0.1274914567580523, + "grad_norm": 0.10737740248441696, + "learning_rate": 0.002, + "loss": 2.375, + "step": 32980 + }, + { + "epoch": 0.12753011396143557, + "grad_norm": 0.12268875539302826, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 32990 + }, + { + "epoch": 0.12756877116481885, + "grad_norm": 0.108036570250988, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 33000 + }, + { + "epoch": 0.12760742836820213, + "grad_norm": 0.12710832059383392, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 33010 + }, + { + "epoch": 0.1276460855715854, + "grad_norm": 0.11616093665361404, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 33020 + }, + { + "epoch": 0.1276847427749687, + "grad_norm": 0.11192166805267334, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 33030 + }, + { + "epoch": 0.12772339997835197, + "grad_norm": 0.09995438903570175, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 33040 + }, + { + "epoch": 0.12776205718173525, + "grad_norm": 0.10356901586055756, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 33050 + }, + { + "epoch": 0.12780071438511853, + "grad_norm": 0.1217799261212349, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 33060 + }, + { + "epoch": 0.1278393715885018, + "grad_norm": 0.14946649968624115, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 33070 + }, + { + "epoch": 0.1278780287918851, + "grad_norm": 0.10571971535682678, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 33080 + }, + { + "epoch": 0.12791668599526837, + "grad_norm": 0.10848978906869888, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 33090 + }, + { + "epoch": 0.12795534319865165, + "grad_norm": 0.10227275639772415, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 33100 + }, + { + "epoch": 0.1279940004020349, + "grad_norm": 0.1201995238661766, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 33110 + }, + { + "epoch": 0.12803265760541818, + "grad_norm": 0.1080368310213089, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 33120 + }, + { + "epoch": 0.12807131480880146, + "grad_norm": 0.10840100049972534, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 33130 + }, + { + "epoch": 0.12810997201218474, + "grad_norm": 0.11384178698062897, + "learning_rate": 0.002, + "loss": 2.372, + "step": 33140 + }, + { + "epoch": 0.12814862921556802, + "grad_norm": 0.11699331551790237, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 33150 + }, + { + "epoch": 0.1281872864189513, + "grad_norm": 0.11681430786848068, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 33160 + }, + { + "epoch": 0.12822594362233458, + "grad_norm": 0.1251811385154724, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 33170 + }, + { + "epoch": 0.12826460082571786, + "grad_norm": 0.11562751978635788, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 33180 + }, + { + "epoch": 0.12830325802910114, + "grad_norm": 0.1195770800113678, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 33190 + }, + { + "epoch": 0.12834191523248442, + "grad_norm": 0.09677097201347351, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 33200 + }, + { + "epoch": 0.1283805724358677, + "grad_norm": 0.12000081688165665, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 33210 + }, + { + "epoch": 0.12841922963925098, + "grad_norm": 0.10882232338190079, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 33220 + }, + { + "epoch": 0.12845788684263426, + "grad_norm": 0.11827144026756287, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 33230 + }, + { + "epoch": 0.12849654404601754, + "grad_norm": 0.10499045997858047, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 33240 + }, + { + "epoch": 0.12853520124940082, + "grad_norm": 0.09814994782209396, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 33250 + }, + { + "epoch": 0.1285738584527841, + "grad_norm": 0.09377431869506836, + "learning_rate": 0.002, + "loss": 2.379, + "step": 33260 + }, + { + "epoch": 0.12861251565616738, + "grad_norm": 0.09934505075216293, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 33270 + }, + { + "epoch": 0.12865117285955066, + "grad_norm": 0.10614591091871262, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 33280 + }, + { + "epoch": 0.12868983006293394, + "grad_norm": 0.11912450194358826, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 33290 + }, + { + "epoch": 0.1287284872663172, + "grad_norm": 0.10692890733480453, + "learning_rate": 0.002, + "loss": 2.372, + "step": 33300 + }, + { + "epoch": 0.12876714446970047, + "grad_norm": 0.10942743718624115, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 33310 + }, + { + "epoch": 0.12880580167308375, + "grad_norm": 0.10537812858819962, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 33320 + }, + { + "epoch": 0.12884445887646703, + "grad_norm": 0.11302319914102554, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 33330 + }, + { + "epoch": 0.1288831160798503, + "grad_norm": 0.13061635196208954, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 33340 + }, + { + "epoch": 0.1289217732832336, + "grad_norm": 0.18177761137485504, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 33350 + }, + { + "epoch": 0.12896043048661687, + "grad_norm": 0.11147965490818024, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 33360 + }, + { + "epoch": 0.12899908769000015, + "grad_norm": 0.10751602053642273, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 33370 + }, + { + "epoch": 0.12903774489338343, + "grad_norm": 0.10788480192422867, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 33380 + }, + { + "epoch": 0.1290764020967667, + "grad_norm": 0.10402106493711472, + "learning_rate": 0.002, + "loss": 2.385, + "step": 33390 + }, + { + "epoch": 0.12911505930015, + "grad_norm": 0.14801332354545593, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 33400 + }, + { + "epoch": 0.12915371650353327, + "grad_norm": 0.10262597352266312, + "learning_rate": 0.002, + "loss": 2.3839, + "step": 33410 + }, + { + "epoch": 0.12919237370691655, + "grad_norm": 0.11831195652484894, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 33420 + }, + { + "epoch": 0.12923103091029983, + "grad_norm": 0.10673588514328003, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 33430 + }, + { + "epoch": 0.1292696881136831, + "grad_norm": 0.11378040909767151, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 33440 + }, + { + "epoch": 0.1293083453170664, + "grad_norm": 0.10400614142417908, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 33450 + }, + { + "epoch": 0.12934700252044967, + "grad_norm": 0.10201553255319595, + "learning_rate": 0.002, + "loss": 2.368, + "step": 33460 + }, + { + "epoch": 0.12938565972383295, + "grad_norm": 0.10591879487037659, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 33470 + }, + { + "epoch": 0.1294243169272162, + "grad_norm": 0.09862728416919708, + "learning_rate": 0.002, + "loss": 2.369, + "step": 33480 + }, + { + "epoch": 0.12946297413059948, + "grad_norm": 0.1223021075129509, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 33490 + }, + { + "epoch": 0.12950163133398276, + "grad_norm": 0.11414515972137451, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 33500 + }, + { + "epoch": 0.12954028853736604, + "grad_norm": 0.10432834178209305, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 33510 + }, + { + "epoch": 0.12957894574074932, + "grad_norm": 0.09742403030395508, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 33520 + }, + { + "epoch": 0.1296176029441326, + "grad_norm": 0.13793131709098816, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 33530 + }, + { + "epoch": 0.12965626014751588, + "grad_norm": 0.10405918955802917, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 33540 + }, + { + "epoch": 0.12969491735089916, + "grad_norm": 0.12017746269702911, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 33550 + }, + { + "epoch": 0.12973357455428244, + "grad_norm": 0.16131430864334106, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 33560 + }, + { + "epoch": 0.12977223175766572, + "grad_norm": 0.09843146055936813, + "learning_rate": 0.002, + "loss": 2.366, + "step": 33570 + }, + { + "epoch": 0.129810888961049, + "grad_norm": 0.09624658524990082, + "learning_rate": 0.002, + "loss": 2.374, + "step": 33580 + }, + { + "epoch": 0.12984954616443228, + "grad_norm": 0.14317472279071808, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 33590 + }, + { + "epoch": 0.12988820336781556, + "grad_norm": 0.11247939616441727, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 33600 + }, + { + "epoch": 0.12992686057119884, + "grad_norm": 0.11236079782247543, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 33610 + }, + { + "epoch": 0.12996551777458212, + "grad_norm": 0.09830132871866226, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 33620 + }, + { + "epoch": 0.1300041749779654, + "grad_norm": 0.12697303295135498, + "learning_rate": 0.002, + "loss": 2.368, + "step": 33630 + }, + { + "epoch": 0.13004283218134868, + "grad_norm": 0.11531970649957657, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 33640 + }, + { + "epoch": 0.13008148938473196, + "grad_norm": 0.1151520311832428, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 33650 + }, + { + "epoch": 0.13012014658811524, + "grad_norm": 0.11021476984024048, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 33660 + }, + { + "epoch": 0.1301588037914985, + "grad_norm": 0.1186911091208458, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 33670 + }, + { + "epoch": 0.13019746099488178, + "grad_norm": 0.12309000641107559, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 33680 + }, + { + "epoch": 0.13023611819826506, + "grad_norm": 0.11252123862504959, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 33690 + }, + { + "epoch": 0.13027477540164834, + "grad_norm": 0.10669244825839996, + "learning_rate": 0.002, + "loss": 2.3853, + "step": 33700 + }, + { + "epoch": 0.13031343260503161, + "grad_norm": 0.10406588017940521, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 33710 + }, + { + "epoch": 0.1303520898084149, + "grad_norm": 0.11617938429117203, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 33720 + }, + { + "epoch": 0.13039074701179817, + "grad_norm": 0.1132294237613678, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 33730 + }, + { + "epoch": 0.13042940421518145, + "grad_norm": 0.10331351310014725, + "learning_rate": 0.002, + "loss": 2.376, + "step": 33740 + }, + { + "epoch": 0.13046806141856473, + "grad_norm": 0.11811483651399612, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 33750 + }, + { + "epoch": 0.13050671862194801, + "grad_norm": 0.11243601888418198, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 33760 + }, + { + "epoch": 0.1305453758253313, + "grad_norm": 0.10086555033922195, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 33770 + }, + { + "epoch": 0.13058403302871457, + "grad_norm": 0.09739705175161362, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 33780 + }, + { + "epoch": 0.13062269023209785, + "grad_norm": 0.11868289113044739, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 33790 + }, + { + "epoch": 0.13066134743548113, + "grad_norm": 0.12014929950237274, + "learning_rate": 0.002, + "loss": 2.368, + "step": 33800 + }, + { + "epoch": 0.13070000463886441, + "grad_norm": 0.11841581016778946, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 33810 + }, + { + "epoch": 0.1307386618422477, + "grad_norm": 0.10554622858762741, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 33820 + }, + { + "epoch": 0.13077731904563097, + "grad_norm": 0.11373893916606903, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 33830 + }, + { + "epoch": 0.13081597624901425, + "grad_norm": 0.10629983991384506, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 33840 + }, + { + "epoch": 0.1308546334523975, + "grad_norm": 0.12039709091186523, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 33850 + }, + { + "epoch": 0.1308932906557808, + "grad_norm": 0.10280530154705048, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 33860 + }, + { + "epoch": 0.13093194785916407, + "grad_norm": 0.11763978004455566, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 33870 + }, + { + "epoch": 0.13097060506254735, + "grad_norm": 0.10091401636600494, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 33880 + }, + { + "epoch": 0.13100926226593063, + "grad_norm": 0.09946057200431824, + "learning_rate": 0.002, + "loss": 2.366, + "step": 33890 + }, + { + "epoch": 0.1310479194693139, + "grad_norm": 0.09984345734119415, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 33900 + }, + { + "epoch": 0.13108657667269719, + "grad_norm": 0.10241945832967758, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 33910 + }, + { + "epoch": 0.13112523387608047, + "grad_norm": 0.12130527943372726, + "learning_rate": 0.002, + "loss": 2.379, + "step": 33920 + }, + { + "epoch": 0.13116389107946375, + "grad_norm": 0.1324186772108078, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 33930 + }, + { + "epoch": 0.13120254828284703, + "grad_norm": 0.11365784704685211, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 33940 + }, + { + "epoch": 0.1312412054862303, + "grad_norm": 0.1000702902674675, + "learning_rate": 0.002, + "loss": 2.361, + "step": 33950 + }, + { + "epoch": 0.13127986268961359, + "grad_norm": 0.12024486064910889, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 33960 + }, + { + "epoch": 0.13131851989299687, + "grad_norm": 0.11599510163068771, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 33970 + }, + { + "epoch": 0.13135717709638015, + "grad_norm": 0.11001361906528473, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 33980 + }, + { + "epoch": 0.13139583429976343, + "grad_norm": 0.12406077235937119, + "learning_rate": 0.002, + "loss": 2.3865, + "step": 33990 + }, + { + "epoch": 0.1314344915031467, + "grad_norm": 0.1052245944738388, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 34000 + }, + { + "epoch": 0.13147314870652999, + "grad_norm": 0.10168890655040741, + "learning_rate": 0.002, + "loss": 2.368, + "step": 34010 + }, + { + "epoch": 0.13151180590991327, + "grad_norm": 0.107086680829525, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 34020 + }, + { + "epoch": 0.13155046311329655, + "grad_norm": 0.12155095487833023, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 34030 + }, + { + "epoch": 0.1315891203166798, + "grad_norm": 0.14013329148292542, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 34040 + }, + { + "epoch": 0.13162777752006308, + "grad_norm": 0.0939648449420929, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 34050 + }, + { + "epoch": 0.13166643472344636, + "grad_norm": 0.10857540369033813, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 34060 + }, + { + "epoch": 0.13170509192682964, + "grad_norm": 0.12482918798923492, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 34070 + }, + { + "epoch": 0.13174374913021292, + "grad_norm": 0.11149606853723526, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 34080 + }, + { + "epoch": 0.1317824063335962, + "grad_norm": 0.09495566040277481, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 34090 + }, + { + "epoch": 0.13182106353697948, + "grad_norm": 0.11346203833818436, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 34100 + }, + { + "epoch": 0.13185972074036276, + "grad_norm": 0.09801855683326721, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 34110 + }, + { + "epoch": 0.13189837794374604, + "grad_norm": 0.11547388881444931, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 34120 + }, + { + "epoch": 0.13193703514712932, + "grad_norm": 0.09938013553619385, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 34130 + }, + { + "epoch": 0.1319756923505126, + "grad_norm": 0.11353269219398499, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 34140 + }, + { + "epoch": 0.13201434955389588, + "grad_norm": 0.10928058624267578, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 34150 + }, + { + "epoch": 0.13205300675727916, + "grad_norm": 0.09228232502937317, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 34160 + }, + { + "epoch": 0.13209166396066244, + "grad_norm": 0.10695263743400574, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 34170 + }, + { + "epoch": 0.13213032116404572, + "grad_norm": 0.10757897049188614, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 34180 + }, + { + "epoch": 0.132168978367429, + "grad_norm": 0.11652503907680511, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 34190 + }, + { + "epoch": 0.13220763557081228, + "grad_norm": 0.11906265467405319, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 34200 + }, + { + "epoch": 0.13224629277419556, + "grad_norm": 0.10782443732023239, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 34210 + }, + { + "epoch": 0.1322849499775788, + "grad_norm": 0.11037446558475494, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 34220 + }, + { + "epoch": 0.1323236071809621, + "grad_norm": 0.10838919878005981, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 34230 + }, + { + "epoch": 0.13236226438434537, + "grad_norm": 0.11310845613479614, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 34240 + }, + { + "epoch": 0.13240092158772865, + "grad_norm": 0.10031761229038239, + "learning_rate": 0.002, + "loss": 2.385, + "step": 34250 + }, + { + "epoch": 0.13243957879111193, + "grad_norm": 0.11770909279584885, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 34260 + }, + { + "epoch": 0.1324782359944952, + "grad_norm": 0.12400855869054794, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 34270 + }, + { + "epoch": 0.1325168931978785, + "grad_norm": 0.10364599525928497, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 34280 + }, + { + "epoch": 0.13255555040126177, + "grad_norm": 0.11015373468399048, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 34290 + }, + { + "epoch": 0.13259420760464505, + "grad_norm": 0.09881162643432617, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 34300 + }, + { + "epoch": 0.13263286480802833, + "grad_norm": 0.11332736909389496, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 34310 + }, + { + "epoch": 0.1326715220114116, + "grad_norm": 0.11150939762592316, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 34320 + }, + { + "epoch": 0.1327101792147949, + "grad_norm": 0.10195937752723694, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 34330 + }, + { + "epoch": 0.13274883641817817, + "grad_norm": 0.12010291963815689, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 34340 + }, + { + "epoch": 0.13278749362156145, + "grad_norm": 0.09696830064058304, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 34350 + }, + { + "epoch": 0.13282615082494473, + "grad_norm": 0.14391209185123444, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 34360 + }, + { + "epoch": 0.132864808028328, + "grad_norm": 0.11253627389669418, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 34370 + }, + { + "epoch": 0.1329034652317113, + "grad_norm": 0.10166656225919724, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 34380 + }, + { + "epoch": 0.13294212243509457, + "grad_norm": 0.0988810807466507, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 34390 + }, + { + "epoch": 0.13298077963847785, + "grad_norm": 0.10384784638881683, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 34400 + }, + { + "epoch": 0.1330194368418611, + "grad_norm": 0.14408060908317566, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 34410 + }, + { + "epoch": 0.13305809404524438, + "grad_norm": 0.11995476484298706, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 34420 + }, + { + "epoch": 0.13309675124862766, + "grad_norm": 0.10418880730867386, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 34430 + }, + { + "epoch": 0.13313540845201094, + "grad_norm": 0.10377342253923416, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 34440 + }, + { + "epoch": 0.13317406565539422, + "grad_norm": 0.09623515605926514, + "learning_rate": 0.002, + "loss": 2.369, + "step": 34450 + }, + { + "epoch": 0.1332127228587775, + "grad_norm": 0.11711203306913376, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 34460 + }, + { + "epoch": 0.13325138006216078, + "grad_norm": 0.12866175174713135, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 34470 + }, + { + "epoch": 0.13329003726554406, + "grad_norm": 0.10368845611810684, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 34480 + }, + { + "epoch": 0.13332869446892734, + "grad_norm": 0.0972469374537468, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 34490 + }, + { + "epoch": 0.13336735167231062, + "grad_norm": 0.09679017961025238, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 34500 + }, + { + "epoch": 0.1334060088756939, + "grad_norm": 0.11398512870073318, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 34510 + }, + { + "epoch": 0.13344466607907718, + "grad_norm": 0.10731250047683716, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 34520 + }, + { + "epoch": 0.13348332328246046, + "grad_norm": 0.12601740658283234, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 34530 + }, + { + "epoch": 0.13352198048584374, + "grad_norm": 0.10348492115736008, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 34540 + }, + { + "epoch": 0.13356063768922702, + "grad_norm": 0.09467864781618118, + "learning_rate": 0.002, + "loss": 2.383, + "step": 34550 + }, + { + "epoch": 0.1335992948926103, + "grad_norm": 0.1319250464439392, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 34560 + }, + { + "epoch": 0.13363795209599358, + "grad_norm": 0.09204111993312836, + "learning_rate": 0.002, + "loss": 2.3801, + "step": 34570 + }, + { + "epoch": 0.13367660929937686, + "grad_norm": 0.09467501938343048, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 34580 + }, + { + "epoch": 0.1337152665027601, + "grad_norm": 0.1087702140212059, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 34590 + }, + { + "epoch": 0.1337539237061434, + "grad_norm": 0.10382219403982162, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 34600 + }, + { + "epoch": 0.13379258090952667, + "grad_norm": 0.14096559584140778, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 34610 + }, + { + "epoch": 0.13383123811290995, + "grad_norm": 0.10706502944231033, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 34620 + }, + { + "epoch": 0.13386989531629323, + "grad_norm": 0.10187694430351257, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 34630 + }, + { + "epoch": 0.1339085525196765, + "grad_norm": 0.10945390164852142, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 34640 + }, + { + "epoch": 0.1339472097230598, + "grad_norm": 0.10661990195512772, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 34650 + }, + { + "epoch": 0.13398586692644307, + "grad_norm": 0.12462542206048965, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 34660 + }, + { + "epoch": 0.13402452412982635, + "grad_norm": 0.13518981635570526, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 34670 + }, + { + "epoch": 0.13406318133320963, + "grad_norm": 0.11695235967636108, + "learning_rate": 0.002, + "loss": 2.374, + "step": 34680 + }, + { + "epoch": 0.1341018385365929, + "grad_norm": 0.10137923061847687, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 34690 + }, + { + "epoch": 0.1341404957399762, + "grad_norm": 0.09696846455335617, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 34700 + }, + { + "epoch": 0.13417915294335947, + "grad_norm": 0.10143442451953888, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 34710 + }, + { + "epoch": 0.13421781014674275, + "grad_norm": 0.11306238174438477, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 34720 + }, + { + "epoch": 0.13425646735012603, + "grad_norm": 0.10919847339391708, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 34730 + }, + { + "epoch": 0.1342951245535093, + "grad_norm": 0.09910539537668228, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 34740 + }, + { + "epoch": 0.1343337817568926, + "grad_norm": 0.1374046355485916, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 34750 + }, + { + "epoch": 0.13437243896027587, + "grad_norm": 0.10045044869184494, + "learning_rate": 0.002, + "loss": 2.3878, + "step": 34760 + }, + { + "epoch": 0.13441109616365915, + "grad_norm": 0.11504543572664261, + "learning_rate": 0.002, + "loss": 2.38, + "step": 34770 + }, + { + "epoch": 0.1344497533670424, + "grad_norm": 0.10948486626148224, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 34780 + }, + { + "epoch": 0.13448841057042568, + "grad_norm": 0.10025618225336075, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 34790 + }, + { + "epoch": 0.13452706777380896, + "grad_norm": 0.0966828241944313, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 34800 + }, + { + "epoch": 0.13456572497719224, + "grad_norm": 0.11460334807634354, + "learning_rate": 0.002, + "loss": 2.371, + "step": 34810 + }, + { + "epoch": 0.13460438218057552, + "grad_norm": 0.11981458216905594, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 34820 + }, + { + "epoch": 0.1346430393839588, + "grad_norm": 0.09813759475946426, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 34830 + }, + { + "epoch": 0.13468169658734208, + "grad_norm": 0.14167159795761108, + "learning_rate": 0.002, + "loss": 2.365, + "step": 34840 + }, + { + "epoch": 0.13472035379072536, + "grad_norm": 0.11069276183843613, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 34850 + }, + { + "epoch": 0.13475901099410864, + "grad_norm": 0.11259116232395172, + "learning_rate": 0.002, + "loss": 2.382, + "step": 34860 + }, + { + "epoch": 0.13479766819749192, + "grad_norm": 0.10852546244859695, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 34870 + }, + { + "epoch": 0.1348363254008752, + "grad_norm": 0.11632886528968811, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 34880 + }, + { + "epoch": 0.13487498260425848, + "grad_norm": 0.09457679092884064, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 34890 + }, + { + "epoch": 0.13491363980764176, + "grad_norm": 0.09845416992902756, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 34900 + }, + { + "epoch": 0.13495229701102504, + "grad_norm": 0.11023177206516266, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 34910 + }, + { + "epoch": 0.13499095421440832, + "grad_norm": 0.09931118786334991, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 34920 + }, + { + "epoch": 0.1350296114177916, + "grad_norm": 0.10401184856891632, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 34930 + }, + { + "epoch": 0.13506826862117488, + "grad_norm": 0.1214630976319313, + "learning_rate": 0.002, + "loss": 2.3821, + "step": 34940 + }, + { + "epoch": 0.13510692582455816, + "grad_norm": 0.10948102176189423, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 34950 + }, + { + "epoch": 0.1351455830279414, + "grad_norm": 0.10995184630155563, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 34960 + }, + { + "epoch": 0.1351842402313247, + "grad_norm": 0.10805307328701019, + "learning_rate": 0.002, + "loss": 2.369, + "step": 34970 + }, + { + "epoch": 0.13522289743470797, + "grad_norm": 0.10571695864200592, + "learning_rate": 0.002, + "loss": 2.365, + "step": 34980 + }, + { + "epoch": 0.13526155463809125, + "grad_norm": 0.11283328384160995, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 34990 + }, + { + "epoch": 0.13530021184147453, + "grad_norm": 0.12748633325099945, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 35000 + }, + { + "epoch": 0.1353388690448578, + "grad_norm": 0.11866314709186554, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 35010 + }, + { + "epoch": 0.1353775262482411, + "grad_norm": 0.11615738272666931, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 35020 + }, + { + "epoch": 0.13541618345162437, + "grad_norm": 0.09199300408363342, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 35030 + }, + { + "epoch": 0.13545484065500765, + "grad_norm": 0.10658486187458038, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 35040 + }, + { + "epoch": 0.13549349785839093, + "grad_norm": 0.1143282800912857, + "learning_rate": 0.002, + "loss": 2.352, + "step": 35050 + }, + { + "epoch": 0.1355321550617742, + "grad_norm": 0.11966723948717117, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 35060 + }, + { + "epoch": 0.1355708122651575, + "grad_norm": 0.10665853321552277, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 35070 + }, + { + "epoch": 0.13560946946854077, + "grad_norm": 0.10373278707265854, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 35080 + }, + { + "epoch": 0.13564812667192405, + "grad_norm": 0.12308533489704132, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 35090 + }, + { + "epoch": 0.13568678387530733, + "grad_norm": 0.09434260427951813, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 35100 + }, + { + "epoch": 0.1357254410786906, + "grad_norm": 0.108939990401268, + "learning_rate": 0.002, + "loss": 2.3862, + "step": 35110 + }, + { + "epoch": 0.1357640982820739, + "grad_norm": 0.09926486015319824, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 35120 + }, + { + "epoch": 0.13580275548545717, + "grad_norm": 0.10864213854074478, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 35130 + }, + { + "epoch": 0.13584141268884045, + "grad_norm": 0.10281189531087875, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 35140 + }, + { + "epoch": 0.1358800698922237, + "grad_norm": 0.11445096880197525, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 35150 + }, + { + "epoch": 0.13591872709560698, + "grad_norm": 0.10199040919542313, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 35160 + }, + { + "epoch": 0.13595738429899026, + "grad_norm": 0.10905351489782333, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 35170 + }, + { + "epoch": 0.13599604150237354, + "grad_norm": 0.09866225719451904, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 35180 + }, + { + "epoch": 0.13603469870575682, + "grad_norm": 0.12357515841722488, + "learning_rate": 0.002, + "loss": 2.363, + "step": 35190 + }, + { + "epoch": 0.1360733559091401, + "grad_norm": 0.1316716969013214, + "learning_rate": 0.002, + "loss": 2.366, + "step": 35200 + }, + { + "epoch": 0.13611201311252338, + "grad_norm": 0.11268094182014465, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 35210 + }, + { + "epoch": 0.13615067031590666, + "grad_norm": 0.1085466742515564, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 35220 + }, + { + "epoch": 0.13618932751928994, + "grad_norm": 0.11580248922109604, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 35230 + }, + { + "epoch": 0.13622798472267322, + "grad_norm": 0.09953868389129639, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 35240 + }, + { + "epoch": 0.1362666419260565, + "grad_norm": 0.10986287146806717, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 35250 + }, + { + "epoch": 0.13630529912943978, + "grad_norm": 0.13953445851802826, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 35260 + }, + { + "epoch": 0.13634395633282306, + "grad_norm": 0.10694142431020737, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 35270 + }, + { + "epoch": 0.13638261353620634, + "grad_norm": 0.1019204631447792, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 35280 + }, + { + "epoch": 0.13642127073958962, + "grad_norm": 0.11643867939710617, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 35290 + }, + { + "epoch": 0.1364599279429729, + "grad_norm": 0.10843092203140259, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 35300 + }, + { + "epoch": 0.13649858514635618, + "grad_norm": 0.11738748848438263, + "learning_rate": 0.002, + "loss": 2.365, + "step": 35310 + }, + { + "epoch": 0.13653724234973946, + "grad_norm": 0.10633944720029831, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 35320 + }, + { + "epoch": 0.13657589955312274, + "grad_norm": 0.11410286277532578, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 35330 + }, + { + "epoch": 0.136614556756506, + "grad_norm": 0.11511071771383286, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 35340 + }, + { + "epoch": 0.13665321395988927, + "grad_norm": 0.10633667558431625, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 35350 + }, + { + "epoch": 0.13669187116327255, + "grad_norm": 0.12423171103000641, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 35360 + }, + { + "epoch": 0.13673052836665583, + "grad_norm": 0.10752137005329132, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 35370 + }, + { + "epoch": 0.13676918557003911, + "grad_norm": 0.1084589809179306, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 35380 + }, + { + "epoch": 0.1368078427734224, + "grad_norm": 0.0935010313987732, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 35390 + }, + { + "epoch": 0.13684649997680567, + "grad_norm": 0.1256430596113205, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 35400 + }, + { + "epoch": 0.13688515718018895, + "grad_norm": 0.11605731397867203, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 35410 + }, + { + "epoch": 0.13692381438357223, + "grad_norm": 0.12058790028095245, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 35420 + }, + { + "epoch": 0.13696247158695551, + "grad_norm": 0.12661702930927277, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 35430 + }, + { + "epoch": 0.1370011287903388, + "grad_norm": 0.09687843918800354, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 35440 + }, + { + "epoch": 0.13703978599372207, + "grad_norm": 0.11837822943925858, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 35450 + }, + { + "epoch": 0.13707844319710535, + "grad_norm": 0.11290821433067322, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 35460 + }, + { + "epoch": 0.13711710040048863, + "grad_norm": 0.11151868104934692, + "learning_rate": 0.002, + "loss": 2.3849, + "step": 35470 + }, + { + "epoch": 0.13715575760387191, + "grad_norm": 0.13255830109119415, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 35480 + }, + { + "epoch": 0.1371944148072552, + "grad_norm": 0.09891264140605927, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 35490 + }, + { + "epoch": 0.13723307201063847, + "grad_norm": 0.11972101032733917, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 35500 + }, + { + "epoch": 0.13727172921402175, + "grad_norm": 0.11530592292547226, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 35510 + }, + { + "epoch": 0.137310386417405, + "grad_norm": 0.10842709243297577, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 35520 + }, + { + "epoch": 0.13734904362078829, + "grad_norm": 0.11317627876996994, + "learning_rate": 0.002, + "loss": 2.379, + "step": 35530 + }, + { + "epoch": 0.13738770082417157, + "grad_norm": 0.13239452242851257, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 35540 + }, + { + "epoch": 0.13742635802755485, + "grad_norm": 0.12214444577693939, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 35550 + }, + { + "epoch": 0.13746501523093813, + "grad_norm": 0.12387856096029282, + "learning_rate": 0.002, + "loss": 2.39, + "step": 35560 + }, + { + "epoch": 0.1375036724343214, + "grad_norm": 0.10341054946184158, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 35570 + }, + { + "epoch": 0.13754232963770469, + "grad_norm": 0.1387908160686493, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 35580 + }, + { + "epoch": 0.13758098684108797, + "grad_norm": 0.11028929054737091, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 35590 + }, + { + "epoch": 0.13761964404447125, + "grad_norm": 0.10659206658601761, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 35600 + }, + { + "epoch": 0.13765830124785453, + "grad_norm": 0.1167743057012558, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 35610 + }, + { + "epoch": 0.1376969584512378, + "grad_norm": 0.1232951208949089, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 35620 + }, + { + "epoch": 0.13773561565462109, + "grad_norm": 0.118013396859169, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 35630 + }, + { + "epoch": 0.13777427285800437, + "grad_norm": 0.1103489100933075, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 35640 + }, + { + "epoch": 0.13781293006138765, + "grad_norm": 0.10421431809663773, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 35650 + }, + { + "epoch": 0.13785158726477093, + "grad_norm": 0.11698539555072784, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 35660 + }, + { + "epoch": 0.1378902444681542, + "grad_norm": 0.08978178352117538, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 35670 + }, + { + "epoch": 0.13792890167153748, + "grad_norm": 0.12872745096683502, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 35680 + }, + { + "epoch": 0.13796755887492076, + "grad_norm": 0.10976526886224747, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 35690 + }, + { + "epoch": 0.13800621607830404, + "grad_norm": 0.13008564710617065, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 35700 + }, + { + "epoch": 0.1380448732816873, + "grad_norm": 0.10386268049478531, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 35710 + }, + { + "epoch": 0.13808353048507058, + "grad_norm": 0.11616958677768707, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 35720 + }, + { + "epoch": 0.13812218768845386, + "grad_norm": 0.13968725502490997, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 35730 + }, + { + "epoch": 0.13816084489183714, + "grad_norm": 0.11268781870603561, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 35740 + }, + { + "epoch": 0.13819950209522042, + "grad_norm": 0.1011965274810791, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 35750 + }, + { + "epoch": 0.1382381592986037, + "grad_norm": 0.09758837521076202, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 35760 + }, + { + "epoch": 0.13827681650198698, + "grad_norm": 0.11418548226356506, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 35770 + }, + { + "epoch": 0.13831547370537026, + "grad_norm": 0.0981953963637352, + "learning_rate": 0.002, + "loss": 2.348, + "step": 35780 + }, + { + "epoch": 0.13835413090875354, + "grad_norm": 0.10302402824163437, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 35790 + }, + { + "epoch": 0.13839278811213682, + "grad_norm": 0.14570669829845428, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 35800 + }, + { + "epoch": 0.1384314453155201, + "grad_norm": 0.1252729445695877, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 35810 + }, + { + "epoch": 0.13847010251890338, + "grad_norm": 0.10477358847856522, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 35820 + }, + { + "epoch": 0.13850875972228666, + "grad_norm": 0.10391885787248611, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 35830 + }, + { + "epoch": 0.13854741692566994, + "grad_norm": 0.11140663921833038, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 35840 + }, + { + "epoch": 0.13858607412905322, + "grad_norm": 0.11339595913887024, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 35850 + }, + { + "epoch": 0.1386247313324365, + "grad_norm": 0.11354203522205353, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 35860 + }, + { + "epoch": 0.13866338853581978, + "grad_norm": 0.10377524793148041, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 35870 + }, + { + "epoch": 0.13870204573920306, + "grad_norm": 0.11121828854084015, + "learning_rate": 0.002, + "loss": 2.394, + "step": 35880 + }, + { + "epoch": 0.1387407029425863, + "grad_norm": 0.107803113758564, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 35890 + }, + { + "epoch": 0.1387793601459696, + "grad_norm": 0.12699300050735474, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 35900 + }, + { + "epoch": 0.13881801734935287, + "grad_norm": 0.11092286556959152, + "learning_rate": 0.002, + "loss": 2.376, + "step": 35910 + }, + { + "epoch": 0.13885667455273615, + "grad_norm": 0.10627099871635437, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 35920 + }, + { + "epoch": 0.13889533175611943, + "grad_norm": 0.09759150445461273, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 35930 + }, + { + "epoch": 0.1389339889595027, + "grad_norm": 0.134559765458107, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 35940 + }, + { + "epoch": 0.138972646162886, + "grad_norm": 0.10734544694423676, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 35950 + }, + { + "epoch": 0.13901130336626927, + "grad_norm": 0.09406106173992157, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 35960 + }, + { + "epoch": 0.13904996056965255, + "grad_norm": 0.09491714090108871, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 35970 + }, + { + "epoch": 0.13908861777303583, + "grad_norm": 0.1088794469833374, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 35980 + }, + { + "epoch": 0.1391272749764191, + "grad_norm": 0.11808019876480103, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 35990 + }, + { + "epoch": 0.1391659321798024, + "grad_norm": 0.11798539012670517, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 36000 + }, + { + "epoch": 0.13920458938318567, + "grad_norm": 0.11974812299013138, + "learning_rate": 0.002, + "loss": 2.373, + "step": 36010 + }, + { + "epoch": 0.13924324658656895, + "grad_norm": 0.10560175031423569, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 36020 + }, + { + "epoch": 0.13928190378995223, + "grad_norm": 0.10788323730230331, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 36030 + }, + { + "epoch": 0.1393205609933355, + "grad_norm": 0.10671960562467575, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 36040 + }, + { + "epoch": 0.1393592181967188, + "grad_norm": 0.10810962319374084, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 36050 + }, + { + "epoch": 0.13939787540010207, + "grad_norm": 0.14038975536823273, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 36060 + }, + { + "epoch": 0.13943653260348535, + "grad_norm": 0.11312098801136017, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 36070 + }, + { + "epoch": 0.1394751898068686, + "grad_norm": 0.11241593956947327, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 36080 + }, + { + "epoch": 0.13951384701025188, + "grad_norm": 0.11558341234922409, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 36090 + }, + { + "epoch": 0.13955250421363516, + "grad_norm": 0.11133956909179688, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 36100 + }, + { + "epoch": 0.13959116141701844, + "grad_norm": 0.10750514268875122, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 36110 + }, + { + "epoch": 0.13962981862040172, + "grad_norm": 0.1164507046341896, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 36120 + }, + { + "epoch": 0.139668475823785, + "grad_norm": 0.10103576630353928, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 36130 + }, + { + "epoch": 0.13970713302716828, + "grad_norm": 0.12037768959999084, + "learning_rate": 0.002, + "loss": 2.372, + "step": 36140 + }, + { + "epoch": 0.13974579023055156, + "grad_norm": 0.0958688035607338, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 36150 + }, + { + "epoch": 0.13978444743393484, + "grad_norm": 0.1011674776673317, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 36160 + }, + { + "epoch": 0.13982310463731812, + "grad_norm": 0.12649409472942352, + "learning_rate": 0.002, + "loss": 2.366, + "step": 36170 + }, + { + "epoch": 0.1398617618407014, + "grad_norm": 0.10558871924877167, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 36180 + }, + { + "epoch": 0.13990041904408468, + "grad_norm": 0.1151675283908844, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 36190 + }, + { + "epoch": 0.13993907624746796, + "grad_norm": 0.09600713849067688, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 36200 + }, + { + "epoch": 0.13997773345085124, + "grad_norm": 0.1276940405368805, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 36210 + }, + { + "epoch": 0.14001639065423452, + "grad_norm": 0.11435941606760025, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 36220 + }, + { + "epoch": 0.1400550478576178, + "grad_norm": 0.11217609792947769, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 36230 + }, + { + "epoch": 0.14009370506100108, + "grad_norm": 0.09375467151403427, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 36240 + }, + { + "epoch": 0.14013236226438436, + "grad_norm": 0.1384739875793457, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 36250 + }, + { + "epoch": 0.1401710194677676, + "grad_norm": 0.11851988732814789, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 36260 + }, + { + "epoch": 0.1402096766711509, + "grad_norm": 0.1143161803483963, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 36270 + }, + { + "epoch": 0.14024833387453417, + "grad_norm": 0.09635128825902939, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 36280 + }, + { + "epoch": 0.14028699107791745, + "grad_norm": 0.092675119638443, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 36290 + }, + { + "epoch": 0.14032564828130073, + "grad_norm": 0.10740847885608673, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 36300 + }, + { + "epoch": 0.140364305484684, + "grad_norm": 0.1300441473722458, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 36310 + }, + { + "epoch": 0.1404029626880673, + "grad_norm": 0.11544227600097656, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 36320 + }, + { + "epoch": 0.14044161989145057, + "grad_norm": 0.12911774218082428, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 36330 + }, + { + "epoch": 0.14048027709483385, + "grad_norm": 0.10501087456941605, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 36340 + }, + { + "epoch": 0.14051893429821713, + "grad_norm": 0.11758385598659515, + "learning_rate": 0.002, + "loss": 2.363, + "step": 36350 + }, + { + "epoch": 0.1405575915016004, + "grad_norm": 0.11279705911874771, + "learning_rate": 0.002, + "loss": 2.363, + "step": 36360 + }, + { + "epoch": 0.1405962487049837, + "grad_norm": 0.11421018093824387, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 36370 + }, + { + "epoch": 0.14063490590836697, + "grad_norm": 0.10318736732006073, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 36380 + }, + { + "epoch": 0.14067356311175025, + "grad_norm": 0.10576699674129486, + "learning_rate": 0.002, + "loss": 2.377, + "step": 36390 + }, + { + "epoch": 0.14071222031513353, + "grad_norm": 0.10628781467676163, + "learning_rate": 0.002, + "loss": 2.3837, + "step": 36400 + }, + { + "epoch": 0.1407508775185168, + "grad_norm": 0.09710478782653809, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 36410 + }, + { + "epoch": 0.1407895347219001, + "grad_norm": 0.11761929094791412, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 36420 + }, + { + "epoch": 0.14082819192528337, + "grad_norm": 0.11446376144886017, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 36430 + }, + { + "epoch": 0.14086684912866665, + "grad_norm": 0.10575418919324875, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 36440 + }, + { + "epoch": 0.1409055063320499, + "grad_norm": 0.10004209727048874, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 36450 + }, + { + "epoch": 0.14094416353543318, + "grad_norm": 0.11382671445608139, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 36460 + }, + { + "epoch": 0.14098282073881646, + "grad_norm": 0.11979065090417862, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 36470 + }, + { + "epoch": 0.14102147794219974, + "grad_norm": 0.09754132479429245, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 36480 + }, + { + "epoch": 0.14106013514558302, + "grad_norm": 0.1170993447303772, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 36490 + }, + { + "epoch": 0.1410987923489663, + "grad_norm": 0.09756185859441757, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 36500 + }, + { + "epoch": 0.14113744955234958, + "grad_norm": 0.12066052854061127, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 36510 + }, + { + "epoch": 0.14117610675573286, + "grad_norm": 0.09300950914621353, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 36520 + }, + { + "epoch": 0.14121476395911614, + "grad_norm": 0.10591083765029907, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 36530 + }, + { + "epoch": 0.14125342116249942, + "grad_norm": 0.11248134076595306, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 36540 + }, + { + "epoch": 0.1412920783658827, + "grad_norm": 0.1965792328119278, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 36550 + }, + { + "epoch": 0.14133073556926598, + "grad_norm": 0.11744797229766846, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 36560 + }, + { + "epoch": 0.14136939277264926, + "grad_norm": 0.10491868853569031, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 36570 + }, + { + "epoch": 0.14140804997603254, + "grad_norm": 0.11006952077150345, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 36580 + }, + { + "epoch": 0.14144670717941582, + "grad_norm": 0.10664433985948563, + "learning_rate": 0.002, + "loss": 2.3863, + "step": 36590 + }, + { + "epoch": 0.1414853643827991, + "grad_norm": 0.10997500270605087, + "learning_rate": 0.002, + "loss": 2.3784, + "step": 36600 + }, + { + "epoch": 0.14152402158618238, + "grad_norm": 0.10684429109096527, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 36610 + }, + { + "epoch": 0.14156267878956566, + "grad_norm": 0.11024966835975647, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 36620 + }, + { + "epoch": 0.1416013359929489, + "grad_norm": 0.10300753265619278, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 36630 + }, + { + "epoch": 0.1416399931963322, + "grad_norm": 0.2539750635623932, + "learning_rate": 0.002, + "loss": 2.373, + "step": 36640 + }, + { + "epoch": 0.14167865039971547, + "grad_norm": 0.11048327386379242, + "learning_rate": 0.002, + "loss": 2.368, + "step": 36650 + }, + { + "epoch": 0.14171730760309875, + "grad_norm": 0.10366856306791306, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 36660 + }, + { + "epoch": 0.14175596480648203, + "grad_norm": 0.09625563025474548, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 36670 + }, + { + "epoch": 0.1417946220098653, + "grad_norm": 0.10819088667631149, + "learning_rate": 0.002, + "loss": 2.37, + "step": 36680 + }, + { + "epoch": 0.1418332792132486, + "grad_norm": 0.11163844913244247, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 36690 + }, + { + "epoch": 0.14187193641663187, + "grad_norm": 0.13152261078357697, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 36700 + }, + { + "epoch": 0.14191059362001515, + "grad_norm": 0.11616198718547821, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 36710 + }, + { + "epoch": 0.14194925082339843, + "grad_norm": 0.11571443825960159, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 36720 + }, + { + "epoch": 0.1419879080267817, + "grad_norm": 0.1276528388261795, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 36730 + }, + { + "epoch": 0.142026565230165, + "grad_norm": 0.11477814614772797, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 36740 + }, + { + "epoch": 0.14206522243354827, + "grad_norm": 0.11127634346485138, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 36750 + }, + { + "epoch": 0.14210387963693155, + "grad_norm": 0.11378784477710724, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 36760 + }, + { + "epoch": 0.14214253684031483, + "grad_norm": 0.09223242849111557, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 36770 + }, + { + "epoch": 0.1421811940436981, + "grad_norm": 0.1084481030702591, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 36780 + }, + { + "epoch": 0.1422198512470814, + "grad_norm": 0.12222377210855484, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 36790 + }, + { + "epoch": 0.14225850845046467, + "grad_norm": 0.10207165777683258, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 36800 + }, + { + "epoch": 0.14229716565384795, + "grad_norm": 0.13364824652671814, + "learning_rate": 0.002, + "loss": 2.348, + "step": 36810 + }, + { + "epoch": 0.1423358228572312, + "grad_norm": 0.10581555217504501, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 36820 + }, + { + "epoch": 0.14237448006061448, + "grad_norm": 0.10019327700138092, + "learning_rate": 0.002, + "loss": 2.3839, + "step": 36830 + }, + { + "epoch": 0.14241313726399776, + "grad_norm": 0.11175578832626343, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 36840 + }, + { + "epoch": 0.14245179446738104, + "grad_norm": 0.12977084517478943, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 36850 + }, + { + "epoch": 0.14249045167076432, + "grad_norm": 0.10484184324741364, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 36860 + }, + { + "epoch": 0.1425291088741476, + "grad_norm": 0.11148852109909058, + "learning_rate": 0.002, + "loss": 2.364, + "step": 36870 + }, + { + "epoch": 0.14256776607753088, + "grad_norm": 0.12150207161903381, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 36880 + }, + { + "epoch": 0.14260642328091416, + "grad_norm": 0.3142203986644745, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 36890 + }, + { + "epoch": 0.14264508048429744, + "grad_norm": 0.1186809092760086, + "learning_rate": 0.002, + "loss": 2.358, + "step": 36900 + }, + { + "epoch": 0.14268373768768072, + "grad_norm": 0.11978371441364288, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 36910 + }, + { + "epoch": 0.142722394891064, + "grad_norm": 0.11863017082214355, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 36920 + }, + { + "epoch": 0.14276105209444728, + "grad_norm": 0.09911804646253586, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 36930 + }, + { + "epoch": 0.14279970929783056, + "grad_norm": 0.10414470732212067, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 36940 + }, + { + "epoch": 0.14283836650121384, + "grad_norm": 0.11176794022321701, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 36950 + }, + { + "epoch": 0.14287702370459712, + "grad_norm": 0.09463711827993393, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 36960 + }, + { + "epoch": 0.1429156809079804, + "grad_norm": 0.10849887132644653, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 36970 + }, + { + "epoch": 0.14295433811136368, + "grad_norm": 0.11662127822637558, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 36980 + }, + { + "epoch": 0.14299299531474696, + "grad_norm": 0.10456910729408264, + "learning_rate": 0.002, + "loss": 2.3852, + "step": 36990 + }, + { + "epoch": 0.14303165251813021, + "grad_norm": 0.09997577965259552, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 37000 + }, + { + "epoch": 0.1430703097215135, + "grad_norm": 0.09770150482654572, + "learning_rate": 0.002, + "loss": 2.3876, + "step": 37010 + }, + { + "epoch": 0.14310896692489677, + "grad_norm": 0.10815806686878204, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 37020 + }, + { + "epoch": 0.14314762412828005, + "grad_norm": 0.10741622745990753, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 37030 + }, + { + "epoch": 0.14318628133166333, + "grad_norm": 0.09064080566167831, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 37040 + }, + { + "epoch": 0.14322493853504661, + "grad_norm": 0.12310798466205597, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 37050 + }, + { + "epoch": 0.1432635957384299, + "grad_norm": 0.11956330388784409, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 37060 + }, + { + "epoch": 0.14330225294181317, + "grad_norm": 0.09280236810445786, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 37070 + }, + { + "epoch": 0.14334091014519645, + "grad_norm": 0.10639739781618118, + "learning_rate": 0.002, + "loss": 2.3883, + "step": 37080 + }, + { + "epoch": 0.14337956734857973, + "grad_norm": 0.10321924835443497, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 37090 + }, + { + "epoch": 0.14341822455196301, + "grad_norm": 0.09975964576005936, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 37100 + }, + { + "epoch": 0.1434568817553463, + "grad_norm": 0.11580091714859009, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 37110 + }, + { + "epoch": 0.14349553895872957, + "grad_norm": 0.10225492715835571, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 37120 + }, + { + "epoch": 0.14353419616211285, + "grad_norm": 0.10724704712629318, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 37130 + }, + { + "epoch": 0.14357285336549613, + "grad_norm": 0.09924058616161346, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 37140 + }, + { + "epoch": 0.1436115105688794, + "grad_norm": 0.11980696767568588, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 37150 + }, + { + "epoch": 0.1436501677722627, + "grad_norm": 0.11811673641204834, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 37160 + }, + { + "epoch": 0.14368882497564597, + "grad_norm": 0.11168906837701797, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 37170 + }, + { + "epoch": 0.14372748217902925, + "grad_norm": 0.09915437549352646, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 37180 + }, + { + "epoch": 0.1437661393824125, + "grad_norm": 0.13308192789554596, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 37190 + }, + { + "epoch": 0.14380479658579579, + "grad_norm": 0.11475107818841934, + "learning_rate": 0.002, + "loss": 2.383, + "step": 37200 + }, + { + "epoch": 0.14384345378917907, + "grad_norm": 0.09859679639339447, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 37210 + }, + { + "epoch": 0.14388211099256235, + "grad_norm": 0.12095901370048523, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 37220 + }, + { + "epoch": 0.14392076819594563, + "grad_norm": 0.09938090294599533, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 37230 + }, + { + "epoch": 0.1439594253993289, + "grad_norm": 0.1443241834640503, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 37240 + }, + { + "epoch": 0.14399808260271219, + "grad_norm": 0.12056610733270645, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 37250 + }, + { + "epoch": 0.14403673980609547, + "grad_norm": 0.11923497915267944, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 37260 + }, + { + "epoch": 0.14407539700947875, + "grad_norm": 0.1033424586057663, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 37270 + }, + { + "epoch": 0.14411405421286202, + "grad_norm": 0.10121098160743713, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 37280 + }, + { + "epoch": 0.1441527114162453, + "grad_norm": 0.11377987265586853, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 37290 + }, + { + "epoch": 0.14419136861962858, + "grad_norm": 0.09906457364559174, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 37300 + }, + { + "epoch": 0.14423002582301186, + "grad_norm": 0.0996997058391571, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 37310 + }, + { + "epoch": 0.14426868302639514, + "grad_norm": 0.1009167730808258, + "learning_rate": 0.002, + "loss": 2.369, + "step": 37320 + }, + { + "epoch": 0.14430734022977842, + "grad_norm": 0.11943599581718445, + "learning_rate": 0.002, + "loss": 2.37, + "step": 37330 + }, + { + "epoch": 0.1443459974331617, + "grad_norm": 0.10313688963651657, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 37340 + }, + { + "epoch": 0.14438465463654498, + "grad_norm": 0.11535441875457764, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 37350 + }, + { + "epoch": 0.14442331183992826, + "grad_norm": 0.12095227837562561, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 37360 + }, + { + "epoch": 0.14446196904331154, + "grad_norm": 0.12098788470029831, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 37370 + }, + { + "epoch": 0.1445006262466948, + "grad_norm": 0.0900544747710228, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 37380 + }, + { + "epoch": 0.14453928345007808, + "grad_norm": 0.1258208453655243, + "learning_rate": 0.002, + "loss": 2.366, + "step": 37390 + }, + { + "epoch": 0.14457794065346136, + "grad_norm": 0.10623008757829666, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 37400 + }, + { + "epoch": 0.14461659785684464, + "grad_norm": 0.11886344105005264, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 37410 + }, + { + "epoch": 0.14465525506022792, + "grad_norm": 0.12853644788265228, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 37420 + }, + { + "epoch": 0.1446939122636112, + "grad_norm": 0.1244492307305336, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 37430 + }, + { + "epoch": 0.14473256946699448, + "grad_norm": 0.10314949601888657, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 37440 + }, + { + "epoch": 0.14477122667037776, + "grad_norm": 0.1144338920712471, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 37450 + }, + { + "epoch": 0.14480988387376104, + "grad_norm": 0.09299468994140625, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 37460 + }, + { + "epoch": 0.14484854107714432, + "grad_norm": 0.12412026524543762, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 37470 + }, + { + "epoch": 0.1448871982805276, + "grad_norm": 0.12529881298542023, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 37480 + }, + { + "epoch": 0.14492585548391088, + "grad_norm": 0.11511372774839401, + "learning_rate": 0.002, + "loss": 2.3888, + "step": 37490 + }, + { + "epoch": 0.14496451268729416, + "grad_norm": 0.1147739514708519, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 37500 + }, + { + "epoch": 0.14500316989067744, + "grad_norm": 0.08730924129486084, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 37510 + }, + { + "epoch": 0.14504182709406072, + "grad_norm": 0.11324899643659592, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 37520 + }, + { + "epoch": 0.145080484297444, + "grad_norm": 0.10321440547704697, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 37530 + }, + { + "epoch": 0.14511914150082728, + "grad_norm": 0.10298170894384384, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 37540 + }, + { + "epoch": 0.14515779870421056, + "grad_norm": 0.10055653750896454, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 37550 + }, + { + "epoch": 0.1451964559075938, + "grad_norm": 0.11173214018344879, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 37560 + }, + { + "epoch": 0.1452351131109771, + "grad_norm": 0.11200610548257828, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 37570 + }, + { + "epoch": 0.14527377031436037, + "grad_norm": 0.0937102884054184, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 37580 + }, + { + "epoch": 0.14531242751774365, + "grad_norm": 0.10634052008390427, + "learning_rate": 0.002, + "loss": 2.354, + "step": 37590 + }, + { + "epoch": 0.14535108472112693, + "grad_norm": 0.10749258100986481, + "learning_rate": 0.002, + "loss": 2.36, + "step": 37600 + }, + { + "epoch": 0.1453897419245102, + "grad_norm": 0.10672356188297272, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 37610 + }, + { + "epoch": 0.1454283991278935, + "grad_norm": 0.1124885305762291, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 37620 + }, + { + "epoch": 0.14546705633127677, + "grad_norm": 0.144533172249794, + "learning_rate": 0.002, + "loss": 2.358, + "step": 37630 + }, + { + "epoch": 0.14550571353466005, + "grad_norm": 0.12376045435667038, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 37640 + }, + { + "epoch": 0.14554437073804333, + "grad_norm": 0.11379161477088928, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 37650 + }, + { + "epoch": 0.1455830279414266, + "grad_norm": 0.10776171833276749, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 37660 + }, + { + "epoch": 0.1456216851448099, + "grad_norm": 0.1258264183998108, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 37670 + }, + { + "epoch": 0.14566034234819317, + "grad_norm": 0.10170570015907288, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 37680 + }, + { + "epoch": 0.14569899955157645, + "grad_norm": 0.10516185313463211, + "learning_rate": 0.002, + "loss": 2.353, + "step": 37690 + }, + { + "epoch": 0.14573765675495973, + "grad_norm": 0.10602464526891708, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 37700 + }, + { + "epoch": 0.145776313958343, + "grad_norm": 0.11193682253360748, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 37710 + }, + { + "epoch": 0.1458149711617263, + "grad_norm": 0.12408501654863358, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 37720 + }, + { + "epoch": 0.14585362836510957, + "grad_norm": 0.09819848090410233, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 37730 + }, + { + "epoch": 0.14589228556849285, + "grad_norm": 0.1574973613023758, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 37740 + }, + { + "epoch": 0.1459309427718761, + "grad_norm": 0.10473944991827011, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 37750 + }, + { + "epoch": 0.14596959997525938, + "grad_norm": 0.10519735515117645, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 37760 + }, + { + "epoch": 0.14600825717864266, + "grad_norm": 0.13095340132713318, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 37770 + }, + { + "epoch": 0.14604691438202594, + "grad_norm": 0.1454060673713684, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 37780 + }, + { + "epoch": 0.14608557158540922, + "grad_norm": 0.10329166799783707, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 37790 + }, + { + "epoch": 0.1461242287887925, + "grad_norm": 0.11781110614538193, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 37800 + }, + { + "epoch": 0.14616288599217578, + "grad_norm": 0.12178536504507065, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 37810 + }, + { + "epoch": 0.14620154319555906, + "grad_norm": 0.10970946401357651, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 37820 + }, + { + "epoch": 0.14624020039894234, + "grad_norm": 0.1142861470580101, + "learning_rate": 0.002, + "loss": 2.3858, + "step": 37830 + }, + { + "epoch": 0.14627885760232562, + "grad_norm": 0.10663673281669617, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 37840 + }, + { + "epoch": 0.1463175148057089, + "grad_norm": 0.10833430290222168, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 37850 + }, + { + "epoch": 0.14635617200909218, + "grad_norm": 0.10738880932331085, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 37860 + }, + { + "epoch": 0.14639482921247546, + "grad_norm": 0.11424372345209122, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 37870 + }, + { + "epoch": 0.14643348641585874, + "grad_norm": 0.10799358040094376, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 37880 + }, + { + "epoch": 0.14647214361924202, + "grad_norm": 0.14553822576999664, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 37890 + }, + { + "epoch": 0.1465108008226253, + "grad_norm": 0.11662301421165466, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 37900 + }, + { + "epoch": 0.14654945802600858, + "grad_norm": 0.11311762779951096, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 37910 + }, + { + "epoch": 0.14658811522939186, + "grad_norm": 0.12561675906181335, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 37920 + }, + { + "epoch": 0.1466267724327751, + "grad_norm": 0.12258616089820862, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 37930 + }, + { + "epoch": 0.1466654296361584, + "grad_norm": 0.09970700740814209, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 37940 + }, + { + "epoch": 0.14670408683954167, + "grad_norm": 0.1401025950908661, + "learning_rate": 0.002, + "loss": 2.365, + "step": 37950 + }, + { + "epoch": 0.14674274404292495, + "grad_norm": 0.10542020946741104, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 37960 + }, + { + "epoch": 0.14678140124630823, + "grad_norm": 0.1307201087474823, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 37970 + }, + { + "epoch": 0.1468200584496915, + "grad_norm": 0.1087263748049736, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 37980 + }, + { + "epoch": 0.1468587156530748, + "grad_norm": 0.0990528091788292, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 37990 + }, + { + "epoch": 0.14689737285645807, + "grad_norm": 0.13766375184059143, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 38000 + }, + { + "epoch": 0.14693603005984135, + "grad_norm": 0.10230547934770584, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 38010 + }, + { + "epoch": 0.14697468726322463, + "grad_norm": 0.11706888675689697, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 38020 + }, + { + "epoch": 0.1470133444666079, + "grad_norm": 0.11129742115736008, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 38030 + }, + { + "epoch": 0.1470520016699912, + "grad_norm": 0.09448976814746857, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 38040 + }, + { + "epoch": 0.14709065887337447, + "grad_norm": 0.11556555330753326, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 38050 + }, + { + "epoch": 0.14712931607675775, + "grad_norm": 0.10370432585477829, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 38060 + }, + { + "epoch": 0.14716797328014103, + "grad_norm": 0.16028402745723724, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 38070 + }, + { + "epoch": 0.1472066304835243, + "grad_norm": 0.12887880206108093, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 38080 + }, + { + "epoch": 0.1472452876869076, + "grad_norm": 0.12080951780080795, + "learning_rate": 0.002, + "loss": 2.374, + "step": 38090 + }, + { + "epoch": 0.14728394489029087, + "grad_norm": 0.11555592715740204, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 38100 + }, + { + "epoch": 0.14732260209367415, + "grad_norm": 0.09942886233329773, + "learning_rate": 0.002, + "loss": 2.3841, + "step": 38110 + }, + { + "epoch": 0.1473612592970574, + "grad_norm": 0.10487055033445358, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 38120 + }, + { + "epoch": 0.14739991650044068, + "grad_norm": 0.10219061374664307, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 38130 + }, + { + "epoch": 0.14743857370382396, + "grad_norm": 0.10599678754806519, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 38140 + }, + { + "epoch": 0.14747723090720724, + "grad_norm": 0.11388130486011505, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 38150 + }, + { + "epoch": 0.14751588811059052, + "grad_norm": 0.10307695716619492, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 38160 + }, + { + "epoch": 0.1475545453139738, + "grad_norm": 0.10115735977888107, + "learning_rate": 0.002, + "loss": 2.3798, + "step": 38170 + }, + { + "epoch": 0.14759320251735708, + "grad_norm": 0.11397730559110641, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 38180 + }, + { + "epoch": 0.14763185972074036, + "grad_norm": 0.10884783416986465, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 38190 + }, + { + "epoch": 0.14767051692412364, + "grad_norm": 0.10271456837654114, + "learning_rate": 0.002, + "loss": 2.3809, + "step": 38200 + }, + { + "epoch": 0.14770917412750692, + "grad_norm": 0.11193260550498962, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 38210 + }, + { + "epoch": 0.1477478313308902, + "grad_norm": 0.11704428493976593, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 38220 + }, + { + "epoch": 0.14778648853427348, + "grad_norm": 0.10461351275444031, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 38230 + }, + { + "epoch": 0.14782514573765676, + "grad_norm": 0.11719801276922226, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 38240 + }, + { + "epoch": 0.14786380294104004, + "grad_norm": 0.12601828575134277, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 38250 + }, + { + "epoch": 0.14790246014442332, + "grad_norm": 0.1058812215924263, + "learning_rate": 0.002, + "loss": 2.387, + "step": 38260 + }, + { + "epoch": 0.1479411173478066, + "grad_norm": 0.11499867588281631, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 38270 + }, + { + "epoch": 0.14797977455118988, + "grad_norm": 0.11680841445922852, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 38280 + }, + { + "epoch": 0.14801843175457316, + "grad_norm": 0.12532247602939606, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 38290 + }, + { + "epoch": 0.1480570889579564, + "grad_norm": 0.11021430045366287, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 38300 + }, + { + "epoch": 0.1480957461613397, + "grad_norm": 0.11453874409198761, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 38310 + }, + { + "epoch": 0.14813440336472297, + "grad_norm": 0.09985355287790298, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 38320 + }, + { + "epoch": 0.14817306056810625, + "grad_norm": 0.11640987545251846, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 38330 + }, + { + "epoch": 0.14821171777148953, + "grad_norm": 0.09893188625574112, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 38340 + }, + { + "epoch": 0.1482503749748728, + "grad_norm": 0.12688705325126648, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 38350 + }, + { + "epoch": 0.1482890321782561, + "grad_norm": 0.12036380916833878, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 38360 + }, + { + "epoch": 0.14832768938163937, + "grad_norm": 0.09100010991096497, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 38370 + }, + { + "epoch": 0.14836634658502265, + "grad_norm": 0.10775777697563171, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 38380 + }, + { + "epoch": 0.14840500378840593, + "grad_norm": 0.12571115791797638, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 38390 + }, + { + "epoch": 0.1484436609917892, + "grad_norm": 0.10129281133413315, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 38400 + }, + { + "epoch": 0.1484823181951725, + "grad_norm": 0.11698244512081146, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 38410 + }, + { + "epoch": 0.14852097539855577, + "grad_norm": 0.09857890754938126, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 38420 + }, + { + "epoch": 0.14855963260193905, + "grad_norm": 0.11326782405376434, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 38430 + }, + { + "epoch": 0.14859828980532233, + "grad_norm": 0.11370991915464401, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 38440 + }, + { + "epoch": 0.1486369470087056, + "grad_norm": 0.10248290747404099, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 38450 + }, + { + "epoch": 0.1486756042120889, + "grad_norm": 0.09775995463132858, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 38460 + }, + { + "epoch": 0.14871426141547217, + "grad_norm": 0.1176404282450676, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 38470 + }, + { + "epoch": 0.14875291861885545, + "grad_norm": 0.10633568465709686, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 38480 + }, + { + "epoch": 0.1487915758222387, + "grad_norm": 0.13698548078536987, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 38490 + }, + { + "epoch": 0.14883023302562198, + "grad_norm": 0.10824945569038391, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 38500 + }, + { + "epoch": 0.14886889022900526, + "grad_norm": 0.0984220951795578, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 38510 + }, + { + "epoch": 0.14890754743238854, + "grad_norm": 0.13383355736732483, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 38520 + }, + { + "epoch": 0.14894620463577182, + "grad_norm": 0.1034078449010849, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 38530 + }, + { + "epoch": 0.1489848618391551, + "grad_norm": 0.10896123200654984, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 38540 + }, + { + "epoch": 0.14902351904253838, + "grad_norm": 0.0965457409620285, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 38550 + }, + { + "epoch": 0.14906217624592166, + "grad_norm": 0.11075472086668015, + "learning_rate": 0.002, + "loss": 2.363, + "step": 38560 + }, + { + "epoch": 0.14910083344930494, + "grad_norm": 0.128397598862648, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 38570 + }, + { + "epoch": 0.14913949065268822, + "grad_norm": 0.12091022729873657, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 38580 + }, + { + "epoch": 0.1491781478560715, + "grad_norm": 0.09868521243333817, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 38590 + }, + { + "epoch": 0.14921680505945478, + "grad_norm": 0.1057891920208931, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 38600 + }, + { + "epoch": 0.14925546226283806, + "grad_norm": 0.12372202426195145, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 38610 + }, + { + "epoch": 0.14929411946622134, + "grad_norm": 0.11062665283679962, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 38620 + }, + { + "epoch": 0.14933277666960462, + "grad_norm": 0.11425059288740158, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 38630 + }, + { + "epoch": 0.1493714338729879, + "grad_norm": 0.11539951711893082, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 38640 + }, + { + "epoch": 0.14941009107637118, + "grad_norm": 0.11890841275453568, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 38650 + }, + { + "epoch": 0.14944874827975446, + "grad_norm": 0.10240473598241806, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 38660 + }, + { + "epoch": 0.14948740548313771, + "grad_norm": 0.10921300947666168, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 38670 + }, + { + "epoch": 0.149526062686521, + "grad_norm": 0.114039845764637, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 38680 + }, + { + "epoch": 0.14956471988990427, + "grad_norm": 0.10230555385351181, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 38690 + }, + { + "epoch": 0.14960337709328755, + "grad_norm": 0.11473822593688965, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 38700 + }, + { + "epoch": 0.14964203429667083, + "grad_norm": 0.10442385077476501, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 38710 + }, + { + "epoch": 0.1496806915000541, + "grad_norm": 0.11906211078166962, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 38720 + }, + { + "epoch": 0.1497193487034374, + "grad_norm": 0.1230156272649765, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 38730 + }, + { + "epoch": 0.14975800590682067, + "grad_norm": 0.09884335100650787, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 38740 + }, + { + "epoch": 0.14979666311020395, + "grad_norm": 0.10977873206138611, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 38750 + }, + { + "epoch": 0.14983532031358723, + "grad_norm": 0.12378506362438202, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 38760 + }, + { + "epoch": 0.1498739775169705, + "grad_norm": 0.11741282045841217, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 38770 + }, + { + "epoch": 0.1499126347203538, + "grad_norm": 0.11203381419181824, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 38780 + }, + { + "epoch": 0.14995129192373707, + "grad_norm": 0.09905987977981567, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 38790 + }, + { + "epoch": 0.14998994912712035, + "grad_norm": 0.11241578310728073, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 38800 + }, + { + "epoch": 0.15002860633050363, + "grad_norm": 0.1121586337685585, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 38810 + }, + { + "epoch": 0.1500672635338869, + "grad_norm": 0.11441362649202347, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 38820 + }, + { + "epoch": 0.1501059207372702, + "grad_norm": 0.09616050124168396, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 38830 + }, + { + "epoch": 0.15014457794065347, + "grad_norm": 0.11325273662805557, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 38840 + }, + { + "epoch": 0.15018323514403675, + "grad_norm": 0.11043589562177658, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 38850 + }, + { + "epoch": 0.15022189234742, + "grad_norm": 0.11214675009250641, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 38860 + }, + { + "epoch": 0.15026054955080329, + "grad_norm": 0.11280752718448639, + "learning_rate": 0.002, + "loss": 2.376, + "step": 38870 + }, + { + "epoch": 0.15029920675418657, + "grad_norm": 0.11541387438774109, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 38880 + }, + { + "epoch": 0.15033786395756984, + "grad_norm": 0.1194639578461647, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 38890 + }, + { + "epoch": 0.15037652116095312, + "grad_norm": 0.10848919302225113, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 38900 + }, + { + "epoch": 0.1504151783643364, + "grad_norm": 0.14039918780326843, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 38910 + }, + { + "epoch": 0.15045383556771968, + "grad_norm": 0.11318860203027725, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 38920 + }, + { + "epoch": 0.15049249277110296, + "grad_norm": 0.10295028239488602, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 38930 + }, + { + "epoch": 0.15053114997448624, + "grad_norm": 0.11704400181770325, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 38940 + }, + { + "epoch": 0.15056980717786952, + "grad_norm": 0.09262275695800781, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 38950 + }, + { + "epoch": 0.1506084643812528, + "grad_norm": 0.11537528783082962, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 38960 + }, + { + "epoch": 0.15064712158463608, + "grad_norm": 0.11481942981481552, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 38970 + }, + { + "epoch": 0.15068577878801936, + "grad_norm": 0.1113208681344986, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 38980 + }, + { + "epoch": 0.15072443599140264, + "grad_norm": 0.12350216507911682, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 38990 + }, + { + "epoch": 0.15076309319478592, + "grad_norm": 0.11521659046411514, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 39000 + }, + { + "epoch": 0.1508017503981692, + "grad_norm": 0.10397952049970627, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 39010 + }, + { + "epoch": 0.15084040760155248, + "grad_norm": 0.11351260542869568, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 39020 + }, + { + "epoch": 0.15087906480493576, + "grad_norm": 0.10307390987873077, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 39030 + }, + { + "epoch": 0.15091772200831902, + "grad_norm": 0.1473456174135208, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 39040 + }, + { + "epoch": 0.1509563792117023, + "grad_norm": 0.10277149826288223, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 39050 + }, + { + "epoch": 0.15099503641508558, + "grad_norm": 0.10011863708496094, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 39060 + }, + { + "epoch": 0.15103369361846886, + "grad_norm": 0.10471244901418686, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 39070 + }, + { + "epoch": 0.15107235082185214, + "grad_norm": 0.11708759516477585, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 39080 + }, + { + "epoch": 0.15111100802523542, + "grad_norm": 0.10626459121704102, + "learning_rate": 0.002, + "loss": 2.357, + "step": 39090 + }, + { + "epoch": 0.1511496652286187, + "grad_norm": 0.11258683353662491, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 39100 + }, + { + "epoch": 0.15118832243200198, + "grad_norm": 0.11116767674684525, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 39110 + }, + { + "epoch": 0.15122697963538526, + "grad_norm": 0.10669376701116562, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 39120 + }, + { + "epoch": 0.15126563683876854, + "grad_norm": 0.10341840237379074, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 39130 + }, + { + "epoch": 0.15130429404215182, + "grad_norm": 0.1296912282705307, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 39140 + }, + { + "epoch": 0.1513429512455351, + "grad_norm": 0.15270934998989105, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 39150 + }, + { + "epoch": 0.15138160844891838, + "grad_norm": 0.10418158024549484, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 39160 + }, + { + "epoch": 0.15142026565230166, + "grad_norm": 0.11090809851884842, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 39170 + }, + { + "epoch": 0.15145892285568494, + "grad_norm": 0.10907279700040817, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 39180 + }, + { + "epoch": 0.15149758005906822, + "grad_norm": 0.11926936358213425, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 39190 + }, + { + "epoch": 0.1515362372624515, + "grad_norm": 0.12747298181056976, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 39200 + }, + { + "epoch": 0.15157489446583478, + "grad_norm": 0.11071181297302246, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 39210 + }, + { + "epoch": 0.15161355166921806, + "grad_norm": 0.09829133003950119, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 39220 + }, + { + "epoch": 0.1516522088726013, + "grad_norm": 0.12237284332513809, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 39230 + }, + { + "epoch": 0.1516908660759846, + "grad_norm": 0.10596148669719696, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 39240 + }, + { + "epoch": 0.15172952327936787, + "grad_norm": 0.09914745390415192, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 39250 + }, + { + "epoch": 0.15176818048275115, + "grad_norm": 0.09774194657802582, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 39260 + }, + { + "epoch": 0.15180683768613443, + "grad_norm": 0.12466096132993698, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 39270 + }, + { + "epoch": 0.1518454948895177, + "grad_norm": 0.10741476714611053, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 39280 + }, + { + "epoch": 0.151884152092901, + "grad_norm": 0.12783187627792358, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 39290 + }, + { + "epoch": 0.15192280929628427, + "grad_norm": 0.10841857641935349, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 39300 + }, + { + "epoch": 0.15196146649966755, + "grad_norm": 0.09851887822151184, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 39310 + }, + { + "epoch": 0.15200012370305083, + "grad_norm": 0.13779272139072418, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 39320 + }, + { + "epoch": 0.1520387809064341, + "grad_norm": 0.1023627445101738, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 39330 + }, + { + "epoch": 0.1520774381098174, + "grad_norm": 0.10356482118368149, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 39340 + }, + { + "epoch": 0.15211609531320067, + "grad_norm": 0.13198953866958618, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 39350 + }, + { + "epoch": 0.15215475251658395, + "grad_norm": 0.11287932097911835, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 39360 + }, + { + "epoch": 0.15219340971996723, + "grad_norm": 0.107682004570961, + "learning_rate": 0.002, + "loss": 2.382, + "step": 39370 + }, + { + "epoch": 0.1522320669233505, + "grad_norm": 0.0983332172036171, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 39380 + }, + { + "epoch": 0.1522707241267338, + "grad_norm": 0.10807695239782333, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 39390 + }, + { + "epoch": 0.15230938133011707, + "grad_norm": 0.1007051095366478, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 39400 + }, + { + "epoch": 0.15234803853350035, + "grad_norm": 0.11607711017131805, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 39410 + }, + { + "epoch": 0.1523866957368836, + "grad_norm": 0.10342895984649658, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 39420 + }, + { + "epoch": 0.15242535294026688, + "grad_norm": 0.09878204017877579, + "learning_rate": 0.002, + "loss": 2.362, + "step": 39430 + }, + { + "epoch": 0.15246401014365016, + "grad_norm": 0.12246676534414291, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 39440 + }, + { + "epoch": 0.15250266734703344, + "grad_norm": 0.10982445627450943, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 39450 + }, + { + "epoch": 0.15254132455041672, + "grad_norm": 0.10152310132980347, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 39460 + }, + { + "epoch": 0.1525799817538, + "grad_norm": 0.11473408341407776, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 39470 + }, + { + "epoch": 0.15261863895718328, + "grad_norm": 0.12105746567249298, + "learning_rate": 0.002, + "loss": 2.37, + "step": 39480 + }, + { + "epoch": 0.15265729616056656, + "grad_norm": 0.11079411208629608, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 39490 + }, + { + "epoch": 0.15269595336394984, + "grad_norm": 0.1170242428779602, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 39500 + }, + { + "epoch": 0.15273461056733312, + "grad_norm": 0.09397020190954208, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 39510 + }, + { + "epoch": 0.1527732677707164, + "grad_norm": 0.09887134283781052, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 39520 + }, + { + "epoch": 0.15281192497409968, + "grad_norm": 0.10110117495059967, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 39530 + }, + { + "epoch": 0.15285058217748296, + "grad_norm": 0.09449794888496399, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 39540 + }, + { + "epoch": 0.15288923938086624, + "grad_norm": 0.11885342746973038, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 39550 + }, + { + "epoch": 0.15292789658424952, + "grad_norm": 0.10996508598327637, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 39560 + }, + { + "epoch": 0.1529665537876328, + "grad_norm": 0.1292993724346161, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 39570 + }, + { + "epoch": 0.15300521099101608, + "grad_norm": 0.11036507785320282, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 39580 + }, + { + "epoch": 0.15304386819439936, + "grad_norm": 0.10646365582942963, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 39590 + }, + { + "epoch": 0.1530825253977826, + "grad_norm": 0.11589569598436356, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 39600 + }, + { + "epoch": 0.1531211826011659, + "grad_norm": 0.1215120479464531, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 39610 + }, + { + "epoch": 0.15315983980454917, + "grad_norm": 0.1089867576956749, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 39620 + }, + { + "epoch": 0.15319849700793245, + "grad_norm": 0.11142711341381073, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 39630 + }, + { + "epoch": 0.15323715421131573, + "grad_norm": 0.11389360576868057, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 39640 + }, + { + "epoch": 0.153275811414699, + "grad_norm": 0.1226469874382019, + "learning_rate": 0.002, + "loss": 2.372, + "step": 39650 + }, + { + "epoch": 0.1533144686180823, + "grad_norm": 0.11769437789916992, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 39660 + }, + { + "epoch": 0.15335312582146557, + "grad_norm": 0.12982621788978577, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 39670 + }, + { + "epoch": 0.15339178302484885, + "grad_norm": 0.11239667981863022, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 39680 + }, + { + "epoch": 0.15343044022823213, + "grad_norm": 0.1196182444691658, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 39690 + }, + { + "epoch": 0.1534690974316154, + "grad_norm": 0.1112075224518776, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 39700 + }, + { + "epoch": 0.1535077546349987, + "grad_norm": 0.10573304444551468, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 39710 + }, + { + "epoch": 0.15354641183838197, + "grad_norm": 0.11556225270032883, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 39720 + }, + { + "epoch": 0.15358506904176525, + "grad_norm": 0.10684444010257721, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 39730 + }, + { + "epoch": 0.15362372624514853, + "grad_norm": 0.1147504448890686, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 39740 + }, + { + "epoch": 0.1536623834485318, + "grad_norm": 0.11106063425540924, + "learning_rate": 0.002, + "loss": 2.3856, + "step": 39750 + }, + { + "epoch": 0.1537010406519151, + "grad_norm": 0.11069447547197342, + "learning_rate": 0.002, + "loss": 2.362, + "step": 39760 + }, + { + "epoch": 0.15373969785529837, + "grad_norm": 0.10675516724586487, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 39770 + }, + { + "epoch": 0.15377835505868165, + "grad_norm": 0.11702346801757812, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 39780 + }, + { + "epoch": 0.1538170122620649, + "grad_norm": 0.12388580292463303, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 39790 + }, + { + "epoch": 0.15385566946544818, + "grad_norm": 0.140107199549675, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 39800 + }, + { + "epoch": 0.15389432666883146, + "grad_norm": 0.09637805819511414, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 39810 + }, + { + "epoch": 0.15393298387221474, + "grad_norm": 0.10908488184213638, + "learning_rate": 0.002, + "loss": 2.363, + "step": 39820 + }, + { + "epoch": 0.15397164107559802, + "grad_norm": 0.09757030755281448, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 39830 + }, + { + "epoch": 0.1540102982789813, + "grad_norm": 0.1076817587018013, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 39840 + }, + { + "epoch": 0.15404895548236458, + "grad_norm": 0.11481721699237823, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 39850 + }, + { + "epoch": 0.15408761268574786, + "grad_norm": 0.09377177059650421, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 39860 + }, + { + "epoch": 0.15412626988913114, + "grad_norm": 0.1203828752040863, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 39870 + }, + { + "epoch": 0.15416492709251442, + "grad_norm": 0.11628779768943787, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 39880 + }, + { + "epoch": 0.1542035842958977, + "grad_norm": 0.10759463161230087, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 39890 + }, + { + "epoch": 0.15424224149928098, + "grad_norm": 0.10259198397397995, + "learning_rate": 0.002, + "loss": 2.373, + "step": 39900 + }, + { + "epoch": 0.15428089870266426, + "grad_norm": 0.10605776309967041, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 39910 + }, + { + "epoch": 0.15431955590604754, + "grad_norm": 0.5578303337097168, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 39920 + }, + { + "epoch": 0.15435821310943082, + "grad_norm": 0.13610078394412994, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 39930 + }, + { + "epoch": 0.1543968703128141, + "grad_norm": 0.10285218060016632, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 39940 + }, + { + "epoch": 0.15443552751619738, + "grad_norm": 0.10524322837591171, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 39950 + }, + { + "epoch": 0.15447418471958066, + "grad_norm": 0.10412564873695374, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 39960 + }, + { + "epoch": 0.1545128419229639, + "grad_norm": 0.11148788779973984, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 39970 + }, + { + "epoch": 0.1545514991263472, + "grad_norm": 0.1292443871498108, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 39980 + }, + { + "epoch": 0.15459015632973047, + "grad_norm": 0.08943097293376923, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 39990 + }, + { + "epoch": 0.15462881353311375, + "grad_norm": 0.09316378086805344, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 40000 + }, + { + "epoch": 0.15466747073649703, + "grad_norm": 0.11126011610031128, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 40010 + }, + { + "epoch": 0.1547061279398803, + "grad_norm": 0.11391105502843857, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 40020 + }, + { + "epoch": 0.1547447851432636, + "grad_norm": 0.10801863670349121, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 40030 + }, + { + "epoch": 0.15478344234664687, + "grad_norm": 0.09264006465673447, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 40040 + }, + { + "epoch": 0.15482209955003015, + "grad_norm": 0.10908648371696472, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 40050 + }, + { + "epoch": 0.15486075675341343, + "grad_norm": 0.11322391778230667, + "learning_rate": 0.002, + "loss": 2.37, + "step": 40060 + }, + { + "epoch": 0.1548994139567967, + "grad_norm": 0.10048907995223999, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 40070 + }, + { + "epoch": 0.15493807116018, + "grad_norm": 0.11480753123760223, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 40080 + }, + { + "epoch": 0.15497672836356327, + "grad_norm": 0.11866872012615204, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 40090 + }, + { + "epoch": 0.15501538556694655, + "grad_norm": 0.12018725275993347, + "learning_rate": 0.002, + "loss": 2.359, + "step": 40100 + }, + { + "epoch": 0.15505404277032983, + "grad_norm": 0.11524038016796112, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 40110 + }, + { + "epoch": 0.1550926999737131, + "grad_norm": 0.11069086194038391, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 40120 + }, + { + "epoch": 0.1551313571770964, + "grad_norm": 0.11821892112493515, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 40130 + }, + { + "epoch": 0.15517001438047967, + "grad_norm": 0.11484308540821075, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 40140 + }, + { + "epoch": 0.15520867158386295, + "grad_norm": 0.09898848831653595, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 40150 + }, + { + "epoch": 0.1552473287872462, + "grad_norm": 0.10578508675098419, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 40160 + }, + { + "epoch": 0.15528598599062948, + "grad_norm": 0.09843472391366959, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 40170 + }, + { + "epoch": 0.15532464319401276, + "grad_norm": 0.11496349424123764, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 40180 + }, + { + "epoch": 0.15536330039739604, + "grad_norm": 0.11298873275518417, + "learning_rate": 0.002, + "loss": 2.369, + "step": 40190 + }, + { + "epoch": 0.15540195760077932, + "grad_norm": 0.1040901467204094, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 40200 + }, + { + "epoch": 0.1554406148041626, + "grad_norm": 0.1098884865641594, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 40210 + }, + { + "epoch": 0.15547927200754588, + "grad_norm": 0.11201644688844681, + "learning_rate": 0.002, + "loss": 2.3807, + "step": 40220 + }, + { + "epoch": 0.15551792921092916, + "grad_norm": 0.12052586674690247, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 40230 + }, + { + "epoch": 0.15555658641431244, + "grad_norm": 0.09976432472467422, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 40240 + }, + { + "epoch": 0.15559524361769572, + "grad_norm": 0.09134536236524582, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 40250 + }, + { + "epoch": 0.155633900821079, + "grad_norm": 0.12106562405824661, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 40260 + }, + { + "epoch": 0.15567255802446228, + "grad_norm": 0.13005444407463074, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 40270 + }, + { + "epoch": 0.15571121522784556, + "grad_norm": 0.11198069900274277, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 40280 + }, + { + "epoch": 0.15574987243122884, + "grad_norm": 0.13218308985233307, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 40290 + }, + { + "epoch": 0.15578852963461212, + "grad_norm": 0.11125843226909637, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 40300 + }, + { + "epoch": 0.1558271868379954, + "grad_norm": 0.10655783861875534, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 40310 + }, + { + "epoch": 0.15586584404137868, + "grad_norm": 0.11717306077480316, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 40320 + }, + { + "epoch": 0.15590450124476196, + "grad_norm": 0.12472710013389587, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 40330 + }, + { + "epoch": 0.1559431584481452, + "grad_norm": 0.10040798783302307, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 40340 + }, + { + "epoch": 0.1559818156515285, + "grad_norm": 0.10789766907691956, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 40350 + }, + { + "epoch": 0.15602047285491177, + "grad_norm": 0.10105174034833908, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 40360 + }, + { + "epoch": 0.15605913005829505, + "grad_norm": 0.10405503958463669, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 40370 + }, + { + "epoch": 0.15609778726167833, + "grad_norm": 0.12392150610685349, + "learning_rate": 0.002, + "loss": 2.357, + "step": 40380 + }, + { + "epoch": 0.1561364444650616, + "grad_norm": 0.11654236912727356, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 40390 + }, + { + "epoch": 0.1561751016684449, + "grad_norm": 0.10785891860723495, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 40400 + }, + { + "epoch": 0.15621375887182817, + "grad_norm": 0.10248953849077225, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 40410 + }, + { + "epoch": 0.15625241607521145, + "grad_norm": 0.09905094653367996, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 40420 + }, + { + "epoch": 0.15629107327859473, + "grad_norm": 0.11263404786586761, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 40430 + }, + { + "epoch": 0.156329730481978, + "grad_norm": 0.10626372694969177, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 40440 + }, + { + "epoch": 0.1563683876853613, + "grad_norm": 0.10268019884824753, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 40450 + }, + { + "epoch": 0.15640704488874457, + "grad_norm": 0.1138499453663826, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 40460 + }, + { + "epoch": 0.15644570209212785, + "grad_norm": 0.11354564130306244, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 40470 + }, + { + "epoch": 0.15648435929551113, + "grad_norm": 0.10203731805086136, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 40480 + }, + { + "epoch": 0.1565230164988944, + "grad_norm": 0.14382880926132202, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 40490 + }, + { + "epoch": 0.1565616737022777, + "grad_norm": 0.1070975810289383, + "learning_rate": 0.002, + "loss": 2.367, + "step": 40500 + }, + { + "epoch": 0.15660033090566097, + "grad_norm": 0.10172320157289505, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 40510 + }, + { + "epoch": 0.15663898810904425, + "grad_norm": 0.13020893931388855, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 40520 + }, + { + "epoch": 0.1566776453124275, + "grad_norm": 0.11600089818239212, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 40530 + }, + { + "epoch": 0.15671630251581078, + "grad_norm": 0.12256909906864166, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 40540 + }, + { + "epoch": 0.15675495971919406, + "grad_norm": 0.11398594826459885, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 40550 + }, + { + "epoch": 0.15679361692257734, + "grad_norm": 0.10339022427797318, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 40560 + }, + { + "epoch": 0.15683227412596062, + "grad_norm": 0.11490005254745483, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 40570 + }, + { + "epoch": 0.1568709313293439, + "grad_norm": 0.10776352137327194, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 40580 + }, + { + "epoch": 0.15690958853272718, + "grad_norm": 0.09247356653213501, + "learning_rate": 0.002, + "loss": 2.3831, + "step": 40590 + }, + { + "epoch": 0.15694824573611046, + "grad_norm": 0.10917262732982635, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 40600 + }, + { + "epoch": 0.15698690293949374, + "grad_norm": 0.12253812700510025, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 40610 + }, + { + "epoch": 0.15702556014287702, + "grad_norm": 0.09024526178836823, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 40620 + }, + { + "epoch": 0.1570642173462603, + "grad_norm": 0.1180238425731659, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 40630 + }, + { + "epoch": 0.15710287454964358, + "grad_norm": 0.10546514391899109, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 40640 + }, + { + "epoch": 0.15714153175302686, + "grad_norm": 0.10448755323886871, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 40650 + }, + { + "epoch": 0.15718018895641014, + "grad_norm": 0.09461455792188644, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 40660 + }, + { + "epoch": 0.15721884615979342, + "grad_norm": 0.11377495527267456, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 40670 + }, + { + "epoch": 0.1572575033631767, + "grad_norm": 0.11607875674962997, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 40680 + }, + { + "epoch": 0.15729616056655998, + "grad_norm": 0.10454504936933517, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 40690 + }, + { + "epoch": 0.15733481776994326, + "grad_norm": 0.10954848676919937, + "learning_rate": 0.002, + "loss": 2.355, + "step": 40700 + }, + { + "epoch": 0.15737347497332652, + "grad_norm": 0.18937352299690247, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 40710 + }, + { + "epoch": 0.1574121321767098, + "grad_norm": 0.09121294319629669, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 40720 + }, + { + "epoch": 0.15745078938009308, + "grad_norm": 0.10851619392633438, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 40730 + }, + { + "epoch": 0.15748944658347636, + "grad_norm": 0.1146823838353157, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 40740 + }, + { + "epoch": 0.15752810378685964, + "grad_norm": 0.11195283383131027, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 40750 + }, + { + "epoch": 0.15756676099024292, + "grad_norm": 0.09316971898078918, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 40760 + }, + { + "epoch": 0.1576054181936262, + "grad_norm": 0.11370929330587387, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 40770 + }, + { + "epoch": 0.15764407539700948, + "grad_norm": 0.11564512550830841, + "learning_rate": 0.002, + "loss": 2.371, + "step": 40780 + }, + { + "epoch": 0.15768273260039276, + "grad_norm": 0.11623459309339523, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 40790 + }, + { + "epoch": 0.15772138980377604, + "grad_norm": 0.09869327396154404, + "learning_rate": 0.002, + "loss": 2.344, + "step": 40800 + }, + { + "epoch": 0.15776004700715932, + "grad_norm": 0.12094533443450928, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 40810 + }, + { + "epoch": 0.1577987042105426, + "grad_norm": 0.10380878299474716, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 40820 + }, + { + "epoch": 0.15783736141392588, + "grad_norm": 0.1145130842924118, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 40830 + }, + { + "epoch": 0.15787601861730916, + "grad_norm": 0.109002023935318, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 40840 + }, + { + "epoch": 0.15791467582069244, + "grad_norm": 0.1000434085726738, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 40850 + }, + { + "epoch": 0.15795333302407571, + "grad_norm": 0.1145804226398468, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 40860 + }, + { + "epoch": 0.157991990227459, + "grad_norm": 0.09565671533346176, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 40870 + }, + { + "epoch": 0.15803064743084227, + "grad_norm": 0.09479733556509018, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 40880 + }, + { + "epoch": 0.15806930463422555, + "grad_norm": 0.09191577136516571, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 40890 + }, + { + "epoch": 0.1581079618376088, + "grad_norm": 0.10246901214122772, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 40900 + }, + { + "epoch": 0.1581466190409921, + "grad_norm": 0.11019384860992432, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 40910 + }, + { + "epoch": 0.15818527624437537, + "grad_norm": 0.10496606677770615, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 40920 + }, + { + "epoch": 0.15822393344775865, + "grad_norm": 0.12050442397594452, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 40930 + }, + { + "epoch": 0.15826259065114193, + "grad_norm": 0.1075400784611702, + "learning_rate": 0.002, + "loss": 2.3835, + "step": 40940 + }, + { + "epoch": 0.1583012478545252, + "grad_norm": 0.10765066742897034, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 40950 + }, + { + "epoch": 0.1583399050579085, + "grad_norm": 0.1123281940817833, + "learning_rate": 0.002, + "loss": 2.359, + "step": 40960 + }, + { + "epoch": 0.15837856226129177, + "grad_norm": 0.12240695208311081, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 40970 + }, + { + "epoch": 0.15841721946467505, + "grad_norm": 0.12359564006328583, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 40980 + }, + { + "epoch": 0.15845587666805833, + "grad_norm": 0.12420374900102615, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 40990 + }, + { + "epoch": 0.1584945338714416, + "grad_norm": 0.10608118772506714, + "learning_rate": 0.002, + "loss": 2.392, + "step": 41000 + }, + { + "epoch": 0.1585331910748249, + "grad_norm": 0.11344565451145172, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 41010 + }, + { + "epoch": 0.15857184827820817, + "grad_norm": 0.10235242545604706, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 41020 + }, + { + "epoch": 0.15861050548159145, + "grad_norm": 0.11939121037721634, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 41030 + }, + { + "epoch": 0.15864916268497473, + "grad_norm": 0.1067543774843216, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 41040 + }, + { + "epoch": 0.158687819888358, + "grad_norm": 0.10151252895593643, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 41050 + }, + { + "epoch": 0.15872647709174129, + "grad_norm": 0.10624859482049942, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 41060 + }, + { + "epoch": 0.15876513429512457, + "grad_norm": 0.11090574413537979, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 41070 + }, + { + "epoch": 0.15880379149850785, + "grad_norm": 0.10871037095785141, + "learning_rate": 0.002, + "loss": 2.357, + "step": 41080 + }, + { + "epoch": 0.1588424487018911, + "grad_norm": 0.094578817486763, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 41090 + }, + { + "epoch": 0.15888110590527438, + "grad_norm": 0.12802806496620178, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 41100 + }, + { + "epoch": 0.15891976310865766, + "grad_norm": 0.15294228494167328, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 41110 + }, + { + "epoch": 0.15895842031204094, + "grad_norm": 0.11911449581384659, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 41120 + }, + { + "epoch": 0.15899707751542422, + "grad_norm": 0.10868462920188904, + "learning_rate": 0.002, + "loss": 2.368, + "step": 41130 + }, + { + "epoch": 0.1590357347188075, + "grad_norm": 0.1078554317355156, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 41140 + }, + { + "epoch": 0.15907439192219078, + "grad_norm": 0.11344462633132935, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 41150 + }, + { + "epoch": 0.15911304912557406, + "grad_norm": 0.10913313180208206, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 41160 + }, + { + "epoch": 0.15915170632895734, + "grad_norm": 0.094475157558918, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 41170 + }, + { + "epoch": 0.15919036353234062, + "grad_norm": 0.10378986597061157, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 41180 + }, + { + "epoch": 0.1592290207357239, + "grad_norm": 0.12117094546556473, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 41190 + }, + { + "epoch": 0.15926767793910718, + "grad_norm": 0.10164425522089005, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 41200 + }, + { + "epoch": 0.15930633514249046, + "grad_norm": 0.11677688360214233, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 41210 + }, + { + "epoch": 0.15934499234587374, + "grad_norm": 0.10177718847990036, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 41220 + }, + { + "epoch": 0.15938364954925702, + "grad_norm": 0.12472458928823471, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 41230 + }, + { + "epoch": 0.1594223067526403, + "grad_norm": 0.09913904964923859, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 41240 + }, + { + "epoch": 0.15946096395602358, + "grad_norm": 0.11345645785331726, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 41250 + }, + { + "epoch": 0.15949962115940686, + "grad_norm": 0.10212550312280655, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 41260 + }, + { + "epoch": 0.1595382783627901, + "grad_norm": 0.09881438314914703, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 41270 + }, + { + "epoch": 0.1595769355661734, + "grad_norm": 0.108587846159935, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 41280 + }, + { + "epoch": 0.15961559276955667, + "grad_norm": 0.12372417002916336, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 41290 + }, + { + "epoch": 0.15965424997293995, + "grad_norm": 0.11009825766086578, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 41300 + }, + { + "epoch": 0.15969290717632323, + "grad_norm": 0.09615076333284378, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 41310 + }, + { + "epoch": 0.1597315643797065, + "grad_norm": 0.10243486613035202, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 41320 + }, + { + "epoch": 0.1597702215830898, + "grad_norm": 0.10789047926664352, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 41330 + }, + { + "epoch": 0.15980887878647307, + "grad_norm": 0.10097122937440872, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 41340 + }, + { + "epoch": 0.15984753598985635, + "grad_norm": 0.11414719372987747, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 41350 + }, + { + "epoch": 0.15988619319323963, + "grad_norm": 0.11628225445747375, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 41360 + }, + { + "epoch": 0.1599248503966229, + "grad_norm": 0.10371656715869904, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 41370 + }, + { + "epoch": 0.1599635076000062, + "grad_norm": 0.10703284293413162, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 41380 + }, + { + "epoch": 0.16000216480338947, + "grad_norm": 0.1234053373336792, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 41390 + }, + { + "epoch": 0.16004082200677275, + "grad_norm": 0.11645738780498505, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 41400 + }, + { + "epoch": 0.16007947921015603, + "grad_norm": 0.10837571322917938, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 41410 + }, + { + "epoch": 0.1601181364135393, + "grad_norm": 0.11016348004341125, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 41420 + }, + { + "epoch": 0.1601567936169226, + "grad_norm": 0.11296390742063522, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 41430 + }, + { + "epoch": 0.16019545082030587, + "grad_norm": 0.11930079013109207, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 41440 + }, + { + "epoch": 0.16023410802368915, + "grad_norm": 0.10097251832485199, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 41450 + }, + { + "epoch": 0.1602727652270724, + "grad_norm": 0.10060349106788635, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 41460 + }, + { + "epoch": 0.16031142243045568, + "grad_norm": 0.10195745527744293, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 41470 + }, + { + "epoch": 0.16035007963383896, + "grad_norm": 0.10063726454973221, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 41480 + }, + { + "epoch": 0.16038873683722224, + "grad_norm": 0.11097043007612228, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 41490 + }, + { + "epoch": 0.16042739404060552, + "grad_norm": 0.10367852449417114, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 41500 + }, + { + "epoch": 0.1604660512439888, + "grad_norm": 0.1086919978260994, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 41510 + }, + { + "epoch": 0.16050470844737208, + "grad_norm": 0.10193134099245071, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 41520 + }, + { + "epoch": 0.16054336565075536, + "grad_norm": 0.10302462428808212, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 41530 + }, + { + "epoch": 0.16058202285413864, + "grad_norm": 0.11917942762374878, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 41540 + }, + { + "epoch": 0.16062068005752192, + "grad_norm": 0.11485613137483597, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 41550 + }, + { + "epoch": 0.1606593372609052, + "grad_norm": 0.11260788142681122, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 41560 + }, + { + "epoch": 0.16069799446428848, + "grad_norm": 0.10732334852218628, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 41570 + }, + { + "epoch": 0.16073665166767176, + "grad_norm": 0.12386949360370636, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 41580 + }, + { + "epoch": 0.16077530887105504, + "grad_norm": 0.11389769613742828, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 41590 + }, + { + "epoch": 0.16081396607443832, + "grad_norm": 0.10484343022108078, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 41600 + }, + { + "epoch": 0.1608526232778216, + "grad_norm": 0.11448004841804504, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 41610 + }, + { + "epoch": 0.16089128048120488, + "grad_norm": 0.1158195436000824, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 41620 + }, + { + "epoch": 0.16092993768458816, + "grad_norm": 0.10200965404510498, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 41630 + }, + { + "epoch": 0.1609685948879714, + "grad_norm": 0.1114157885313034, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 41640 + }, + { + "epoch": 0.1610072520913547, + "grad_norm": 0.14126089215278625, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 41650 + }, + { + "epoch": 0.16104590929473797, + "grad_norm": 0.09708284586668015, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 41660 + }, + { + "epoch": 0.16108456649812125, + "grad_norm": 0.10202867537736893, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 41670 + }, + { + "epoch": 0.16112322370150453, + "grad_norm": 0.10208814591169357, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 41680 + }, + { + "epoch": 0.1611618809048878, + "grad_norm": 0.1131867989897728, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 41690 + }, + { + "epoch": 0.1612005381082711, + "grad_norm": 0.12291889637708664, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 41700 + }, + { + "epoch": 0.16123919531165437, + "grad_norm": 0.12259145081043243, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 41710 + }, + { + "epoch": 0.16127785251503765, + "grad_norm": 0.12137303501367569, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 41720 + }, + { + "epoch": 0.16131650971842093, + "grad_norm": 0.09981673955917358, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 41730 + }, + { + "epoch": 0.1613551669218042, + "grad_norm": 0.09920236468315125, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 41740 + }, + { + "epoch": 0.1613938241251875, + "grad_norm": 0.10953893512487411, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 41750 + }, + { + "epoch": 0.16143248132857077, + "grad_norm": 0.12220784276723862, + "learning_rate": 0.002, + "loss": 2.358, + "step": 41760 + }, + { + "epoch": 0.16147113853195405, + "grad_norm": 0.11718578636646271, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 41770 + }, + { + "epoch": 0.16150979573533733, + "grad_norm": 0.10739953815937042, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 41780 + }, + { + "epoch": 0.1615484529387206, + "grad_norm": 0.099624864757061, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 41790 + }, + { + "epoch": 0.1615871101421039, + "grad_norm": 0.10801070928573608, + "learning_rate": 0.002, + "loss": 2.359, + "step": 41800 + }, + { + "epoch": 0.16162576734548717, + "grad_norm": 0.10810569673776627, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 41810 + }, + { + "epoch": 0.16166442454887045, + "grad_norm": 0.09924687445163727, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 41820 + }, + { + "epoch": 0.1617030817522537, + "grad_norm": 0.10863161087036133, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 41830 + }, + { + "epoch": 0.16174173895563698, + "grad_norm": 0.1171480268239975, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 41840 + }, + { + "epoch": 0.16178039615902026, + "grad_norm": 0.12051352858543396, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 41850 + }, + { + "epoch": 0.16181905336240354, + "grad_norm": 0.11164912581443787, + "learning_rate": 0.002, + "loss": 2.361, + "step": 41860 + }, + { + "epoch": 0.16185771056578682, + "grad_norm": 0.10219510644674301, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 41870 + }, + { + "epoch": 0.1618963677691701, + "grad_norm": 0.1181781142950058, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 41880 + }, + { + "epoch": 0.16193502497255338, + "grad_norm": 0.10184772312641144, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 41890 + }, + { + "epoch": 0.16197368217593666, + "grad_norm": 0.09413854032754898, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 41900 + }, + { + "epoch": 0.16201233937931994, + "grad_norm": 0.09799924492835999, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 41910 + }, + { + "epoch": 0.16205099658270322, + "grad_norm": 0.09872174263000488, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 41920 + }, + { + "epoch": 0.1620896537860865, + "grad_norm": 0.10964304208755493, + "learning_rate": 0.002, + "loss": 2.367, + "step": 41930 + }, + { + "epoch": 0.16212831098946978, + "grad_norm": 0.1020277738571167, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 41940 + }, + { + "epoch": 0.16216696819285306, + "grad_norm": 0.10835389047861099, + "learning_rate": 0.002, + "loss": 2.365, + "step": 41950 + }, + { + "epoch": 0.16220562539623634, + "grad_norm": 0.0997660756111145, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 41960 + }, + { + "epoch": 0.16224428259961962, + "grad_norm": 0.11345890164375305, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 41970 + }, + { + "epoch": 0.1622829398030029, + "grad_norm": 0.10423390567302704, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 41980 + }, + { + "epoch": 0.16232159700638618, + "grad_norm": 0.1084778755903244, + "learning_rate": 0.002, + "loss": 2.3832, + "step": 41990 + }, + { + "epoch": 0.16236025420976946, + "grad_norm": 0.09999390691518784, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 42000 + }, + { + "epoch": 0.1623989114131527, + "grad_norm": 0.10016658902168274, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 42010 + }, + { + "epoch": 0.162437568616536, + "grad_norm": 0.10784460604190826, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 42020 + }, + { + "epoch": 0.16247622581991927, + "grad_norm": 0.10587889701128006, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 42030 + }, + { + "epoch": 0.16251488302330255, + "grad_norm": 0.1207164004445076, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 42040 + }, + { + "epoch": 0.16255354022668583, + "grad_norm": 0.109578438103199, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 42050 + }, + { + "epoch": 0.1625921974300691, + "grad_norm": 0.12938733398914337, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 42060 + }, + { + "epoch": 0.1626308546334524, + "grad_norm": 0.10855179280042648, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 42070 + }, + { + "epoch": 0.16266951183683567, + "grad_norm": 0.1001514419913292, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 42080 + }, + { + "epoch": 0.16270816904021895, + "grad_norm": 0.10790702700614929, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 42090 + }, + { + "epoch": 0.16274682624360223, + "grad_norm": 0.11273425817489624, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 42100 + }, + { + "epoch": 0.1627854834469855, + "grad_norm": 0.12000437825918198, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 42110 + }, + { + "epoch": 0.1628241406503688, + "grad_norm": 0.10086032003164291, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 42120 + }, + { + "epoch": 0.16286279785375207, + "grad_norm": 0.11639446020126343, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 42130 + }, + { + "epoch": 0.16290145505713535, + "grad_norm": 0.1023893803358078, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 42140 + }, + { + "epoch": 0.16294011226051863, + "grad_norm": 0.10636291652917862, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 42150 + }, + { + "epoch": 0.1629787694639019, + "grad_norm": 0.11687823385000229, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 42160 + }, + { + "epoch": 0.1630174266672852, + "grad_norm": 0.11895740032196045, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 42170 + }, + { + "epoch": 0.16305608387066847, + "grad_norm": 0.12592259049415588, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 42180 + }, + { + "epoch": 0.16309474107405175, + "grad_norm": 0.10185597091913223, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 42190 + }, + { + "epoch": 0.163133398277435, + "grad_norm": 0.0969577431678772, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 42200 + }, + { + "epoch": 0.16317205548081828, + "grad_norm": 0.1074877604842186, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 42210 + }, + { + "epoch": 0.16321071268420156, + "grad_norm": 0.1083407774567604, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 42220 + }, + { + "epoch": 0.16324936988758484, + "grad_norm": 0.10659298300743103, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 42230 + }, + { + "epoch": 0.16328802709096812, + "grad_norm": 0.12535052001476288, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 42240 + }, + { + "epoch": 0.1633266842943514, + "grad_norm": 0.10125640034675598, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 42250 + }, + { + "epoch": 0.16336534149773468, + "grad_norm": 0.12252649664878845, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 42260 + }, + { + "epoch": 0.16340399870111796, + "grad_norm": 0.10944797098636627, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 42270 + }, + { + "epoch": 0.16344265590450124, + "grad_norm": 0.09932339191436768, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 42280 + }, + { + "epoch": 0.16348131310788452, + "grad_norm": 0.10842996090650558, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 42290 + }, + { + "epoch": 0.1635199703112678, + "grad_norm": 0.10472535341978073, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 42300 + }, + { + "epoch": 0.16355862751465108, + "grad_norm": 0.11402362585067749, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 42310 + }, + { + "epoch": 0.16359728471803436, + "grad_norm": 0.11580096185207367, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 42320 + }, + { + "epoch": 0.16363594192141764, + "grad_norm": 0.1062471866607666, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 42330 + }, + { + "epoch": 0.16367459912480092, + "grad_norm": 0.10192801803350449, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 42340 + }, + { + "epoch": 0.1637132563281842, + "grad_norm": 0.10680807381868362, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 42350 + }, + { + "epoch": 0.16375191353156748, + "grad_norm": 0.1325022429227829, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 42360 + }, + { + "epoch": 0.16379057073495076, + "grad_norm": 0.11499712616205215, + "learning_rate": 0.002, + "loss": 2.363, + "step": 42370 + }, + { + "epoch": 0.16382922793833402, + "grad_norm": 0.10338990390300751, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 42380 + }, + { + "epoch": 0.1638678851417173, + "grad_norm": 0.11115837097167969, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 42390 + }, + { + "epoch": 0.16390654234510058, + "grad_norm": 0.09730077534914017, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 42400 + }, + { + "epoch": 0.16394519954848386, + "grad_norm": 0.1082802340388298, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 42410 + }, + { + "epoch": 0.16398385675186714, + "grad_norm": 0.11049145460128784, + "learning_rate": 0.002, + "loss": 2.365, + "step": 42420 + }, + { + "epoch": 0.16402251395525042, + "grad_norm": 0.10516611486673355, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 42430 + }, + { + "epoch": 0.1640611711586337, + "grad_norm": 0.11633968353271484, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 42440 + }, + { + "epoch": 0.16409982836201698, + "grad_norm": 0.1104530468583107, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 42450 + }, + { + "epoch": 0.16413848556540026, + "grad_norm": 0.13360534608364105, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 42460 + }, + { + "epoch": 0.16417714276878353, + "grad_norm": 0.11686535179615021, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 42470 + }, + { + "epoch": 0.16421579997216681, + "grad_norm": 0.09977176785469055, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 42480 + }, + { + "epoch": 0.1642544571755501, + "grad_norm": 0.09932684898376465, + "learning_rate": 0.002, + "loss": 2.373, + "step": 42490 + }, + { + "epoch": 0.16429311437893337, + "grad_norm": 0.10714755952358246, + "learning_rate": 0.002, + "loss": 2.367, + "step": 42500 + }, + { + "epoch": 0.16433177158231665, + "grad_norm": 0.11574291437864304, + "learning_rate": 0.002, + "loss": 2.369, + "step": 42510 + }, + { + "epoch": 0.16437042878569993, + "grad_norm": 0.10226025432348251, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 42520 + }, + { + "epoch": 0.16440908598908321, + "grad_norm": 0.13221094012260437, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 42530 + }, + { + "epoch": 0.1644477431924665, + "grad_norm": 0.1127496212720871, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 42540 + }, + { + "epoch": 0.16448640039584977, + "grad_norm": 0.16812896728515625, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 42550 + }, + { + "epoch": 0.16452505759923305, + "grad_norm": 0.11325038224458694, + "learning_rate": 0.002, + "loss": 2.3892, + "step": 42560 + }, + { + "epoch": 0.1645637148026163, + "grad_norm": 0.09131856262683868, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 42570 + }, + { + "epoch": 0.1646023720059996, + "grad_norm": 0.11105062067508698, + "learning_rate": 0.002, + "loss": 2.365, + "step": 42580 + }, + { + "epoch": 0.16464102920938287, + "grad_norm": 0.1120314747095108, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 42590 + }, + { + "epoch": 0.16467968641276615, + "grad_norm": 0.119980588555336, + "learning_rate": 0.002, + "loss": 2.3821, + "step": 42600 + }, + { + "epoch": 0.16471834361614943, + "grad_norm": 0.08983515202999115, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 42610 + }, + { + "epoch": 0.1647570008195327, + "grad_norm": 0.10229698568582535, + "learning_rate": 0.002, + "loss": 2.352, + "step": 42620 + }, + { + "epoch": 0.164795658022916, + "grad_norm": 0.10783220827579498, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 42630 + }, + { + "epoch": 0.16483431522629927, + "grad_norm": 0.1008199080824852, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 42640 + }, + { + "epoch": 0.16487297242968255, + "grad_norm": 0.10238343477249146, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 42650 + }, + { + "epoch": 0.16491162963306583, + "grad_norm": 0.11489154398441315, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 42660 + }, + { + "epoch": 0.1649502868364491, + "grad_norm": 0.12892086803913116, + "learning_rate": 0.002, + "loss": 2.373, + "step": 42670 + }, + { + "epoch": 0.16498894403983239, + "grad_norm": 0.10225165635347366, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 42680 + }, + { + "epoch": 0.16502760124321567, + "grad_norm": 0.10173000395298004, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 42690 + }, + { + "epoch": 0.16506625844659895, + "grad_norm": 0.11082421243190765, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 42700 + }, + { + "epoch": 0.16510491564998223, + "grad_norm": 0.11183352023363113, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 42710 + }, + { + "epoch": 0.1651435728533655, + "grad_norm": 0.11176785081624985, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 42720 + }, + { + "epoch": 0.16518223005674879, + "grad_norm": 0.10572459548711777, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 42730 + }, + { + "epoch": 0.16522088726013207, + "grad_norm": 0.11575083434581757, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 42740 + }, + { + "epoch": 0.16525954446351532, + "grad_norm": 0.0935940369963646, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 42750 + }, + { + "epoch": 0.1652982016668986, + "grad_norm": 0.12977440655231476, + "learning_rate": 0.002, + "loss": 2.375, + "step": 42760 + }, + { + "epoch": 0.16533685887028188, + "grad_norm": 0.10405240952968597, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 42770 + }, + { + "epoch": 0.16537551607366516, + "grad_norm": 0.11216165125370026, + "learning_rate": 0.002, + "loss": 2.356, + "step": 42780 + }, + { + "epoch": 0.16541417327704844, + "grad_norm": 0.10593120753765106, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 42790 + }, + { + "epoch": 0.16545283048043172, + "grad_norm": 0.12728847563266754, + "learning_rate": 0.002, + "loss": 2.366, + "step": 42800 + }, + { + "epoch": 0.165491487683815, + "grad_norm": 0.12727369368076324, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 42810 + }, + { + "epoch": 0.16553014488719828, + "grad_norm": 0.11364904046058655, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 42820 + }, + { + "epoch": 0.16556880209058156, + "grad_norm": 0.09339181333780289, + "learning_rate": 0.002, + "loss": 2.378, + "step": 42830 + }, + { + "epoch": 0.16560745929396484, + "grad_norm": 0.14502650499343872, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 42840 + }, + { + "epoch": 0.16564611649734812, + "grad_norm": 0.10401785373687744, + "learning_rate": 0.002, + "loss": 2.364, + "step": 42850 + }, + { + "epoch": 0.1656847737007314, + "grad_norm": 0.10165320336818695, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 42860 + }, + { + "epoch": 0.16572343090411468, + "grad_norm": 0.11069615185260773, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 42870 + }, + { + "epoch": 0.16576208810749796, + "grad_norm": 0.11860626935958862, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 42880 + }, + { + "epoch": 0.16580074531088124, + "grad_norm": 0.10857722163200378, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 42890 + }, + { + "epoch": 0.16583940251426452, + "grad_norm": 0.09501806646585464, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 42900 + }, + { + "epoch": 0.1658780597176478, + "grad_norm": 0.13568583130836487, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 42910 + }, + { + "epoch": 0.16591671692103108, + "grad_norm": 0.10308828204870224, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 42920 + }, + { + "epoch": 0.16595537412441436, + "grad_norm": 0.11815542727708817, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 42930 + }, + { + "epoch": 0.1659940313277976, + "grad_norm": 0.11061900109052658, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 42940 + }, + { + "epoch": 0.1660326885311809, + "grad_norm": 0.11406237632036209, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 42950 + }, + { + "epoch": 0.16607134573456417, + "grad_norm": 0.0945214033126831, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 42960 + }, + { + "epoch": 0.16611000293794745, + "grad_norm": 0.11722251772880554, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 42970 + }, + { + "epoch": 0.16614866014133073, + "grad_norm": 0.1507737636566162, + "learning_rate": 0.002, + "loss": 2.372, + "step": 42980 + }, + { + "epoch": 0.166187317344714, + "grad_norm": 0.11134760081768036, + "learning_rate": 0.002, + "loss": 2.3787, + "step": 42990 + }, + { + "epoch": 0.1662259745480973, + "grad_norm": 0.19596156477928162, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 43000 + }, + { + "epoch": 0.16626463175148057, + "grad_norm": 0.15305453538894653, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 43010 + }, + { + "epoch": 0.16630328895486385, + "grad_norm": 0.11053252220153809, + "learning_rate": 0.002, + "loss": 2.3756, + "step": 43020 + }, + { + "epoch": 0.16634194615824713, + "grad_norm": 0.11744187027215958, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 43030 + }, + { + "epoch": 0.1663806033616304, + "grad_norm": 0.11429338902235031, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 43040 + }, + { + "epoch": 0.1664192605650137, + "grad_norm": 0.12207076698541641, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 43050 + }, + { + "epoch": 0.16645791776839697, + "grad_norm": 0.1300329566001892, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 43060 + }, + { + "epoch": 0.16649657497178025, + "grad_norm": 0.10230127722024918, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 43070 + }, + { + "epoch": 0.16653523217516353, + "grad_norm": 0.11186619848012924, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 43080 + }, + { + "epoch": 0.1665738893785468, + "grad_norm": 0.1025962233543396, + "learning_rate": 0.002, + "loss": 2.379, + "step": 43090 + }, + { + "epoch": 0.1666125465819301, + "grad_norm": 0.12221942842006683, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 43100 + }, + { + "epoch": 0.16665120378531337, + "grad_norm": 0.10172362625598907, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 43110 + }, + { + "epoch": 0.16668986098869665, + "grad_norm": 0.09649766981601715, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 43120 + }, + { + "epoch": 0.1667285181920799, + "grad_norm": 0.11706840246915817, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 43130 + }, + { + "epoch": 0.16676717539546318, + "grad_norm": 0.11006322503089905, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 43140 + }, + { + "epoch": 0.16680583259884646, + "grad_norm": 0.1078108474612236, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 43150 + }, + { + "epoch": 0.16684448980222974, + "grad_norm": 0.13114474713802338, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 43160 + }, + { + "epoch": 0.16688314700561302, + "grad_norm": 0.11985458433628082, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 43170 + }, + { + "epoch": 0.1669218042089963, + "grad_norm": 0.10740912705659866, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 43180 + }, + { + "epoch": 0.16696046141237958, + "grad_norm": 0.12001767009496689, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 43190 + }, + { + "epoch": 0.16699911861576286, + "grad_norm": 0.10463010519742966, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 43200 + }, + { + "epoch": 0.16703777581914614, + "grad_norm": 0.09417607635259628, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 43210 + }, + { + "epoch": 0.16707643302252942, + "grad_norm": 0.10411380976438522, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 43220 + }, + { + "epoch": 0.1671150902259127, + "grad_norm": 0.11311469227075577, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 43230 + }, + { + "epoch": 0.16715374742929598, + "grad_norm": 0.1006241962313652, + "learning_rate": 0.002, + "loss": 2.358, + "step": 43240 + }, + { + "epoch": 0.16719240463267926, + "grad_norm": 0.09688572585582733, + "learning_rate": 0.002, + "loss": 2.3982, + "step": 43250 + }, + { + "epoch": 0.16723106183606254, + "grad_norm": 0.12874460220336914, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 43260 + }, + { + "epoch": 0.16726971903944582, + "grad_norm": 0.10218334943056107, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 43270 + }, + { + "epoch": 0.1673083762428291, + "grad_norm": 0.11155752837657928, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 43280 + }, + { + "epoch": 0.16734703344621238, + "grad_norm": 0.0992024838924408, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 43290 + }, + { + "epoch": 0.16738569064959566, + "grad_norm": 0.11269593983888626, + "learning_rate": 0.002, + "loss": 2.353, + "step": 43300 + }, + { + "epoch": 0.1674243478529789, + "grad_norm": 0.1077861487865448, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 43310 + }, + { + "epoch": 0.1674630050563622, + "grad_norm": 0.11057788133621216, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 43320 + }, + { + "epoch": 0.16750166225974547, + "grad_norm": 0.10573381930589676, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 43330 + }, + { + "epoch": 0.16754031946312875, + "grad_norm": 0.10935483127832413, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 43340 + }, + { + "epoch": 0.16757897666651203, + "grad_norm": 0.13087883591651917, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 43350 + }, + { + "epoch": 0.1676176338698953, + "grad_norm": 0.09338037669658661, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 43360 + }, + { + "epoch": 0.1676562910732786, + "grad_norm": 0.09488900750875473, + "learning_rate": 0.002, + "loss": 2.362, + "step": 43370 + }, + { + "epoch": 0.16769494827666187, + "grad_norm": 0.10798195749521255, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 43380 + }, + { + "epoch": 0.16773360548004515, + "grad_norm": 0.10760509222745895, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 43390 + }, + { + "epoch": 0.16777226268342843, + "grad_norm": 0.11724671721458435, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 43400 + }, + { + "epoch": 0.1678109198868117, + "grad_norm": 0.10204483568668365, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 43410 + }, + { + "epoch": 0.167849577090195, + "grad_norm": 0.10492843389511108, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 43420 + }, + { + "epoch": 0.16788823429357827, + "grad_norm": 0.10826718807220459, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 43430 + }, + { + "epoch": 0.16792689149696155, + "grad_norm": 0.11438310146331787, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 43440 + }, + { + "epoch": 0.16796554870034483, + "grad_norm": 0.12006634473800659, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 43450 + }, + { + "epoch": 0.1680042059037281, + "grad_norm": 0.10296225547790527, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 43460 + }, + { + "epoch": 0.1680428631071114, + "grad_norm": 0.10148349404335022, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 43470 + }, + { + "epoch": 0.16808152031049467, + "grad_norm": 0.09886068850755692, + "learning_rate": 0.002, + "loss": 2.367, + "step": 43480 + }, + { + "epoch": 0.16812017751387795, + "grad_norm": 0.11167940497398376, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 43490 + }, + { + "epoch": 0.1681588347172612, + "grad_norm": 0.12011834233999252, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 43500 + }, + { + "epoch": 0.16819749192064448, + "grad_norm": 0.11208701878786087, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 43510 + }, + { + "epoch": 0.16823614912402776, + "grad_norm": 0.11228106915950775, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 43520 + }, + { + "epoch": 0.16827480632741104, + "grad_norm": 0.10892346501350403, + "learning_rate": 0.002, + "loss": 2.36, + "step": 43530 + }, + { + "epoch": 0.16831346353079432, + "grad_norm": 0.09330982714891434, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 43540 + }, + { + "epoch": 0.1683521207341776, + "grad_norm": 0.12238658964633942, + "learning_rate": 0.002, + "loss": 2.3801, + "step": 43550 + }, + { + "epoch": 0.16839077793756088, + "grad_norm": 0.10222240537405014, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 43560 + }, + { + "epoch": 0.16842943514094416, + "grad_norm": 0.10212979465723038, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 43570 + }, + { + "epoch": 0.16846809234432744, + "grad_norm": 0.09810635447502136, + "learning_rate": 0.002, + "loss": 2.355, + "step": 43580 + }, + { + "epoch": 0.16850674954771072, + "grad_norm": 0.11242741346359253, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 43590 + }, + { + "epoch": 0.168545406751094, + "grad_norm": 0.11861951649188995, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 43600 + }, + { + "epoch": 0.16858406395447728, + "grad_norm": 0.10650777816772461, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 43610 + }, + { + "epoch": 0.16862272115786056, + "grad_norm": 0.10390397906303406, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 43620 + }, + { + "epoch": 0.16866137836124384, + "grad_norm": 0.09910054504871368, + "learning_rate": 0.002, + "loss": 2.3793, + "step": 43630 + }, + { + "epoch": 0.16870003556462712, + "grad_norm": 0.10709985345602036, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 43640 + }, + { + "epoch": 0.1687386927680104, + "grad_norm": 0.22000548243522644, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 43650 + }, + { + "epoch": 0.16877734997139368, + "grad_norm": 0.11501609534025192, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 43660 + }, + { + "epoch": 0.16881600717477696, + "grad_norm": 0.11920400708913803, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 43670 + }, + { + "epoch": 0.1688546643781602, + "grad_norm": 0.1039498895406723, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 43680 + }, + { + "epoch": 0.1688933215815435, + "grad_norm": 0.11218732595443726, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 43690 + }, + { + "epoch": 0.16893197878492677, + "grad_norm": 0.10131587088108063, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 43700 + }, + { + "epoch": 0.16897063598831005, + "grad_norm": 0.14687520265579224, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 43710 + }, + { + "epoch": 0.16900929319169333, + "grad_norm": 0.1265539526939392, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 43720 + }, + { + "epoch": 0.1690479503950766, + "grad_norm": 0.10759860277175903, + "learning_rate": 0.002, + "loss": 2.368, + "step": 43730 + }, + { + "epoch": 0.1690866075984599, + "grad_norm": 0.12066882848739624, + "learning_rate": 0.002, + "loss": 2.371, + "step": 43740 + }, + { + "epoch": 0.16912526480184317, + "grad_norm": 0.10885506868362427, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 43750 + }, + { + "epoch": 0.16916392200522645, + "grad_norm": 0.09433972835540771, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 43760 + }, + { + "epoch": 0.16920257920860973, + "grad_norm": 0.11025030165910721, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 43770 + }, + { + "epoch": 0.169241236411993, + "grad_norm": 0.10352890193462372, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 43780 + }, + { + "epoch": 0.1692798936153763, + "grad_norm": 0.13102802634239197, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 43790 + }, + { + "epoch": 0.16931855081875957, + "grad_norm": 0.09540624916553497, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 43800 + }, + { + "epoch": 0.16935720802214285, + "grad_norm": 0.10578183084726334, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 43810 + }, + { + "epoch": 0.16939586522552613, + "grad_norm": 0.101011723279953, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 43820 + }, + { + "epoch": 0.1694345224289094, + "grad_norm": 0.1189325824379921, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 43830 + }, + { + "epoch": 0.1694731796322927, + "grad_norm": 0.11614560335874557, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 43840 + }, + { + "epoch": 0.16951183683567597, + "grad_norm": 0.10390368103981018, + "learning_rate": 0.002, + "loss": 2.358, + "step": 43850 + }, + { + "epoch": 0.16955049403905925, + "grad_norm": 0.10693172365427017, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 43860 + }, + { + "epoch": 0.1695891512424425, + "grad_norm": 0.11329218000173569, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 43870 + }, + { + "epoch": 0.16962780844582578, + "grad_norm": 0.08312906324863434, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 43880 + }, + { + "epoch": 0.16966646564920906, + "grad_norm": 0.1291521042585373, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 43890 + }, + { + "epoch": 0.16970512285259234, + "grad_norm": 0.09508147835731506, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 43900 + }, + { + "epoch": 0.16974378005597562, + "grad_norm": 0.11148455739021301, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 43910 + }, + { + "epoch": 0.1697824372593589, + "grad_norm": 0.12895052134990692, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 43920 + }, + { + "epoch": 0.16982109446274218, + "grad_norm": 0.12822982668876648, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 43930 + }, + { + "epoch": 0.16985975166612546, + "grad_norm": 0.089112788438797, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 43940 + }, + { + "epoch": 0.16989840886950874, + "grad_norm": 0.10964418947696686, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 43950 + }, + { + "epoch": 0.16993706607289202, + "grad_norm": 0.12344907969236374, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 43960 + }, + { + "epoch": 0.1699757232762753, + "grad_norm": 0.09824126213788986, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 43970 + }, + { + "epoch": 0.17001438047965858, + "grad_norm": 0.10377447307109833, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 43980 + }, + { + "epoch": 0.17005303768304186, + "grad_norm": 0.11566921323537827, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 43990 + }, + { + "epoch": 0.17009169488642514, + "grad_norm": 0.12002314627170563, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 44000 + }, + { + "epoch": 0.17013035208980842, + "grad_norm": 0.09902594238519669, + "learning_rate": 0.002, + "loss": 2.365, + "step": 44010 + }, + { + "epoch": 0.1701690092931917, + "grad_norm": 0.0968996211886406, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 44020 + }, + { + "epoch": 0.17020766649657498, + "grad_norm": 0.12413863837718964, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 44030 + }, + { + "epoch": 0.17024632369995826, + "grad_norm": 0.1103622242808342, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 44040 + }, + { + "epoch": 0.17028498090334152, + "grad_norm": 0.11150927096605301, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 44050 + }, + { + "epoch": 0.1703236381067248, + "grad_norm": 0.13580836355686188, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 44060 + }, + { + "epoch": 0.17036229531010808, + "grad_norm": 0.11713317036628723, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 44070 + }, + { + "epoch": 0.17040095251349135, + "grad_norm": 0.11041072010993958, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 44080 + }, + { + "epoch": 0.17043960971687463, + "grad_norm": 0.13192050158977509, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 44090 + }, + { + "epoch": 0.17047826692025791, + "grad_norm": 0.1301390528678894, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 44100 + }, + { + "epoch": 0.1705169241236412, + "grad_norm": 0.12830758094787598, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 44110 + }, + { + "epoch": 0.17055558132702447, + "grad_norm": 0.10173708200454712, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 44120 + }, + { + "epoch": 0.17059423853040775, + "grad_norm": 0.09899038076400757, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 44130 + }, + { + "epoch": 0.17063289573379103, + "grad_norm": 0.10673151165246964, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 44140 + }, + { + "epoch": 0.17067155293717431, + "grad_norm": 0.10917104035615921, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 44150 + }, + { + "epoch": 0.1707102101405576, + "grad_norm": 0.10550056397914886, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 44160 + }, + { + "epoch": 0.17074886734394087, + "grad_norm": 0.10488106310367584, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 44170 + }, + { + "epoch": 0.17078752454732415, + "grad_norm": 0.13775895535945892, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 44180 + }, + { + "epoch": 0.17082618175070743, + "grad_norm": 0.10303659737110138, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 44190 + }, + { + "epoch": 0.17086483895409071, + "grad_norm": 0.11900082975625992, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 44200 + }, + { + "epoch": 0.170903496157474, + "grad_norm": 0.11325709521770477, + "learning_rate": 0.002, + "loss": 2.366, + "step": 44210 + }, + { + "epoch": 0.17094215336085727, + "grad_norm": 0.09763014316558838, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 44220 + }, + { + "epoch": 0.17098081056424055, + "grad_norm": 0.1087694838643074, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 44230 + }, + { + "epoch": 0.1710194677676238, + "grad_norm": 0.1253872960805893, + "learning_rate": 0.002, + "loss": 2.3795, + "step": 44240 + }, + { + "epoch": 0.1710581249710071, + "grad_norm": 0.16377699375152588, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 44250 + }, + { + "epoch": 0.17109678217439037, + "grad_norm": 0.10072610527276993, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 44260 + }, + { + "epoch": 0.17113543937777365, + "grad_norm": 0.0969557985663414, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 44270 + }, + { + "epoch": 0.17117409658115693, + "grad_norm": 0.11019019037485123, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 44280 + }, + { + "epoch": 0.1712127537845402, + "grad_norm": 0.10584430396556854, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 44290 + }, + { + "epoch": 0.17125141098792349, + "grad_norm": 0.10273636132478714, + "learning_rate": 0.002, + "loss": 2.368, + "step": 44300 + }, + { + "epoch": 0.17129006819130677, + "grad_norm": 0.10688309371471405, + "learning_rate": 0.002, + "loss": 2.359, + "step": 44310 + }, + { + "epoch": 0.17132872539469005, + "grad_norm": 0.1351660043001175, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 44320 + }, + { + "epoch": 0.17136738259807333, + "grad_norm": 0.12430575489997864, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 44330 + }, + { + "epoch": 0.1714060398014566, + "grad_norm": 0.12683185935020447, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 44340 + }, + { + "epoch": 0.17144469700483989, + "grad_norm": 0.1072586253285408, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 44350 + }, + { + "epoch": 0.17148335420822317, + "grad_norm": 0.10600695013999939, + "learning_rate": 0.002, + "loss": 2.361, + "step": 44360 + }, + { + "epoch": 0.17152201141160645, + "grad_norm": 0.10198380798101425, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 44370 + }, + { + "epoch": 0.17156066861498973, + "grad_norm": 0.10946477949619293, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 44380 + }, + { + "epoch": 0.171599325818373, + "grad_norm": 0.11855118721723557, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 44390 + }, + { + "epoch": 0.17163798302175629, + "grad_norm": 0.10252556949853897, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 44400 + }, + { + "epoch": 0.17167664022513957, + "grad_norm": 0.10042252391576767, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 44410 + }, + { + "epoch": 0.17171529742852282, + "grad_norm": 0.10182294249534607, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 44420 + }, + { + "epoch": 0.1717539546319061, + "grad_norm": 0.12930281460285187, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 44430 + }, + { + "epoch": 0.17179261183528938, + "grad_norm": 0.10676681250333786, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 44440 + }, + { + "epoch": 0.17183126903867266, + "grad_norm": 0.1255345195531845, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 44450 + }, + { + "epoch": 0.17186992624205594, + "grad_norm": 0.11223578453063965, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 44460 + }, + { + "epoch": 0.17190858344543922, + "grad_norm": 0.09784567356109619, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 44470 + }, + { + "epoch": 0.1719472406488225, + "grad_norm": 0.09986116737127304, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 44480 + }, + { + "epoch": 0.17198589785220578, + "grad_norm": 0.15577766299247742, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 44490 + }, + { + "epoch": 0.17202455505558906, + "grad_norm": 0.10955075919628143, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 44500 + }, + { + "epoch": 0.17206321225897234, + "grad_norm": 0.10409995913505554, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 44510 + }, + { + "epoch": 0.17210186946235562, + "grad_norm": 0.1049322858452797, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 44520 + }, + { + "epoch": 0.1721405266657389, + "grad_norm": 0.09995309263467789, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 44530 + }, + { + "epoch": 0.17217918386912218, + "grad_norm": 0.10812091827392578, + "learning_rate": 0.002, + "loss": 2.358, + "step": 44540 + }, + { + "epoch": 0.17221784107250546, + "grad_norm": 0.10199900716543198, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 44550 + }, + { + "epoch": 0.17225649827588874, + "grad_norm": 0.11742229759693146, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 44560 + }, + { + "epoch": 0.17229515547927202, + "grad_norm": 0.13238979876041412, + "learning_rate": 0.002, + "loss": 2.367, + "step": 44570 + }, + { + "epoch": 0.1723338126826553, + "grad_norm": 0.11673494428396225, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 44580 + }, + { + "epoch": 0.17237246988603858, + "grad_norm": 0.10327861458063126, + "learning_rate": 0.002, + "loss": 2.375, + "step": 44590 + }, + { + "epoch": 0.17241112708942186, + "grad_norm": 0.09699682891368866, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 44600 + }, + { + "epoch": 0.1724497842928051, + "grad_norm": 0.09948820620775223, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 44610 + }, + { + "epoch": 0.1724884414961884, + "grad_norm": 0.12126339972019196, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 44620 + }, + { + "epoch": 0.17252709869957167, + "grad_norm": 0.1148219108581543, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 44630 + }, + { + "epoch": 0.17256575590295495, + "grad_norm": 0.10499219596385956, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 44640 + }, + { + "epoch": 0.17260441310633823, + "grad_norm": 0.10470159351825714, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 44650 + }, + { + "epoch": 0.1726430703097215, + "grad_norm": 0.12049949914216995, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 44660 + }, + { + "epoch": 0.1726817275131048, + "grad_norm": 0.10587108880281448, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 44670 + }, + { + "epoch": 0.17272038471648807, + "grad_norm": 0.15022005140781403, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 44680 + }, + { + "epoch": 0.17275904191987135, + "grad_norm": 0.09850125759840012, + "learning_rate": 0.002, + "loss": 2.358, + "step": 44690 + }, + { + "epoch": 0.17279769912325463, + "grad_norm": 0.1199873685836792, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 44700 + }, + { + "epoch": 0.1728363563266379, + "grad_norm": 0.10073809325695038, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 44710 + }, + { + "epoch": 0.1728750135300212, + "grad_norm": 0.10510051250457764, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 44720 + }, + { + "epoch": 0.17291367073340447, + "grad_norm": 0.1406104862689972, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 44730 + }, + { + "epoch": 0.17295232793678775, + "grad_norm": 0.0993463471531868, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 44740 + }, + { + "epoch": 0.17299098514017103, + "grad_norm": 0.11417844891548157, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 44750 + }, + { + "epoch": 0.1730296423435543, + "grad_norm": 0.11962078511714935, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 44760 + }, + { + "epoch": 0.1730682995469376, + "grad_norm": 0.12286528199911118, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 44770 + }, + { + "epoch": 0.17310695675032087, + "grad_norm": 0.09920985251665115, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 44780 + }, + { + "epoch": 0.17314561395370412, + "grad_norm": 0.10032475739717484, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 44790 + }, + { + "epoch": 0.1731842711570874, + "grad_norm": 0.0911695659160614, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 44800 + }, + { + "epoch": 0.17322292836047068, + "grad_norm": 0.11346561461687088, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 44810 + }, + { + "epoch": 0.17326158556385396, + "grad_norm": 0.11510348320007324, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 44820 + }, + { + "epoch": 0.17330024276723724, + "grad_norm": 0.11334090679883957, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 44830 + }, + { + "epoch": 0.17333889997062052, + "grad_norm": 0.1060083732008934, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 44840 + }, + { + "epoch": 0.1733775571740038, + "grad_norm": 0.11127477884292603, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 44850 + }, + { + "epoch": 0.17341621437738708, + "grad_norm": 0.10036677122116089, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 44860 + }, + { + "epoch": 0.17345487158077036, + "grad_norm": 0.12818297743797302, + "learning_rate": 0.002, + "loss": 2.3867, + "step": 44870 + }, + { + "epoch": 0.17349352878415364, + "grad_norm": 0.10412877053022385, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 44880 + }, + { + "epoch": 0.17353218598753692, + "grad_norm": 0.16107313334941864, + "learning_rate": 0.002, + "loss": 2.3788, + "step": 44890 + }, + { + "epoch": 0.1735708431909202, + "grad_norm": 0.10612837225198746, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 44900 + }, + { + "epoch": 0.17360950039430348, + "grad_norm": 0.11250829696655273, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 44910 + }, + { + "epoch": 0.17364815759768676, + "grad_norm": 0.11362912505865097, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 44920 + }, + { + "epoch": 0.17368681480107004, + "grad_norm": 0.10450930148363113, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 44930 + }, + { + "epoch": 0.17372547200445332, + "grad_norm": 0.10974158346652985, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 44940 + }, + { + "epoch": 0.1737641292078366, + "grad_norm": 0.1050289049744606, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 44950 + }, + { + "epoch": 0.17380278641121988, + "grad_norm": 0.10771483927965164, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 44960 + }, + { + "epoch": 0.17384144361460316, + "grad_norm": 0.09984450042247772, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 44970 + }, + { + "epoch": 0.1738801008179864, + "grad_norm": 0.1271042674779892, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 44980 + }, + { + "epoch": 0.1739187580213697, + "grad_norm": 0.10474269837141037, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 44990 + }, + { + "epoch": 0.17395741522475297, + "grad_norm": 0.10538092255592346, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 45000 + }, + { + "epoch": 0.17399607242813625, + "grad_norm": 0.09209699928760529, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 45010 + }, + { + "epoch": 0.17403472963151953, + "grad_norm": 0.11103249341249466, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 45020 + }, + { + "epoch": 0.1740733868349028, + "grad_norm": 0.10285907238721848, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 45030 + }, + { + "epoch": 0.1741120440382861, + "grad_norm": 0.11306691914796829, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 45040 + }, + { + "epoch": 0.17415070124166937, + "grad_norm": 0.12675841152668, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 45050 + }, + { + "epoch": 0.17418935844505265, + "grad_norm": 0.11700502038002014, + "learning_rate": 0.002, + "loss": 2.374, + "step": 45060 + }, + { + "epoch": 0.17422801564843593, + "grad_norm": 0.1016639992594719, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 45070 + }, + { + "epoch": 0.1742666728518192, + "grad_norm": 0.10172295570373535, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 45080 + }, + { + "epoch": 0.1743053300552025, + "grad_norm": 0.11756344884634018, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 45090 + }, + { + "epoch": 0.17434398725858577, + "grad_norm": 0.11248449981212616, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 45100 + }, + { + "epoch": 0.17438264446196905, + "grad_norm": 0.09576928615570068, + "learning_rate": 0.002, + "loss": 2.3796, + "step": 45110 + }, + { + "epoch": 0.17442130166535233, + "grad_norm": 0.10466929525136948, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 45120 + }, + { + "epoch": 0.1744599588687356, + "grad_norm": 0.10423614829778671, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 45130 + }, + { + "epoch": 0.1744986160721189, + "grad_norm": 0.11010349541902542, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 45140 + }, + { + "epoch": 0.17453727327550217, + "grad_norm": 0.11372268944978714, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 45150 + }, + { + "epoch": 0.17457593047888545, + "grad_norm": 0.12476648390293121, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 45160 + }, + { + "epoch": 0.1746145876822687, + "grad_norm": 0.09552644193172455, + "learning_rate": 0.002, + "loss": 2.3812, + "step": 45170 + }, + { + "epoch": 0.17465324488565198, + "grad_norm": 0.13805052638053894, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 45180 + }, + { + "epoch": 0.17469190208903526, + "grad_norm": 0.11120432615280151, + "learning_rate": 0.002, + "loss": 2.377, + "step": 45190 + }, + { + "epoch": 0.17473055929241854, + "grad_norm": 0.10684333741664886, + "learning_rate": 0.002, + "loss": 2.3829, + "step": 45200 + }, + { + "epoch": 0.17476921649580182, + "grad_norm": 0.10616767406463623, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 45210 + }, + { + "epoch": 0.1748078736991851, + "grad_norm": 0.09469354897737503, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 45220 + }, + { + "epoch": 0.17484653090256838, + "grad_norm": 0.10828037559986115, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 45230 + }, + { + "epoch": 0.17488518810595166, + "grad_norm": 0.09035887569189072, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 45240 + }, + { + "epoch": 0.17492384530933494, + "grad_norm": 0.10850965231657028, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 45250 + }, + { + "epoch": 0.17496250251271822, + "grad_norm": 0.11956586688756943, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 45260 + }, + { + "epoch": 0.1750011597161015, + "grad_norm": 0.09887845814228058, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 45270 + }, + { + "epoch": 0.17503981691948478, + "grad_norm": 0.0914347693324089, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 45280 + }, + { + "epoch": 0.17507847412286806, + "grad_norm": 0.09789085388183594, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 45290 + }, + { + "epoch": 0.17511713132625134, + "grad_norm": 0.09701121598482132, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 45300 + }, + { + "epoch": 0.17515578852963462, + "grad_norm": 0.11873602122068405, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 45310 + }, + { + "epoch": 0.1751944457330179, + "grad_norm": 0.1296318918466568, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 45320 + }, + { + "epoch": 0.17523310293640118, + "grad_norm": 0.11447659134864807, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 45330 + }, + { + "epoch": 0.17527176013978446, + "grad_norm": 0.09558804333209991, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 45340 + }, + { + "epoch": 0.1753104173431677, + "grad_norm": 0.108881376683712, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 45350 + }, + { + "epoch": 0.175349074546551, + "grad_norm": 0.12144729495048523, + "learning_rate": 0.002, + "loss": 2.362, + "step": 45360 + }, + { + "epoch": 0.17538773174993427, + "grad_norm": 0.10852917283773422, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 45370 + }, + { + "epoch": 0.17542638895331755, + "grad_norm": 0.091704361140728, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 45380 + }, + { + "epoch": 0.17546504615670083, + "grad_norm": 0.11262385547161102, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 45390 + }, + { + "epoch": 0.1755037033600841, + "grad_norm": 0.12413255125284195, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 45400 + }, + { + "epoch": 0.1755423605634674, + "grad_norm": 0.09488387405872345, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 45410 + }, + { + "epoch": 0.17558101776685067, + "grad_norm": 0.10605372488498688, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 45420 + }, + { + "epoch": 0.17561967497023395, + "grad_norm": 0.12880057096481323, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 45430 + }, + { + "epoch": 0.17565833217361723, + "grad_norm": 0.20423239469528198, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 45440 + }, + { + "epoch": 0.1756969893770005, + "grad_norm": 0.09972039610147476, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 45450 + }, + { + "epoch": 0.1757356465803838, + "grad_norm": 0.11355508863925934, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 45460 + }, + { + "epoch": 0.17577430378376707, + "grad_norm": 0.1071861982345581, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 45470 + }, + { + "epoch": 0.17581296098715035, + "grad_norm": 0.1498335301876068, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 45480 + }, + { + "epoch": 0.17585161819053363, + "grad_norm": 0.1033952608704567, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 45490 + }, + { + "epoch": 0.1758902753939169, + "grad_norm": 0.09816624969244003, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 45500 + }, + { + "epoch": 0.1759289325973002, + "grad_norm": 0.13971713185310364, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 45510 + }, + { + "epoch": 0.17596758980068347, + "grad_norm": 0.10514900088310242, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 45520 + }, + { + "epoch": 0.17600624700406675, + "grad_norm": 0.09521742910146713, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 45530 + }, + { + "epoch": 0.17604490420745, + "grad_norm": 0.10389627516269684, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 45540 + }, + { + "epoch": 0.17608356141083328, + "grad_norm": 0.11555583775043488, + "learning_rate": 0.002, + "loss": 2.367, + "step": 45550 + }, + { + "epoch": 0.17612221861421656, + "grad_norm": 0.10777294635772705, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 45560 + }, + { + "epoch": 0.17616087581759984, + "grad_norm": 0.14498485624790192, + "learning_rate": 0.002, + "loss": 2.3791, + "step": 45570 + }, + { + "epoch": 0.17619953302098312, + "grad_norm": 0.10857778787612915, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 45580 + }, + { + "epoch": 0.1762381902243664, + "grad_norm": 0.10575216263532639, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 45590 + }, + { + "epoch": 0.17627684742774968, + "grad_norm": 0.11028721928596497, + "learning_rate": 0.002, + "loss": 2.363, + "step": 45600 + }, + { + "epoch": 0.17631550463113296, + "grad_norm": 0.13524943590164185, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 45610 + }, + { + "epoch": 0.17635416183451624, + "grad_norm": 0.10539298504590988, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 45620 + }, + { + "epoch": 0.17639281903789952, + "grad_norm": 0.10771636664867401, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 45630 + }, + { + "epoch": 0.1764314762412828, + "grad_norm": 0.09472755342721939, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 45640 + }, + { + "epoch": 0.17647013344466608, + "grad_norm": 0.10767373442649841, + "learning_rate": 0.002, + "loss": 2.3826, + "step": 45650 + }, + { + "epoch": 0.17650879064804936, + "grad_norm": 0.11240257322788239, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 45660 + }, + { + "epoch": 0.17654744785143264, + "grad_norm": 0.1209542527794838, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 45670 + }, + { + "epoch": 0.17658610505481592, + "grad_norm": 0.12508618831634521, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 45680 + }, + { + "epoch": 0.1766247622581992, + "grad_norm": 0.10958380252122879, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 45690 + }, + { + "epoch": 0.17666341946158248, + "grad_norm": 0.1180795207619667, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 45700 + }, + { + "epoch": 0.17670207666496576, + "grad_norm": 0.10940097272396088, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 45710 + }, + { + "epoch": 0.17674073386834901, + "grad_norm": 0.11199961602687836, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 45720 + }, + { + "epoch": 0.1767793910717323, + "grad_norm": 0.10809573531150818, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 45730 + }, + { + "epoch": 0.17681804827511557, + "grad_norm": 0.11269965022802353, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 45740 + }, + { + "epoch": 0.17685670547849885, + "grad_norm": 0.10095714032649994, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 45750 + }, + { + "epoch": 0.17689536268188213, + "grad_norm": 0.117369644343853, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 45760 + }, + { + "epoch": 0.17693401988526541, + "grad_norm": 0.11148129403591156, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 45770 + }, + { + "epoch": 0.1769726770886487, + "grad_norm": 0.11547453701496124, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 45780 + }, + { + "epoch": 0.17701133429203197, + "grad_norm": 0.10645493119955063, + "learning_rate": 0.002, + "loss": 2.3822, + "step": 45790 + }, + { + "epoch": 0.17704999149541525, + "grad_norm": 0.11404749751091003, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 45800 + }, + { + "epoch": 0.17708864869879853, + "grad_norm": 0.11249172687530518, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 45810 + }, + { + "epoch": 0.17712730590218181, + "grad_norm": 0.10758005082607269, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 45820 + }, + { + "epoch": 0.1771659631055651, + "grad_norm": 0.09517668187618256, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 45830 + }, + { + "epoch": 0.17720462030894837, + "grad_norm": 0.09979206323623657, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 45840 + }, + { + "epoch": 0.17724327751233165, + "grad_norm": 0.12252053618431091, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 45850 + }, + { + "epoch": 0.17728193471571493, + "grad_norm": 0.09131057560443878, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 45860 + }, + { + "epoch": 0.1773205919190982, + "grad_norm": 0.1101309061050415, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 45870 + }, + { + "epoch": 0.1773592491224815, + "grad_norm": 0.10598935186862946, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 45880 + }, + { + "epoch": 0.17739790632586477, + "grad_norm": 0.09703782200813293, + "learning_rate": 0.002, + "loss": 2.368, + "step": 45890 + }, + { + "epoch": 0.17743656352924805, + "grad_norm": 0.09163852781057358, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 45900 + }, + { + "epoch": 0.1774752207326313, + "grad_norm": 0.10155810415744781, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 45910 + }, + { + "epoch": 0.17751387793601459, + "grad_norm": 0.0889534205198288, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 45920 + }, + { + "epoch": 0.17755253513939787, + "grad_norm": 0.11034919321537018, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 45930 + }, + { + "epoch": 0.17759119234278115, + "grad_norm": 0.09865278750658035, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 45940 + }, + { + "epoch": 0.17762984954616443, + "grad_norm": 0.14617250859737396, + "learning_rate": 0.002, + "loss": 2.368, + "step": 45950 + }, + { + "epoch": 0.1776685067495477, + "grad_norm": 0.10735570639371872, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 45960 + }, + { + "epoch": 0.17770716395293099, + "grad_norm": 0.11080707609653473, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 45970 + }, + { + "epoch": 0.17774582115631427, + "grad_norm": 0.10667536407709122, + "learning_rate": 0.002, + "loss": 2.3875, + "step": 45980 + }, + { + "epoch": 0.17778447835969755, + "grad_norm": 0.11528275907039642, + "learning_rate": 0.002, + "loss": 2.374, + "step": 45990 + }, + { + "epoch": 0.17782313556308083, + "grad_norm": 0.10348264873027802, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 46000 + }, + { + "epoch": 0.1778617927664641, + "grad_norm": 0.12170681357383728, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 46010 + }, + { + "epoch": 0.17790044996984739, + "grad_norm": 0.10371945053339005, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 46020 + }, + { + "epoch": 0.17793910717323067, + "grad_norm": 0.10639218986034393, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 46030 + }, + { + "epoch": 0.17797776437661394, + "grad_norm": 0.0943278968334198, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 46040 + }, + { + "epoch": 0.17801642157999722, + "grad_norm": 0.08742164820432663, + "learning_rate": 0.002, + "loss": 2.356, + "step": 46050 + }, + { + "epoch": 0.1780550787833805, + "grad_norm": 0.09669308364391327, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 46060 + }, + { + "epoch": 0.17809373598676378, + "grad_norm": 0.41715648770332336, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 46070 + }, + { + "epoch": 0.17813239319014706, + "grad_norm": 0.11329425871372223, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 46080 + }, + { + "epoch": 0.17817105039353032, + "grad_norm": 0.10821500420570374, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 46090 + }, + { + "epoch": 0.1782097075969136, + "grad_norm": 0.12273920327425003, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 46100 + }, + { + "epoch": 0.17824836480029688, + "grad_norm": 0.11014291644096375, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 46110 + }, + { + "epoch": 0.17828702200368016, + "grad_norm": 0.10393861681222916, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 46120 + }, + { + "epoch": 0.17832567920706344, + "grad_norm": 0.10663165152072906, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 46130 + }, + { + "epoch": 0.17836433641044672, + "grad_norm": 0.11056865751743317, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 46140 + }, + { + "epoch": 0.17840299361383, + "grad_norm": 0.10019081830978394, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 46150 + }, + { + "epoch": 0.17844165081721328, + "grad_norm": 0.11017369478940964, + "learning_rate": 0.002, + "loss": 2.345, + "step": 46160 + }, + { + "epoch": 0.17848030802059656, + "grad_norm": 0.12934242188930511, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 46170 + }, + { + "epoch": 0.17851896522397984, + "grad_norm": 0.09688087552785873, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 46180 + }, + { + "epoch": 0.17855762242736312, + "grad_norm": 0.11228056252002716, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 46190 + }, + { + "epoch": 0.1785962796307464, + "grad_norm": 0.12104123085737228, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 46200 + }, + { + "epoch": 0.17863493683412968, + "grad_norm": 0.11429349333047867, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 46210 + }, + { + "epoch": 0.17867359403751296, + "grad_norm": 0.10362027585506439, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 46220 + }, + { + "epoch": 0.17871225124089624, + "grad_norm": 0.09616777300834656, + "learning_rate": 0.002, + "loss": 2.371, + "step": 46230 + }, + { + "epoch": 0.17875090844427952, + "grad_norm": 0.11989966779947281, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 46240 + }, + { + "epoch": 0.1787895656476628, + "grad_norm": 0.11376291513442993, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 46250 + }, + { + "epoch": 0.17882822285104608, + "grad_norm": 0.11649604141712189, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 46260 + }, + { + "epoch": 0.17886688005442936, + "grad_norm": 0.10671722888946533, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 46270 + }, + { + "epoch": 0.1789055372578126, + "grad_norm": 0.10673186182975769, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 46280 + }, + { + "epoch": 0.1789441944611959, + "grad_norm": 0.11358071118593216, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 46290 + }, + { + "epoch": 0.17898285166457917, + "grad_norm": 0.11956032365560532, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 46300 + }, + { + "epoch": 0.17902150886796245, + "grad_norm": 0.10784201323986053, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 46310 + }, + { + "epoch": 0.17906016607134573, + "grad_norm": 0.09544270485639572, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 46320 + }, + { + "epoch": 0.179098823274729, + "grad_norm": 0.10480832308530807, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 46330 + }, + { + "epoch": 0.1791374804781123, + "grad_norm": 0.11912026256322861, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 46340 + }, + { + "epoch": 0.17917613768149557, + "grad_norm": 0.11689063161611557, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 46350 + }, + { + "epoch": 0.17921479488487885, + "grad_norm": 0.10498364269733429, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 46360 + }, + { + "epoch": 0.17925345208826213, + "grad_norm": 0.1057942733168602, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 46370 + }, + { + "epoch": 0.1792921092916454, + "grad_norm": 0.12397985905408859, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 46380 + }, + { + "epoch": 0.1793307664950287, + "grad_norm": 0.1134156733751297, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 46390 + }, + { + "epoch": 0.17936942369841197, + "grad_norm": 0.11008192598819733, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 46400 + }, + { + "epoch": 0.17940808090179525, + "grad_norm": 0.10683248937129974, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 46410 + }, + { + "epoch": 0.17944673810517853, + "grad_norm": 0.10753653198480606, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 46420 + }, + { + "epoch": 0.1794853953085618, + "grad_norm": 0.09765728563070297, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 46430 + }, + { + "epoch": 0.1795240525119451, + "grad_norm": 0.11700306832790375, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 46440 + }, + { + "epoch": 0.17956270971532837, + "grad_norm": 0.09504656493663788, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 46450 + }, + { + "epoch": 0.17960136691871162, + "grad_norm": 0.10488741844892502, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 46460 + }, + { + "epoch": 0.1796400241220949, + "grad_norm": 0.09921709448099136, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 46470 + }, + { + "epoch": 0.17967868132547818, + "grad_norm": 0.12173950672149658, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 46480 + }, + { + "epoch": 0.17971733852886146, + "grad_norm": 0.11671063303947449, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 46490 + }, + { + "epoch": 0.17975599573224474, + "grad_norm": 0.12096337229013443, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 46500 + }, + { + "epoch": 0.17979465293562802, + "grad_norm": 0.11629269272089005, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 46510 + }, + { + "epoch": 0.1798333101390113, + "grad_norm": 0.12332172691822052, + "learning_rate": 0.002, + "loss": 2.374, + "step": 46520 + }, + { + "epoch": 0.17987196734239458, + "grad_norm": 0.11764345318078995, + "learning_rate": 0.002, + "loss": 2.359, + "step": 46530 + }, + { + "epoch": 0.17991062454577786, + "grad_norm": 0.13840240240097046, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 46540 + }, + { + "epoch": 0.17994928174916114, + "grad_norm": 0.1019141748547554, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 46550 + }, + { + "epoch": 0.17998793895254442, + "grad_norm": 0.11623389273881912, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 46560 + }, + { + "epoch": 0.1800265961559277, + "grad_norm": 0.12610310316085815, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 46570 + }, + { + "epoch": 0.18006525335931098, + "grad_norm": 0.10052474588155746, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 46580 + }, + { + "epoch": 0.18010391056269426, + "grad_norm": 0.11499066650867462, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 46590 + }, + { + "epoch": 0.18014256776607754, + "grad_norm": 0.09875231236219406, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 46600 + }, + { + "epoch": 0.18018122496946082, + "grad_norm": 0.10460720211267471, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 46610 + }, + { + "epoch": 0.1802198821728441, + "grad_norm": 0.11153114587068558, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 46620 + }, + { + "epoch": 0.18025853937622738, + "grad_norm": 0.10913459956645966, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 46630 + }, + { + "epoch": 0.18029719657961066, + "grad_norm": 0.10822083801031113, + "learning_rate": 0.002, + "loss": 2.364, + "step": 46640 + }, + { + "epoch": 0.1803358537829939, + "grad_norm": 0.09244625270366669, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 46650 + }, + { + "epoch": 0.1803745109863772, + "grad_norm": 0.13497793674468994, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 46660 + }, + { + "epoch": 0.18041316818976047, + "grad_norm": 0.11675471812486649, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 46670 + }, + { + "epoch": 0.18045182539314375, + "grad_norm": 0.12889958918094635, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 46680 + }, + { + "epoch": 0.18049048259652703, + "grad_norm": 0.09921512752771378, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 46690 + }, + { + "epoch": 0.1805291397999103, + "grad_norm": 0.10539357364177704, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 46700 + }, + { + "epoch": 0.1805677970032936, + "grad_norm": 0.13350282609462738, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 46710 + }, + { + "epoch": 0.18060645420667687, + "grad_norm": 0.11116880923509598, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 46720 + }, + { + "epoch": 0.18064511141006015, + "grad_norm": 0.12459864467382431, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 46730 + }, + { + "epoch": 0.18068376861344343, + "grad_norm": 0.1052606999874115, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 46740 + }, + { + "epoch": 0.1807224258168267, + "grad_norm": 0.10243360698223114, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 46750 + }, + { + "epoch": 0.18076108302021, + "grad_norm": 0.12499461323022842, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 46760 + }, + { + "epoch": 0.18079974022359327, + "grad_norm": 0.1343996822834015, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 46770 + }, + { + "epoch": 0.18083839742697655, + "grad_norm": 0.11027532070875168, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 46780 + }, + { + "epoch": 0.18087705463035983, + "grad_norm": 0.10879873484373093, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 46790 + }, + { + "epoch": 0.1809157118337431, + "grad_norm": 0.2611781358718872, + "learning_rate": 0.002, + "loss": 2.364, + "step": 46800 + }, + { + "epoch": 0.1809543690371264, + "grad_norm": 0.09826645255088806, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 46810 + }, + { + "epoch": 0.18099302624050967, + "grad_norm": 0.09456578642129898, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 46820 + }, + { + "epoch": 0.18103168344389292, + "grad_norm": 0.24171464145183563, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 46830 + }, + { + "epoch": 0.1810703406472762, + "grad_norm": 0.10748471319675446, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 46840 + }, + { + "epoch": 0.18110899785065948, + "grad_norm": 0.09292565286159515, + "learning_rate": 0.002, + "loss": 2.36, + "step": 46850 + }, + { + "epoch": 0.18114765505404276, + "grad_norm": 0.09652915596961975, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 46860 + }, + { + "epoch": 0.18118631225742604, + "grad_norm": 0.1162387803196907, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 46870 + }, + { + "epoch": 0.18122496946080932, + "grad_norm": 0.1014401912689209, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 46880 + }, + { + "epoch": 0.1812636266641926, + "grad_norm": 0.11465539783239365, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 46890 + }, + { + "epoch": 0.18130228386757588, + "grad_norm": 0.11276334524154663, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 46900 + }, + { + "epoch": 0.18134094107095916, + "grad_norm": 0.12095353752374649, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 46910 + }, + { + "epoch": 0.18137959827434244, + "grad_norm": 0.10484836995601654, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 46920 + }, + { + "epoch": 0.18141825547772572, + "grad_norm": 0.13355353474617004, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 46930 + }, + { + "epoch": 0.181456912681109, + "grad_norm": 0.11671741306781769, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 46940 + }, + { + "epoch": 0.18149556988449228, + "grad_norm": 0.11582811921834946, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 46950 + }, + { + "epoch": 0.18153422708787556, + "grad_norm": 0.10815490782260895, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 46960 + }, + { + "epoch": 0.18157288429125884, + "grad_norm": 0.11243810504674911, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 46970 + }, + { + "epoch": 0.18161154149464212, + "grad_norm": 0.1018824502825737, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 46980 + }, + { + "epoch": 0.1816501986980254, + "grad_norm": 0.13891954720020294, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 46990 + }, + { + "epoch": 0.18168885590140868, + "grad_norm": 0.11140652000904083, + "learning_rate": 0.002, + "loss": 2.36, + "step": 47000 + }, + { + "epoch": 0.18172751310479196, + "grad_norm": 0.1114838719367981, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 47010 + }, + { + "epoch": 0.1817661703081752, + "grad_norm": 0.10132309049367905, + "learning_rate": 0.002, + "loss": 2.366, + "step": 47020 + }, + { + "epoch": 0.1818048275115585, + "grad_norm": 0.10207788646221161, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 47030 + }, + { + "epoch": 0.18184348471494177, + "grad_norm": 0.10041003674268723, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 47040 + }, + { + "epoch": 0.18188214191832505, + "grad_norm": 0.10246816277503967, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 47050 + }, + { + "epoch": 0.18192079912170833, + "grad_norm": 0.11263757944107056, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 47060 + }, + { + "epoch": 0.1819594563250916, + "grad_norm": 0.10960984230041504, + "learning_rate": 0.002, + "loss": 2.37, + "step": 47070 + }, + { + "epoch": 0.1819981135284749, + "grad_norm": 0.12285451591014862, + "learning_rate": 0.002, + "loss": 2.361, + "step": 47080 + }, + { + "epoch": 0.18203677073185817, + "grad_norm": 0.09926436841487885, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 47090 + }, + { + "epoch": 0.18207542793524145, + "grad_norm": 0.10922511667013168, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 47100 + }, + { + "epoch": 0.18211408513862473, + "grad_norm": 0.10222557187080383, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 47110 + }, + { + "epoch": 0.182152742342008, + "grad_norm": 0.11343535780906677, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 47120 + }, + { + "epoch": 0.1821913995453913, + "grad_norm": 0.10885771363973618, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 47130 + }, + { + "epoch": 0.18223005674877457, + "grad_norm": 0.1149262934923172, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 47140 + }, + { + "epoch": 0.18226871395215785, + "grad_norm": 0.12736138701438904, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 47150 + }, + { + "epoch": 0.18230737115554113, + "grad_norm": 0.12280838936567307, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 47160 + }, + { + "epoch": 0.1823460283589244, + "grad_norm": 0.10826502740383148, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 47170 + }, + { + "epoch": 0.1823846855623077, + "grad_norm": 0.10554526746273041, + "learning_rate": 0.002, + "loss": 2.363, + "step": 47180 + }, + { + "epoch": 0.18242334276569097, + "grad_norm": 0.11571838706731796, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 47190 + }, + { + "epoch": 0.18246199996907425, + "grad_norm": 0.12278785556554794, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 47200 + }, + { + "epoch": 0.1825006571724575, + "grad_norm": 0.11990324407815933, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 47210 + }, + { + "epoch": 0.18253931437584078, + "grad_norm": 0.12232258170843124, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 47220 + }, + { + "epoch": 0.18257797157922406, + "grad_norm": 0.11240170150995255, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 47230 + }, + { + "epoch": 0.18261662878260734, + "grad_norm": 0.11030350625514984, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 47240 + }, + { + "epoch": 0.18265528598599062, + "grad_norm": 0.11100868135690689, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 47250 + }, + { + "epoch": 0.1826939431893739, + "grad_norm": 0.11257486790418625, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 47260 + }, + { + "epoch": 0.18273260039275718, + "grad_norm": 0.11170840263366699, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 47270 + }, + { + "epoch": 0.18277125759614046, + "grad_norm": 0.11553363502025604, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 47280 + }, + { + "epoch": 0.18280991479952374, + "grad_norm": 0.09762480109930038, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 47290 + }, + { + "epoch": 0.18284857200290702, + "grad_norm": 0.10298377275466919, + "learning_rate": 0.002, + "loss": 2.364, + "step": 47300 + }, + { + "epoch": 0.1828872292062903, + "grad_norm": 0.10539838671684265, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 47310 + }, + { + "epoch": 0.18292588640967358, + "grad_norm": 0.10760633647441864, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 47320 + }, + { + "epoch": 0.18296454361305686, + "grad_norm": 0.09907570481300354, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 47330 + }, + { + "epoch": 0.18300320081644014, + "grad_norm": 0.10394413769245148, + "learning_rate": 0.002, + "loss": 2.359, + "step": 47340 + }, + { + "epoch": 0.18304185801982342, + "grad_norm": 0.08843515068292618, + "learning_rate": 0.002, + "loss": 2.36, + "step": 47350 + }, + { + "epoch": 0.1830805152232067, + "grad_norm": 0.11288506537675858, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 47360 + }, + { + "epoch": 0.18311917242658998, + "grad_norm": 0.12361892312765121, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 47370 + }, + { + "epoch": 0.18315782962997326, + "grad_norm": 0.11280398070812225, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 47380 + }, + { + "epoch": 0.18319648683335651, + "grad_norm": 0.09853893518447876, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 47390 + }, + { + "epoch": 0.1832351440367398, + "grad_norm": 0.11054504662752151, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 47400 + }, + { + "epoch": 0.18327380124012307, + "grad_norm": 0.10531944036483765, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 47410 + }, + { + "epoch": 0.18331245844350635, + "grad_norm": 0.10795912146568298, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 47420 + }, + { + "epoch": 0.18335111564688963, + "grad_norm": 0.09988661110401154, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 47430 + }, + { + "epoch": 0.18338977285027291, + "grad_norm": 0.10467620193958282, + "learning_rate": 0.002, + "loss": 2.355, + "step": 47440 + }, + { + "epoch": 0.1834284300536562, + "grad_norm": 0.10298123955726624, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 47450 + }, + { + "epoch": 0.18346708725703947, + "grad_norm": 0.0956730917096138, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 47460 + }, + { + "epoch": 0.18350574446042275, + "grad_norm": 0.10356737673282623, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 47470 + }, + { + "epoch": 0.18354440166380603, + "grad_norm": 0.11420086771249771, + "learning_rate": 0.002, + "loss": 2.359, + "step": 47480 + }, + { + "epoch": 0.1835830588671893, + "grad_norm": 0.1024063304066658, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 47490 + }, + { + "epoch": 0.1836217160705726, + "grad_norm": 0.11216024309396744, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 47500 + }, + { + "epoch": 0.18366037327395587, + "grad_norm": 0.10744771361351013, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 47510 + }, + { + "epoch": 0.18369903047733915, + "grad_norm": 0.09520062059164047, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 47520 + }, + { + "epoch": 0.18373768768072243, + "grad_norm": 0.10963470488786697, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 47530 + }, + { + "epoch": 0.1837763448841057, + "grad_norm": 0.11262558400630951, + "learning_rate": 0.002, + "loss": 2.352, + "step": 47540 + }, + { + "epoch": 0.183815002087489, + "grad_norm": 0.10754802823066711, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 47550 + }, + { + "epoch": 0.18385365929087227, + "grad_norm": 0.11214997619390488, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 47560 + }, + { + "epoch": 0.18389231649425555, + "grad_norm": 0.12302026897668839, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 47570 + }, + { + "epoch": 0.1839309736976388, + "grad_norm": 0.10110511630773544, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 47580 + }, + { + "epoch": 0.18396963090102209, + "grad_norm": 0.09465829282999039, + "learning_rate": 0.002, + "loss": 2.3845, + "step": 47590 + }, + { + "epoch": 0.18400828810440537, + "grad_norm": 0.09735490381717682, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 47600 + }, + { + "epoch": 0.18404694530778865, + "grad_norm": 0.1238107830286026, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 47610 + }, + { + "epoch": 0.18408560251117193, + "grad_norm": 0.10707355290651321, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 47620 + }, + { + "epoch": 0.1841242597145552, + "grad_norm": 0.10860617458820343, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 47630 + }, + { + "epoch": 0.18416291691793849, + "grad_norm": 0.10216601192951202, + "learning_rate": 0.002, + "loss": 2.3789, + "step": 47640 + }, + { + "epoch": 0.18420157412132176, + "grad_norm": 0.1095500960946083, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 47650 + }, + { + "epoch": 0.18424023132470504, + "grad_norm": 0.09551633149385452, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 47660 + }, + { + "epoch": 0.18427888852808832, + "grad_norm": 0.11457042396068573, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 47670 + }, + { + "epoch": 0.1843175457314716, + "grad_norm": 0.10601530224084854, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 47680 + }, + { + "epoch": 0.18435620293485488, + "grad_norm": 0.10594504326581955, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 47690 + }, + { + "epoch": 0.18439486013823816, + "grad_norm": 0.11967021971940994, + "learning_rate": 0.002, + "loss": 2.3905, + "step": 47700 + }, + { + "epoch": 0.18443351734162144, + "grad_norm": 0.11337132006883621, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 47710 + }, + { + "epoch": 0.18447217454500472, + "grad_norm": 0.11595602333545685, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 47720 + }, + { + "epoch": 0.184510831748388, + "grad_norm": 0.10115175694227219, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 47730 + }, + { + "epoch": 0.18454948895177128, + "grad_norm": 0.11340272426605225, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 47740 + }, + { + "epoch": 0.18458814615515456, + "grad_norm": 0.1012321338057518, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 47750 + }, + { + "epoch": 0.18462680335853782, + "grad_norm": 0.1182548850774765, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 47760 + }, + { + "epoch": 0.1846654605619211, + "grad_norm": 0.1108071506023407, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 47770 + }, + { + "epoch": 0.18470411776530438, + "grad_norm": 0.11040622740983963, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 47780 + }, + { + "epoch": 0.18474277496868766, + "grad_norm": 0.1175733357667923, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 47790 + }, + { + "epoch": 0.18478143217207094, + "grad_norm": 0.1209789365530014, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 47800 + }, + { + "epoch": 0.18482008937545422, + "grad_norm": 0.1346805989742279, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 47810 + }, + { + "epoch": 0.1848587465788375, + "grad_norm": 0.10459670424461365, + "learning_rate": 0.002, + "loss": 2.368, + "step": 47820 + }, + { + "epoch": 0.18489740378222078, + "grad_norm": 0.14861343801021576, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 47830 + }, + { + "epoch": 0.18493606098560406, + "grad_norm": 0.10830673575401306, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 47840 + }, + { + "epoch": 0.18497471818898734, + "grad_norm": 0.12613213062286377, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 47850 + }, + { + "epoch": 0.18501337539237062, + "grad_norm": 0.12823881208896637, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 47860 + }, + { + "epoch": 0.1850520325957539, + "grad_norm": 0.10592518746852875, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 47870 + }, + { + "epoch": 0.18509068979913718, + "grad_norm": 0.10592817515134811, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 47880 + }, + { + "epoch": 0.18512934700252046, + "grad_norm": 0.1027887687087059, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 47890 + }, + { + "epoch": 0.18516800420590374, + "grad_norm": 0.10653171688318253, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 47900 + }, + { + "epoch": 0.18520666140928702, + "grad_norm": 0.11139184981584549, + "learning_rate": 0.002, + "loss": 2.368, + "step": 47910 + }, + { + "epoch": 0.1852453186126703, + "grad_norm": 0.11169011890888214, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 47920 + }, + { + "epoch": 0.18528397581605358, + "grad_norm": 0.08974819630384445, + "learning_rate": 0.002, + "loss": 2.353, + "step": 47930 + }, + { + "epoch": 0.18532263301943686, + "grad_norm": 0.10861147195100784, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 47940 + }, + { + "epoch": 0.1853612902228201, + "grad_norm": 0.12688162922859192, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 47950 + }, + { + "epoch": 0.1853999474262034, + "grad_norm": 0.09429048746824265, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 47960 + }, + { + "epoch": 0.18543860462958667, + "grad_norm": 0.10829395055770874, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 47970 + }, + { + "epoch": 0.18547726183296995, + "grad_norm": 0.11658895760774612, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 47980 + }, + { + "epoch": 0.18551591903635323, + "grad_norm": 0.11807240545749664, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 47990 + }, + { + "epoch": 0.1855545762397365, + "grad_norm": 0.10759463906288147, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 48000 + }, + { + "epoch": 0.1855932334431198, + "grad_norm": 0.0997847467660904, + "learning_rate": 0.002, + "loss": 2.356, + "step": 48010 + }, + { + "epoch": 0.18563189064650307, + "grad_norm": 0.12707951664924622, + "learning_rate": 0.002, + "loss": 2.362, + "step": 48020 + }, + { + "epoch": 0.18567054784988635, + "grad_norm": 0.10862939059734344, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 48030 + }, + { + "epoch": 0.18570920505326963, + "grad_norm": 0.11815818399190903, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 48040 + }, + { + "epoch": 0.1857478622566529, + "grad_norm": 0.10583402961492538, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 48050 + }, + { + "epoch": 0.1857865194600362, + "grad_norm": 0.11162041127681732, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 48060 + }, + { + "epoch": 0.18582517666341947, + "grad_norm": 0.1119517907500267, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 48070 + }, + { + "epoch": 0.18586383386680275, + "grad_norm": 0.11928005516529083, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 48080 + }, + { + "epoch": 0.18590249107018603, + "grad_norm": 0.11648055911064148, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 48090 + }, + { + "epoch": 0.1859411482735693, + "grad_norm": 0.111610546708107, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 48100 + }, + { + "epoch": 0.1859798054769526, + "grad_norm": 0.10913601517677307, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 48110 + }, + { + "epoch": 0.18601846268033587, + "grad_norm": 0.1146617978811264, + "learning_rate": 0.002, + "loss": 2.37, + "step": 48120 + }, + { + "epoch": 0.18605711988371912, + "grad_norm": 0.11511615663766861, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 48130 + }, + { + "epoch": 0.1860957770871024, + "grad_norm": 0.13018366694450378, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 48140 + }, + { + "epoch": 0.18613443429048568, + "grad_norm": 0.10985668003559113, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 48150 + }, + { + "epoch": 0.18617309149386896, + "grad_norm": 0.10770490020513535, + "learning_rate": 0.002, + "loss": 2.371, + "step": 48160 + }, + { + "epoch": 0.18621174869725224, + "grad_norm": 0.10217759013175964, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 48170 + }, + { + "epoch": 0.18625040590063552, + "grad_norm": 0.1058979406952858, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 48180 + }, + { + "epoch": 0.1862890631040188, + "grad_norm": 0.11669570952653885, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 48190 + }, + { + "epoch": 0.18632772030740208, + "grad_norm": 0.15146414935588837, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 48200 + }, + { + "epoch": 0.18636637751078536, + "grad_norm": 0.11123238503932953, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 48210 + }, + { + "epoch": 0.18640503471416864, + "grad_norm": 0.11966782063245773, + "learning_rate": 0.002, + "loss": 2.353, + "step": 48220 + }, + { + "epoch": 0.18644369191755192, + "grad_norm": 0.08881065249443054, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 48230 + }, + { + "epoch": 0.1864823491209352, + "grad_norm": 0.11233676224946976, + "learning_rate": 0.002, + "loss": 2.37, + "step": 48240 + }, + { + "epoch": 0.18652100632431848, + "grad_norm": 0.1251545548439026, + "learning_rate": 0.002, + "loss": 2.368, + "step": 48250 + }, + { + "epoch": 0.18655966352770176, + "grad_norm": 0.10777512192726135, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 48260 + }, + { + "epoch": 0.18659832073108504, + "grad_norm": 0.10856172442436218, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 48270 + }, + { + "epoch": 0.18663697793446832, + "grad_norm": 0.11710671335458755, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 48280 + }, + { + "epoch": 0.1866756351378516, + "grad_norm": 0.13152992725372314, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 48290 + }, + { + "epoch": 0.18671429234123488, + "grad_norm": 0.10631529986858368, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 48300 + }, + { + "epoch": 0.18675294954461816, + "grad_norm": 0.09661126136779785, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 48310 + }, + { + "epoch": 0.1867916067480014, + "grad_norm": 0.10520081222057343, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 48320 + }, + { + "epoch": 0.1868302639513847, + "grad_norm": 0.11317374557256699, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 48330 + }, + { + "epoch": 0.18686892115476797, + "grad_norm": 0.09754928201436996, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 48340 + }, + { + "epoch": 0.18690757835815125, + "grad_norm": 0.10474085062742233, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 48350 + }, + { + "epoch": 0.18694623556153453, + "grad_norm": 0.10286795347929001, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 48360 + }, + { + "epoch": 0.1869848927649178, + "grad_norm": 0.10378462821245193, + "learning_rate": 0.002, + "loss": 2.3814, + "step": 48370 + }, + { + "epoch": 0.1870235499683011, + "grad_norm": 0.11171542853116989, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 48380 + }, + { + "epoch": 0.18706220717168437, + "grad_norm": 0.11779604852199554, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 48390 + }, + { + "epoch": 0.18710086437506765, + "grad_norm": 0.0927729532122612, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 48400 + }, + { + "epoch": 0.18713952157845093, + "grad_norm": 0.13855071365833282, + "learning_rate": 0.002, + "loss": 2.359, + "step": 48410 + }, + { + "epoch": 0.1871781787818342, + "grad_norm": 0.10825547575950623, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 48420 + }, + { + "epoch": 0.1872168359852175, + "grad_norm": 0.10999954491853714, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 48430 + }, + { + "epoch": 0.18725549318860077, + "grad_norm": 0.10651865601539612, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 48440 + }, + { + "epoch": 0.18729415039198405, + "grad_norm": 0.09782829135656357, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 48450 + }, + { + "epoch": 0.18733280759536733, + "grad_norm": 0.15147368609905243, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 48460 + }, + { + "epoch": 0.1873714647987506, + "grad_norm": 0.21194928884506226, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 48470 + }, + { + "epoch": 0.1874101220021339, + "grad_norm": 0.1071695014834404, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 48480 + }, + { + "epoch": 0.18744877920551717, + "grad_norm": 0.1013324111700058, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 48490 + }, + { + "epoch": 0.18748743640890042, + "grad_norm": 0.10426542907953262, + "learning_rate": 0.002, + "loss": 2.353, + "step": 48500 + }, + { + "epoch": 0.1875260936122837, + "grad_norm": 0.10406072437763214, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 48510 + }, + { + "epoch": 0.18756475081566698, + "grad_norm": 0.10694553703069687, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 48520 + }, + { + "epoch": 0.18760340801905026, + "grad_norm": 0.10278517007827759, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 48530 + }, + { + "epoch": 0.18764206522243354, + "grad_norm": 0.12315454334020615, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 48540 + }, + { + "epoch": 0.18768072242581682, + "grad_norm": 0.10810661315917969, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 48550 + }, + { + "epoch": 0.1877193796292001, + "grad_norm": 0.10936526954174042, + "learning_rate": 0.002, + "loss": 2.358, + "step": 48560 + }, + { + "epoch": 0.18775803683258338, + "grad_norm": 0.10920289158821106, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 48570 + }, + { + "epoch": 0.18779669403596666, + "grad_norm": 0.10547734797000885, + "learning_rate": 0.002, + "loss": 2.351, + "step": 48580 + }, + { + "epoch": 0.18783535123934994, + "grad_norm": 0.09385696798563004, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 48590 + }, + { + "epoch": 0.18787400844273322, + "grad_norm": 0.1407233476638794, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 48600 + }, + { + "epoch": 0.1879126656461165, + "grad_norm": 0.11766829341650009, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 48610 + }, + { + "epoch": 0.18795132284949978, + "grad_norm": 0.13642434775829315, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 48620 + }, + { + "epoch": 0.18798998005288306, + "grad_norm": 0.11041362583637238, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 48630 + }, + { + "epoch": 0.18802863725626634, + "grad_norm": 0.112607941031456, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 48640 + }, + { + "epoch": 0.18806729445964962, + "grad_norm": 0.10543616116046906, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 48650 + }, + { + "epoch": 0.1881059516630329, + "grad_norm": 0.11784285306930542, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 48660 + }, + { + "epoch": 0.18814460886641618, + "grad_norm": 0.09574959427118301, + "learning_rate": 0.002, + "loss": 2.367, + "step": 48670 + }, + { + "epoch": 0.18818326606979946, + "grad_norm": 0.11014584451913834, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 48680 + }, + { + "epoch": 0.1882219232731827, + "grad_norm": 0.1165049821138382, + "learning_rate": 0.002, + "loss": 2.362, + "step": 48690 + }, + { + "epoch": 0.188260580476566, + "grad_norm": 0.10616659373044968, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 48700 + }, + { + "epoch": 0.18829923767994927, + "grad_norm": 0.11664260178804398, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 48710 + }, + { + "epoch": 0.18833789488333255, + "grad_norm": 0.1024443507194519, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 48720 + }, + { + "epoch": 0.18837655208671583, + "grad_norm": 0.1058788001537323, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 48730 + }, + { + "epoch": 0.1884152092900991, + "grad_norm": 0.10449366271495819, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 48740 + }, + { + "epoch": 0.1884538664934824, + "grad_norm": 0.11875820904970169, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 48750 + }, + { + "epoch": 0.18849252369686567, + "grad_norm": 0.11891734600067139, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 48760 + }, + { + "epoch": 0.18853118090024895, + "grad_norm": 0.10855202376842499, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 48770 + }, + { + "epoch": 0.18856983810363223, + "grad_norm": 0.1013849750161171, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 48780 + }, + { + "epoch": 0.1886084953070155, + "grad_norm": 0.13188014924526215, + "learning_rate": 0.002, + "loss": 2.364, + "step": 48790 + }, + { + "epoch": 0.1886471525103988, + "grad_norm": 0.09956081956624985, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 48800 + }, + { + "epoch": 0.18868580971378207, + "grad_norm": 0.10106679052114487, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 48810 + }, + { + "epoch": 0.18872446691716535, + "grad_norm": 0.12552915513515472, + "learning_rate": 0.002, + "loss": 2.368, + "step": 48820 + }, + { + "epoch": 0.18876312412054863, + "grad_norm": 0.1399880200624466, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 48830 + }, + { + "epoch": 0.1888017813239319, + "grad_norm": 0.10296420007944107, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 48840 + }, + { + "epoch": 0.1888404385273152, + "grad_norm": 0.12583224475383759, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 48850 + }, + { + "epoch": 0.18887909573069847, + "grad_norm": 0.11771658062934875, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 48860 + }, + { + "epoch": 0.18891775293408172, + "grad_norm": 0.095871701836586, + "learning_rate": 0.002, + "loss": 2.355, + "step": 48870 + }, + { + "epoch": 0.188956410137465, + "grad_norm": 0.10693106800317764, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 48880 + }, + { + "epoch": 0.18899506734084828, + "grad_norm": 0.12797410786151886, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 48890 + }, + { + "epoch": 0.18903372454423156, + "grad_norm": 0.09604047238826752, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 48900 + }, + { + "epoch": 0.18907238174761484, + "grad_norm": 0.12026006728410721, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 48910 + }, + { + "epoch": 0.18911103895099812, + "grad_norm": 0.11032012104988098, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 48920 + }, + { + "epoch": 0.1891496961543814, + "grad_norm": 0.1259058266878128, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 48930 + }, + { + "epoch": 0.18918835335776468, + "grad_norm": 0.10738653689622879, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 48940 + }, + { + "epoch": 0.18922701056114796, + "grad_norm": 0.10654452443122864, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 48950 + }, + { + "epoch": 0.18926566776453124, + "grad_norm": 0.10034621506929398, + "learning_rate": 0.002, + "loss": 2.354, + "step": 48960 + }, + { + "epoch": 0.18930432496791452, + "grad_norm": 0.11938867717981339, + "learning_rate": 0.002, + "loss": 2.3764, + "step": 48970 + }, + { + "epoch": 0.1893429821712978, + "grad_norm": 0.10956014692783356, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 48980 + }, + { + "epoch": 0.18938163937468108, + "grad_norm": 0.093159019947052, + "learning_rate": 0.002, + "loss": 2.362, + "step": 48990 + }, + { + "epoch": 0.18942029657806436, + "grad_norm": 0.11266133189201355, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 49000 + }, + { + "epoch": 0.18945895378144764, + "grad_norm": 0.10753951221704483, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 49010 + }, + { + "epoch": 0.18949761098483092, + "grad_norm": 0.10640676319599152, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 49020 + }, + { + "epoch": 0.1895362681882142, + "grad_norm": 0.10722913593053818, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 49030 + }, + { + "epoch": 0.18957492539159748, + "grad_norm": 0.11566959321498871, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 49040 + }, + { + "epoch": 0.18961358259498076, + "grad_norm": 0.097598135471344, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 49050 + }, + { + "epoch": 0.18965223979836401, + "grad_norm": 0.1264805644750595, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 49060 + }, + { + "epoch": 0.1896908970017473, + "grad_norm": 0.11760244518518448, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 49070 + }, + { + "epoch": 0.18972955420513057, + "grad_norm": 0.11690422892570496, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 49080 + }, + { + "epoch": 0.18976821140851385, + "grad_norm": 0.10791273415088654, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 49090 + }, + { + "epoch": 0.18980686861189713, + "grad_norm": 0.09621349722146988, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 49100 + }, + { + "epoch": 0.1898455258152804, + "grad_norm": 0.11259328573942184, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 49110 + }, + { + "epoch": 0.1898841830186637, + "grad_norm": 0.11389777064323425, + "learning_rate": 0.002, + "loss": 2.338, + "step": 49120 + }, + { + "epoch": 0.18992284022204697, + "grad_norm": 0.10725866258144379, + "learning_rate": 0.002, + "loss": 2.349, + "step": 49130 + }, + { + "epoch": 0.18996149742543025, + "grad_norm": 0.08963416516780853, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 49140 + }, + { + "epoch": 0.19000015462881353, + "grad_norm": 0.13235047459602356, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 49150 + }, + { + "epoch": 0.1900388118321968, + "grad_norm": 0.11058180034160614, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 49160 + }, + { + "epoch": 0.1900774690355801, + "grad_norm": 0.11632433533668518, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 49170 + }, + { + "epoch": 0.19011612623896337, + "grad_norm": 0.0975283533334732, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 49180 + }, + { + "epoch": 0.19015478344234665, + "grad_norm": 0.09888440370559692, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 49190 + }, + { + "epoch": 0.19019344064572993, + "grad_norm": 0.10686645656824112, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 49200 + }, + { + "epoch": 0.1902320978491132, + "grad_norm": 0.11044856160879135, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 49210 + }, + { + "epoch": 0.1902707550524965, + "grad_norm": 0.1261729896068573, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 49220 + }, + { + "epoch": 0.19030941225587977, + "grad_norm": 0.10963032394647598, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 49230 + }, + { + "epoch": 0.19034806945926305, + "grad_norm": 0.10861487686634064, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 49240 + }, + { + "epoch": 0.1903867266626463, + "grad_norm": 0.12178757786750793, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 49250 + }, + { + "epoch": 0.19042538386602959, + "grad_norm": 0.09885502606630325, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 49260 + }, + { + "epoch": 0.19046404106941286, + "grad_norm": 0.10452932864427567, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 49270 + }, + { + "epoch": 0.19050269827279614, + "grad_norm": 0.11776251345872879, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 49280 + }, + { + "epoch": 0.19054135547617942, + "grad_norm": 0.12242273986339569, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 49290 + }, + { + "epoch": 0.1905800126795627, + "grad_norm": 0.11257563531398773, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 49300 + }, + { + "epoch": 0.19061866988294598, + "grad_norm": 0.10450883209705353, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 49310 + }, + { + "epoch": 0.19065732708632926, + "grad_norm": 0.11983456462621689, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 49320 + }, + { + "epoch": 0.19069598428971254, + "grad_norm": 0.10491986572742462, + "learning_rate": 0.002, + "loss": 2.3799, + "step": 49330 + }, + { + "epoch": 0.19073464149309582, + "grad_norm": 0.11008800566196442, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 49340 + }, + { + "epoch": 0.1907732986964791, + "grad_norm": 0.10659252852201462, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 49350 + }, + { + "epoch": 0.19081195589986238, + "grad_norm": 0.10718666017055511, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 49360 + }, + { + "epoch": 0.19085061310324566, + "grad_norm": 0.10284899175167084, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 49370 + }, + { + "epoch": 0.19088927030662894, + "grad_norm": 0.10044750571250916, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 49380 + }, + { + "epoch": 0.19092792751001222, + "grad_norm": 0.11064857989549637, + "learning_rate": 0.002, + "loss": 2.363, + "step": 49390 + }, + { + "epoch": 0.1909665847133955, + "grad_norm": 0.12401686608791351, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 49400 + }, + { + "epoch": 0.19100524191677878, + "grad_norm": 0.18540844321250916, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 49410 + }, + { + "epoch": 0.19104389912016206, + "grad_norm": 0.10839496552944183, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 49420 + }, + { + "epoch": 0.19108255632354532, + "grad_norm": 0.09727524220943451, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 49430 + }, + { + "epoch": 0.1911212135269286, + "grad_norm": 0.09541568905115128, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 49440 + }, + { + "epoch": 0.19115987073031188, + "grad_norm": 0.09279711544513702, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 49450 + }, + { + "epoch": 0.19119852793369516, + "grad_norm": 0.1435684859752655, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 49460 + }, + { + "epoch": 0.19123718513707844, + "grad_norm": 0.09051964432001114, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 49470 + }, + { + "epoch": 0.19127584234046172, + "grad_norm": 0.10041384398937225, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 49480 + }, + { + "epoch": 0.191314499543845, + "grad_norm": 0.12758305668830872, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 49490 + }, + { + "epoch": 0.19135315674722828, + "grad_norm": 0.1029890701174736, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 49500 + }, + { + "epoch": 0.19139181395061156, + "grad_norm": 0.11915244907140732, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 49510 + }, + { + "epoch": 0.19143047115399484, + "grad_norm": 0.11430905759334564, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 49520 + }, + { + "epoch": 0.19146912835737812, + "grad_norm": 0.11046679317951202, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 49530 + }, + { + "epoch": 0.1915077855607614, + "grad_norm": 0.0970064029097557, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 49540 + }, + { + "epoch": 0.19154644276414468, + "grad_norm": 0.11496380716562271, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 49550 + }, + { + "epoch": 0.19158509996752796, + "grad_norm": 0.10296601802110672, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 49560 + }, + { + "epoch": 0.19162375717091124, + "grad_norm": 0.10830742120742798, + "learning_rate": 0.002, + "loss": 2.359, + "step": 49570 + }, + { + "epoch": 0.19166241437429452, + "grad_norm": 0.1091323122382164, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 49580 + }, + { + "epoch": 0.1917010715776778, + "grad_norm": 0.118216872215271, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 49590 + }, + { + "epoch": 0.19173972878106108, + "grad_norm": 0.12398912012577057, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 49600 + }, + { + "epoch": 0.19177838598444436, + "grad_norm": 0.10063641518354416, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 49610 + }, + { + "epoch": 0.1918170431878276, + "grad_norm": 0.10972592234611511, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 49620 + }, + { + "epoch": 0.1918557003912109, + "grad_norm": 0.1127837672829628, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 49630 + }, + { + "epoch": 0.19189435759459417, + "grad_norm": 0.10776273161172867, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 49640 + }, + { + "epoch": 0.19193301479797745, + "grad_norm": 0.09368200600147247, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 49650 + }, + { + "epoch": 0.19197167200136073, + "grad_norm": 0.11541017889976501, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 49660 + }, + { + "epoch": 0.192010329204744, + "grad_norm": 0.11094188690185547, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 49670 + }, + { + "epoch": 0.1920489864081273, + "grad_norm": 0.09609813988208771, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 49680 + }, + { + "epoch": 0.19208764361151057, + "grad_norm": 0.14049966633319855, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 49690 + }, + { + "epoch": 0.19212630081489385, + "grad_norm": 0.10160891711711884, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 49700 + }, + { + "epoch": 0.19216495801827713, + "grad_norm": 0.10994237661361694, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 49710 + }, + { + "epoch": 0.1922036152216604, + "grad_norm": 0.10198924690485, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 49720 + }, + { + "epoch": 0.1922422724250437, + "grad_norm": 0.09823419898748398, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 49730 + }, + { + "epoch": 0.19228092962842697, + "grad_norm": 0.1106940433382988, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 49740 + }, + { + "epoch": 0.19231958683181025, + "grad_norm": 0.11430583149194717, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 49750 + }, + { + "epoch": 0.19235824403519353, + "grad_norm": 0.10822034627199173, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 49760 + }, + { + "epoch": 0.1923969012385768, + "grad_norm": 0.12003743648529053, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 49770 + }, + { + "epoch": 0.1924355584419601, + "grad_norm": 0.09463027864694595, + "learning_rate": 0.002, + "loss": 2.365, + "step": 49780 + }, + { + "epoch": 0.19247421564534337, + "grad_norm": 0.11937935650348663, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 49790 + }, + { + "epoch": 0.19251287284872662, + "grad_norm": 0.1150641217827797, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 49800 + }, + { + "epoch": 0.1925515300521099, + "grad_norm": 0.10229261964559555, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 49810 + }, + { + "epoch": 0.19259018725549318, + "grad_norm": 0.1260485053062439, + "learning_rate": 0.002, + "loss": 2.361, + "step": 49820 + }, + { + "epoch": 0.19262884445887646, + "grad_norm": 0.10355617851018906, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 49830 + }, + { + "epoch": 0.19266750166225974, + "grad_norm": 0.10395023226737976, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 49840 + }, + { + "epoch": 0.19270615886564302, + "grad_norm": 0.11862505972385406, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 49850 + }, + { + "epoch": 0.1927448160690263, + "grad_norm": 0.12586042284965515, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 49860 + }, + { + "epoch": 0.19278347327240958, + "grad_norm": 0.10471781343221664, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 49870 + }, + { + "epoch": 0.19282213047579286, + "grad_norm": 0.12276338040828705, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 49880 + }, + { + "epoch": 0.19286078767917614, + "grad_norm": 0.11906957626342773, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 49890 + }, + { + "epoch": 0.19289944488255942, + "grad_norm": 0.126028373837471, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 49900 + }, + { + "epoch": 0.1929381020859427, + "grad_norm": 0.10770002752542496, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 49910 + }, + { + "epoch": 0.19297675928932598, + "grad_norm": 0.1063154861330986, + "learning_rate": 0.002, + "loss": 2.343, + "step": 49920 + }, + { + "epoch": 0.19301541649270926, + "grad_norm": 0.10494157671928406, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 49930 + }, + { + "epoch": 0.19305407369609254, + "grad_norm": 0.10478539019823074, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 49940 + }, + { + "epoch": 0.19309273089947582, + "grad_norm": 0.0903603732585907, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 49950 + }, + { + "epoch": 0.1931313881028591, + "grad_norm": 0.10410972684621811, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 49960 + }, + { + "epoch": 0.19317004530624238, + "grad_norm": 0.1011076495051384, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 49970 + }, + { + "epoch": 0.19320870250962566, + "grad_norm": 0.10331527888774872, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 49980 + }, + { + "epoch": 0.1932473597130089, + "grad_norm": 0.12761783599853516, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 49990 + }, + { + "epoch": 0.1932860169163922, + "grad_norm": 0.1133008822798729, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 50000 + }, + { + "epoch": 0.19332467411977547, + "grad_norm": 0.10571694374084473, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 50010 + }, + { + "epoch": 0.19336333132315875, + "grad_norm": 0.10567709803581238, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 50020 + }, + { + "epoch": 0.19340198852654203, + "grad_norm": 0.10329192876815796, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 50030 + }, + { + "epoch": 0.1934406457299253, + "grad_norm": 0.12079528719186783, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 50040 + }, + { + "epoch": 0.1934793029333086, + "grad_norm": 0.10283046960830688, + "learning_rate": 0.002, + "loss": 2.364, + "step": 50050 + }, + { + "epoch": 0.19351796013669187, + "grad_norm": 0.10554220527410507, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 50060 + }, + { + "epoch": 0.19355661734007515, + "grad_norm": 0.10391606390476227, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 50070 + }, + { + "epoch": 0.19359527454345843, + "grad_norm": 0.10973990708589554, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 50080 + }, + { + "epoch": 0.1936339317468417, + "grad_norm": 0.10813383758068085, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 50090 + }, + { + "epoch": 0.193672588950225, + "grad_norm": 0.10249023884534836, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 50100 + }, + { + "epoch": 0.19371124615360827, + "grad_norm": 0.11180222779512405, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 50110 + }, + { + "epoch": 0.19374990335699155, + "grad_norm": 0.09287769347429276, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 50120 + }, + { + "epoch": 0.19378856056037483, + "grad_norm": 0.11256791651248932, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 50130 + }, + { + "epoch": 0.1938272177637581, + "grad_norm": 0.10587641596794128, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 50140 + }, + { + "epoch": 0.1938658749671414, + "grad_norm": 0.10730777680873871, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 50150 + }, + { + "epoch": 0.19390453217052467, + "grad_norm": 0.10186601430177689, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 50160 + }, + { + "epoch": 0.19394318937390792, + "grad_norm": 0.11971145868301392, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 50170 + }, + { + "epoch": 0.1939818465772912, + "grad_norm": 0.09542839974164963, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 50180 + }, + { + "epoch": 0.19402050378067448, + "grad_norm": 0.10131485760211945, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 50190 + }, + { + "epoch": 0.19405916098405776, + "grad_norm": 0.10544635355472565, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 50200 + }, + { + "epoch": 0.19409781818744104, + "grad_norm": 0.12036942690610886, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 50210 + }, + { + "epoch": 0.19413647539082432, + "grad_norm": 0.12579873204231262, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 50220 + }, + { + "epoch": 0.1941751325942076, + "grad_norm": 0.10959774255752563, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 50230 + }, + { + "epoch": 0.19421378979759088, + "grad_norm": 0.10837439447641373, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 50240 + }, + { + "epoch": 0.19425244700097416, + "grad_norm": 0.10460014641284943, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 50250 + }, + { + "epoch": 0.19429110420435744, + "grad_norm": 0.11192625761032104, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 50260 + }, + { + "epoch": 0.19432976140774072, + "grad_norm": 0.09709108620882034, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 50270 + }, + { + "epoch": 0.194368418611124, + "grad_norm": 0.12948331236839294, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 50280 + }, + { + "epoch": 0.19440707581450728, + "grad_norm": 0.12882012128829956, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 50290 + }, + { + "epoch": 0.19444573301789056, + "grad_norm": 0.09529069811105728, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 50300 + }, + { + "epoch": 0.19448439022127384, + "grad_norm": 0.09715364873409271, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 50310 + }, + { + "epoch": 0.19452304742465712, + "grad_norm": 0.09917979687452316, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 50320 + }, + { + "epoch": 0.1945617046280404, + "grad_norm": 0.32154932618141174, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 50330 + }, + { + "epoch": 0.19460036183142368, + "grad_norm": 0.12019972503185272, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 50340 + }, + { + "epoch": 0.19463901903480696, + "grad_norm": 0.12406770139932632, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 50350 + }, + { + "epoch": 0.1946776762381902, + "grad_norm": 0.10514701157808304, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 50360 + }, + { + "epoch": 0.1947163334415735, + "grad_norm": 0.11640505492687225, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 50370 + }, + { + "epoch": 0.19475499064495677, + "grad_norm": 0.1056564599275589, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 50380 + }, + { + "epoch": 0.19479364784834005, + "grad_norm": 0.127691850066185, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 50390 + }, + { + "epoch": 0.19483230505172333, + "grad_norm": 0.09586314857006073, + "learning_rate": 0.002, + "loss": 2.356, + "step": 50400 + }, + { + "epoch": 0.1948709622551066, + "grad_norm": 0.08516710251569748, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 50410 + }, + { + "epoch": 0.1949096194584899, + "grad_norm": 0.10823085159063339, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 50420 + }, + { + "epoch": 0.19494827666187317, + "grad_norm": 0.11516916751861572, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 50430 + }, + { + "epoch": 0.19498693386525645, + "grad_norm": 0.11594846844673157, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 50440 + }, + { + "epoch": 0.19502559106863973, + "grad_norm": 0.1126951277256012, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 50450 + }, + { + "epoch": 0.195064248272023, + "grad_norm": 0.10487323999404907, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 50460 + }, + { + "epoch": 0.1951029054754063, + "grad_norm": 0.11344010382890701, + "learning_rate": 0.002, + "loss": 2.367, + "step": 50470 + }, + { + "epoch": 0.19514156267878957, + "grad_norm": 0.1146879568696022, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 50480 + }, + { + "epoch": 0.19518021988217285, + "grad_norm": 0.10187899321317673, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 50490 + }, + { + "epoch": 0.19521887708555613, + "grad_norm": 0.11327874660491943, + "learning_rate": 0.002, + "loss": 2.361, + "step": 50500 + }, + { + "epoch": 0.1952575342889394, + "grad_norm": 0.11856690049171448, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 50510 + }, + { + "epoch": 0.1952961914923227, + "grad_norm": 0.11800426989793777, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 50520 + }, + { + "epoch": 0.19533484869570597, + "grad_norm": 0.10900002717971802, + "learning_rate": 0.002, + "loss": 2.3786, + "step": 50530 + }, + { + "epoch": 0.19537350589908922, + "grad_norm": 0.12437902390956879, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 50540 + }, + { + "epoch": 0.1954121631024725, + "grad_norm": 0.0991293415427208, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 50550 + }, + { + "epoch": 0.19545082030585578, + "grad_norm": 0.12089473009109497, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 50560 + }, + { + "epoch": 0.19548947750923906, + "grad_norm": 0.11330953240394592, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 50570 + }, + { + "epoch": 0.19552813471262234, + "grad_norm": 0.10841525346040726, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 50580 + }, + { + "epoch": 0.19556679191600562, + "grad_norm": 0.10499054193496704, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 50590 + }, + { + "epoch": 0.1956054491193889, + "grad_norm": 0.11165262013673782, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 50600 + }, + { + "epoch": 0.19564410632277218, + "grad_norm": 0.11451289802789688, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 50610 + }, + { + "epoch": 0.19568276352615546, + "grad_norm": 0.09710144251585007, + "learning_rate": 0.002, + "loss": 2.355, + "step": 50620 + }, + { + "epoch": 0.19572142072953874, + "grad_norm": 0.09652550518512726, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 50630 + }, + { + "epoch": 0.19576007793292202, + "grad_norm": 0.13575007021427155, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 50640 + }, + { + "epoch": 0.1957987351363053, + "grad_norm": 0.09911160171031952, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 50650 + }, + { + "epoch": 0.19583739233968858, + "grad_norm": 0.10242325067520142, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 50660 + }, + { + "epoch": 0.19587604954307186, + "grad_norm": 0.1086401492357254, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 50670 + }, + { + "epoch": 0.19591470674645514, + "grad_norm": 0.1018538549542427, + "learning_rate": 0.002, + "loss": 2.354, + "step": 50680 + }, + { + "epoch": 0.19595336394983842, + "grad_norm": 0.10752106457948685, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 50690 + }, + { + "epoch": 0.1959920211532217, + "grad_norm": 0.11646592617034912, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 50700 + }, + { + "epoch": 0.19603067835660498, + "grad_norm": 0.10107237845659256, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 50710 + }, + { + "epoch": 0.19606933555998826, + "grad_norm": 0.11378759145736694, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 50720 + }, + { + "epoch": 0.1961079927633715, + "grad_norm": 0.11688712984323502, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 50730 + }, + { + "epoch": 0.1961466499667548, + "grad_norm": 0.13264037668704987, + "learning_rate": 0.002, + "loss": 2.353, + "step": 50740 + }, + { + "epoch": 0.19618530717013807, + "grad_norm": 0.09421103447675705, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 50750 + }, + { + "epoch": 0.19622396437352135, + "grad_norm": 0.1111598014831543, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 50760 + }, + { + "epoch": 0.19626262157690463, + "grad_norm": 0.10464281588792801, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 50770 + }, + { + "epoch": 0.1963012787802879, + "grad_norm": 0.12343593686819077, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 50780 + }, + { + "epoch": 0.1963399359836712, + "grad_norm": 0.11526413261890411, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 50790 + }, + { + "epoch": 0.19637859318705447, + "grad_norm": 0.10543012619018555, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 50800 + }, + { + "epoch": 0.19641725039043775, + "grad_norm": 0.09727109223604202, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 50810 + }, + { + "epoch": 0.19645590759382103, + "grad_norm": 0.12500298023223877, + "learning_rate": 0.002, + "loss": 2.357, + "step": 50820 + }, + { + "epoch": 0.1964945647972043, + "grad_norm": 0.10165640711784363, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 50830 + }, + { + "epoch": 0.1965332220005876, + "grad_norm": 0.11737733334302902, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 50840 + }, + { + "epoch": 0.19657187920397087, + "grad_norm": 0.12259113043546677, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 50850 + }, + { + "epoch": 0.19661053640735415, + "grad_norm": 0.10291915386915207, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 50860 + }, + { + "epoch": 0.19664919361073743, + "grad_norm": 0.0999462828040123, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 50870 + }, + { + "epoch": 0.1966878508141207, + "grad_norm": 0.10957477986812592, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 50880 + }, + { + "epoch": 0.196726508017504, + "grad_norm": 0.10866300761699677, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 50890 + }, + { + "epoch": 0.19676516522088727, + "grad_norm": 0.4267626106739044, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 50900 + }, + { + "epoch": 0.19680382242427052, + "grad_norm": 0.13349512219429016, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 50910 + }, + { + "epoch": 0.1968424796276538, + "grad_norm": 0.10224291682243347, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 50920 + }, + { + "epoch": 0.19688113683103708, + "grad_norm": 0.10515494644641876, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 50930 + }, + { + "epoch": 0.19691979403442036, + "grad_norm": 0.12216347455978394, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 50940 + }, + { + "epoch": 0.19695845123780364, + "grad_norm": 0.10043194890022278, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 50950 + }, + { + "epoch": 0.19699710844118692, + "grad_norm": 0.10455900430679321, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 50960 + }, + { + "epoch": 0.1970357656445702, + "grad_norm": 0.17741772532463074, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 50970 + }, + { + "epoch": 0.19707442284795348, + "grad_norm": 0.11157890409231186, + "learning_rate": 0.002, + "loss": 2.35, + "step": 50980 + }, + { + "epoch": 0.19711308005133676, + "grad_norm": 0.09297435730695724, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 50990 + }, + { + "epoch": 0.19715173725472004, + "grad_norm": 0.1312807947397232, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 51000 + }, + { + "epoch": 0.19719039445810332, + "grad_norm": 0.11647715419530869, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 51010 + }, + { + "epoch": 0.1972290516614866, + "grad_norm": 0.12451427429914474, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 51020 + }, + { + "epoch": 0.19726770886486988, + "grad_norm": 0.10510771721601486, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 51030 + }, + { + "epoch": 0.19730636606825316, + "grad_norm": 0.11461485922336578, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 51040 + }, + { + "epoch": 0.19734502327163644, + "grad_norm": 0.12008356302976608, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 51050 + }, + { + "epoch": 0.19738368047501972, + "grad_norm": 0.08818504959344864, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 51060 + }, + { + "epoch": 0.197422337678403, + "grad_norm": 0.1345818042755127, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 51070 + }, + { + "epoch": 0.19746099488178628, + "grad_norm": 0.10152529925107956, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 51080 + }, + { + "epoch": 0.19749965208516956, + "grad_norm": 0.11092906445264816, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 51090 + }, + { + "epoch": 0.19753830928855282, + "grad_norm": 0.09306729584932327, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 51100 + }, + { + "epoch": 0.1975769664919361, + "grad_norm": 0.15187200903892517, + "learning_rate": 0.002, + "loss": 2.348, + "step": 51110 + }, + { + "epoch": 0.19761562369531938, + "grad_norm": 0.1150587648153305, + "learning_rate": 0.002, + "loss": 2.365, + "step": 51120 + }, + { + "epoch": 0.19765428089870266, + "grad_norm": 0.114975206553936, + "learning_rate": 0.002, + "loss": 2.365, + "step": 51130 + }, + { + "epoch": 0.19769293810208594, + "grad_norm": 0.11752933263778687, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 51140 + }, + { + "epoch": 0.19773159530546922, + "grad_norm": 0.11128650605678558, + "learning_rate": 0.002, + "loss": 2.3751, + "step": 51150 + }, + { + "epoch": 0.1977702525088525, + "grad_norm": 0.12327593564987183, + "learning_rate": 0.002, + "loss": 2.333, + "step": 51160 + }, + { + "epoch": 0.19780890971223578, + "grad_norm": 0.09861862659454346, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 51170 + }, + { + "epoch": 0.19784756691561906, + "grad_norm": 0.10292232781648636, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 51180 + }, + { + "epoch": 0.19788622411900234, + "grad_norm": 0.10943485051393509, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 51190 + }, + { + "epoch": 0.19792488132238562, + "grad_norm": 0.10931216925382614, + "learning_rate": 0.002, + "loss": 2.3815, + "step": 51200 + }, + { + "epoch": 0.1979635385257689, + "grad_norm": 0.11949874460697174, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 51210 + }, + { + "epoch": 0.19800219572915218, + "grad_norm": 0.10317299515008926, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 51220 + }, + { + "epoch": 0.19804085293253545, + "grad_norm": 0.11775912344455719, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 51230 + }, + { + "epoch": 0.19807951013591873, + "grad_norm": 0.1264687180519104, + "learning_rate": 0.002, + "loss": 2.3782, + "step": 51240 + }, + { + "epoch": 0.19811816733930201, + "grad_norm": 0.10325953364372253, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 51250 + }, + { + "epoch": 0.1981568245426853, + "grad_norm": 0.129286527633667, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 51260 + }, + { + "epoch": 0.19819548174606857, + "grad_norm": 0.10436193645000458, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 51270 + }, + { + "epoch": 0.19823413894945185, + "grad_norm": 0.10270849615335464, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 51280 + }, + { + "epoch": 0.1982727961528351, + "grad_norm": 0.09914952516555786, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 51290 + }, + { + "epoch": 0.1983114533562184, + "grad_norm": 0.10150845348834991, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 51300 + }, + { + "epoch": 0.19835011055960167, + "grad_norm": 0.11425089091062546, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 51310 + }, + { + "epoch": 0.19838876776298495, + "grad_norm": 0.10831782966852188, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 51320 + }, + { + "epoch": 0.19842742496636823, + "grad_norm": 0.09986311942338943, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 51330 + }, + { + "epoch": 0.1984660821697515, + "grad_norm": 0.10581366717815399, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 51340 + }, + { + "epoch": 0.1985047393731348, + "grad_norm": 0.09615137428045273, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 51350 + }, + { + "epoch": 0.19854339657651807, + "grad_norm": 0.12368248403072357, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 51360 + }, + { + "epoch": 0.19858205377990135, + "grad_norm": 0.11965450644493103, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 51370 + }, + { + "epoch": 0.19862071098328463, + "grad_norm": 0.1004866436123848, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 51380 + }, + { + "epoch": 0.1986593681866679, + "grad_norm": 0.112697534263134, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 51390 + }, + { + "epoch": 0.1986980253900512, + "grad_norm": 0.09046253561973572, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 51400 + }, + { + "epoch": 0.19873668259343447, + "grad_norm": 0.09781434386968613, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 51410 + }, + { + "epoch": 0.19877533979681775, + "grad_norm": 0.11699929088354111, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 51420 + }, + { + "epoch": 0.19881399700020103, + "grad_norm": 0.10510469973087311, + "learning_rate": 0.002, + "loss": 2.361, + "step": 51430 + }, + { + "epoch": 0.1988526542035843, + "grad_norm": 0.10240361839532852, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 51440 + }, + { + "epoch": 0.19889131140696759, + "grad_norm": 0.09002921730279922, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 51450 + }, + { + "epoch": 0.19892996861035087, + "grad_norm": 0.12872596085071564, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 51460 + }, + { + "epoch": 0.19896862581373412, + "grad_norm": 0.10436994582414627, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 51470 + }, + { + "epoch": 0.1990072830171174, + "grad_norm": 0.10890268534421921, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 51480 + }, + { + "epoch": 0.19904594022050068, + "grad_norm": 0.1062799021601677, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 51490 + }, + { + "epoch": 0.19908459742388396, + "grad_norm": 0.11125991493463516, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 51500 + }, + { + "epoch": 0.19912325462726724, + "grad_norm": 0.11942413449287415, + "learning_rate": 0.002, + "loss": 2.3825, + "step": 51510 + }, + { + "epoch": 0.19916191183065052, + "grad_norm": 0.11848786473274231, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 51520 + }, + { + "epoch": 0.1992005690340338, + "grad_norm": 0.10823323577642441, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 51530 + }, + { + "epoch": 0.19923922623741708, + "grad_norm": 0.1108582392334938, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 51540 + }, + { + "epoch": 0.19927788344080036, + "grad_norm": 0.1172277182340622, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 51550 + }, + { + "epoch": 0.19931654064418364, + "grad_norm": 0.19322575628757477, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 51560 + }, + { + "epoch": 0.19935519784756692, + "grad_norm": 0.11942016333341599, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 51570 + }, + { + "epoch": 0.1993938550509502, + "grad_norm": 0.1157030388712883, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 51580 + }, + { + "epoch": 0.19943251225433348, + "grad_norm": 0.09611659497022629, + "learning_rate": 0.002, + "loss": 2.3808, + "step": 51590 + }, + { + "epoch": 0.19947116945771676, + "grad_norm": 0.12318339943885803, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 51600 + }, + { + "epoch": 0.19950982666110004, + "grad_norm": 0.1084495484828949, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 51610 + }, + { + "epoch": 0.19954848386448332, + "grad_norm": 0.10069076716899872, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 51620 + }, + { + "epoch": 0.1995871410678666, + "grad_norm": 0.1215478777885437, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 51630 + }, + { + "epoch": 0.19962579827124988, + "grad_norm": 0.11098402738571167, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 51640 + }, + { + "epoch": 0.19966445547463316, + "grad_norm": 0.10591264814138412, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 51650 + }, + { + "epoch": 0.1997031126780164, + "grad_norm": 0.0950435921549797, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 51660 + }, + { + "epoch": 0.1997417698813997, + "grad_norm": 0.11709853261709213, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 51670 + }, + { + "epoch": 0.19978042708478297, + "grad_norm": 0.09581815451383591, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 51680 + }, + { + "epoch": 0.19981908428816625, + "grad_norm": 0.1009148582816124, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 51690 + }, + { + "epoch": 0.19985774149154953, + "grad_norm": 0.1481183022260666, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 51700 + }, + { + "epoch": 0.1998963986949328, + "grad_norm": 0.10128211230039597, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 51710 + }, + { + "epoch": 0.1999350558983161, + "grad_norm": 0.10426509380340576, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 51720 + }, + { + "epoch": 0.19997371310169937, + "grad_norm": 0.11983291804790497, + "learning_rate": 0.002, + "loss": 2.3816, + "step": 51730 + }, + { + "epoch": 0.20001237030508265, + "grad_norm": 0.102561816573143, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 51740 + }, + { + "epoch": 0.20005102750846593, + "grad_norm": 0.10453730076551437, + "learning_rate": 0.002, + "loss": 2.357, + "step": 51750 + }, + { + "epoch": 0.2000896847118492, + "grad_norm": 0.12531380355358124, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 51760 + }, + { + "epoch": 0.2001283419152325, + "grad_norm": 0.10815022140741348, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 51770 + }, + { + "epoch": 0.20016699911861577, + "grad_norm": 0.11682631075382233, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 51780 + }, + { + "epoch": 0.20020565632199905, + "grad_norm": 0.10224246978759766, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 51790 + }, + { + "epoch": 0.20024431352538233, + "grad_norm": 0.1255955845117569, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 51800 + }, + { + "epoch": 0.2002829707287656, + "grad_norm": 0.11466054618358612, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 51810 + }, + { + "epoch": 0.2003216279321489, + "grad_norm": 0.10104741901159286, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 51820 + }, + { + "epoch": 0.20036028513553217, + "grad_norm": 0.10511884838342667, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 51830 + }, + { + "epoch": 0.20039894233891542, + "grad_norm": 0.11252196878194809, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 51840 + }, + { + "epoch": 0.2004375995422987, + "grad_norm": 0.10455108433961868, + "learning_rate": 0.002, + "loss": 2.3836, + "step": 51850 + }, + { + "epoch": 0.20047625674568198, + "grad_norm": 0.10854092985391617, + "learning_rate": 0.002, + "loss": 2.362, + "step": 51860 + }, + { + "epoch": 0.20051491394906526, + "grad_norm": 0.12762337923049927, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 51870 + }, + { + "epoch": 0.20055357115244854, + "grad_norm": 0.12892848253250122, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 51880 + }, + { + "epoch": 0.20059222835583182, + "grad_norm": 0.09376315027475357, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 51890 + }, + { + "epoch": 0.2006308855592151, + "grad_norm": 0.126753568649292, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 51900 + }, + { + "epoch": 0.20066954276259838, + "grad_norm": 0.11163308471441269, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 51910 + }, + { + "epoch": 0.20070819996598166, + "grad_norm": 0.09663382172584534, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 51920 + }, + { + "epoch": 0.20074685716936494, + "grad_norm": 0.10127527266740799, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 51930 + }, + { + "epoch": 0.20078551437274822, + "grad_norm": 0.11793727427721024, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 51940 + }, + { + "epoch": 0.2008241715761315, + "grad_norm": 0.10714305192232132, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 51950 + }, + { + "epoch": 0.20086282877951478, + "grad_norm": 0.09726440906524658, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 51960 + }, + { + "epoch": 0.20090148598289806, + "grad_norm": 0.14774122834205627, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 51970 + }, + { + "epoch": 0.20094014318628134, + "grad_norm": 0.10895711183547974, + "learning_rate": 0.002, + "loss": 2.36, + "step": 51980 + }, + { + "epoch": 0.20097880038966462, + "grad_norm": 0.12761323153972626, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 51990 + }, + { + "epoch": 0.2010174575930479, + "grad_norm": 0.10693914443254471, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 52000 + }, + { + "epoch": 0.20105611479643118, + "grad_norm": 0.10339793562889099, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 52010 + }, + { + "epoch": 0.20109477199981446, + "grad_norm": 0.11272173374891281, + "learning_rate": 0.002, + "loss": 2.373, + "step": 52020 + }, + { + "epoch": 0.2011334292031977, + "grad_norm": 0.19333140552043915, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 52030 + }, + { + "epoch": 0.201172086406581, + "grad_norm": 0.12649370729923248, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 52040 + }, + { + "epoch": 0.20121074360996427, + "grad_norm": 0.10977057367563248, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 52050 + }, + { + "epoch": 0.20124940081334755, + "grad_norm": 0.12432746589183807, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 52060 + }, + { + "epoch": 0.20128805801673083, + "grad_norm": 0.11161592602729797, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 52070 + }, + { + "epoch": 0.2013267152201141, + "grad_norm": 0.10561925172805786, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 52080 + }, + { + "epoch": 0.2013653724234974, + "grad_norm": 0.10858765244483948, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 52090 + }, + { + "epoch": 0.20140402962688067, + "grad_norm": 0.09681866317987442, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 52100 + }, + { + "epoch": 0.20144268683026395, + "grad_norm": 0.09604979306459427, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 52110 + }, + { + "epoch": 0.20148134403364723, + "grad_norm": 0.10785799473524094, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 52120 + }, + { + "epoch": 0.2015200012370305, + "grad_norm": 0.11797741055488586, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 52130 + }, + { + "epoch": 0.2015586584404138, + "grad_norm": 0.1102878525853157, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 52140 + }, + { + "epoch": 0.20159731564379707, + "grad_norm": 0.11702249944210052, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 52150 + }, + { + "epoch": 0.20163597284718035, + "grad_norm": 0.13625964522361755, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 52160 + }, + { + "epoch": 0.20167463005056363, + "grad_norm": 0.10420363396406174, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 52170 + }, + { + "epoch": 0.2017132872539469, + "grad_norm": 0.09645235538482666, + "learning_rate": 0.002, + "loss": 2.366, + "step": 52180 + }, + { + "epoch": 0.2017519444573302, + "grad_norm": 0.09708897024393082, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 52190 + }, + { + "epoch": 0.20179060166071347, + "grad_norm": 0.1467909961938858, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 52200 + }, + { + "epoch": 0.20182925886409672, + "grad_norm": 0.10608675330877304, + "learning_rate": 0.002, + "loss": 2.347, + "step": 52210 + }, + { + "epoch": 0.20186791606748, + "grad_norm": 0.11547255516052246, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 52220 + }, + { + "epoch": 0.20190657327086328, + "grad_norm": 0.10597088187932968, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 52230 + }, + { + "epoch": 0.20194523047424656, + "grad_norm": 0.11478706449270248, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 52240 + }, + { + "epoch": 0.20198388767762984, + "grad_norm": 0.09801722317934036, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 52250 + }, + { + "epoch": 0.20202254488101312, + "grad_norm": 0.10778038203716278, + "learning_rate": 0.002, + "loss": 2.363, + "step": 52260 + }, + { + "epoch": 0.2020612020843964, + "grad_norm": 0.10066181421279907, + "learning_rate": 0.002, + "loss": 2.362, + "step": 52270 + }, + { + "epoch": 0.20209985928777968, + "grad_norm": 0.11359509825706482, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 52280 + }, + { + "epoch": 0.20213851649116296, + "grad_norm": 0.10373353958129883, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 52290 + }, + { + "epoch": 0.20217717369454624, + "grad_norm": 0.12140042334794998, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 52300 + }, + { + "epoch": 0.20221583089792952, + "grad_norm": 0.1029222160577774, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 52310 + }, + { + "epoch": 0.2022544881013128, + "grad_norm": 0.10438024997711182, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 52320 + }, + { + "epoch": 0.20229314530469608, + "grad_norm": 0.1253383904695511, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 52330 + }, + { + "epoch": 0.20233180250807936, + "grad_norm": 0.0986919105052948, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 52340 + }, + { + "epoch": 0.20237045971146264, + "grad_norm": 0.15070602297782898, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 52350 + }, + { + "epoch": 0.20240911691484592, + "grad_norm": 0.1333739310503006, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 52360 + }, + { + "epoch": 0.2024477741182292, + "grad_norm": 0.10192544013261795, + "learning_rate": 0.002, + "loss": 2.371, + "step": 52370 + }, + { + "epoch": 0.20248643132161248, + "grad_norm": 0.11291380226612091, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 52380 + }, + { + "epoch": 0.20252508852499576, + "grad_norm": 0.11298597604036331, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 52390 + }, + { + "epoch": 0.202563745728379, + "grad_norm": 0.11294633895158768, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 52400 + }, + { + "epoch": 0.2026024029317623, + "grad_norm": 0.09718062728643417, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 52410 + }, + { + "epoch": 0.20264106013514557, + "grad_norm": 0.09357758611440659, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 52420 + }, + { + "epoch": 0.20267971733852885, + "grad_norm": 0.10638459771871567, + "learning_rate": 0.002, + "loss": 2.3755, + "step": 52430 + }, + { + "epoch": 0.20271837454191213, + "grad_norm": 0.10469187796115875, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 52440 + }, + { + "epoch": 0.2027570317452954, + "grad_norm": 0.11636140942573547, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 52450 + }, + { + "epoch": 0.2027956889486787, + "grad_norm": 0.10756337642669678, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 52460 + }, + { + "epoch": 0.20283434615206197, + "grad_norm": 0.10654626041650772, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 52470 + }, + { + "epoch": 0.20287300335544525, + "grad_norm": 0.08959631621837616, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 52480 + }, + { + "epoch": 0.20291166055882853, + "grad_norm": 0.10842543840408325, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 52490 + }, + { + "epoch": 0.2029503177622118, + "grad_norm": 0.12206081300973892, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 52500 + }, + { + "epoch": 0.2029889749655951, + "grad_norm": 0.1023700013756752, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 52510 + }, + { + "epoch": 0.20302763216897837, + "grad_norm": 0.11099164187908173, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 52520 + }, + { + "epoch": 0.20306628937236165, + "grad_norm": 0.10270822048187256, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 52530 + }, + { + "epoch": 0.20310494657574493, + "grad_norm": 0.11300281435251236, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 52540 + }, + { + "epoch": 0.2031436037791282, + "grad_norm": 0.12924693524837494, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 52550 + }, + { + "epoch": 0.2031822609825115, + "grad_norm": 0.11208979785442352, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 52560 + }, + { + "epoch": 0.20322091818589477, + "grad_norm": 0.1158527284860611, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 52570 + }, + { + "epoch": 0.20325957538927802, + "grad_norm": 0.11639354377985, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 52580 + }, + { + "epoch": 0.2032982325926613, + "grad_norm": 0.09775479882955551, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 52590 + }, + { + "epoch": 0.20333688979604458, + "grad_norm": 0.0977337658405304, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 52600 + }, + { + "epoch": 0.20337554699942786, + "grad_norm": 0.1110336184501648, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 52610 + }, + { + "epoch": 0.20341420420281114, + "grad_norm": 0.10928376764059067, + "learning_rate": 0.002, + "loss": 2.348, + "step": 52620 + }, + { + "epoch": 0.20345286140619442, + "grad_norm": 0.11134651303291321, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 52630 + }, + { + "epoch": 0.2034915186095777, + "grad_norm": 0.10814300179481506, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 52640 + }, + { + "epoch": 0.20353017581296098, + "grad_norm": 0.10304401814937592, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 52650 + }, + { + "epoch": 0.20356883301634426, + "grad_norm": 0.10869714617729187, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 52660 + }, + { + "epoch": 0.20360749021972754, + "grad_norm": 0.10580986738204956, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 52670 + }, + { + "epoch": 0.20364614742311082, + "grad_norm": 0.09266664832830429, + "learning_rate": 0.002, + "loss": 2.345, + "step": 52680 + }, + { + "epoch": 0.2036848046264941, + "grad_norm": 0.10514344274997711, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 52690 + }, + { + "epoch": 0.20372346182987738, + "grad_norm": 0.10906578600406647, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 52700 + }, + { + "epoch": 0.20376211903326066, + "grad_norm": 0.11037307977676392, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 52710 + }, + { + "epoch": 0.20380077623664394, + "grad_norm": 0.09810831397771835, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 52720 + }, + { + "epoch": 0.20383943344002722, + "grad_norm": 0.1301860213279724, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 52730 + }, + { + "epoch": 0.2038780906434105, + "grad_norm": 0.10767398029565811, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 52740 + }, + { + "epoch": 0.20391674784679378, + "grad_norm": 0.09629841148853302, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 52750 + }, + { + "epoch": 0.20395540505017706, + "grad_norm": 0.12146098166704178, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 52760 + }, + { + "epoch": 0.20399406225356032, + "grad_norm": 0.1003902480006218, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 52770 + }, + { + "epoch": 0.2040327194569436, + "grad_norm": 0.11783084273338318, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 52780 + }, + { + "epoch": 0.20407137666032688, + "grad_norm": 0.10537533462047577, + "learning_rate": 0.002, + "loss": 2.3801, + "step": 52790 + }, + { + "epoch": 0.20411003386371016, + "grad_norm": 0.10663650184869766, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 52800 + }, + { + "epoch": 0.20414869106709344, + "grad_norm": 0.11123210191726685, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 52810 + }, + { + "epoch": 0.20418734827047672, + "grad_norm": 0.09847772866487503, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 52820 + }, + { + "epoch": 0.20422600547386, + "grad_norm": 0.10249926149845123, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 52830 + }, + { + "epoch": 0.20426466267724327, + "grad_norm": 0.16679657995700836, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 52840 + }, + { + "epoch": 0.20430331988062655, + "grad_norm": 0.11221053451299667, + "learning_rate": 0.002, + "loss": 2.3833, + "step": 52850 + }, + { + "epoch": 0.20434197708400983, + "grad_norm": 0.11050441116094589, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 52860 + }, + { + "epoch": 0.20438063428739311, + "grad_norm": 0.10989818722009659, + "learning_rate": 0.002, + "loss": 2.377, + "step": 52870 + }, + { + "epoch": 0.2044192914907764, + "grad_norm": 0.1037473976612091, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 52880 + }, + { + "epoch": 0.20445794869415967, + "grad_norm": 0.11152027547359467, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 52890 + }, + { + "epoch": 0.20449660589754295, + "grad_norm": 0.11552037298679352, + "learning_rate": 0.002, + "loss": 2.364, + "step": 52900 + }, + { + "epoch": 0.20453526310092623, + "grad_norm": 0.10706567764282227, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 52910 + }, + { + "epoch": 0.20457392030430951, + "grad_norm": 0.10491839796304703, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 52920 + }, + { + "epoch": 0.2046125775076928, + "grad_norm": 0.11457972228527069, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 52930 + }, + { + "epoch": 0.20465123471107607, + "grad_norm": 0.11204688251018524, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 52940 + }, + { + "epoch": 0.20468989191445933, + "grad_norm": 0.1085963174700737, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 52950 + }, + { + "epoch": 0.2047285491178426, + "grad_norm": 0.11827849596738815, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 52960 + }, + { + "epoch": 0.2047672063212259, + "grad_norm": 0.10177775472402573, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 52970 + }, + { + "epoch": 0.20480586352460917, + "grad_norm": 0.09992183744907379, + "learning_rate": 0.002, + "loss": 2.366, + "step": 52980 + }, + { + "epoch": 0.20484452072799245, + "grad_norm": 0.10621283203363419, + "learning_rate": 0.002, + "loss": 2.356, + "step": 52990 + }, + { + "epoch": 0.20488317793137573, + "grad_norm": 0.10804471373558044, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 53000 + }, + { + "epoch": 0.204921835134759, + "grad_norm": 0.10280714184045792, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 53010 + }, + { + "epoch": 0.20496049233814229, + "grad_norm": 0.10437464714050293, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 53020 + }, + { + "epoch": 0.20499914954152557, + "grad_norm": 0.11850042641162872, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 53030 + }, + { + "epoch": 0.20503780674490885, + "grad_norm": 0.11176618188619614, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 53040 + }, + { + "epoch": 0.20507646394829213, + "grad_norm": 0.095208078622818, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 53050 + }, + { + "epoch": 0.2051151211516754, + "grad_norm": 0.09806636720895767, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 53060 + }, + { + "epoch": 0.20515377835505869, + "grad_norm": 0.10049755126237869, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 53070 + }, + { + "epoch": 0.20519243555844197, + "grad_norm": 0.11506608873605728, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 53080 + }, + { + "epoch": 0.20523109276182525, + "grad_norm": 0.1036606952548027, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 53090 + }, + { + "epoch": 0.20526974996520853, + "grad_norm": 0.10761820524930954, + "learning_rate": 0.002, + "loss": 2.355, + "step": 53100 + }, + { + "epoch": 0.2053084071685918, + "grad_norm": 0.10791739821434021, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 53110 + }, + { + "epoch": 0.20534706437197509, + "grad_norm": 0.09568759053945541, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 53120 + }, + { + "epoch": 0.20538572157535837, + "grad_norm": 0.1140003353357315, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 53130 + }, + { + "epoch": 0.20542437877874162, + "grad_norm": 0.1259390115737915, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 53140 + }, + { + "epoch": 0.2054630359821249, + "grad_norm": 0.1129179373383522, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 53150 + }, + { + "epoch": 0.20550169318550818, + "grad_norm": 0.09243609011173248, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 53160 + }, + { + "epoch": 0.20554035038889146, + "grad_norm": 0.10274989157915115, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 53170 + }, + { + "epoch": 0.20557900759227474, + "grad_norm": 0.10646963864564896, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 53180 + }, + { + "epoch": 0.20561766479565802, + "grad_norm": 0.11503088474273682, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 53190 + }, + { + "epoch": 0.2056563219990413, + "grad_norm": 0.10646315664052963, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 53200 + }, + { + "epoch": 0.20569497920242458, + "grad_norm": 0.0975303053855896, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 53210 + }, + { + "epoch": 0.20573363640580786, + "grad_norm": 0.11746834218502045, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 53220 + }, + { + "epoch": 0.20577229360919114, + "grad_norm": 0.10905087739229202, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 53230 + }, + { + "epoch": 0.20581095081257442, + "grad_norm": 0.11742258071899414, + "learning_rate": 0.002, + "loss": 2.382, + "step": 53240 + }, + { + "epoch": 0.2058496080159577, + "grad_norm": 0.09772148728370667, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 53250 + }, + { + "epoch": 0.20588826521934098, + "grad_norm": 0.12421903759241104, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 53260 + }, + { + "epoch": 0.20592692242272426, + "grad_norm": 0.10841967165470123, + "learning_rate": 0.002, + "loss": 2.367, + "step": 53270 + }, + { + "epoch": 0.20596557962610754, + "grad_norm": 0.1038578525185585, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 53280 + }, + { + "epoch": 0.20600423682949082, + "grad_norm": 0.10237884521484375, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 53290 + }, + { + "epoch": 0.2060428940328741, + "grad_norm": 0.10999718308448792, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 53300 + }, + { + "epoch": 0.20608155123625738, + "grad_norm": 0.18075010180473328, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 53310 + }, + { + "epoch": 0.20612020843964066, + "grad_norm": 0.1075098067522049, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 53320 + }, + { + "epoch": 0.2061588656430239, + "grad_norm": 0.11189239472150803, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 53330 + }, + { + "epoch": 0.2061975228464072, + "grad_norm": 0.10663151741027832, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 53340 + }, + { + "epoch": 0.20623618004979047, + "grad_norm": 0.11522796005010605, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 53350 + }, + { + "epoch": 0.20627483725317375, + "grad_norm": 0.0978827029466629, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 53360 + }, + { + "epoch": 0.20631349445655703, + "grad_norm": 0.11668947339057922, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 53370 + }, + { + "epoch": 0.2063521516599403, + "grad_norm": 0.1104608029127121, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 53380 + }, + { + "epoch": 0.2063908088633236, + "grad_norm": 0.11412586271762848, + "learning_rate": 0.002, + "loss": 2.352, + "step": 53390 + }, + { + "epoch": 0.20642946606670687, + "grad_norm": 0.1089489609003067, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 53400 + }, + { + "epoch": 0.20646812327009015, + "grad_norm": 0.18193396925926208, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 53410 + }, + { + "epoch": 0.20650678047347343, + "grad_norm": 0.1322345733642578, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 53420 + }, + { + "epoch": 0.2065454376768567, + "grad_norm": 0.10003045946359634, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 53430 + }, + { + "epoch": 0.20658409488024, + "grad_norm": 0.08871643245220184, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 53440 + }, + { + "epoch": 0.20662275208362327, + "grad_norm": 0.10830266773700714, + "learning_rate": 0.002, + "loss": 2.358, + "step": 53450 + }, + { + "epoch": 0.20666140928700655, + "grad_norm": 0.09646753966808319, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 53460 + }, + { + "epoch": 0.20670006649038983, + "grad_norm": 0.09747888892889023, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 53470 + }, + { + "epoch": 0.2067387236937731, + "grad_norm": 0.11742375046014786, + "learning_rate": 0.002, + "loss": 2.362, + "step": 53480 + }, + { + "epoch": 0.2067773808971564, + "grad_norm": 0.10999744385480881, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 53490 + }, + { + "epoch": 0.20681603810053967, + "grad_norm": 0.11388671398162842, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 53500 + }, + { + "epoch": 0.20685469530392292, + "grad_norm": 0.10879701375961304, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 53510 + }, + { + "epoch": 0.2068933525073062, + "grad_norm": 0.12783634662628174, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 53520 + }, + { + "epoch": 0.20693200971068948, + "grad_norm": 0.10533930361270905, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 53530 + }, + { + "epoch": 0.20697066691407276, + "grad_norm": 0.12564559280872345, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 53540 + }, + { + "epoch": 0.20700932411745604, + "grad_norm": 0.1137334555387497, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 53550 + }, + { + "epoch": 0.20704798132083932, + "grad_norm": 0.1020798459649086, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 53560 + }, + { + "epoch": 0.2070866385242226, + "grad_norm": 0.11555466800928116, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 53570 + }, + { + "epoch": 0.20712529572760588, + "grad_norm": 0.1030665710568428, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 53580 + }, + { + "epoch": 0.20716395293098916, + "grad_norm": 0.13051795959472656, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 53590 + }, + { + "epoch": 0.20720261013437244, + "grad_norm": 0.10592159628868103, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 53600 + }, + { + "epoch": 0.20724126733775572, + "grad_norm": 0.6249107122421265, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 53610 + }, + { + "epoch": 0.207279924541139, + "grad_norm": 0.11390656977891922, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 53620 + }, + { + "epoch": 0.20731858174452228, + "grad_norm": 0.1781582087278366, + "learning_rate": 0.002, + "loss": 2.384, + "step": 53630 + }, + { + "epoch": 0.20735723894790556, + "grad_norm": 0.11048318445682526, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 53640 + }, + { + "epoch": 0.20739589615128884, + "grad_norm": 0.11458776891231537, + "learning_rate": 0.002, + "loss": 2.354, + "step": 53650 + }, + { + "epoch": 0.20743455335467212, + "grad_norm": 0.11732518672943115, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 53660 + }, + { + "epoch": 0.2074732105580554, + "grad_norm": 0.0880560651421547, + "learning_rate": 0.002, + "loss": 2.362, + "step": 53670 + }, + { + "epoch": 0.20751186776143868, + "grad_norm": 0.08990538120269775, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 53680 + }, + { + "epoch": 0.20755052496482196, + "grad_norm": 0.10926368832588196, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 53690 + }, + { + "epoch": 0.2075891821682052, + "grad_norm": 0.11573584377765656, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 53700 + }, + { + "epoch": 0.2076278393715885, + "grad_norm": 0.1370205134153366, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 53710 + }, + { + "epoch": 0.20766649657497177, + "grad_norm": 0.11262237280607224, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 53720 + }, + { + "epoch": 0.20770515377835505, + "grad_norm": 0.1154111921787262, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 53730 + }, + { + "epoch": 0.20774381098173833, + "grad_norm": 0.09423351287841797, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 53740 + }, + { + "epoch": 0.2077824681851216, + "grad_norm": 0.10815108567476273, + "learning_rate": 0.002, + "loss": 2.358, + "step": 53750 + }, + { + "epoch": 0.2078211253885049, + "grad_norm": 0.10672447830438614, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 53760 + }, + { + "epoch": 0.20785978259188817, + "grad_norm": 0.09784535318613052, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 53770 + }, + { + "epoch": 0.20789843979527145, + "grad_norm": 0.12865093350410461, + "learning_rate": 0.002, + "loss": 2.377, + "step": 53780 + }, + { + "epoch": 0.20793709699865473, + "grad_norm": 0.10319879651069641, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 53790 + }, + { + "epoch": 0.207975754202038, + "grad_norm": 0.11751110851764679, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 53800 + }, + { + "epoch": 0.2080144114054213, + "grad_norm": 0.10024145245552063, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 53810 + }, + { + "epoch": 0.20805306860880457, + "grad_norm": 0.11266933381557465, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 53820 + }, + { + "epoch": 0.20809172581218785, + "grad_norm": 0.09626465290784836, + "learning_rate": 0.002, + "loss": 2.37, + "step": 53830 + }, + { + "epoch": 0.20813038301557113, + "grad_norm": 0.10873950272798538, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 53840 + }, + { + "epoch": 0.2081690402189544, + "grad_norm": 0.09650988131761551, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 53850 + }, + { + "epoch": 0.2082076974223377, + "grad_norm": 0.09024965018033981, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 53860 + }, + { + "epoch": 0.20824635462572097, + "grad_norm": 0.13784830272197723, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 53870 + }, + { + "epoch": 0.20828501182910422, + "grad_norm": 0.0888957530260086, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 53880 + }, + { + "epoch": 0.2083236690324875, + "grad_norm": 0.11602488905191422, + "learning_rate": 0.002, + "loss": 2.351, + "step": 53890 + }, + { + "epoch": 0.20836232623587078, + "grad_norm": 0.09725574404001236, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 53900 + }, + { + "epoch": 0.20840098343925406, + "grad_norm": 0.10566900670528412, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 53910 + }, + { + "epoch": 0.20843964064263734, + "grad_norm": 0.11023826897144318, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 53920 + }, + { + "epoch": 0.20847829784602062, + "grad_norm": 0.1042385846376419, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 53930 + }, + { + "epoch": 0.2085169550494039, + "grad_norm": 0.11207899451255798, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 53940 + }, + { + "epoch": 0.20855561225278718, + "grad_norm": 0.10884620994329453, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 53950 + }, + { + "epoch": 0.20859426945617046, + "grad_norm": 0.10588477551937103, + "learning_rate": 0.002, + "loss": 2.364, + "step": 53960 + }, + { + "epoch": 0.20863292665955374, + "grad_norm": 0.10225141793489456, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 53970 + }, + { + "epoch": 0.20867158386293702, + "grad_norm": 0.11052332818508148, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 53980 + }, + { + "epoch": 0.2087102410663203, + "grad_norm": 0.12436167895793915, + "learning_rate": 0.002, + "loss": 2.37, + "step": 53990 + }, + { + "epoch": 0.20874889826970358, + "grad_norm": 0.10896704345941544, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 54000 + }, + { + "epoch": 0.20878755547308686, + "grad_norm": 0.11404981464147568, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 54010 + }, + { + "epoch": 0.20882621267647014, + "grad_norm": 0.12331146001815796, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 54020 + }, + { + "epoch": 0.20886486987985342, + "grad_norm": 0.09879045188426971, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 54030 + }, + { + "epoch": 0.2089035270832367, + "grad_norm": 0.11368494480848312, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 54040 + }, + { + "epoch": 0.20894218428661998, + "grad_norm": 0.13123731315135956, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 54050 + }, + { + "epoch": 0.20898084149000326, + "grad_norm": 0.10231734067201614, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 54060 + }, + { + "epoch": 0.2090194986933865, + "grad_norm": 0.10188768804073334, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 54070 + }, + { + "epoch": 0.2090581558967698, + "grad_norm": 0.10839677602052689, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 54080 + }, + { + "epoch": 0.20909681310015307, + "grad_norm": 0.09975928068161011, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 54090 + }, + { + "epoch": 0.20913547030353635, + "grad_norm": 0.11016577482223511, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 54100 + }, + { + "epoch": 0.20917412750691963, + "grad_norm": 0.11556573957204819, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 54110 + }, + { + "epoch": 0.2092127847103029, + "grad_norm": 0.10059158504009247, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 54120 + }, + { + "epoch": 0.2092514419136862, + "grad_norm": 0.11386764794588089, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 54130 + }, + { + "epoch": 0.20929009911706947, + "grad_norm": 0.13921064138412476, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 54140 + }, + { + "epoch": 0.20932875632045275, + "grad_norm": 0.1022169217467308, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 54150 + }, + { + "epoch": 0.20936741352383603, + "grad_norm": 0.10318666696548462, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 54160 + }, + { + "epoch": 0.2094060707272193, + "grad_norm": 0.1251523792743683, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 54170 + }, + { + "epoch": 0.2094447279306026, + "grad_norm": 0.11519023776054382, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 54180 + }, + { + "epoch": 0.20948338513398587, + "grad_norm": 0.09141872823238373, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 54190 + }, + { + "epoch": 0.20952204233736915, + "grad_norm": 0.11835416406393051, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 54200 + }, + { + "epoch": 0.20956069954075243, + "grad_norm": 0.1005050390958786, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 54210 + }, + { + "epoch": 0.2095993567441357, + "grad_norm": 0.1052476167678833, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 54220 + }, + { + "epoch": 0.209638013947519, + "grad_norm": 0.11354995518922806, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 54230 + }, + { + "epoch": 0.20967667115090227, + "grad_norm": 0.11486457288265228, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 54240 + }, + { + "epoch": 0.20971532835428552, + "grad_norm": 0.10793580114841461, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 54250 + }, + { + "epoch": 0.2097539855576688, + "grad_norm": 0.11628241837024689, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 54260 + }, + { + "epoch": 0.20979264276105208, + "grad_norm": 0.10537659376859665, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 54270 + }, + { + "epoch": 0.20983129996443536, + "grad_norm": 0.09963370114564896, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 54280 + }, + { + "epoch": 0.20986995716781864, + "grad_norm": 0.10262928158044815, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 54290 + }, + { + "epoch": 0.20990861437120192, + "grad_norm": 0.13343960046768188, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 54300 + }, + { + "epoch": 0.2099472715745852, + "grad_norm": 0.1169983446598053, + "learning_rate": 0.002, + "loss": 2.367, + "step": 54310 + }, + { + "epoch": 0.20998592877796848, + "grad_norm": 0.12719640135765076, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 54320 + }, + { + "epoch": 0.21002458598135176, + "grad_norm": 0.0954410657286644, + "learning_rate": 0.002, + "loss": 2.354, + "step": 54330 + }, + { + "epoch": 0.21006324318473504, + "grad_norm": 0.10446008294820786, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 54340 + }, + { + "epoch": 0.21010190038811832, + "grad_norm": 0.14668498933315277, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 54350 + }, + { + "epoch": 0.2101405575915016, + "grad_norm": 0.11192698031663895, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 54360 + }, + { + "epoch": 0.21017921479488488, + "grad_norm": 0.10454915463924408, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 54370 + }, + { + "epoch": 0.21021787199826816, + "grad_norm": 0.10452423244714737, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 54380 + }, + { + "epoch": 0.21025652920165144, + "grad_norm": 0.10957881063222885, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 54390 + }, + { + "epoch": 0.21029518640503472, + "grad_norm": 0.10211426019668579, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 54400 + }, + { + "epoch": 0.210333843608418, + "grad_norm": 0.09771328419446945, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 54410 + }, + { + "epoch": 0.21037250081180128, + "grad_norm": 0.10654677450656891, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 54420 + }, + { + "epoch": 0.21041115801518456, + "grad_norm": 0.10297355055809021, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 54430 + }, + { + "epoch": 0.21044981521856782, + "grad_norm": 0.094737708568573, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 54440 + }, + { + "epoch": 0.2104884724219511, + "grad_norm": 0.10669811069965363, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 54450 + }, + { + "epoch": 0.21052712962533437, + "grad_norm": 0.09515495598316193, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 54460 + }, + { + "epoch": 0.21056578682871765, + "grad_norm": 0.11773104220628738, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 54470 + }, + { + "epoch": 0.21060444403210093, + "grad_norm": 0.09968582540750504, + "learning_rate": 0.002, + "loss": 2.352, + "step": 54480 + }, + { + "epoch": 0.21064310123548421, + "grad_norm": 0.10157714784145355, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 54490 + }, + { + "epoch": 0.2106817584388675, + "grad_norm": 0.09814414381980896, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 54500 + }, + { + "epoch": 0.21072041564225077, + "grad_norm": 0.12160098552703857, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 54510 + }, + { + "epoch": 0.21075907284563405, + "grad_norm": 0.1207745298743248, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 54520 + }, + { + "epoch": 0.21079773004901733, + "grad_norm": 0.11184488236904144, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 54530 + }, + { + "epoch": 0.21083638725240061, + "grad_norm": 0.11148563027381897, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 54540 + }, + { + "epoch": 0.2108750444557839, + "grad_norm": 0.092194102704525, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 54550 + }, + { + "epoch": 0.21091370165916717, + "grad_norm": 0.11377835273742676, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 54560 + }, + { + "epoch": 0.21095235886255045, + "grad_norm": 0.10908497869968414, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 54570 + }, + { + "epoch": 0.21099101606593373, + "grad_norm": 0.12056691944599152, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 54580 + }, + { + "epoch": 0.21102967326931701, + "grad_norm": 0.09635305404663086, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 54590 + }, + { + "epoch": 0.2110683304727003, + "grad_norm": 0.09015733003616333, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 54600 + }, + { + "epoch": 0.21110698767608357, + "grad_norm": 0.1049027368426323, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 54610 + }, + { + "epoch": 0.21114564487946683, + "grad_norm": 0.11679041385650635, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 54620 + }, + { + "epoch": 0.2111843020828501, + "grad_norm": 0.11454702913761139, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 54630 + }, + { + "epoch": 0.21122295928623339, + "grad_norm": 0.11232933402061462, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 54640 + }, + { + "epoch": 0.21126161648961667, + "grad_norm": 0.10257542878389359, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 54650 + }, + { + "epoch": 0.21130027369299995, + "grad_norm": 0.09737318754196167, + "learning_rate": 0.002, + "loss": 2.375, + "step": 54660 + }, + { + "epoch": 0.21133893089638323, + "grad_norm": 0.1093873605132103, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 54670 + }, + { + "epoch": 0.2113775880997665, + "grad_norm": 0.10621381551027298, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 54680 + }, + { + "epoch": 0.21141624530314979, + "grad_norm": 0.1178363785147667, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 54690 + }, + { + "epoch": 0.21145490250653307, + "grad_norm": 0.10773173719644547, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 54700 + }, + { + "epoch": 0.21149355970991635, + "grad_norm": 0.12191834300756454, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 54710 + }, + { + "epoch": 0.21153221691329963, + "grad_norm": 0.10326214879751205, + "learning_rate": 0.002, + "loss": 2.343, + "step": 54720 + }, + { + "epoch": 0.2115708741166829, + "grad_norm": 0.12878748774528503, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 54730 + }, + { + "epoch": 0.21160953132006619, + "grad_norm": 0.10724682360887527, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 54740 + }, + { + "epoch": 0.21164818852344947, + "grad_norm": 0.09913008660078049, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 54750 + }, + { + "epoch": 0.21168684572683275, + "grad_norm": 0.1128566637635231, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 54760 + }, + { + "epoch": 0.21172550293021603, + "grad_norm": 0.10462232679128647, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 54770 + }, + { + "epoch": 0.2117641601335993, + "grad_norm": 0.12480421364307404, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 54780 + }, + { + "epoch": 0.21180281733698259, + "grad_norm": 0.10443200170993805, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 54790 + }, + { + "epoch": 0.21184147454036586, + "grad_norm": 0.09765107929706573, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 54800 + }, + { + "epoch": 0.21188013174374912, + "grad_norm": 0.10396154969930649, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 54810 + }, + { + "epoch": 0.2119187889471324, + "grad_norm": 0.11567749083042145, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 54820 + }, + { + "epoch": 0.21195744615051568, + "grad_norm": 0.09926524013280869, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 54830 + }, + { + "epoch": 0.21199610335389896, + "grad_norm": 0.1023450493812561, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 54840 + }, + { + "epoch": 0.21203476055728224, + "grad_norm": 0.13101065158843994, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 54850 + }, + { + "epoch": 0.21207341776066552, + "grad_norm": 0.1092870682477951, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 54860 + }, + { + "epoch": 0.2121120749640488, + "grad_norm": 0.10586284846067429, + "learning_rate": 0.002, + "loss": 2.371, + "step": 54870 + }, + { + "epoch": 0.21215073216743208, + "grad_norm": 0.1010589674115181, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 54880 + }, + { + "epoch": 0.21218938937081536, + "grad_norm": 0.22518308460712433, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 54890 + }, + { + "epoch": 0.21222804657419864, + "grad_norm": 0.0966985747218132, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 54900 + }, + { + "epoch": 0.21226670377758192, + "grad_norm": 0.0973641648888588, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 54910 + }, + { + "epoch": 0.2123053609809652, + "grad_norm": 0.10347139835357666, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 54920 + }, + { + "epoch": 0.21234401818434848, + "grad_norm": 0.10601606220006943, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 54930 + }, + { + "epoch": 0.21238267538773176, + "grad_norm": 0.12179460376501083, + "learning_rate": 0.002, + "loss": 2.344, + "step": 54940 + }, + { + "epoch": 0.21242133259111504, + "grad_norm": 0.11832549422979355, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 54950 + }, + { + "epoch": 0.21245998979449832, + "grad_norm": 0.11510959267616272, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 54960 + }, + { + "epoch": 0.2124986469978816, + "grad_norm": 0.10815947502851486, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 54970 + }, + { + "epoch": 0.21253730420126488, + "grad_norm": 0.09001940488815308, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 54980 + }, + { + "epoch": 0.21257596140464813, + "grad_norm": 0.12041858583688736, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 54990 + }, + { + "epoch": 0.2126146186080314, + "grad_norm": 0.1027616411447525, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 55000 + }, + { + "epoch": 0.2126532758114147, + "grad_norm": 0.10954099148511887, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 55010 + }, + { + "epoch": 0.21269193301479797, + "grad_norm": 0.1290101706981659, + "learning_rate": 0.002, + "loss": 2.345, + "step": 55020 + }, + { + "epoch": 0.21273059021818125, + "grad_norm": 0.09908808767795563, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 55030 + }, + { + "epoch": 0.21276924742156453, + "grad_norm": 0.1017303317785263, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 55040 + }, + { + "epoch": 0.2128079046249478, + "grad_norm": 0.11294441670179367, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 55050 + }, + { + "epoch": 0.2128465618283311, + "grad_norm": 0.10736481845378876, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 55060 + }, + { + "epoch": 0.21288521903171437, + "grad_norm": 0.09974376112222672, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 55070 + }, + { + "epoch": 0.21292387623509765, + "grad_norm": 0.12165064364671707, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 55080 + }, + { + "epoch": 0.21296253343848093, + "grad_norm": 0.10961667448282242, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 55090 + }, + { + "epoch": 0.2130011906418642, + "grad_norm": 0.11452201008796692, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 55100 + }, + { + "epoch": 0.2130398478452475, + "grad_norm": 0.10695372521877289, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 55110 + }, + { + "epoch": 0.21307850504863077, + "grad_norm": 0.09367763251066208, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 55120 + }, + { + "epoch": 0.21311716225201405, + "grad_norm": 0.09776943176984787, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 55130 + }, + { + "epoch": 0.21315581945539733, + "grad_norm": 0.124178446829319, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 55140 + }, + { + "epoch": 0.2131944766587806, + "grad_norm": 0.10170625895261765, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 55150 + }, + { + "epoch": 0.2132331338621639, + "grad_norm": 0.10083147138357162, + "learning_rate": 0.002, + "loss": 2.368, + "step": 55160 + }, + { + "epoch": 0.21327179106554717, + "grad_norm": 0.11676553636789322, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 55170 + }, + { + "epoch": 0.21331044826893042, + "grad_norm": 0.10173624008893967, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 55180 + }, + { + "epoch": 0.2133491054723137, + "grad_norm": 0.1213875263929367, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 55190 + }, + { + "epoch": 0.21338776267569698, + "grad_norm": 0.11385396867990494, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 55200 + }, + { + "epoch": 0.21342641987908026, + "grad_norm": 0.11575232446193695, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 55210 + }, + { + "epoch": 0.21346507708246354, + "grad_norm": 0.10983169823884964, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 55220 + }, + { + "epoch": 0.21350373428584682, + "grad_norm": 0.10628439486026764, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 55230 + }, + { + "epoch": 0.2135423914892301, + "grad_norm": 0.11618947982788086, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 55240 + }, + { + "epoch": 0.21358104869261338, + "grad_norm": 0.09852719306945801, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 55250 + }, + { + "epoch": 0.21361970589599666, + "grad_norm": 0.12594835460186005, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 55260 + }, + { + "epoch": 0.21365836309937994, + "grad_norm": 0.10377329587936401, + "learning_rate": 0.002, + "loss": 2.359, + "step": 55270 + }, + { + "epoch": 0.21369702030276322, + "grad_norm": 0.12225122004747391, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 55280 + }, + { + "epoch": 0.2137356775061465, + "grad_norm": 0.09943044930696487, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 55290 + }, + { + "epoch": 0.21377433470952978, + "grad_norm": 0.1107863187789917, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 55300 + }, + { + "epoch": 0.21381299191291306, + "grad_norm": 0.11024272441864014, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 55310 + }, + { + "epoch": 0.21385164911629634, + "grad_norm": 0.15556177496910095, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 55320 + }, + { + "epoch": 0.21389030631967962, + "grad_norm": 0.11116694658994675, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 55330 + }, + { + "epoch": 0.2139289635230629, + "grad_norm": 0.09645438939332962, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 55340 + }, + { + "epoch": 0.21396762072644618, + "grad_norm": 0.12381944805383682, + "learning_rate": 0.002, + "loss": 2.366, + "step": 55350 + }, + { + "epoch": 0.21400627792982946, + "grad_norm": 0.11018754541873932, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 55360 + }, + { + "epoch": 0.2140449351332127, + "grad_norm": 0.1027529314160347, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 55370 + }, + { + "epoch": 0.214083592336596, + "grad_norm": 0.09542679786682129, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 55380 + }, + { + "epoch": 0.21412224953997927, + "grad_norm": 0.12536196410655975, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 55390 + }, + { + "epoch": 0.21416090674336255, + "grad_norm": 0.11031936854124069, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 55400 + }, + { + "epoch": 0.21419956394674583, + "grad_norm": 0.12041087448596954, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 55410 + }, + { + "epoch": 0.2142382211501291, + "grad_norm": 0.11016488820314407, + "learning_rate": 0.002, + "loss": 2.357, + "step": 55420 + }, + { + "epoch": 0.2142768783535124, + "grad_norm": 0.13809560239315033, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 55430 + }, + { + "epoch": 0.21431553555689567, + "grad_norm": 0.09298276156187057, + "learning_rate": 0.002, + "loss": 2.351, + "step": 55440 + }, + { + "epoch": 0.21435419276027895, + "grad_norm": 0.11793974041938782, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 55450 + }, + { + "epoch": 0.21439284996366223, + "grad_norm": 0.10310234129428864, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 55460 + }, + { + "epoch": 0.2144315071670455, + "grad_norm": 0.11061696708202362, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 55470 + }, + { + "epoch": 0.2144701643704288, + "grad_norm": 0.11232610791921616, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 55480 + }, + { + "epoch": 0.21450882157381207, + "grad_norm": 0.10409107804298401, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 55490 + }, + { + "epoch": 0.21454747877719535, + "grad_norm": 0.1074514091014862, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 55500 + }, + { + "epoch": 0.21458613598057863, + "grad_norm": 0.12128176540136337, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 55510 + }, + { + "epoch": 0.2146247931839619, + "grad_norm": 0.09980496019124985, + "learning_rate": 0.002, + "loss": 2.362, + "step": 55520 + }, + { + "epoch": 0.2146634503873452, + "grad_norm": 0.10034727305173874, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 55530 + }, + { + "epoch": 0.21470210759072847, + "grad_norm": 0.10929981619119644, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 55540 + }, + { + "epoch": 0.21474076479411172, + "grad_norm": 0.10616452991962433, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 55550 + }, + { + "epoch": 0.214779421997495, + "grad_norm": 0.10621832311153412, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 55560 + }, + { + "epoch": 0.21481807920087828, + "grad_norm": 0.09339945763349533, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 55570 + }, + { + "epoch": 0.21485673640426156, + "grad_norm": 0.12555859982967377, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 55580 + }, + { + "epoch": 0.21489539360764484, + "grad_norm": 0.09851313382387161, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 55590 + }, + { + "epoch": 0.21493405081102812, + "grad_norm": 0.10451821982860565, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 55600 + }, + { + "epoch": 0.2149727080144114, + "grad_norm": 0.09720192104578018, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 55610 + }, + { + "epoch": 0.21501136521779468, + "grad_norm": 0.09352467209100723, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 55620 + }, + { + "epoch": 0.21505002242117796, + "grad_norm": 0.10066934674978256, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 55630 + }, + { + "epoch": 0.21508867962456124, + "grad_norm": 0.13200034201145172, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 55640 + }, + { + "epoch": 0.21512733682794452, + "grad_norm": 0.10262254625558853, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 55650 + }, + { + "epoch": 0.2151659940313278, + "grad_norm": 0.11579084396362305, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 55660 + }, + { + "epoch": 0.21520465123471108, + "grad_norm": 0.10719157755374908, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 55670 + }, + { + "epoch": 0.21524330843809436, + "grad_norm": 0.12055464088916779, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 55680 + }, + { + "epoch": 0.21528196564147764, + "grad_norm": 0.1007857471704483, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 55690 + }, + { + "epoch": 0.21532062284486092, + "grad_norm": 0.10116320103406906, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 55700 + }, + { + "epoch": 0.2153592800482442, + "grad_norm": 0.10648627579212189, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 55710 + }, + { + "epoch": 0.21539793725162748, + "grad_norm": 0.09784390777349472, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 55720 + }, + { + "epoch": 0.21543659445501076, + "grad_norm": 0.1030447855591774, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 55730 + }, + { + "epoch": 0.215475251658394, + "grad_norm": 0.09437708556652069, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 55740 + }, + { + "epoch": 0.2155139088617773, + "grad_norm": 0.12527985870838165, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 55750 + }, + { + "epoch": 0.21555256606516057, + "grad_norm": 0.0960555300116539, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 55760 + }, + { + "epoch": 0.21559122326854385, + "grad_norm": 0.12125767767429352, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 55770 + }, + { + "epoch": 0.21562988047192713, + "grad_norm": 0.10940881818532944, + "learning_rate": 0.002, + "loss": 2.339, + "step": 55780 + }, + { + "epoch": 0.2156685376753104, + "grad_norm": 0.11511075496673584, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 55790 + }, + { + "epoch": 0.2157071948786937, + "grad_norm": 0.1234438568353653, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 55800 + }, + { + "epoch": 0.21574585208207697, + "grad_norm": 0.0952814370393753, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 55810 + }, + { + "epoch": 0.21578450928546025, + "grad_norm": 0.10598743706941605, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 55820 + }, + { + "epoch": 0.21582316648884353, + "grad_norm": 0.11886392533779144, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 55830 + }, + { + "epoch": 0.2158618236922268, + "grad_norm": 0.09600363671779633, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 55840 + }, + { + "epoch": 0.2159004808956101, + "grad_norm": 0.12411284446716309, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 55850 + }, + { + "epoch": 0.21593913809899337, + "grad_norm": 0.11406931281089783, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 55860 + }, + { + "epoch": 0.21597779530237665, + "grad_norm": 0.09808206558227539, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 55870 + }, + { + "epoch": 0.21601645250575993, + "grad_norm": 0.09981367737054825, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 55880 + }, + { + "epoch": 0.2160551097091432, + "grad_norm": 0.11539043486118317, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 55890 + }, + { + "epoch": 0.2160937669125265, + "grad_norm": 0.09959662705659866, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 55900 + }, + { + "epoch": 0.21613242411590977, + "grad_norm": 0.10524085909128189, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 55910 + }, + { + "epoch": 0.21617108131929302, + "grad_norm": 0.10049695521593094, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 55920 + }, + { + "epoch": 0.2162097385226763, + "grad_norm": 0.11792772263288498, + "learning_rate": 0.002, + "loss": 2.354, + "step": 55930 + }, + { + "epoch": 0.21624839572605958, + "grad_norm": 0.1288314014673233, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 55940 + }, + { + "epoch": 0.21628705292944286, + "grad_norm": 0.11100339889526367, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 55950 + }, + { + "epoch": 0.21632571013282614, + "grad_norm": 0.10390551388263702, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 55960 + }, + { + "epoch": 0.21636436733620942, + "grad_norm": 0.10269264876842499, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 55970 + }, + { + "epoch": 0.2164030245395927, + "grad_norm": 0.1092856377363205, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 55980 + }, + { + "epoch": 0.21644168174297598, + "grad_norm": 0.1061163917183876, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 55990 + }, + { + "epoch": 0.21648033894635926, + "grad_norm": 0.09067103266716003, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 56000 + }, + { + "epoch": 0.21651899614974254, + "grad_norm": 0.1044374480843544, + "learning_rate": 0.002, + "loss": 2.372, + "step": 56010 + }, + { + "epoch": 0.21655765335312582, + "grad_norm": 0.1103706806898117, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 56020 + }, + { + "epoch": 0.2165963105565091, + "grad_norm": 0.09729396551847458, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 56030 + }, + { + "epoch": 0.21663496775989238, + "grad_norm": 0.11313861608505249, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 56040 + }, + { + "epoch": 0.21667362496327566, + "grad_norm": 0.09405819326639175, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 56050 + }, + { + "epoch": 0.21671228216665894, + "grad_norm": 0.11277754604816437, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 56060 + }, + { + "epoch": 0.21675093937004222, + "grad_norm": 0.10887305438518524, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 56070 + }, + { + "epoch": 0.2167895965734255, + "grad_norm": 0.08707216382026672, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 56080 + }, + { + "epoch": 0.21682825377680878, + "grad_norm": 0.10491017252206802, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 56090 + }, + { + "epoch": 0.21686691098019206, + "grad_norm": 0.11531982570886612, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 56100 + }, + { + "epoch": 0.21690556818357531, + "grad_norm": 0.09856010228395462, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 56110 + }, + { + "epoch": 0.2169442253869586, + "grad_norm": 0.11213725060224533, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 56120 + }, + { + "epoch": 0.21698288259034187, + "grad_norm": 0.11464966833591461, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 56130 + }, + { + "epoch": 0.21702153979372515, + "grad_norm": 0.12422860413789749, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 56140 + }, + { + "epoch": 0.21706019699710843, + "grad_norm": 0.10502666980028152, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 56150 + }, + { + "epoch": 0.21709885420049171, + "grad_norm": 0.0971233919262886, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 56160 + }, + { + "epoch": 0.217137511403875, + "grad_norm": 0.1192779615521431, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 56170 + }, + { + "epoch": 0.21717616860725827, + "grad_norm": 0.10713006556034088, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 56180 + }, + { + "epoch": 0.21721482581064155, + "grad_norm": 0.10372406989336014, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 56190 + }, + { + "epoch": 0.21725348301402483, + "grad_norm": 0.11853165179491043, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 56200 + }, + { + "epoch": 0.21729214021740811, + "grad_norm": 0.09557045996189117, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 56210 + }, + { + "epoch": 0.2173307974207914, + "grad_norm": 0.12145307660102844, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 56220 + }, + { + "epoch": 0.21736945462417467, + "grad_norm": 0.10478398203849792, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 56230 + }, + { + "epoch": 0.21740811182755795, + "grad_norm": 0.11328970640897751, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 56240 + }, + { + "epoch": 0.21744676903094123, + "grad_norm": 0.10635059326887131, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 56250 + }, + { + "epoch": 0.2174854262343245, + "grad_norm": 0.1067148745059967, + "learning_rate": 0.002, + "loss": 2.367, + "step": 56260 + }, + { + "epoch": 0.2175240834377078, + "grad_norm": 0.11332537978887558, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 56270 + }, + { + "epoch": 0.21756274064109107, + "grad_norm": 0.11176859587430954, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 56280 + }, + { + "epoch": 0.21760139784447433, + "grad_norm": 0.13134358823299408, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 56290 + }, + { + "epoch": 0.2176400550478576, + "grad_norm": 0.1118798553943634, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 56300 + }, + { + "epoch": 0.21767871225124089, + "grad_norm": 0.1096770390868187, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 56310 + }, + { + "epoch": 0.21771736945462417, + "grad_norm": 0.10314035415649414, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 56320 + }, + { + "epoch": 0.21775602665800745, + "grad_norm": 0.11302123218774796, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 56330 + }, + { + "epoch": 0.21779468386139073, + "grad_norm": 0.10143207758665085, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 56340 + }, + { + "epoch": 0.217833341064774, + "grad_norm": 0.13196396827697754, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 56350 + }, + { + "epoch": 0.21787199826815729, + "grad_norm": 0.12415259331464767, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 56360 + }, + { + "epoch": 0.21791065547154057, + "grad_norm": 0.10905799269676208, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 56370 + }, + { + "epoch": 0.21794931267492385, + "grad_norm": 0.0945185050368309, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 56380 + }, + { + "epoch": 0.21798796987830713, + "grad_norm": 0.12273037433624268, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 56390 + }, + { + "epoch": 0.2180266270816904, + "grad_norm": 0.1234779804944992, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 56400 + }, + { + "epoch": 0.21806528428507369, + "grad_norm": 0.10489704459905624, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 56410 + }, + { + "epoch": 0.21810394148845696, + "grad_norm": 0.1268286556005478, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 56420 + }, + { + "epoch": 0.21814259869184024, + "grad_norm": 0.09708143025636673, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 56430 + }, + { + "epoch": 0.21818125589522352, + "grad_norm": 0.10344573110342026, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 56440 + }, + { + "epoch": 0.2182199130986068, + "grad_norm": 0.11367353051900864, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 56450 + }, + { + "epoch": 0.21825857030199008, + "grad_norm": 0.11198901385068893, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 56460 + }, + { + "epoch": 0.21829722750537336, + "grad_norm": 0.11139329522848129, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 56470 + }, + { + "epoch": 0.21833588470875662, + "grad_norm": 0.2934470772743225, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 56480 + }, + { + "epoch": 0.2183745419121399, + "grad_norm": 0.09208117425441742, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 56490 + }, + { + "epoch": 0.21841319911552318, + "grad_norm": 0.09002061933279037, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 56500 + }, + { + "epoch": 0.21845185631890646, + "grad_norm": 0.10619490593671799, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 56510 + }, + { + "epoch": 0.21849051352228974, + "grad_norm": 0.1191694512963295, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 56520 + }, + { + "epoch": 0.21852917072567302, + "grad_norm": 0.09826357662677765, + "learning_rate": 0.002, + "loss": 2.372, + "step": 56530 + }, + { + "epoch": 0.2185678279290563, + "grad_norm": 0.10082120448350906, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 56540 + }, + { + "epoch": 0.21860648513243958, + "grad_norm": 0.1363697350025177, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 56550 + }, + { + "epoch": 0.21864514233582286, + "grad_norm": 0.11739884316921234, + "learning_rate": 0.002, + "loss": 2.375, + "step": 56560 + }, + { + "epoch": 0.21868379953920614, + "grad_norm": 0.09619259089231491, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 56570 + }, + { + "epoch": 0.21872245674258942, + "grad_norm": 0.10125040262937546, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 56580 + }, + { + "epoch": 0.2187611139459727, + "grad_norm": 0.6269662976264954, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 56590 + }, + { + "epoch": 0.21879977114935598, + "grad_norm": 0.14983363449573517, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 56600 + }, + { + "epoch": 0.21883842835273926, + "grad_norm": 0.1266162395477295, + "learning_rate": 0.002, + "loss": 2.35, + "step": 56610 + }, + { + "epoch": 0.21887708555612254, + "grad_norm": 0.10967439413070679, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 56620 + }, + { + "epoch": 0.21891574275950582, + "grad_norm": 0.09150734543800354, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 56630 + }, + { + "epoch": 0.2189543999628891, + "grad_norm": 0.2657480239868164, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 56640 + }, + { + "epoch": 0.21899305716627238, + "grad_norm": 0.0987754538655281, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 56650 + }, + { + "epoch": 0.21903171436965563, + "grad_norm": 0.09909951686859131, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 56660 + }, + { + "epoch": 0.2190703715730389, + "grad_norm": 0.09344511479139328, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 56670 + }, + { + "epoch": 0.2191090287764222, + "grad_norm": 0.0909435972571373, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 56680 + }, + { + "epoch": 0.21914768597980547, + "grad_norm": 0.11159311980009079, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 56690 + }, + { + "epoch": 0.21918634318318875, + "grad_norm": 0.09787567704916, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 56700 + }, + { + "epoch": 0.21922500038657203, + "grad_norm": 0.09632629156112671, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 56710 + }, + { + "epoch": 0.2192636575899553, + "grad_norm": 0.10457716882228851, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 56720 + }, + { + "epoch": 0.2193023147933386, + "grad_norm": 0.12746752798557281, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 56730 + }, + { + "epoch": 0.21934097199672187, + "grad_norm": 0.10214285552501678, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 56740 + }, + { + "epoch": 0.21937962920010515, + "grad_norm": 0.12154365330934525, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 56750 + }, + { + "epoch": 0.21941828640348843, + "grad_norm": 0.11975551396608353, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 56760 + }, + { + "epoch": 0.2194569436068717, + "grad_norm": 0.10890713334083557, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 56770 + }, + { + "epoch": 0.219495600810255, + "grad_norm": 0.11588835716247559, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 56780 + }, + { + "epoch": 0.21953425801363827, + "grad_norm": 0.09341085702180862, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 56790 + }, + { + "epoch": 0.21957291521702155, + "grad_norm": 0.10048364102840424, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 56800 + }, + { + "epoch": 0.21961157242040483, + "grad_norm": 0.11687114834785461, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 56810 + }, + { + "epoch": 0.2196502296237881, + "grad_norm": 0.10780845582485199, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 56820 + }, + { + "epoch": 0.2196888868271714, + "grad_norm": 0.10212460160255432, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 56830 + }, + { + "epoch": 0.21972754403055467, + "grad_norm": 0.1121063157916069, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 56840 + }, + { + "epoch": 0.21976620123393792, + "grad_norm": 0.10919679701328278, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 56850 + }, + { + "epoch": 0.2198048584373212, + "grad_norm": 0.10835408419370651, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 56860 + }, + { + "epoch": 0.21984351564070448, + "grad_norm": 0.10051613301038742, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 56870 + }, + { + "epoch": 0.21988217284408776, + "grad_norm": 0.119876429438591, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 56880 + }, + { + "epoch": 0.21992083004747104, + "grad_norm": 0.1138211339712143, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 56890 + }, + { + "epoch": 0.21995948725085432, + "grad_norm": 0.10223833471536636, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 56900 + }, + { + "epoch": 0.2199981444542376, + "grad_norm": 0.10718075186014175, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 56910 + }, + { + "epoch": 0.22003680165762088, + "grad_norm": 0.11498738080263138, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 56920 + }, + { + "epoch": 0.22007545886100416, + "grad_norm": 0.11930633336305618, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 56930 + }, + { + "epoch": 0.22011411606438744, + "grad_norm": 0.11556489765644073, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 56940 + }, + { + "epoch": 0.22015277326777072, + "grad_norm": 0.10762212425470352, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 56950 + }, + { + "epoch": 0.220191430471154, + "grad_norm": 0.09085428714752197, + "learning_rate": 0.002, + "loss": 2.37, + "step": 56960 + }, + { + "epoch": 0.22023008767453728, + "grad_norm": 0.10463374853134155, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 56970 + }, + { + "epoch": 0.22026874487792056, + "grad_norm": 0.10341199487447739, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 56980 + }, + { + "epoch": 0.22030740208130384, + "grad_norm": 0.10309240221977234, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 56990 + }, + { + "epoch": 0.22034605928468712, + "grad_norm": 0.13077403604984283, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 57000 + }, + { + "epoch": 0.2203847164880704, + "grad_norm": 0.09969010204076767, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 57010 + }, + { + "epoch": 0.22042337369145368, + "grad_norm": 0.18511107563972473, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 57020 + }, + { + "epoch": 0.22046203089483693, + "grad_norm": 0.111596018075943, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 57030 + }, + { + "epoch": 0.2205006880982202, + "grad_norm": 0.10208901017904282, + "learning_rate": 0.002, + "loss": 2.3734, + "step": 57040 + }, + { + "epoch": 0.2205393453016035, + "grad_norm": 0.09552479535341263, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 57050 + }, + { + "epoch": 0.22057800250498677, + "grad_norm": 0.10220164060592651, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 57060 + }, + { + "epoch": 0.22061665970837005, + "grad_norm": 0.09908819943666458, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 57070 + }, + { + "epoch": 0.22065531691175333, + "grad_norm": 0.10516798496246338, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 57080 + }, + { + "epoch": 0.2206939741151366, + "grad_norm": 0.11086596548557281, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 57090 + }, + { + "epoch": 0.2207326313185199, + "grad_norm": 0.09934603422880173, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 57100 + }, + { + "epoch": 0.22077128852190317, + "grad_norm": 0.10181740671396255, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 57110 + }, + { + "epoch": 0.22080994572528645, + "grad_norm": 0.11644434183835983, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 57120 + }, + { + "epoch": 0.22084860292866973, + "grad_norm": 0.12942452728748322, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 57130 + }, + { + "epoch": 0.220887260132053, + "grad_norm": 0.11943166702985764, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 57140 + }, + { + "epoch": 0.2209259173354363, + "grad_norm": 0.13018329441547394, + "learning_rate": 0.002, + "loss": 2.371, + "step": 57150 + }, + { + "epoch": 0.22096457453881957, + "grad_norm": 0.1093996912240982, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 57160 + }, + { + "epoch": 0.22100323174220285, + "grad_norm": 0.10882691293954849, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 57170 + }, + { + "epoch": 0.22104188894558613, + "grad_norm": 0.11444193124771118, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 57180 + }, + { + "epoch": 0.2210805461489694, + "grad_norm": 0.12251651287078857, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 57190 + }, + { + "epoch": 0.2211192033523527, + "grad_norm": 0.11783339083194733, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 57200 + }, + { + "epoch": 0.22115786055573597, + "grad_norm": 0.09975133091211319, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 57210 + }, + { + "epoch": 0.22119651775911922, + "grad_norm": 0.12574127316474915, + "learning_rate": 0.002, + "loss": 2.3777, + "step": 57220 + }, + { + "epoch": 0.2212351749625025, + "grad_norm": 0.10755207389593124, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 57230 + }, + { + "epoch": 0.22127383216588578, + "grad_norm": 0.13826222717761993, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 57240 + }, + { + "epoch": 0.22131248936926906, + "grad_norm": 0.11009145528078079, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 57250 + }, + { + "epoch": 0.22135114657265234, + "grad_norm": 0.10710586607456207, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 57260 + }, + { + "epoch": 0.22138980377603562, + "grad_norm": 0.11547312140464783, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 57270 + }, + { + "epoch": 0.2214284609794189, + "grad_norm": 0.10780449956655502, + "learning_rate": 0.002, + "loss": 2.356, + "step": 57280 + }, + { + "epoch": 0.22146711818280218, + "grad_norm": 0.11152885854244232, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 57290 + }, + { + "epoch": 0.22150577538618546, + "grad_norm": 0.10193509608507156, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 57300 + }, + { + "epoch": 0.22154443258956874, + "grad_norm": 0.12414531409740448, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 57310 + }, + { + "epoch": 0.22158308979295202, + "grad_norm": 0.1089351624250412, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 57320 + }, + { + "epoch": 0.2216217469963353, + "grad_norm": 0.11467059701681137, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 57330 + }, + { + "epoch": 0.22166040419971858, + "grad_norm": 0.12027007341384888, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 57340 + }, + { + "epoch": 0.22169906140310186, + "grad_norm": 0.11830438673496246, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 57350 + }, + { + "epoch": 0.22173771860648514, + "grad_norm": 0.11438164114952087, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 57360 + }, + { + "epoch": 0.22177637580986842, + "grad_norm": 0.09567909687757492, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 57370 + }, + { + "epoch": 0.2218150330132517, + "grad_norm": 0.13299918174743652, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 57380 + }, + { + "epoch": 0.22185369021663498, + "grad_norm": 0.10379625111818314, + "learning_rate": 0.002, + "loss": 2.351, + "step": 57390 + }, + { + "epoch": 0.22189234742001826, + "grad_norm": 0.11183536052703857, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 57400 + }, + { + "epoch": 0.2219310046234015, + "grad_norm": 0.1073332354426384, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 57410 + }, + { + "epoch": 0.2219696618267848, + "grad_norm": 0.13551676273345947, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 57420 + }, + { + "epoch": 0.22200831903016807, + "grad_norm": 0.10104433447122574, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 57430 + }, + { + "epoch": 0.22204697623355135, + "grad_norm": 0.09403500705957413, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 57440 + }, + { + "epoch": 0.22208563343693463, + "grad_norm": 0.11605286598205566, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 57450 + }, + { + "epoch": 0.2221242906403179, + "grad_norm": 0.11487416177988052, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 57460 + }, + { + "epoch": 0.2221629478437012, + "grad_norm": 0.0884837657213211, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 57470 + }, + { + "epoch": 0.22220160504708447, + "grad_norm": 0.11717015504837036, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 57480 + }, + { + "epoch": 0.22224026225046775, + "grad_norm": 0.10914857685565948, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 57490 + }, + { + "epoch": 0.22227891945385103, + "grad_norm": 0.10230378806591034, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 57500 + }, + { + "epoch": 0.2223175766572343, + "grad_norm": 0.13150063157081604, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 57510 + }, + { + "epoch": 0.2223562338606176, + "grad_norm": 0.1172526627779007, + "learning_rate": 0.002, + "loss": 2.343, + "step": 57520 + }, + { + "epoch": 0.22239489106400087, + "grad_norm": 0.09546870738267899, + "learning_rate": 0.002, + "loss": 2.348, + "step": 57530 + }, + { + "epoch": 0.22243354826738415, + "grad_norm": 0.10355856269598007, + "learning_rate": 0.002, + "loss": 2.352, + "step": 57540 + }, + { + "epoch": 0.22247220547076743, + "grad_norm": 0.09346092492341995, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 57550 + }, + { + "epoch": 0.2225108626741507, + "grad_norm": 0.1043282151222229, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 57560 + }, + { + "epoch": 0.222549519877534, + "grad_norm": 0.10177697986364365, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 57570 + }, + { + "epoch": 0.22258817708091727, + "grad_norm": 0.1164051815867424, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 57580 + }, + { + "epoch": 0.22262683428430052, + "grad_norm": 0.0973474457859993, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 57590 + }, + { + "epoch": 0.2226654914876838, + "grad_norm": 0.0994463786482811, + "learning_rate": 0.002, + "loss": 2.358, + "step": 57600 + }, + { + "epoch": 0.22270414869106708, + "grad_norm": 0.11317646503448486, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 57610 + }, + { + "epoch": 0.22274280589445036, + "grad_norm": 0.10045770555734634, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 57620 + }, + { + "epoch": 0.22278146309783364, + "grad_norm": 0.11160579323768616, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 57630 + }, + { + "epoch": 0.22282012030121692, + "grad_norm": 0.11086764931678772, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 57640 + }, + { + "epoch": 0.2228587775046002, + "grad_norm": 0.1002127155661583, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 57650 + }, + { + "epoch": 0.22289743470798348, + "grad_norm": 0.11504166573286057, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 57660 + }, + { + "epoch": 0.22293609191136676, + "grad_norm": 0.10598036646842957, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 57670 + }, + { + "epoch": 0.22297474911475004, + "grad_norm": 0.11816691607236862, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 57680 + }, + { + "epoch": 0.22301340631813332, + "grad_norm": 0.10995496809482574, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 57690 + }, + { + "epoch": 0.2230520635215166, + "grad_norm": 0.10727677494287491, + "learning_rate": 0.002, + "loss": 2.346, + "step": 57700 + }, + { + "epoch": 0.22309072072489988, + "grad_norm": 0.12346167117357254, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 57710 + }, + { + "epoch": 0.22312937792828316, + "grad_norm": 0.11119364947080612, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 57720 + }, + { + "epoch": 0.22316803513166644, + "grad_norm": 0.10107563436031342, + "learning_rate": 0.002, + "loss": 2.359, + "step": 57730 + }, + { + "epoch": 0.22320669233504972, + "grad_norm": 0.11088485270738602, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 57740 + }, + { + "epoch": 0.223245349538433, + "grad_norm": 0.09493140131235123, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 57750 + }, + { + "epoch": 0.22328400674181628, + "grad_norm": 0.09776858985424042, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 57760 + }, + { + "epoch": 0.22332266394519956, + "grad_norm": 0.1105109453201294, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 57770 + }, + { + "epoch": 0.22336132114858281, + "grad_norm": 0.09360536187887192, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 57780 + }, + { + "epoch": 0.2233999783519661, + "grad_norm": 0.15051110088825226, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 57790 + }, + { + "epoch": 0.22343863555534937, + "grad_norm": 0.10816194117069244, + "learning_rate": 0.002, + "loss": 2.3824, + "step": 57800 + }, + { + "epoch": 0.22347729275873265, + "grad_norm": 0.11494158208370209, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 57810 + }, + { + "epoch": 0.22351594996211593, + "grad_norm": 0.11450178176164627, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 57820 + }, + { + "epoch": 0.22355460716549921, + "grad_norm": 0.10050127655267715, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 57830 + }, + { + "epoch": 0.2235932643688825, + "grad_norm": 0.09223105013370514, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 57840 + }, + { + "epoch": 0.22363192157226577, + "grad_norm": 0.09952362626791, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 57850 + }, + { + "epoch": 0.22367057877564905, + "grad_norm": 0.11158590018749237, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 57860 + }, + { + "epoch": 0.22370923597903233, + "grad_norm": 0.24610917270183563, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 57870 + }, + { + "epoch": 0.2237478931824156, + "grad_norm": 0.10274453461170197, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 57880 + }, + { + "epoch": 0.2237865503857989, + "grad_norm": 0.1181669682264328, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 57890 + }, + { + "epoch": 0.22382520758918217, + "grad_norm": 0.11647023260593414, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 57900 + }, + { + "epoch": 0.22386386479256545, + "grad_norm": 0.10563940554857254, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 57910 + }, + { + "epoch": 0.22390252199594873, + "grad_norm": 0.13255992531776428, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 57920 + }, + { + "epoch": 0.223941179199332, + "grad_norm": 0.11975933611392975, + "learning_rate": 0.002, + "loss": 2.351, + "step": 57930 + }, + { + "epoch": 0.2239798364027153, + "grad_norm": 0.10210419446229935, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 57940 + }, + { + "epoch": 0.22401849360609857, + "grad_norm": 0.10627969354391098, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 57950 + }, + { + "epoch": 0.22405715080948183, + "grad_norm": 0.14683599770069122, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 57960 + }, + { + "epoch": 0.2240958080128651, + "grad_norm": 0.1016928106546402, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 57970 + }, + { + "epoch": 0.22413446521624839, + "grad_norm": 0.11856039613485336, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 57980 + }, + { + "epoch": 0.22417312241963167, + "grad_norm": 0.10540778189897537, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 57990 + }, + { + "epoch": 0.22421177962301495, + "grad_norm": 0.10758113116025925, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 58000 + }, + { + "epoch": 0.22425043682639823, + "grad_norm": 0.11392021924257278, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 58010 + }, + { + "epoch": 0.2242890940297815, + "grad_norm": 0.10420487821102142, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 58020 + }, + { + "epoch": 0.22432775123316478, + "grad_norm": 0.12214615195989609, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 58030 + }, + { + "epoch": 0.22436640843654806, + "grad_norm": 0.12638214230537415, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 58040 + }, + { + "epoch": 0.22440506563993134, + "grad_norm": 0.10415442287921906, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 58050 + }, + { + "epoch": 0.22444372284331462, + "grad_norm": 0.10841263085603714, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 58060 + }, + { + "epoch": 0.2244823800466979, + "grad_norm": 0.10981647670269012, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 58070 + }, + { + "epoch": 0.22452103725008118, + "grad_norm": 0.1304844319820404, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 58080 + }, + { + "epoch": 0.22455969445346446, + "grad_norm": 0.10160935670137405, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 58090 + }, + { + "epoch": 0.22459835165684774, + "grad_norm": 0.11425960063934326, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 58100 + }, + { + "epoch": 0.22463700886023102, + "grad_norm": 0.1108141615986824, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 58110 + }, + { + "epoch": 0.2246756660636143, + "grad_norm": 0.0970022901892662, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 58120 + }, + { + "epoch": 0.22471432326699758, + "grad_norm": 0.11046090722084045, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 58130 + }, + { + "epoch": 0.22475298047038086, + "grad_norm": 0.11191964149475098, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 58140 + }, + { + "epoch": 0.22479163767376412, + "grad_norm": 0.11110240966081619, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 58150 + }, + { + "epoch": 0.2248302948771474, + "grad_norm": 0.12536807358264923, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 58160 + }, + { + "epoch": 0.22486895208053068, + "grad_norm": 0.1163516640663147, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 58170 + }, + { + "epoch": 0.22490760928391396, + "grad_norm": 0.11710215359926224, + "learning_rate": 0.002, + "loss": 2.371, + "step": 58180 + }, + { + "epoch": 0.22494626648729724, + "grad_norm": 0.0901746153831482, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 58190 + }, + { + "epoch": 0.22498492369068052, + "grad_norm": 0.12406687438488007, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 58200 + }, + { + "epoch": 0.2250235808940638, + "grad_norm": 0.1148986741900444, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 58210 + }, + { + "epoch": 0.22506223809744708, + "grad_norm": 0.11530084162950516, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 58220 + }, + { + "epoch": 0.22510089530083036, + "grad_norm": 0.10330082476139069, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 58230 + }, + { + "epoch": 0.22513955250421364, + "grad_norm": 0.10616800934076309, + "learning_rate": 0.002, + "loss": 2.3818, + "step": 58240 + }, + { + "epoch": 0.22517820970759692, + "grad_norm": 0.11210612207651138, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 58250 + }, + { + "epoch": 0.2252168669109802, + "grad_norm": 0.10534332692623138, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 58260 + }, + { + "epoch": 0.22525552411436348, + "grad_norm": 0.11809881031513214, + "learning_rate": 0.002, + "loss": 2.357, + "step": 58270 + }, + { + "epoch": 0.22529418131774676, + "grad_norm": 0.11005978286266327, + "learning_rate": 0.002, + "loss": 2.354, + "step": 58280 + }, + { + "epoch": 0.22533283852113004, + "grad_norm": 0.10193295776844025, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 58290 + }, + { + "epoch": 0.22537149572451332, + "grad_norm": 0.11820928752422333, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 58300 + }, + { + "epoch": 0.2254101529278966, + "grad_norm": 0.11508919298648834, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 58310 + }, + { + "epoch": 0.22544881013127988, + "grad_norm": 0.10235237330198288, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 58320 + }, + { + "epoch": 0.22548746733466313, + "grad_norm": 0.0997655987739563, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 58330 + }, + { + "epoch": 0.2255261245380464, + "grad_norm": 0.10590886324644089, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 58340 + }, + { + "epoch": 0.2255647817414297, + "grad_norm": 0.11804696917533875, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 58350 + }, + { + "epoch": 0.22560343894481297, + "grad_norm": 0.1085897833108902, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 58360 + }, + { + "epoch": 0.22564209614819625, + "grad_norm": 0.10946321487426758, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 58370 + }, + { + "epoch": 0.22568075335157953, + "grad_norm": 0.11689482629299164, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 58380 + }, + { + "epoch": 0.2257194105549628, + "grad_norm": 0.1097857654094696, + "learning_rate": 0.002, + "loss": 2.362, + "step": 58390 + }, + { + "epoch": 0.2257580677583461, + "grad_norm": 0.09730439633131027, + "learning_rate": 0.002, + "loss": 2.345, + "step": 58400 + }, + { + "epoch": 0.22579672496172937, + "grad_norm": 0.09880250692367554, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 58410 + }, + { + "epoch": 0.22583538216511265, + "grad_norm": 0.1136285811662674, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 58420 + }, + { + "epoch": 0.22587403936849593, + "grad_norm": 0.1159689649939537, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 58430 + }, + { + "epoch": 0.2259126965718792, + "grad_norm": 0.10962986201047897, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 58440 + }, + { + "epoch": 0.2259513537752625, + "grad_norm": 0.12517063319683075, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 58450 + }, + { + "epoch": 0.22599001097864577, + "grad_norm": 0.10460768640041351, + "learning_rate": 0.002, + "loss": 2.361, + "step": 58460 + }, + { + "epoch": 0.22602866818202905, + "grad_norm": 0.1050831526517868, + "learning_rate": 0.002, + "loss": 2.367, + "step": 58470 + }, + { + "epoch": 0.22606732538541233, + "grad_norm": 0.10631754994392395, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 58480 + }, + { + "epoch": 0.2261059825887956, + "grad_norm": 0.1022988110780716, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 58490 + }, + { + "epoch": 0.2261446397921789, + "grad_norm": 0.0885656401515007, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 58500 + }, + { + "epoch": 0.22618329699556217, + "grad_norm": 0.10334338992834091, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 58510 + }, + { + "epoch": 0.22622195419894542, + "grad_norm": 0.12360124289989471, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 58520 + }, + { + "epoch": 0.2262606114023287, + "grad_norm": 0.10091336816549301, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 58530 + }, + { + "epoch": 0.22629926860571198, + "grad_norm": 0.09060948342084885, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 58540 + }, + { + "epoch": 0.22633792580909526, + "grad_norm": 0.11909083276987076, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 58550 + }, + { + "epoch": 0.22637658301247854, + "grad_norm": 0.11162544786930084, + "learning_rate": 0.002, + "loss": 2.3817, + "step": 58560 + }, + { + "epoch": 0.22641524021586182, + "grad_norm": 0.11788932979106903, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 58570 + }, + { + "epoch": 0.2264538974192451, + "grad_norm": 0.1028372049331665, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 58580 + }, + { + "epoch": 0.22649255462262838, + "grad_norm": 0.09843498468399048, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 58590 + }, + { + "epoch": 0.22653121182601166, + "grad_norm": 0.1286923885345459, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 58600 + }, + { + "epoch": 0.22656986902939494, + "grad_norm": 0.09629490971565247, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 58610 + }, + { + "epoch": 0.22660852623277822, + "grad_norm": 0.11018511652946472, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 58620 + }, + { + "epoch": 0.2266471834361615, + "grad_norm": 0.09881836175918579, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 58630 + }, + { + "epoch": 0.22668584063954478, + "grad_norm": 0.1260494887828827, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 58640 + }, + { + "epoch": 0.22672449784292806, + "grad_norm": 0.11758597940206528, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 58650 + }, + { + "epoch": 0.22676315504631134, + "grad_norm": 0.11921512335538864, + "learning_rate": 0.002, + "loss": 2.385, + "step": 58660 + }, + { + "epoch": 0.22680181224969462, + "grad_norm": 0.1015913262963295, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 58670 + }, + { + "epoch": 0.2268404694530779, + "grad_norm": 0.14632849395275116, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 58680 + }, + { + "epoch": 0.22687912665646118, + "grad_norm": 0.1042994037270546, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 58690 + }, + { + "epoch": 0.22691778385984443, + "grad_norm": 0.10337947309017181, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 58700 + }, + { + "epoch": 0.2269564410632277, + "grad_norm": 0.12961353361606598, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 58710 + }, + { + "epoch": 0.226995098266611, + "grad_norm": 0.09461600333452225, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 58720 + }, + { + "epoch": 0.22703375546999427, + "grad_norm": 0.1150931864976883, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 58730 + }, + { + "epoch": 0.22707241267337755, + "grad_norm": 0.13251717388629913, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 58740 + }, + { + "epoch": 0.22711106987676083, + "grad_norm": 0.09482169896364212, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 58750 + }, + { + "epoch": 0.2271497270801441, + "grad_norm": 0.10086926817893982, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 58760 + }, + { + "epoch": 0.2271883842835274, + "grad_norm": 0.10259281098842621, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 58770 + }, + { + "epoch": 0.22722704148691067, + "grad_norm": 0.11963513493537903, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 58780 + }, + { + "epoch": 0.22726569869029395, + "grad_norm": 0.09868388622999191, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 58790 + }, + { + "epoch": 0.22730435589367723, + "grad_norm": 0.10000711679458618, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 58800 + }, + { + "epoch": 0.2273430130970605, + "grad_norm": 0.0994882881641388, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 58810 + }, + { + "epoch": 0.2273816703004438, + "grad_norm": 0.10000110417604446, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 58820 + }, + { + "epoch": 0.22742032750382707, + "grad_norm": 0.10472284257411957, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 58830 + }, + { + "epoch": 0.22745898470721035, + "grad_norm": 0.11229816824197769, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 58840 + }, + { + "epoch": 0.22749764191059363, + "grad_norm": 0.11290360987186432, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 58850 + }, + { + "epoch": 0.2275362991139769, + "grad_norm": 0.12206948548555374, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 58860 + }, + { + "epoch": 0.2275749563173602, + "grad_norm": 0.1078030988574028, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 58870 + }, + { + "epoch": 0.22761361352074347, + "grad_norm": 0.11390410363674164, + "learning_rate": 0.002, + "loss": 2.367, + "step": 58880 + }, + { + "epoch": 0.22765227072412672, + "grad_norm": 0.11171270161867142, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 58890 + }, + { + "epoch": 0.22769092792751, + "grad_norm": 0.12419595569372177, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 58900 + }, + { + "epoch": 0.22772958513089328, + "grad_norm": 0.08958700299263, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 58910 + }, + { + "epoch": 0.22776824233427656, + "grad_norm": 0.10190758109092712, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 58920 + }, + { + "epoch": 0.22780689953765984, + "grad_norm": 0.1246049553155899, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 58930 + }, + { + "epoch": 0.22784555674104312, + "grad_norm": 0.10058507323265076, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 58940 + }, + { + "epoch": 0.2278842139444264, + "grad_norm": 0.10306686908006668, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 58950 + }, + { + "epoch": 0.22792287114780968, + "grad_norm": 0.0994425043463707, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 58960 + }, + { + "epoch": 0.22796152835119296, + "grad_norm": 0.11332330852746964, + "learning_rate": 0.002, + "loss": 2.353, + "step": 58970 + }, + { + "epoch": 0.22800018555457624, + "grad_norm": 0.11806744337081909, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 58980 + }, + { + "epoch": 0.22803884275795952, + "grad_norm": 0.09620604664087296, + "learning_rate": 0.002, + "loss": 2.36, + "step": 58990 + }, + { + "epoch": 0.2280774999613428, + "grad_norm": 0.10248254984617233, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 59000 + }, + { + "epoch": 0.22811615716472608, + "grad_norm": 0.1298442929983139, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 59010 + }, + { + "epoch": 0.22815481436810936, + "grad_norm": 0.10102105140686035, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 59020 + }, + { + "epoch": 0.22819347157149264, + "grad_norm": 0.13595005869865417, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 59030 + }, + { + "epoch": 0.22823212877487592, + "grad_norm": 0.10934408009052277, + "learning_rate": 0.002, + "loss": 2.356, + "step": 59040 + }, + { + "epoch": 0.2282707859782592, + "grad_norm": 0.09380467981100082, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 59050 + }, + { + "epoch": 0.22830944318164248, + "grad_norm": 0.10101353377103806, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 59060 + }, + { + "epoch": 0.22834810038502576, + "grad_norm": 0.10406546294689178, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 59070 + }, + { + "epoch": 0.228386757588409, + "grad_norm": 0.1326468586921692, + "learning_rate": 0.002, + "loss": 2.369, + "step": 59080 + }, + { + "epoch": 0.2284254147917923, + "grad_norm": 0.108613021671772, + "learning_rate": 0.002, + "loss": 2.3761, + "step": 59090 + }, + { + "epoch": 0.22846407199517557, + "grad_norm": 0.10440162569284439, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 59100 + }, + { + "epoch": 0.22850272919855885, + "grad_norm": 0.11736375093460083, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 59110 + }, + { + "epoch": 0.22854138640194213, + "grad_norm": 0.10554061084985733, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 59120 + }, + { + "epoch": 0.2285800436053254, + "grad_norm": 0.0910368263721466, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 59130 + }, + { + "epoch": 0.2286187008087087, + "grad_norm": 0.09408886730670929, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 59140 + }, + { + "epoch": 0.22865735801209197, + "grad_norm": 0.11463332176208496, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 59150 + }, + { + "epoch": 0.22869601521547525, + "grad_norm": 0.11015474796295166, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 59160 + }, + { + "epoch": 0.22873467241885853, + "grad_norm": 0.11497203260660172, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 59170 + }, + { + "epoch": 0.2287733296222418, + "grad_norm": 0.10321945697069168, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 59180 + }, + { + "epoch": 0.2288119868256251, + "grad_norm": 0.11147177219390869, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 59190 + }, + { + "epoch": 0.22885064402900837, + "grad_norm": 0.11212391406297684, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 59200 + }, + { + "epoch": 0.22888930123239165, + "grad_norm": 0.10757148265838623, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 59210 + }, + { + "epoch": 0.22892795843577493, + "grad_norm": 0.11025886237621307, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 59220 + }, + { + "epoch": 0.2289666156391582, + "grad_norm": 0.10767524689435959, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 59230 + }, + { + "epoch": 0.2290052728425415, + "grad_norm": 0.1106267124414444, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 59240 + }, + { + "epoch": 0.22904393004592477, + "grad_norm": 0.11064332723617554, + "learning_rate": 0.002, + "loss": 2.35, + "step": 59250 + }, + { + "epoch": 0.22908258724930802, + "grad_norm": 0.10452796518802643, + "learning_rate": 0.002, + "loss": 2.379, + "step": 59260 + }, + { + "epoch": 0.2291212444526913, + "grad_norm": 0.15363597869873047, + "learning_rate": 0.002, + "loss": 2.374, + "step": 59270 + }, + { + "epoch": 0.22915990165607458, + "grad_norm": 0.1117829754948616, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 59280 + }, + { + "epoch": 0.22919855885945786, + "grad_norm": 0.10392409563064575, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 59290 + }, + { + "epoch": 0.22923721606284114, + "grad_norm": 0.1403128057718277, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 59300 + }, + { + "epoch": 0.22927587326622442, + "grad_norm": 0.09336331486701965, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 59310 + }, + { + "epoch": 0.2293145304696077, + "grad_norm": 0.10608049482107162, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 59320 + }, + { + "epoch": 0.22935318767299098, + "grad_norm": 0.13066236674785614, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 59330 + }, + { + "epoch": 0.22939184487637426, + "grad_norm": 0.1086943969130516, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 59340 + }, + { + "epoch": 0.22943050207975754, + "grad_norm": 0.11340256035327911, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 59350 + }, + { + "epoch": 0.22946915928314082, + "grad_norm": 0.11242882907390594, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 59360 + }, + { + "epoch": 0.2295078164865241, + "grad_norm": 0.10541485995054245, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 59370 + }, + { + "epoch": 0.22954647368990738, + "grad_norm": 0.10490956902503967, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 59380 + }, + { + "epoch": 0.22958513089329066, + "grad_norm": 0.11540848761796951, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 59390 + }, + { + "epoch": 0.22962378809667394, + "grad_norm": 0.10599116235971451, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 59400 + }, + { + "epoch": 0.22966244530005722, + "grad_norm": 0.10107158869504929, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 59410 + }, + { + "epoch": 0.2297011025034405, + "grad_norm": 0.16774043440818787, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 59420 + }, + { + "epoch": 0.22973975970682378, + "grad_norm": 0.10241368412971497, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 59430 + }, + { + "epoch": 0.22977841691020706, + "grad_norm": 0.11344046890735626, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 59440 + }, + { + "epoch": 0.22981707411359031, + "grad_norm": 0.134840726852417, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 59450 + }, + { + "epoch": 0.2298557313169736, + "grad_norm": 0.11046073585748672, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 59460 + }, + { + "epoch": 0.22989438852035687, + "grad_norm": 0.09477958083152771, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 59470 + }, + { + "epoch": 0.22993304572374015, + "grad_norm": 0.1265595704317093, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 59480 + }, + { + "epoch": 0.22997170292712343, + "grad_norm": 0.10804461687803268, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 59490 + }, + { + "epoch": 0.2300103601305067, + "grad_norm": 0.11189703643321991, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 59500 + }, + { + "epoch": 0.23004901733389, + "grad_norm": 0.09515593200922012, + "learning_rate": 0.002, + "loss": 2.363, + "step": 59510 + }, + { + "epoch": 0.23008767453727327, + "grad_norm": 0.12038534879684448, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 59520 + }, + { + "epoch": 0.23012633174065655, + "grad_norm": 0.11073701083660126, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 59530 + }, + { + "epoch": 0.23016498894403983, + "grad_norm": 0.11130673438310623, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 59540 + }, + { + "epoch": 0.2302036461474231, + "grad_norm": 0.11496109515428543, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 59550 + }, + { + "epoch": 0.2302423033508064, + "grad_norm": 0.10915815830230713, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 59560 + }, + { + "epoch": 0.23028096055418967, + "grad_norm": 0.11543036997318268, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 59570 + }, + { + "epoch": 0.23031961775757295, + "grad_norm": 0.11247259378433228, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 59580 + }, + { + "epoch": 0.23035827496095623, + "grad_norm": 0.10884750634431839, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 59590 + }, + { + "epoch": 0.2303969321643395, + "grad_norm": 0.09827539324760437, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 59600 + }, + { + "epoch": 0.2304355893677228, + "grad_norm": 0.093353271484375, + "learning_rate": 0.002, + "loss": 2.351, + "step": 59610 + }, + { + "epoch": 0.23047424657110607, + "grad_norm": 0.11901076883077621, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 59620 + }, + { + "epoch": 0.23051290377448933, + "grad_norm": 0.09644563496112823, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 59630 + }, + { + "epoch": 0.2305515609778726, + "grad_norm": 0.09767736494541168, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 59640 + }, + { + "epoch": 0.23059021818125588, + "grad_norm": 0.13363195955753326, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 59650 + }, + { + "epoch": 0.23062887538463916, + "grad_norm": 0.1053873673081398, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 59660 + }, + { + "epoch": 0.23066753258802244, + "grad_norm": 0.10517704486846924, + "learning_rate": 0.002, + "loss": 2.358, + "step": 59670 + }, + { + "epoch": 0.23070618979140572, + "grad_norm": 0.1103927344083786, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 59680 + }, + { + "epoch": 0.230744846994789, + "grad_norm": 0.11776106804609299, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 59690 + }, + { + "epoch": 0.23078350419817228, + "grad_norm": 0.0981423407793045, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 59700 + }, + { + "epoch": 0.23082216140155556, + "grad_norm": 0.11345241963863373, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 59710 + }, + { + "epoch": 0.23086081860493884, + "grad_norm": 0.10204170644283295, + "learning_rate": 0.002, + "loss": 2.343, + "step": 59720 + }, + { + "epoch": 0.23089947580832212, + "grad_norm": 0.10411133617162704, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 59730 + }, + { + "epoch": 0.2309381330117054, + "grad_norm": 0.09219200909137726, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 59740 + }, + { + "epoch": 0.23097679021508868, + "grad_norm": 0.12752839922904968, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 59750 + }, + { + "epoch": 0.23101544741847196, + "grad_norm": 0.12858721613883972, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 59760 + }, + { + "epoch": 0.23105410462185524, + "grad_norm": 0.10879185795783997, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 59770 + }, + { + "epoch": 0.23109276182523852, + "grad_norm": 0.09733107686042786, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 59780 + }, + { + "epoch": 0.2311314190286218, + "grad_norm": 0.09520354866981506, + "learning_rate": 0.002, + "loss": 2.361, + "step": 59790 + }, + { + "epoch": 0.23117007623200508, + "grad_norm": 0.11922292411327362, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 59800 + }, + { + "epoch": 0.23120873343538836, + "grad_norm": 0.10134036839008331, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 59810 + }, + { + "epoch": 0.23124739063877162, + "grad_norm": 0.12658289074897766, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 59820 + }, + { + "epoch": 0.2312860478421549, + "grad_norm": 0.10828740149736404, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 59830 + }, + { + "epoch": 0.23132470504553818, + "grad_norm": 0.10405316203832626, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 59840 + }, + { + "epoch": 0.23136336224892146, + "grad_norm": 0.12723368406295776, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 59850 + }, + { + "epoch": 0.23140201945230474, + "grad_norm": 0.11608844250440598, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 59860 + }, + { + "epoch": 0.23144067665568802, + "grad_norm": 0.09689639508724213, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 59870 + }, + { + "epoch": 0.2314793338590713, + "grad_norm": 0.10837455838918686, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 59880 + }, + { + "epoch": 0.23151799106245458, + "grad_norm": 0.1091194823384285, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 59890 + }, + { + "epoch": 0.23155664826583786, + "grad_norm": 0.10450857132673264, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 59900 + }, + { + "epoch": 0.23159530546922114, + "grad_norm": 0.11571555584669113, + "learning_rate": 0.002, + "loss": 2.36, + "step": 59910 + }, + { + "epoch": 0.23163396267260442, + "grad_norm": 0.10868965834379196, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 59920 + }, + { + "epoch": 0.2316726198759877, + "grad_norm": 0.10212372988462448, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 59930 + }, + { + "epoch": 0.23171127707937098, + "grad_norm": 0.10903532803058624, + "learning_rate": 0.002, + "loss": 2.356, + "step": 59940 + }, + { + "epoch": 0.23174993428275426, + "grad_norm": 0.09267522394657135, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 59950 + }, + { + "epoch": 0.23178859148613754, + "grad_norm": 0.1206112876534462, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 59960 + }, + { + "epoch": 0.23182724868952082, + "grad_norm": 0.10821081697940826, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 59970 + }, + { + "epoch": 0.2318659058929041, + "grad_norm": 0.09382860362529755, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 59980 + }, + { + "epoch": 0.23190456309628737, + "grad_norm": 0.10157258808612823, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 59990 + }, + { + "epoch": 0.23194322029967063, + "grad_norm": 0.09857197850942612, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 60000 + }, + { + "epoch": 0.2319818775030539, + "grad_norm": 0.11237531155347824, + "learning_rate": 0.002, + "loss": 2.3771, + "step": 60010 + }, + { + "epoch": 0.2320205347064372, + "grad_norm": 0.11936473846435547, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 60020 + }, + { + "epoch": 0.23205919190982047, + "grad_norm": 0.10506843775510788, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 60030 + }, + { + "epoch": 0.23209784911320375, + "grad_norm": 0.1060006245970726, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 60040 + }, + { + "epoch": 0.23213650631658703, + "grad_norm": 0.10951527208089828, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 60050 + }, + { + "epoch": 0.2321751635199703, + "grad_norm": 0.09036508202552795, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 60060 + }, + { + "epoch": 0.2322138207233536, + "grad_norm": 0.09523717314004898, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 60070 + }, + { + "epoch": 0.23225247792673687, + "grad_norm": 0.11686524003744125, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 60080 + }, + { + "epoch": 0.23229113513012015, + "grad_norm": 0.10026539862155914, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 60090 + }, + { + "epoch": 0.23232979233350343, + "grad_norm": 0.1290268748998642, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 60100 + }, + { + "epoch": 0.2323684495368867, + "grad_norm": 0.10675327479839325, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 60110 + }, + { + "epoch": 0.23240710674027, + "grad_norm": 0.1130603775382042, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 60120 + }, + { + "epoch": 0.23244576394365327, + "grad_norm": 0.0931505411863327, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 60130 + }, + { + "epoch": 0.23248442114703655, + "grad_norm": 0.11370448768138885, + "learning_rate": 0.002, + "loss": 2.343, + "step": 60140 + }, + { + "epoch": 0.23252307835041983, + "grad_norm": 0.13120517134666443, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 60150 + }, + { + "epoch": 0.2325617355538031, + "grad_norm": 0.1023140400648117, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 60160 + }, + { + "epoch": 0.23260039275718639, + "grad_norm": 0.09642255306243896, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 60170 + }, + { + "epoch": 0.23263904996056967, + "grad_norm": 0.11504430323839188, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 60180 + }, + { + "epoch": 0.23267770716395292, + "grad_norm": 0.10445452481508255, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 60190 + }, + { + "epoch": 0.2327163643673362, + "grad_norm": 0.10418809205293655, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 60200 + }, + { + "epoch": 0.23275502157071948, + "grad_norm": 0.11320661008358002, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 60210 + }, + { + "epoch": 0.23279367877410276, + "grad_norm": 0.10116437077522278, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 60220 + }, + { + "epoch": 0.23283233597748604, + "grad_norm": 0.11768436431884766, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 60230 + }, + { + "epoch": 0.23287099318086932, + "grad_norm": 0.10723967850208282, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 60240 + }, + { + "epoch": 0.2329096503842526, + "grad_norm": 0.1054241880774498, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 60250 + }, + { + "epoch": 0.23294830758763588, + "grad_norm": 0.10430363565683365, + "learning_rate": 0.002, + "loss": 2.351, + "step": 60260 + }, + { + "epoch": 0.23298696479101916, + "grad_norm": 0.12296608835458755, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 60270 + }, + { + "epoch": 0.23302562199440244, + "grad_norm": 0.12378449738025665, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 60280 + }, + { + "epoch": 0.23306427919778572, + "grad_norm": 0.10923901200294495, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 60290 + }, + { + "epoch": 0.233102936401169, + "grad_norm": 0.10164798051118851, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 60300 + }, + { + "epoch": 0.23314159360455228, + "grad_norm": 0.1040244922041893, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 60310 + }, + { + "epoch": 0.23318025080793556, + "grad_norm": 0.11190349608659744, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 60320 + }, + { + "epoch": 0.23321890801131884, + "grad_norm": 0.09140828996896744, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 60330 + }, + { + "epoch": 0.23325756521470212, + "grad_norm": 0.10454193502664566, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 60340 + }, + { + "epoch": 0.2332962224180854, + "grad_norm": 0.11909783631563187, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 60350 + }, + { + "epoch": 0.23333487962146868, + "grad_norm": 0.09467803686857224, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 60360 + }, + { + "epoch": 0.23337353682485193, + "grad_norm": 0.10158351808786392, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 60370 + }, + { + "epoch": 0.2334121940282352, + "grad_norm": 0.09694387018680573, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 60380 + }, + { + "epoch": 0.2334508512316185, + "grad_norm": 0.09731967002153397, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 60390 + }, + { + "epoch": 0.23348950843500177, + "grad_norm": 0.11340931057929993, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 60400 + }, + { + "epoch": 0.23352816563838505, + "grad_norm": 0.10410448163747787, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 60410 + }, + { + "epoch": 0.23356682284176833, + "grad_norm": 0.10245625674724579, + "learning_rate": 0.002, + "loss": 2.343, + "step": 60420 + }, + { + "epoch": 0.2336054800451516, + "grad_norm": 0.10402455925941467, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 60430 + }, + { + "epoch": 0.2336441372485349, + "grad_norm": 0.10691343247890472, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 60440 + }, + { + "epoch": 0.23368279445191817, + "grad_norm": 0.12020547688007355, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 60450 + }, + { + "epoch": 0.23372145165530145, + "grad_norm": 0.12145112454891205, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 60460 + }, + { + "epoch": 0.23376010885868473, + "grad_norm": 0.10276582092046738, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 60470 + }, + { + "epoch": 0.233798766062068, + "grad_norm": 0.09935696423053741, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 60480 + }, + { + "epoch": 0.2338374232654513, + "grad_norm": 0.10785102844238281, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 60490 + }, + { + "epoch": 0.23387608046883457, + "grad_norm": 0.1112961694598198, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 60500 + }, + { + "epoch": 0.23391473767221785, + "grad_norm": 0.09906793385744095, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 60510 + }, + { + "epoch": 0.23395339487560113, + "grad_norm": 0.12156188488006592, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 60520 + }, + { + "epoch": 0.2339920520789844, + "grad_norm": 0.12288404256105423, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 60530 + }, + { + "epoch": 0.2340307092823677, + "grad_norm": 0.1141674593091011, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 60540 + }, + { + "epoch": 0.23406936648575097, + "grad_norm": 0.11037001758813858, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 60550 + }, + { + "epoch": 0.23410802368913422, + "grad_norm": 0.12164504826068878, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 60560 + }, + { + "epoch": 0.2341466808925175, + "grad_norm": 0.11643952131271362, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 60570 + }, + { + "epoch": 0.23418533809590078, + "grad_norm": 0.09684377908706665, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 60580 + }, + { + "epoch": 0.23422399529928406, + "grad_norm": 0.09952197968959808, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 60590 + }, + { + "epoch": 0.23426265250266734, + "grad_norm": 0.10011433064937592, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 60600 + }, + { + "epoch": 0.23430130970605062, + "grad_norm": 0.11262384802103043, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 60610 + }, + { + "epoch": 0.2343399669094339, + "grad_norm": 0.10437977313995361, + "learning_rate": 0.002, + "loss": 2.364, + "step": 60620 + }, + { + "epoch": 0.23437862411281718, + "grad_norm": 0.08822324872016907, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 60630 + }, + { + "epoch": 0.23441728131620046, + "grad_norm": 0.10701391100883484, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 60640 + }, + { + "epoch": 0.23445593851958374, + "grad_norm": 0.10735175013542175, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 60650 + }, + { + "epoch": 0.23449459572296702, + "grad_norm": 0.11642715334892273, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 60660 + }, + { + "epoch": 0.2345332529263503, + "grad_norm": 0.13378360867500305, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 60670 + }, + { + "epoch": 0.23457191012973358, + "grad_norm": 0.10131211578845978, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 60680 + }, + { + "epoch": 0.23461056733311686, + "grad_norm": 0.10409022122621536, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 60690 + }, + { + "epoch": 0.23464922453650014, + "grad_norm": 0.10755995661020279, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 60700 + }, + { + "epoch": 0.23468788173988342, + "grad_norm": 0.08830675482749939, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 60710 + }, + { + "epoch": 0.2347265389432667, + "grad_norm": 0.10361380130052567, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 60720 + }, + { + "epoch": 0.23476519614664998, + "grad_norm": 0.09997949749231339, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 60730 + }, + { + "epoch": 0.23480385335003323, + "grad_norm": 0.1282014101743698, + "learning_rate": 0.002, + "loss": 2.377, + "step": 60740 + }, + { + "epoch": 0.2348425105534165, + "grad_norm": 0.10668148845434189, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 60750 + }, + { + "epoch": 0.2348811677567998, + "grad_norm": 0.09716004133224487, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 60760 + }, + { + "epoch": 0.23491982496018307, + "grad_norm": 0.11875343322753906, + "learning_rate": 0.002, + "loss": 2.369, + "step": 60770 + }, + { + "epoch": 0.23495848216356635, + "grad_norm": 0.10365365445613861, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 60780 + }, + { + "epoch": 0.23499713936694963, + "grad_norm": 0.09198535978794098, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 60790 + }, + { + "epoch": 0.2350357965703329, + "grad_norm": 0.10645847767591476, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 60800 + }, + { + "epoch": 0.2350744537737162, + "grad_norm": 0.10718676447868347, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 60810 + }, + { + "epoch": 0.23511311097709947, + "grad_norm": 0.116599440574646, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 60820 + }, + { + "epoch": 0.23515176818048275, + "grad_norm": 0.10643000900745392, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 60830 + }, + { + "epoch": 0.23519042538386603, + "grad_norm": 0.11804504692554474, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 60840 + }, + { + "epoch": 0.2352290825872493, + "grad_norm": 0.11182983964681625, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 60850 + }, + { + "epoch": 0.2352677397906326, + "grad_norm": 0.1037188172340393, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 60860 + }, + { + "epoch": 0.23530639699401587, + "grad_norm": 0.12416130304336548, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 60870 + }, + { + "epoch": 0.23534505419739915, + "grad_norm": 0.11619950830936432, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 60880 + }, + { + "epoch": 0.23538371140078243, + "grad_norm": 0.13519085943698883, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 60890 + }, + { + "epoch": 0.2354223686041657, + "grad_norm": 0.10871560871601105, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 60900 + }, + { + "epoch": 0.235461025807549, + "grad_norm": 0.18231026828289032, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 60910 + }, + { + "epoch": 0.23549968301093227, + "grad_norm": 0.15642951428890228, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 60920 + }, + { + "epoch": 0.23553834021431552, + "grad_norm": 0.10259267687797546, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 60930 + }, + { + "epoch": 0.2355769974176988, + "grad_norm": 0.10545379668474197, + "learning_rate": 0.002, + "loss": 2.357, + "step": 60940 + }, + { + "epoch": 0.23561565462108208, + "grad_norm": 0.08988619595766068, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 60950 + }, + { + "epoch": 0.23565431182446536, + "grad_norm": 0.11802471429109573, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 60960 + }, + { + "epoch": 0.23569296902784864, + "grad_norm": 0.27038830518722534, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 60970 + }, + { + "epoch": 0.23573162623123192, + "grad_norm": 0.10791066288948059, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 60980 + }, + { + "epoch": 0.2357702834346152, + "grad_norm": 0.13653592765331268, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 60990 + }, + { + "epoch": 0.23580894063799848, + "grad_norm": 0.10227449238300323, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 61000 + }, + { + "epoch": 0.23584759784138176, + "grad_norm": 0.10759121924638748, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 61010 + }, + { + "epoch": 0.23588625504476504, + "grad_norm": 0.09532109647989273, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 61020 + }, + { + "epoch": 0.23592491224814832, + "grad_norm": 0.09882092475891113, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 61030 + }, + { + "epoch": 0.2359635694515316, + "grad_norm": 0.1165454238653183, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 61040 + }, + { + "epoch": 0.23600222665491488, + "grad_norm": 0.09766757488250732, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 61050 + }, + { + "epoch": 0.23604088385829816, + "grad_norm": 0.09498842805624008, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 61060 + }, + { + "epoch": 0.23607954106168144, + "grad_norm": 0.1085570901632309, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 61070 + }, + { + "epoch": 0.23611819826506472, + "grad_norm": 0.09749253839254379, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 61080 + }, + { + "epoch": 0.236156855468448, + "grad_norm": 0.09708741307258606, + "learning_rate": 0.002, + "loss": 2.363, + "step": 61090 + }, + { + "epoch": 0.23619551267183128, + "grad_norm": 0.10069181025028229, + "learning_rate": 0.002, + "loss": 2.3839, + "step": 61100 + }, + { + "epoch": 0.23623416987521456, + "grad_norm": 0.15146248042583466, + "learning_rate": 0.002, + "loss": 2.348, + "step": 61110 + }, + { + "epoch": 0.2362728270785978, + "grad_norm": 0.11124181747436523, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 61120 + }, + { + "epoch": 0.2363114842819811, + "grad_norm": 0.09074581414461136, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 61130 + }, + { + "epoch": 0.23635014148536437, + "grad_norm": 0.09841548651456833, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 61140 + }, + { + "epoch": 0.23638879868874765, + "grad_norm": 0.11487787216901779, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 61150 + }, + { + "epoch": 0.23642745589213093, + "grad_norm": 0.10396473854780197, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 61160 + }, + { + "epoch": 0.2364661130955142, + "grad_norm": 0.11128471791744232, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 61170 + }, + { + "epoch": 0.2365047702988975, + "grad_norm": 0.12065692991018295, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 61180 + }, + { + "epoch": 0.23654342750228077, + "grad_norm": 0.10189115256071091, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 61190 + }, + { + "epoch": 0.23658208470566405, + "grad_norm": 0.1232014149427414, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 61200 + }, + { + "epoch": 0.23662074190904733, + "grad_norm": 0.11311760544776917, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 61210 + }, + { + "epoch": 0.2366593991124306, + "grad_norm": 0.10827051103115082, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 61220 + }, + { + "epoch": 0.2366980563158139, + "grad_norm": 0.12067557126283646, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 61230 + }, + { + "epoch": 0.23673671351919717, + "grad_norm": 0.11931245774030685, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 61240 + }, + { + "epoch": 0.23677537072258045, + "grad_norm": 0.10347548872232437, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 61250 + }, + { + "epoch": 0.23681402792596373, + "grad_norm": 0.12837807834148407, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 61260 + }, + { + "epoch": 0.236852685129347, + "grad_norm": 0.10006406903266907, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 61270 + }, + { + "epoch": 0.2368913423327303, + "grad_norm": 0.10347016155719757, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 61280 + }, + { + "epoch": 0.23692999953611357, + "grad_norm": 0.14867345988750458, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 61290 + }, + { + "epoch": 0.23696865673949682, + "grad_norm": 0.1082024946808815, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 61300 + }, + { + "epoch": 0.2370073139428801, + "grad_norm": 0.10785643011331558, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 61310 + }, + { + "epoch": 0.23704597114626338, + "grad_norm": 0.10544189065694809, + "learning_rate": 0.002, + "loss": 2.366, + "step": 61320 + }, + { + "epoch": 0.23708462834964666, + "grad_norm": 0.11011648178100586, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 61330 + }, + { + "epoch": 0.23712328555302994, + "grad_norm": 0.09242957830429077, + "learning_rate": 0.002, + "loss": 2.363, + "step": 61340 + }, + { + "epoch": 0.23716194275641322, + "grad_norm": 0.10205409675836563, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 61350 + }, + { + "epoch": 0.2372005999597965, + "grad_norm": 0.11633102595806122, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 61360 + }, + { + "epoch": 0.23723925716317978, + "grad_norm": 0.11982908844947815, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 61370 + }, + { + "epoch": 0.23727791436656306, + "grad_norm": 0.09505453705787659, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 61380 + }, + { + "epoch": 0.23731657156994634, + "grad_norm": 0.1351066529750824, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 61390 + }, + { + "epoch": 0.23735522877332962, + "grad_norm": 0.10480768233537674, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 61400 + }, + { + "epoch": 0.2373938859767129, + "grad_norm": 0.10703454911708832, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 61410 + }, + { + "epoch": 0.23743254318009618, + "grad_norm": 0.11127861589193344, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 61420 + }, + { + "epoch": 0.23747120038347946, + "grad_norm": 0.1108800619840622, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 61430 + }, + { + "epoch": 0.23750985758686274, + "grad_norm": 0.12055522203445435, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 61440 + }, + { + "epoch": 0.23754851479024602, + "grad_norm": 0.11530635505914688, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 61450 + }, + { + "epoch": 0.2375871719936293, + "grad_norm": 0.13346411287784576, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 61460 + }, + { + "epoch": 0.23762582919701258, + "grad_norm": 0.11563769727945328, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 61470 + }, + { + "epoch": 0.23766448640039586, + "grad_norm": 0.12334870547056198, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 61480 + }, + { + "epoch": 0.23770314360377912, + "grad_norm": 0.4291572868824005, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 61490 + }, + { + "epoch": 0.2377418008071624, + "grad_norm": 0.129678413271904, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 61500 + }, + { + "epoch": 0.23778045801054568, + "grad_norm": 0.10555509477853775, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 61510 + }, + { + "epoch": 0.23781911521392896, + "grad_norm": 0.11403724551200867, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 61520 + }, + { + "epoch": 0.23785777241731224, + "grad_norm": 0.08737763017416, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 61530 + }, + { + "epoch": 0.23789642962069552, + "grad_norm": 0.10799986124038696, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 61540 + }, + { + "epoch": 0.2379350868240788, + "grad_norm": 0.11616066843271255, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 61550 + }, + { + "epoch": 0.23797374402746208, + "grad_norm": 0.095784492790699, + "learning_rate": 0.002, + "loss": 2.345, + "step": 61560 + }, + { + "epoch": 0.23801240123084536, + "grad_norm": 0.09789443761110306, + "learning_rate": 0.002, + "loss": 2.368, + "step": 61570 + }, + { + "epoch": 0.23805105843422864, + "grad_norm": 0.10828810930252075, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 61580 + }, + { + "epoch": 0.23808971563761192, + "grad_norm": 0.10432812571525574, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 61590 + }, + { + "epoch": 0.2381283728409952, + "grad_norm": 0.10405416041612625, + "learning_rate": 0.002, + "loss": 2.366, + "step": 61600 + }, + { + "epoch": 0.23816703004437847, + "grad_norm": 0.0994420126080513, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 61610 + }, + { + "epoch": 0.23820568724776175, + "grad_norm": 0.11040692776441574, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 61620 + }, + { + "epoch": 0.23824434445114503, + "grad_norm": 0.11844494193792343, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 61630 + }, + { + "epoch": 0.23828300165452831, + "grad_norm": 0.10033336281776428, + "learning_rate": 0.002, + "loss": 2.368, + "step": 61640 + }, + { + "epoch": 0.2383216588579116, + "grad_norm": 0.10760051757097244, + "learning_rate": 0.002, + "loss": 2.364, + "step": 61650 + }, + { + "epoch": 0.23836031606129487, + "grad_norm": 0.10519957542419434, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 61660 + }, + { + "epoch": 0.23839897326467813, + "grad_norm": 0.0907941609621048, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 61670 + }, + { + "epoch": 0.2384376304680614, + "grad_norm": 0.11044026911258698, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 61680 + }, + { + "epoch": 0.2384762876714447, + "grad_norm": 0.12676018476486206, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 61690 + }, + { + "epoch": 0.23851494487482797, + "grad_norm": 0.11517304182052612, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 61700 + }, + { + "epoch": 0.23855360207821125, + "grad_norm": 0.1039179265499115, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 61710 + }, + { + "epoch": 0.23859225928159453, + "grad_norm": 0.1346934586763382, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 61720 + }, + { + "epoch": 0.2386309164849778, + "grad_norm": 0.10842429846525192, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 61730 + }, + { + "epoch": 0.2386695736883611, + "grad_norm": 0.11558246612548828, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 61740 + }, + { + "epoch": 0.23870823089174437, + "grad_norm": 0.10950513184070587, + "learning_rate": 0.002, + "loss": 2.342, + "step": 61750 + }, + { + "epoch": 0.23874688809512765, + "grad_norm": 0.11494038254022598, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 61760 + }, + { + "epoch": 0.23878554529851093, + "grad_norm": 0.1149371787905693, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 61770 + }, + { + "epoch": 0.2388242025018942, + "grad_norm": 0.10944673418998718, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 61780 + }, + { + "epoch": 0.23886285970527749, + "grad_norm": 0.09396765381097794, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 61790 + }, + { + "epoch": 0.23890151690866077, + "grad_norm": 0.09576135128736496, + "learning_rate": 0.002, + "loss": 2.368, + "step": 61800 + }, + { + "epoch": 0.23894017411204405, + "grad_norm": 0.10625293105840683, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 61810 + }, + { + "epoch": 0.23897883131542733, + "grad_norm": 0.09761402755975723, + "learning_rate": 0.002, + "loss": 2.3806, + "step": 61820 + }, + { + "epoch": 0.2390174885188106, + "grad_norm": 0.1327749788761139, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 61830 + }, + { + "epoch": 0.23905614572219389, + "grad_norm": 0.10481736809015274, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 61840 + }, + { + "epoch": 0.23909480292557717, + "grad_norm": 0.11769228428602219, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 61850 + }, + { + "epoch": 0.23913346012896042, + "grad_norm": 0.1069621592760086, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 61860 + }, + { + "epoch": 0.2391721173323437, + "grad_norm": 0.11873393505811691, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 61870 + }, + { + "epoch": 0.23921077453572698, + "grad_norm": 0.10264294594526291, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 61880 + }, + { + "epoch": 0.23924943173911026, + "grad_norm": 0.11171815544366837, + "learning_rate": 0.002, + "loss": 2.35, + "step": 61890 + }, + { + "epoch": 0.23928808894249354, + "grad_norm": 0.09807415306568146, + "learning_rate": 0.002, + "loss": 2.357, + "step": 61900 + }, + { + "epoch": 0.23932674614587682, + "grad_norm": 0.10718560963869095, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 61910 + }, + { + "epoch": 0.2393654033492601, + "grad_norm": 0.09984877705574036, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 61920 + }, + { + "epoch": 0.23940406055264338, + "grad_norm": 0.11579443514347076, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 61930 + }, + { + "epoch": 0.23944271775602666, + "grad_norm": 0.09079534560441971, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 61940 + }, + { + "epoch": 0.23948137495940994, + "grad_norm": 0.13144853711128235, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 61950 + }, + { + "epoch": 0.23952003216279322, + "grad_norm": 0.11411819607019424, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 61960 + }, + { + "epoch": 0.2395586893661765, + "grad_norm": 0.10079851001501083, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 61970 + }, + { + "epoch": 0.23959734656955978, + "grad_norm": 0.10619625449180603, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 61980 + }, + { + "epoch": 0.23963600377294306, + "grad_norm": 0.10123034566640854, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 61990 + }, + { + "epoch": 0.23967466097632634, + "grad_norm": 0.10666295140981674, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 62000 + }, + { + "epoch": 0.23971331817970962, + "grad_norm": 0.10037509351968765, + "learning_rate": 0.002, + "loss": 2.355, + "step": 62010 + }, + { + "epoch": 0.2397519753830929, + "grad_norm": 0.11558493226766586, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 62020 + }, + { + "epoch": 0.23979063258647618, + "grad_norm": 0.23745326697826385, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 62030 + }, + { + "epoch": 0.23982928978985943, + "grad_norm": 0.10613018274307251, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 62040 + }, + { + "epoch": 0.2398679469932427, + "grad_norm": 0.09879384934902191, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 62050 + }, + { + "epoch": 0.239906604196626, + "grad_norm": 0.09674771875143051, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 62060 + }, + { + "epoch": 0.23994526140000927, + "grad_norm": 0.12776194512844086, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 62070 + }, + { + "epoch": 0.23998391860339255, + "grad_norm": 0.105409637093544, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 62080 + }, + { + "epoch": 0.24002257580677583, + "grad_norm": 0.11625168472528458, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 62090 + }, + { + "epoch": 0.2400612330101591, + "grad_norm": 0.10988224297761917, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 62100 + }, + { + "epoch": 0.2400998902135424, + "grad_norm": 0.10473316162824631, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 62110 + }, + { + "epoch": 0.24013854741692567, + "grad_norm": 0.1141149252653122, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 62120 + }, + { + "epoch": 0.24017720462030895, + "grad_norm": 0.1093212142586708, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 62130 + }, + { + "epoch": 0.24021586182369223, + "grad_norm": 0.10351050645112991, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 62140 + }, + { + "epoch": 0.2402545190270755, + "grad_norm": 0.11109581589698792, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 62150 + }, + { + "epoch": 0.2402931762304588, + "grad_norm": 0.11361625045537949, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 62160 + }, + { + "epoch": 0.24033183343384207, + "grad_norm": 0.1043824851512909, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 62170 + }, + { + "epoch": 0.24037049063722535, + "grad_norm": 0.11515588313341141, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 62180 + }, + { + "epoch": 0.24040914784060863, + "grad_norm": 0.13307060301303864, + "learning_rate": 0.002, + "loss": 2.357, + "step": 62190 + }, + { + "epoch": 0.2404478050439919, + "grad_norm": 0.10510449856519699, + "learning_rate": 0.002, + "loss": 2.344, + "step": 62200 + }, + { + "epoch": 0.2404864622473752, + "grad_norm": 0.11014141887426376, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 62210 + }, + { + "epoch": 0.24052511945075847, + "grad_norm": 0.10206637531518936, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 62220 + }, + { + "epoch": 0.24056377665414172, + "grad_norm": 0.10467716306447983, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 62230 + }, + { + "epoch": 0.240602433857525, + "grad_norm": 0.13094080984592438, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 62240 + }, + { + "epoch": 0.24064109106090828, + "grad_norm": 0.12012360990047455, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 62250 + }, + { + "epoch": 0.24067974826429156, + "grad_norm": 0.1046094223856926, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 62260 + }, + { + "epoch": 0.24071840546767484, + "grad_norm": 0.11625311523675919, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 62270 + }, + { + "epoch": 0.24075706267105812, + "grad_norm": 0.11781848222017288, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 62280 + }, + { + "epoch": 0.2407957198744414, + "grad_norm": 0.10195542126893997, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 62290 + }, + { + "epoch": 0.24083437707782468, + "grad_norm": 0.13691802322864532, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 62300 + }, + { + "epoch": 0.24087303428120796, + "grad_norm": 0.10711605101823807, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 62310 + }, + { + "epoch": 0.24091169148459124, + "grad_norm": 0.09732840210199356, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 62320 + }, + { + "epoch": 0.24095034868797452, + "grad_norm": 0.10126104950904846, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 62330 + }, + { + "epoch": 0.2409890058913578, + "grad_norm": 0.11862193793058395, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 62340 + }, + { + "epoch": 0.24102766309474108, + "grad_norm": 0.10822226852178574, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 62350 + }, + { + "epoch": 0.24106632029812436, + "grad_norm": 0.12469718605279922, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 62360 + }, + { + "epoch": 0.24110497750150764, + "grad_norm": 0.11491573601961136, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 62370 + }, + { + "epoch": 0.24114363470489092, + "grad_norm": 0.44333580136299133, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 62380 + }, + { + "epoch": 0.2411822919082742, + "grad_norm": 0.11015953868627548, + "learning_rate": 0.002, + "loss": 2.3851, + "step": 62390 + }, + { + "epoch": 0.24122094911165748, + "grad_norm": 0.09991714358329773, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 62400 + }, + { + "epoch": 0.24125960631504073, + "grad_norm": 0.09338116645812988, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 62410 + }, + { + "epoch": 0.241298263518424, + "grad_norm": 0.1053207665681839, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 62420 + }, + { + "epoch": 0.2413369207218073, + "grad_norm": 0.1153879463672638, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 62430 + }, + { + "epoch": 0.24137557792519057, + "grad_norm": 0.0946960523724556, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 62440 + }, + { + "epoch": 0.24141423512857385, + "grad_norm": 0.10729081183671951, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 62450 + }, + { + "epoch": 0.24145289233195713, + "grad_norm": 0.1094650998711586, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 62460 + }, + { + "epoch": 0.2414915495353404, + "grad_norm": 0.1119939312338829, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 62470 + }, + { + "epoch": 0.2415302067387237, + "grad_norm": 0.10452145338058472, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 62480 + }, + { + "epoch": 0.24156886394210697, + "grad_norm": 0.11885945498943329, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 62490 + }, + { + "epoch": 0.24160752114549025, + "grad_norm": 0.11475305259227753, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 62500 + }, + { + "epoch": 0.24164617834887353, + "grad_norm": 0.11734220385551453, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 62510 + }, + { + "epoch": 0.2416848355522568, + "grad_norm": 0.10288964956998825, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 62520 + }, + { + "epoch": 0.2417234927556401, + "grad_norm": 0.12288731336593628, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 62530 + }, + { + "epoch": 0.24176214995902337, + "grad_norm": 0.10755060613155365, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 62540 + }, + { + "epoch": 0.24180080716240665, + "grad_norm": 0.10444962233304977, + "learning_rate": 0.002, + "loss": 2.352, + "step": 62550 + }, + { + "epoch": 0.24183946436578993, + "grad_norm": 0.1046118289232254, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 62560 + }, + { + "epoch": 0.2418781215691732, + "grad_norm": 0.10155737400054932, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 62570 + }, + { + "epoch": 0.2419167787725565, + "grad_norm": 0.11808685958385468, + "learning_rate": 0.002, + "loss": 2.356, + "step": 62580 + }, + { + "epoch": 0.24195543597593977, + "grad_norm": 0.09973792731761932, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 62590 + }, + { + "epoch": 0.24199409317932302, + "grad_norm": 0.10925379395484924, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 62600 + }, + { + "epoch": 0.2420327503827063, + "grad_norm": 0.1127404049038887, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 62610 + }, + { + "epoch": 0.24207140758608958, + "grad_norm": 0.10948999971151352, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 62620 + }, + { + "epoch": 0.24211006478947286, + "grad_norm": 0.13799403607845306, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 62630 + }, + { + "epoch": 0.24214872199285614, + "grad_norm": 0.11451857537031174, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 62640 + }, + { + "epoch": 0.24218737919623942, + "grad_norm": 0.11420796811580658, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 62650 + }, + { + "epoch": 0.2422260363996227, + "grad_norm": 0.11695457994937897, + "learning_rate": 0.002, + "loss": 2.357, + "step": 62660 + }, + { + "epoch": 0.24226469360300598, + "grad_norm": 0.10169561952352524, + "learning_rate": 0.002, + "loss": 2.352, + "step": 62670 + }, + { + "epoch": 0.24230335080638926, + "grad_norm": 0.11712782829999924, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 62680 + }, + { + "epoch": 0.24234200800977254, + "grad_norm": 0.11228419095277786, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 62690 + }, + { + "epoch": 0.24238066521315582, + "grad_norm": 0.12641753256320953, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 62700 + }, + { + "epoch": 0.2424193224165391, + "grad_norm": 0.11136075109243393, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 62710 + }, + { + "epoch": 0.24245797961992238, + "grad_norm": 0.1106911227107048, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 62720 + }, + { + "epoch": 0.24249663682330566, + "grad_norm": 0.09890349209308624, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 62730 + }, + { + "epoch": 0.24253529402668894, + "grad_norm": 0.13540855050086975, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 62740 + }, + { + "epoch": 0.24257395123007222, + "grad_norm": 0.13993631303310394, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 62750 + }, + { + "epoch": 0.2426126084334555, + "grad_norm": 0.11235027760267258, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 62760 + }, + { + "epoch": 0.24265126563683878, + "grad_norm": 0.10542025417089462, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 62770 + }, + { + "epoch": 0.24268992284022203, + "grad_norm": 0.12307173758745193, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 62780 + }, + { + "epoch": 0.2427285800436053, + "grad_norm": 0.12093117088079453, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 62790 + }, + { + "epoch": 0.2427672372469886, + "grad_norm": 0.09366226196289062, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 62800 + }, + { + "epoch": 0.24280589445037187, + "grad_norm": 0.11713926494121552, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 62810 + }, + { + "epoch": 0.24284455165375515, + "grad_norm": 0.10119593888521194, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 62820 + }, + { + "epoch": 0.24288320885713843, + "grad_norm": 0.13729046285152435, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 62830 + }, + { + "epoch": 0.2429218660605217, + "grad_norm": 0.1037496030330658, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 62840 + }, + { + "epoch": 0.242960523263905, + "grad_norm": 0.10583599656820297, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 62850 + }, + { + "epoch": 0.24299918046728827, + "grad_norm": 0.09716931730508804, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 62860 + }, + { + "epoch": 0.24303783767067155, + "grad_norm": 0.11291979253292084, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 62870 + }, + { + "epoch": 0.24307649487405483, + "grad_norm": 0.1146511435508728, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 62880 + }, + { + "epoch": 0.2431151520774381, + "grad_norm": 0.11932969093322754, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 62890 + }, + { + "epoch": 0.2431538092808214, + "grad_norm": 0.12777186930179596, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 62900 + }, + { + "epoch": 0.24319246648420467, + "grad_norm": 0.11459875106811523, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 62910 + }, + { + "epoch": 0.24323112368758795, + "grad_norm": 0.11585894227027893, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 62920 + }, + { + "epoch": 0.24326978089097123, + "grad_norm": 0.1289733350276947, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 62930 + }, + { + "epoch": 0.2433084380943545, + "grad_norm": 0.10749118030071259, + "learning_rate": 0.002, + "loss": 2.347, + "step": 62940 + }, + { + "epoch": 0.2433470952977378, + "grad_norm": 0.10447361320257187, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 62950 + }, + { + "epoch": 0.24338575250112107, + "grad_norm": 0.11179212480783463, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 62960 + }, + { + "epoch": 0.24342440970450432, + "grad_norm": 0.1024908795952797, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 62970 + }, + { + "epoch": 0.2434630669078876, + "grad_norm": 0.10360907018184662, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 62980 + }, + { + "epoch": 0.24350172411127088, + "grad_norm": 0.11925695836544037, + "learning_rate": 0.002, + "loss": 2.348, + "step": 62990 + }, + { + "epoch": 0.24354038131465416, + "grad_norm": 0.11314789205789566, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 63000 + }, + { + "epoch": 0.24357903851803744, + "grad_norm": 0.09414984285831451, + "learning_rate": 0.002, + "loss": 2.349, + "step": 63010 + }, + { + "epoch": 0.24361769572142072, + "grad_norm": 0.1293078511953354, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 63020 + }, + { + "epoch": 0.243656352924804, + "grad_norm": 0.1282479166984558, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 63030 + }, + { + "epoch": 0.24369501012818728, + "grad_norm": 0.11113385856151581, + "learning_rate": 0.002, + "loss": 2.349, + "step": 63040 + }, + { + "epoch": 0.24373366733157056, + "grad_norm": 0.10495486855506897, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 63050 + }, + { + "epoch": 0.24377232453495384, + "grad_norm": 0.1034003272652626, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 63060 + }, + { + "epoch": 0.24381098173833712, + "grad_norm": 0.103383369743824, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 63070 + }, + { + "epoch": 0.2438496389417204, + "grad_norm": 0.13411355018615723, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 63080 + }, + { + "epoch": 0.24388829614510368, + "grad_norm": 0.09459537267684937, + "learning_rate": 0.002, + "loss": 2.3757, + "step": 63090 + }, + { + "epoch": 0.24392695334848696, + "grad_norm": 0.11230204254388809, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 63100 + }, + { + "epoch": 0.24396561055187024, + "grad_norm": 0.10179710388183594, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 63110 + }, + { + "epoch": 0.24400426775525352, + "grad_norm": 0.10394256561994553, + "learning_rate": 0.002, + "loss": 2.367, + "step": 63120 + }, + { + "epoch": 0.2440429249586368, + "grad_norm": 0.12284719198942184, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 63130 + }, + { + "epoch": 0.24408158216202008, + "grad_norm": 0.10525992512702942, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 63140 + }, + { + "epoch": 0.24412023936540336, + "grad_norm": 0.11592471599578857, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 63150 + }, + { + "epoch": 0.24415889656878662, + "grad_norm": 0.10198235511779785, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 63160 + }, + { + "epoch": 0.2441975537721699, + "grad_norm": 0.10999837517738342, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 63170 + }, + { + "epoch": 0.24423621097555318, + "grad_norm": 0.13410034775733948, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 63180 + }, + { + "epoch": 0.24427486817893646, + "grad_norm": 0.12082164734601974, + "learning_rate": 0.002, + "loss": 2.349, + "step": 63190 + }, + { + "epoch": 0.24431352538231974, + "grad_norm": 0.09982411563396454, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 63200 + }, + { + "epoch": 0.24435218258570301, + "grad_norm": 0.1246073842048645, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 63210 + }, + { + "epoch": 0.2443908397890863, + "grad_norm": 0.10156312584877014, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 63220 + }, + { + "epoch": 0.24442949699246957, + "grad_norm": 0.16856735944747925, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 63230 + }, + { + "epoch": 0.24446815419585285, + "grad_norm": 0.12928234040737152, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 63240 + }, + { + "epoch": 0.24450681139923613, + "grad_norm": 0.11333297938108444, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 63250 + }, + { + "epoch": 0.24454546860261941, + "grad_norm": 0.10946240276098251, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 63260 + }, + { + "epoch": 0.2445841258060027, + "grad_norm": 0.09647426754236221, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 63270 + }, + { + "epoch": 0.24462278300938597, + "grad_norm": 0.10767856240272522, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 63280 + }, + { + "epoch": 0.24466144021276925, + "grad_norm": 0.11887312680482864, + "learning_rate": 0.002, + "loss": 2.361, + "step": 63290 + }, + { + "epoch": 0.24470009741615253, + "grad_norm": 0.12779520452022552, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 63300 + }, + { + "epoch": 0.24473875461953581, + "grad_norm": 0.11230748146772385, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 63310 + }, + { + "epoch": 0.2447774118229191, + "grad_norm": 0.11133432388305664, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 63320 + }, + { + "epoch": 0.24481606902630237, + "grad_norm": 0.10257939249277115, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 63330 + }, + { + "epoch": 0.24485472622968563, + "grad_norm": 0.11476735770702362, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 63340 + }, + { + "epoch": 0.2448933834330689, + "grad_norm": 0.11152663081884384, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 63350 + }, + { + "epoch": 0.2449320406364522, + "grad_norm": 0.1259431391954422, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 63360 + }, + { + "epoch": 0.24497069783983547, + "grad_norm": 0.09820462018251419, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 63370 + }, + { + "epoch": 0.24500935504321875, + "grad_norm": 0.13023404777050018, + "learning_rate": 0.002, + "loss": 2.365, + "step": 63380 + }, + { + "epoch": 0.24504801224660203, + "grad_norm": 0.11050461232662201, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 63390 + }, + { + "epoch": 0.2450866694499853, + "grad_norm": 0.0972348302602768, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 63400 + }, + { + "epoch": 0.24512532665336859, + "grad_norm": 0.12149663269519806, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 63410 + }, + { + "epoch": 0.24516398385675187, + "grad_norm": 0.1159098818898201, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 63420 + }, + { + "epoch": 0.24520264106013515, + "grad_norm": 0.10914923250675201, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 63430 + }, + { + "epoch": 0.24524129826351843, + "grad_norm": 0.12055275589227676, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 63440 + }, + { + "epoch": 0.2452799554669017, + "grad_norm": 0.1446305811405182, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 63450 + }, + { + "epoch": 0.24531861267028499, + "grad_norm": 0.11570040136575699, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 63460 + }, + { + "epoch": 0.24535726987366827, + "grad_norm": 0.10370668768882751, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 63470 + }, + { + "epoch": 0.24539592707705155, + "grad_norm": 0.11650431901216507, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 63480 + }, + { + "epoch": 0.24543458428043483, + "grad_norm": 0.13468730449676514, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 63490 + }, + { + "epoch": 0.2454732414838181, + "grad_norm": 0.11168798804283142, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 63500 + }, + { + "epoch": 0.24551189868720139, + "grad_norm": 0.1285007745027542, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 63510 + }, + { + "epoch": 0.24555055589058467, + "grad_norm": 0.10346856713294983, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 63520 + }, + { + "epoch": 0.24558921309396792, + "grad_norm": 0.11400487273931503, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 63530 + }, + { + "epoch": 0.2456278702973512, + "grad_norm": 0.10188063234090805, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 63540 + }, + { + "epoch": 0.24566652750073448, + "grad_norm": 0.10284477472305298, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 63550 + }, + { + "epoch": 0.24570518470411776, + "grad_norm": 0.10538157820701599, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 63560 + }, + { + "epoch": 0.24574384190750104, + "grad_norm": 0.10660867393016815, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 63570 + }, + { + "epoch": 0.24578249911088432, + "grad_norm": 0.13760226964950562, + "learning_rate": 0.002, + "loss": 2.359, + "step": 63580 + }, + { + "epoch": 0.2458211563142676, + "grad_norm": 0.10196994245052338, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 63590 + }, + { + "epoch": 0.24585981351765088, + "grad_norm": 0.10736634582281113, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 63600 + }, + { + "epoch": 0.24589847072103416, + "grad_norm": 0.11615607142448425, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 63610 + }, + { + "epoch": 0.24593712792441744, + "grad_norm": 0.11788640916347504, + "learning_rate": 0.002, + "loss": 2.368, + "step": 63620 + }, + { + "epoch": 0.24597578512780072, + "grad_norm": 0.10808571428060532, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 63630 + }, + { + "epoch": 0.246014442331184, + "grad_norm": 0.14803314208984375, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 63640 + }, + { + "epoch": 0.24605309953456728, + "grad_norm": 0.11327030509710312, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 63650 + }, + { + "epoch": 0.24609175673795056, + "grad_norm": 0.11561069637537003, + "learning_rate": 0.002, + "loss": 2.345, + "step": 63660 + }, + { + "epoch": 0.24613041394133384, + "grad_norm": 0.10180526971817017, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 63670 + }, + { + "epoch": 0.24616907114471712, + "grad_norm": 0.10653835535049438, + "learning_rate": 0.002, + "loss": 2.356, + "step": 63680 + }, + { + "epoch": 0.2462077283481004, + "grad_norm": 0.11258967220783234, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 63690 + }, + { + "epoch": 0.24624638555148368, + "grad_norm": 0.11039800941944122, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 63700 + }, + { + "epoch": 0.24628504275486693, + "grad_norm": 0.13323748111724854, + "learning_rate": 0.002, + "loss": 2.364, + "step": 63710 + }, + { + "epoch": 0.2463236999582502, + "grad_norm": 0.1323200762271881, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 63720 + }, + { + "epoch": 0.2463623571616335, + "grad_norm": 0.10117621719837189, + "learning_rate": 0.002, + "loss": 2.3859, + "step": 63730 + }, + { + "epoch": 0.24640101436501677, + "grad_norm": 0.12777069211006165, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 63740 + }, + { + "epoch": 0.24643967156840005, + "grad_norm": 0.11329413205385208, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 63750 + }, + { + "epoch": 0.24647832877178333, + "grad_norm": 0.16249066591262817, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 63760 + }, + { + "epoch": 0.2465169859751666, + "grad_norm": 0.09796962887048721, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 63770 + }, + { + "epoch": 0.2465556431785499, + "grad_norm": 0.15831460058689117, + "learning_rate": 0.002, + "loss": 2.3828, + "step": 63780 + }, + { + "epoch": 0.24659430038193317, + "grad_norm": 0.12122119963169098, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 63790 + }, + { + "epoch": 0.24663295758531645, + "grad_norm": 0.11653528362512589, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 63800 + }, + { + "epoch": 0.24667161478869973, + "grad_norm": 0.13036595284938812, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 63810 + }, + { + "epoch": 0.246710271992083, + "grad_norm": 0.10035198926925659, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 63820 + }, + { + "epoch": 0.2467489291954663, + "grad_norm": 0.10289250314235687, + "learning_rate": 0.002, + "loss": 2.337, + "step": 63830 + }, + { + "epoch": 0.24678758639884957, + "grad_norm": 0.12327629327774048, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 63840 + }, + { + "epoch": 0.24682624360223285, + "grad_norm": 0.094744473695755, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 63850 + }, + { + "epoch": 0.24686490080561613, + "grad_norm": 0.11189937591552734, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 63860 + }, + { + "epoch": 0.2469035580089994, + "grad_norm": 0.11155198514461517, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 63870 + }, + { + "epoch": 0.2469422152123827, + "grad_norm": 0.11133726686239243, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 63880 + }, + { + "epoch": 0.24698087241576597, + "grad_norm": 0.11880885809659958, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 63890 + }, + { + "epoch": 0.24701952961914922, + "grad_norm": 0.1178770512342453, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 63900 + }, + { + "epoch": 0.2470581868225325, + "grad_norm": 0.10876142978668213, + "learning_rate": 0.002, + "loss": 2.356, + "step": 63910 + }, + { + "epoch": 0.24709684402591578, + "grad_norm": 0.10888998955488205, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 63920 + }, + { + "epoch": 0.24713550122929906, + "grad_norm": 0.09465057402849197, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 63930 + }, + { + "epoch": 0.24717415843268234, + "grad_norm": 0.12009290605783463, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 63940 + }, + { + "epoch": 0.24721281563606562, + "grad_norm": 0.13506600260734558, + "learning_rate": 0.002, + "loss": 2.35, + "step": 63950 + }, + { + "epoch": 0.2472514728394489, + "grad_norm": 0.12046731263399124, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 63960 + }, + { + "epoch": 0.24729013004283218, + "grad_norm": 0.11283211410045624, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 63970 + }, + { + "epoch": 0.24732878724621546, + "grad_norm": 0.10055335611104965, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 63980 + }, + { + "epoch": 0.24736744444959874, + "grad_norm": 0.11106719076633453, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 63990 + }, + { + "epoch": 0.24740610165298202, + "grad_norm": 0.14089414477348328, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 64000 + }, + { + "epoch": 0.2474447588563653, + "grad_norm": 0.10141649097204208, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 64010 + }, + { + "epoch": 0.24748341605974858, + "grad_norm": 0.11181320995092392, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 64020 + }, + { + "epoch": 0.24752207326313186, + "grad_norm": 0.09916166961193085, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 64030 + }, + { + "epoch": 0.24756073046651514, + "grad_norm": 0.14971262216567993, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 64040 + }, + { + "epoch": 0.24759938766989842, + "grad_norm": 0.10490719974040985, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 64050 + }, + { + "epoch": 0.2476380448732817, + "grad_norm": 0.1055225357413292, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 64060 + }, + { + "epoch": 0.24767670207666498, + "grad_norm": 0.10268551856279373, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 64070 + }, + { + "epoch": 0.24771535928004823, + "grad_norm": 0.12704938650131226, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 64080 + }, + { + "epoch": 0.2477540164834315, + "grad_norm": 0.11193748563528061, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 64090 + }, + { + "epoch": 0.2477926736868148, + "grad_norm": 0.110689178109169, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 64100 + }, + { + "epoch": 0.24783133089019807, + "grad_norm": 0.12341715395450592, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 64110 + }, + { + "epoch": 0.24786998809358135, + "grad_norm": 0.11892364919185638, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 64120 + }, + { + "epoch": 0.24790864529696463, + "grad_norm": 0.10833492130041122, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 64130 + }, + { + "epoch": 0.2479473025003479, + "grad_norm": 0.12388889491558075, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 64140 + }, + { + "epoch": 0.2479859597037312, + "grad_norm": 0.10363738983869553, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 64150 + }, + { + "epoch": 0.24802461690711447, + "grad_norm": 0.1311815083026886, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 64160 + }, + { + "epoch": 0.24806327411049775, + "grad_norm": 0.09794235974550247, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 64170 + }, + { + "epoch": 0.24810193131388103, + "grad_norm": 0.15324456989765167, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 64180 + }, + { + "epoch": 0.2481405885172643, + "grad_norm": 0.11453873664140701, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 64190 + }, + { + "epoch": 0.2481792457206476, + "grad_norm": 0.0997665673494339, + "learning_rate": 0.002, + "loss": 2.35, + "step": 64200 + }, + { + "epoch": 0.24821790292403087, + "grad_norm": 0.10649903118610382, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 64210 + }, + { + "epoch": 0.24825656012741415, + "grad_norm": 0.10414480417966843, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 64220 + }, + { + "epoch": 0.24829521733079743, + "grad_norm": 0.13123467564582825, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 64230 + }, + { + "epoch": 0.2483338745341807, + "grad_norm": 0.11330459266901016, + "learning_rate": 0.002, + "loss": 2.342, + "step": 64240 + }, + { + "epoch": 0.248372531737564, + "grad_norm": 0.09887401759624481, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 64250 + }, + { + "epoch": 0.24841118894094727, + "grad_norm": 0.09318116307258606, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 64260 + }, + { + "epoch": 0.24844984614433052, + "grad_norm": 0.13932709395885468, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 64270 + }, + { + "epoch": 0.2484885033477138, + "grad_norm": 0.12776115536689758, + "learning_rate": 0.002, + "loss": 2.342, + "step": 64280 + }, + { + "epoch": 0.24852716055109708, + "grad_norm": 0.10516197234392166, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 64290 + }, + { + "epoch": 0.24856581775448036, + "grad_norm": 0.10422108322381973, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 64300 + }, + { + "epoch": 0.24860447495786364, + "grad_norm": 0.1057262048125267, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 64310 + }, + { + "epoch": 0.24864313216124692, + "grad_norm": 0.12558288872241974, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 64320 + }, + { + "epoch": 0.2486817893646302, + "grad_norm": 0.10270664840936661, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 64330 + }, + { + "epoch": 0.24872044656801348, + "grad_norm": 0.12882837653160095, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 64340 + }, + { + "epoch": 0.24875910377139676, + "grad_norm": 0.11752016097307205, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 64350 + }, + { + "epoch": 0.24879776097478004, + "grad_norm": 0.10369332134723663, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 64360 + }, + { + "epoch": 0.24883641817816332, + "grad_norm": 0.09841576963663101, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 64370 + }, + { + "epoch": 0.2488750753815466, + "grad_norm": 0.11956728249788284, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 64380 + }, + { + "epoch": 0.24891373258492988, + "grad_norm": 0.11087989062070847, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 64390 + }, + { + "epoch": 0.24895238978831316, + "grad_norm": 0.1197686493396759, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 64400 + }, + { + "epoch": 0.24899104699169644, + "grad_norm": 0.10942673683166504, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 64410 + }, + { + "epoch": 0.24902970419507972, + "grad_norm": 0.1125696524977684, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 64420 + }, + { + "epoch": 0.249068361398463, + "grad_norm": 0.12106586247682571, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 64430 + }, + { + "epoch": 0.24910701860184628, + "grad_norm": 0.09818416088819504, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 64440 + }, + { + "epoch": 0.24914567580522953, + "grad_norm": 0.10371017456054688, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 64450 + }, + { + "epoch": 0.2491843330086128, + "grad_norm": 0.10142534971237183, + "learning_rate": 0.002, + "loss": 2.371, + "step": 64460 + }, + { + "epoch": 0.2492229902119961, + "grad_norm": 0.15823061764240265, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 64470 + }, + { + "epoch": 0.24926164741537937, + "grad_norm": 0.10784637182950974, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 64480 + }, + { + "epoch": 0.24930030461876265, + "grad_norm": 0.10801376402378082, + "learning_rate": 0.002, + "loss": 2.354, + "step": 64490 + }, + { + "epoch": 0.24933896182214593, + "grad_norm": 0.09740674495697021, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 64500 + }, + { + "epoch": 0.2493776190255292, + "grad_norm": 0.11784724146127701, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 64510 + }, + { + "epoch": 0.2494162762289125, + "grad_norm": 0.11282724142074585, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 64520 + }, + { + "epoch": 0.24945493343229577, + "grad_norm": 0.11594454199075699, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 64530 + }, + { + "epoch": 0.24949359063567905, + "grad_norm": 0.09549959003925323, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 64540 + }, + { + "epoch": 0.24953224783906233, + "grad_norm": 0.11274974793195724, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 64550 + }, + { + "epoch": 0.2495709050424456, + "grad_norm": 0.10660584270954132, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 64560 + }, + { + "epoch": 0.2496095622458289, + "grad_norm": 0.09843463450670242, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 64570 + }, + { + "epoch": 0.24964821944921217, + "grad_norm": 0.11696872115135193, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 64580 + }, + { + "epoch": 0.24968687665259545, + "grad_norm": 0.12023330479860306, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 64590 + }, + { + "epoch": 0.24972553385597873, + "grad_norm": 0.10271915048360825, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 64600 + }, + { + "epoch": 0.249764191059362, + "grad_norm": 0.11523495614528656, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 64610 + }, + { + "epoch": 0.2498028482627453, + "grad_norm": 0.1043362021446228, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 64620 + }, + { + "epoch": 0.24984150546612857, + "grad_norm": 0.1054345890879631, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 64630 + }, + { + "epoch": 0.24988016266951182, + "grad_norm": 0.13760827481746674, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 64640 + }, + { + "epoch": 0.2499188198728951, + "grad_norm": 0.11963876336812973, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 64650 + }, + { + "epoch": 0.24995747707627838, + "grad_norm": 0.09870851784944534, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 64660 + }, + { + "epoch": 0.24999613427966166, + "grad_norm": 0.13970845937728882, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 64670 + }, + { + "epoch": 0.25003479148304497, + "grad_norm": 0.15180478990077972, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 64680 + }, + { + "epoch": 0.2500734486864282, + "grad_norm": 0.1098647192120552, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 64690 + }, + { + "epoch": 0.25011210588981153, + "grad_norm": 0.11667878180742264, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 64700 + }, + { + "epoch": 0.2501507630931948, + "grad_norm": 0.15805795788764954, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 64710 + }, + { + "epoch": 0.2501894202965781, + "grad_norm": 0.14936715364456177, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 64720 + }, + { + "epoch": 0.25022807749996134, + "grad_norm": 0.09799555689096451, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 64730 + }, + { + "epoch": 0.2502667347033446, + "grad_norm": 0.09978866577148438, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 64740 + }, + { + "epoch": 0.2503053919067279, + "grad_norm": 0.10472051799297333, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 64750 + }, + { + "epoch": 0.25034404911011116, + "grad_norm": 0.10219122469425201, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 64760 + }, + { + "epoch": 0.25038270631349446, + "grad_norm": 0.0982801541686058, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 64770 + }, + { + "epoch": 0.2504213635168777, + "grad_norm": 0.09922726452350616, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 64780 + }, + { + "epoch": 0.250460020720261, + "grad_norm": 0.11779270321130753, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 64790 + }, + { + "epoch": 0.2504986779236443, + "grad_norm": 0.12202689051628113, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 64800 + }, + { + "epoch": 0.2505373351270276, + "grad_norm": 0.1136423796415329, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 64810 + }, + { + "epoch": 0.25057599233041083, + "grad_norm": 0.10997912287712097, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 64820 + }, + { + "epoch": 0.25061464953379414, + "grad_norm": 0.10405836254358292, + "learning_rate": 0.002, + "loss": 2.339, + "step": 64830 + }, + { + "epoch": 0.2506533067371774, + "grad_norm": 0.10198356956243515, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 64840 + }, + { + "epoch": 0.2506919639405607, + "grad_norm": 0.1193748265504837, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 64850 + }, + { + "epoch": 0.25073062114394395, + "grad_norm": 0.0984581783413887, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 64860 + }, + { + "epoch": 0.25076927834732726, + "grad_norm": 0.12913978099822998, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 64870 + }, + { + "epoch": 0.2508079355507105, + "grad_norm": 0.10153471678495407, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 64880 + }, + { + "epoch": 0.2508465927540938, + "grad_norm": 0.09593507647514343, + "learning_rate": 0.002, + "loss": 2.3866, + "step": 64890 + }, + { + "epoch": 0.2508852499574771, + "grad_norm": 0.1064973697066307, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 64900 + }, + { + "epoch": 0.2509239071608604, + "grad_norm": 0.10728035122156143, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 64910 + }, + { + "epoch": 0.25096256436424363, + "grad_norm": 0.13115960359573364, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 64920 + }, + { + "epoch": 0.2510012215676269, + "grad_norm": 0.11741824448108673, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 64930 + }, + { + "epoch": 0.2510398787710102, + "grad_norm": 0.10732880979776382, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 64940 + }, + { + "epoch": 0.25107853597439345, + "grad_norm": 0.11535921692848206, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 64950 + }, + { + "epoch": 0.25111719317777675, + "grad_norm": 0.11269236356019974, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 64960 + }, + { + "epoch": 0.25115585038116, + "grad_norm": 0.10160496830940247, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 64970 + }, + { + "epoch": 0.2511945075845433, + "grad_norm": 0.11177372187376022, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 64980 + }, + { + "epoch": 0.25123316478792657, + "grad_norm": 0.12382085621356964, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 64990 + }, + { + "epoch": 0.2512718219913099, + "grad_norm": 0.12273989617824554, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 65000 + }, + { + "epoch": 0.2513104791946931, + "grad_norm": 0.10529609769582748, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 65010 + }, + { + "epoch": 0.25134913639807643, + "grad_norm": 0.12380048632621765, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 65020 + }, + { + "epoch": 0.2513877936014597, + "grad_norm": 0.11554655432701111, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 65030 + }, + { + "epoch": 0.251426450804843, + "grad_norm": 0.12273314595222473, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 65040 + }, + { + "epoch": 0.25146510800822625, + "grad_norm": 0.12644176185131073, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 65050 + }, + { + "epoch": 0.25150376521160955, + "grad_norm": 0.10715832561254501, + "learning_rate": 0.002, + "loss": 2.353, + "step": 65060 + }, + { + "epoch": 0.2515424224149928, + "grad_norm": 0.11817505210638046, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 65070 + }, + { + "epoch": 0.2515810796183761, + "grad_norm": 0.1156371533870697, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 65080 + }, + { + "epoch": 0.25161973682175937, + "grad_norm": 0.10240025073289871, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 65090 + }, + { + "epoch": 0.2516583940251426, + "grad_norm": 0.1039934903383255, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 65100 + }, + { + "epoch": 0.2516970512285259, + "grad_norm": 0.12380948662757874, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 65110 + }, + { + "epoch": 0.2517357084319092, + "grad_norm": 0.11189432442188263, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 65120 + }, + { + "epoch": 0.2517743656352925, + "grad_norm": 0.0978197380900383, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 65130 + }, + { + "epoch": 0.25181302283867574, + "grad_norm": 0.11754926294088364, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 65140 + }, + { + "epoch": 0.25185168004205905, + "grad_norm": 0.10339406132698059, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 65150 + }, + { + "epoch": 0.2518903372454423, + "grad_norm": 0.11164677143096924, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 65160 + }, + { + "epoch": 0.2519289944488256, + "grad_norm": 0.09547177702188492, + "learning_rate": 0.002, + "loss": 2.3805, + "step": 65170 + }, + { + "epoch": 0.25196765165220886, + "grad_norm": 0.11362284421920776, + "learning_rate": 0.002, + "loss": 2.3753, + "step": 65180 + }, + { + "epoch": 0.25200630885559216, + "grad_norm": 0.09665997326374054, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 65190 + }, + { + "epoch": 0.2520449660589754, + "grad_norm": 0.12740235030651093, + "learning_rate": 0.002, + "loss": 2.339, + "step": 65200 + }, + { + "epoch": 0.2520836232623587, + "grad_norm": 0.10088548064231873, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 65210 + }, + { + "epoch": 0.252122280465742, + "grad_norm": 0.08670290559530258, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 65220 + }, + { + "epoch": 0.2521609376691253, + "grad_norm": 0.11755174398422241, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 65230 + }, + { + "epoch": 0.25219959487250854, + "grad_norm": 0.11353892087936401, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 65240 + }, + { + "epoch": 0.25223825207589184, + "grad_norm": 0.11044956743717194, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 65250 + }, + { + "epoch": 0.2522769092792751, + "grad_norm": 0.10184010118246078, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 65260 + }, + { + "epoch": 0.2523155664826584, + "grad_norm": 0.11825776845216751, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 65270 + }, + { + "epoch": 0.25235422368604166, + "grad_norm": 0.11049504578113556, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 65280 + }, + { + "epoch": 0.2523928808894249, + "grad_norm": 0.11343789845705032, + "learning_rate": 0.002, + "loss": 2.374, + "step": 65290 + }, + { + "epoch": 0.2524315380928082, + "grad_norm": 0.10674849152565002, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 65300 + }, + { + "epoch": 0.25247019529619147, + "grad_norm": 0.11158914119005203, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 65310 + }, + { + "epoch": 0.2525088524995748, + "grad_norm": 0.13002395629882812, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 65320 + }, + { + "epoch": 0.25254750970295803, + "grad_norm": 0.10284659266471863, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 65330 + }, + { + "epoch": 0.25258616690634134, + "grad_norm": 0.09975076466798782, + "learning_rate": 0.002, + "loss": 2.3719, + "step": 65340 + }, + { + "epoch": 0.2526248241097246, + "grad_norm": 0.0994315892457962, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 65350 + }, + { + "epoch": 0.2526634813131079, + "grad_norm": 0.13250277936458588, + "learning_rate": 0.002, + "loss": 2.359, + "step": 65360 + }, + { + "epoch": 0.25270213851649115, + "grad_norm": 0.10943359136581421, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 65370 + }, + { + "epoch": 0.25274079571987446, + "grad_norm": 0.09684669971466064, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 65380 + }, + { + "epoch": 0.2527794529232577, + "grad_norm": 0.10850505530834198, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 65390 + }, + { + "epoch": 0.252818110126641, + "grad_norm": 0.2953296899795532, + "learning_rate": 0.002, + "loss": 2.3732, + "step": 65400 + }, + { + "epoch": 0.25285676733002427, + "grad_norm": 0.12961512804031372, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 65410 + }, + { + "epoch": 0.2528954245334076, + "grad_norm": 0.12566010653972626, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 65420 + }, + { + "epoch": 0.25293408173679083, + "grad_norm": 0.13647234439849854, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 65430 + }, + { + "epoch": 0.25297273894017414, + "grad_norm": 0.12771177291870117, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 65440 + }, + { + "epoch": 0.2530113961435574, + "grad_norm": 0.10627250373363495, + "learning_rate": 0.002, + "loss": 2.3773, + "step": 65450 + }, + { + "epoch": 0.2530500533469407, + "grad_norm": 0.10476811975240707, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 65460 + }, + { + "epoch": 0.25308871055032395, + "grad_norm": 0.09948313981294632, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 65470 + }, + { + "epoch": 0.2531273677537072, + "grad_norm": 0.12087182700634003, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 65480 + }, + { + "epoch": 0.2531660249570905, + "grad_norm": 0.10390983521938324, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 65490 + }, + { + "epoch": 0.25320468216047376, + "grad_norm": 0.10002344846725464, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 65500 + }, + { + "epoch": 0.25324333936385707, + "grad_norm": 0.10570525377988815, + "learning_rate": 0.002, + "loss": 2.3783, + "step": 65510 + }, + { + "epoch": 0.2532819965672403, + "grad_norm": 0.1476055532693863, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 65520 + }, + { + "epoch": 0.2533206537706236, + "grad_norm": 0.10201095044612885, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 65530 + }, + { + "epoch": 0.2533593109740069, + "grad_norm": 0.13619953393936157, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 65540 + }, + { + "epoch": 0.2533979681773902, + "grad_norm": 0.1017579585313797, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 65550 + }, + { + "epoch": 0.25343662538077344, + "grad_norm": 0.1178319901227951, + "learning_rate": 0.002, + "loss": 2.356, + "step": 65560 + }, + { + "epoch": 0.25347528258415675, + "grad_norm": 0.11252082884311676, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 65570 + }, + { + "epoch": 0.25351393978754, + "grad_norm": 0.1279241144657135, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 65580 + }, + { + "epoch": 0.2535525969909233, + "grad_norm": 0.11474581062793732, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 65590 + }, + { + "epoch": 0.25359125419430656, + "grad_norm": 0.16519343852996826, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 65600 + }, + { + "epoch": 0.25362991139768987, + "grad_norm": 0.11091284453868866, + "learning_rate": 0.002, + "loss": 2.3778, + "step": 65610 + }, + { + "epoch": 0.2536685686010731, + "grad_norm": 0.11500389873981476, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 65620 + }, + { + "epoch": 0.2537072258044564, + "grad_norm": 0.10793469846248627, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 65630 + }, + { + "epoch": 0.2537458830078397, + "grad_norm": 0.1048169657588005, + "learning_rate": 0.002, + "loss": 2.352, + "step": 65640 + }, + { + "epoch": 0.253784540211223, + "grad_norm": 0.1076226532459259, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 65650 + }, + { + "epoch": 0.25382319741460624, + "grad_norm": 0.11597827076911926, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 65660 + }, + { + "epoch": 0.2538618546179895, + "grad_norm": 0.09700324386358261, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 65670 + }, + { + "epoch": 0.2539005118213728, + "grad_norm": 0.11959628015756607, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 65680 + }, + { + "epoch": 0.25393916902475605, + "grad_norm": 0.1079908162355423, + "learning_rate": 0.002, + "loss": 2.353, + "step": 65690 + }, + { + "epoch": 0.25397782622813936, + "grad_norm": 0.1234707236289978, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 65700 + }, + { + "epoch": 0.2540164834315226, + "grad_norm": 0.10080688446760178, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 65710 + }, + { + "epoch": 0.2540551406349059, + "grad_norm": 0.11054587364196777, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 65720 + }, + { + "epoch": 0.25409379783828917, + "grad_norm": 0.12429597973823547, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 65730 + }, + { + "epoch": 0.2541324550416725, + "grad_norm": 0.09979277104139328, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 65740 + }, + { + "epoch": 0.25417111224505573, + "grad_norm": 0.14310751855373383, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 65750 + }, + { + "epoch": 0.25420976944843904, + "grad_norm": 0.11847370117902756, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 65760 + }, + { + "epoch": 0.2542484266518223, + "grad_norm": 0.11517151445150375, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 65770 + }, + { + "epoch": 0.2542870838552056, + "grad_norm": 0.11966444551944733, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 65780 + }, + { + "epoch": 0.25432574105858885, + "grad_norm": 0.12495843321084976, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 65790 + }, + { + "epoch": 0.25436439826197216, + "grad_norm": 0.11208898574113846, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 65800 + }, + { + "epoch": 0.2544030554653554, + "grad_norm": 0.1017102375626564, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 65810 + }, + { + "epoch": 0.2544417126687387, + "grad_norm": 0.10564885288476944, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 65820 + }, + { + "epoch": 0.25448036987212197, + "grad_norm": 0.11047804355621338, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 65830 + }, + { + "epoch": 0.2545190270755052, + "grad_norm": 0.11530250310897827, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 65840 + }, + { + "epoch": 0.25455768427888853, + "grad_norm": 0.1151326596736908, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 65850 + }, + { + "epoch": 0.2545963414822718, + "grad_norm": 0.14109362661838531, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 65860 + }, + { + "epoch": 0.2546349986856551, + "grad_norm": 0.10470999777317047, + "learning_rate": 0.002, + "loss": 2.353, + "step": 65870 + }, + { + "epoch": 0.25467365588903834, + "grad_norm": 0.12141033262014389, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 65880 + }, + { + "epoch": 0.25471231309242165, + "grad_norm": 0.1011107936501503, + "learning_rate": 0.002, + "loss": 2.3792, + "step": 65890 + }, + { + "epoch": 0.2547509702958049, + "grad_norm": 0.11235532909631729, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 65900 + }, + { + "epoch": 0.2547896274991882, + "grad_norm": 0.10267923027276993, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 65910 + }, + { + "epoch": 0.25482828470257146, + "grad_norm": 0.10743974894285202, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 65920 + }, + { + "epoch": 0.25486694190595477, + "grad_norm": 0.11289151012897491, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 65930 + }, + { + "epoch": 0.254905599109338, + "grad_norm": 0.11451549828052521, + "learning_rate": 0.002, + "loss": 2.363, + "step": 65940 + }, + { + "epoch": 0.25494425631272133, + "grad_norm": 0.10237868130207062, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 65950 + }, + { + "epoch": 0.2549829135161046, + "grad_norm": 0.11435071378946304, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 65960 + }, + { + "epoch": 0.2550215707194879, + "grad_norm": 0.12266162037849426, + "learning_rate": 0.002, + "loss": 2.352, + "step": 65970 + }, + { + "epoch": 0.25506022792287114, + "grad_norm": 0.10688599199056625, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 65980 + }, + { + "epoch": 0.25509888512625445, + "grad_norm": 0.10259024798870087, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 65990 + }, + { + "epoch": 0.2551375423296377, + "grad_norm": 0.11779743432998657, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 66000 + }, + { + "epoch": 0.255176199533021, + "grad_norm": 0.12171625345945358, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 66010 + }, + { + "epoch": 0.25521485673640426, + "grad_norm": 0.11175300180912018, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 66020 + }, + { + "epoch": 0.2552535139397875, + "grad_norm": 0.12124433368444443, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 66030 + }, + { + "epoch": 0.2552921711431708, + "grad_norm": 0.11927662044763565, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 66040 + }, + { + "epoch": 0.2553308283465541, + "grad_norm": 0.11759792268276215, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 66050 + }, + { + "epoch": 0.2553694855499374, + "grad_norm": 0.09895379096269608, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 66060 + }, + { + "epoch": 0.25540814275332063, + "grad_norm": 0.1130814403295517, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 66070 + }, + { + "epoch": 0.25544679995670394, + "grad_norm": 0.10689357668161392, + "learning_rate": 0.002, + "loss": 2.345, + "step": 66080 + }, + { + "epoch": 0.2554854571600872, + "grad_norm": 0.11816614866256714, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 66090 + }, + { + "epoch": 0.2555241143634705, + "grad_norm": 0.11300661414861679, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 66100 + }, + { + "epoch": 0.25556277156685375, + "grad_norm": 0.11420150101184845, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 66110 + }, + { + "epoch": 0.25560142877023706, + "grad_norm": 0.10286738723516464, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 66120 + }, + { + "epoch": 0.2556400859736203, + "grad_norm": 0.11612996459007263, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 66130 + }, + { + "epoch": 0.2556787431770036, + "grad_norm": 0.13170316815376282, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 66140 + }, + { + "epoch": 0.2557174003803869, + "grad_norm": 0.10963205248117447, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 66150 + }, + { + "epoch": 0.2557560575837702, + "grad_norm": 0.1187463030219078, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 66160 + }, + { + "epoch": 0.25579471478715343, + "grad_norm": 0.114934541285038, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 66170 + }, + { + "epoch": 0.25583337199053674, + "grad_norm": 0.10894732922315598, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 66180 + }, + { + "epoch": 0.25587202919392, + "grad_norm": 0.11884298920631409, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 66190 + }, + { + "epoch": 0.2559106863973033, + "grad_norm": 0.1013733372092247, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 66200 + }, + { + "epoch": 0.25594934360068655, + "grad_norm": 0.11026433855295181, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 66210 + }, + { + "epoch": 0.2559880008040698, + "grad_norm": 0.09665088355541229, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 66220 + }, + { + "epoch": 0.2560266580074531, + "grad_norm": 0.13314181566238403, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 66230 + }, + { + "epoch": 0.25606531521083636, + "grad_norm": 0.10690948367118835, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 66240 + }, + { + "epoch": 0.25610397241421967, + "grad_norm": 0.11940506845712662, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 66250 + }, + { + "epoch": 0.2561426296176029, + "grad_norm": 0.13842318952083588, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 66260 + }, + { + "epoch": 0.25618128682098623, + "grad_norm": 0.11469965428113937, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 66270 + }, + { + "epoch": 0.2562199440243695, + "grad_norm": 0.09851006418466568, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 66280 + }, + { + "epoch": 0.2562586012277528, + "grad_norm": 0.11406917124986649, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 66290 + }, + { + "epoch": 0.25629725843113604, + "grad_norm": 0.11309903860092163, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 66300 + }, + { + "epoch": 0.25633591563451935, + "grad_norm": 0.11170071363449097, + "learning_rate": 0.002, + "loss": 2.3813, + "step": 66310 + }, + { + "epoch": 0.2563745728379026, + "grad_norm": 0.10675996541976929, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 66320 + }, + { + "epoch": 0.2564132300412859, + "grad_norm": 0.1084975078701973, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 66330 + }, + { + "epoch": 0.25645188724466916, + "grad_norm": 0.12220773845911026, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 66340 + }, + { + "epoch": 0.25649054444805247, + "grad_norm": 0.12375793606042862, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 66350 + }, + { + "epoch": 0.2565292016514357, + "grad_norm": 0.12166187912225723, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 66360 + }, + { + "epoch": 0.25656785885481903, + "grad_norm": 0.10858607292175293, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 66370 + }, + { + "epoch": 0.2566065160582023, + "grad_norm": 0.10314149409532547, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 66380 + }, + { + "epoch": 0.2566451732615856, + "grad_norm": 0.13672682642936707, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 66390 + }, + { + "epoch": 0.25668383046496884, + "grad_norm": 0.11390886455774307, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 66400 + }, + { + "epoch": 0.2567224876683521, + "grad_norm": 0.1029527485370636, + "learning_rate": 0.002, + "loss": 2.362, + "step": 66410 + }, + { + "epoch": 0.2567611448717354, + "grad_norm": 0.1255478709936142, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 66420 + }, + { + "epoch": 0.25679980207511865, + "grad_norm": 0.10678042471408844, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 66430 + }, + { + "epoch": 0.25683845927850196, + "grad_norm": 0.11441392451524734, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 66440 + }, + { + "epoch": 0.2568771164818852, + "grad_norm": 0.10975232720375061, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 66450 + }, + { + "epoch": 0.2569157736852685, + "grad_norm": 0.12330281734466553, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 66460 + }, + { + "epoch": 0.2569544308886518, + "grad_norm": 0.1308504343032837, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 66470 + }, + { + "epoch": 0.2569930880920351, + "grad_norm": 0.11605305969715118, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 66480 + }, + { + "epoch": 0.25703174529541833, + "grad_norm": 0.10903593897819519, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 66490 + }, + { + "epoch": 0.25707040249880164, + "grad_norm": 0.12700045108795166, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 66500 + }, + { + "epoch": 0.2571090597021849, + "grad_norm": 0.09870582073926926, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 66510 + }, + { + "epoch": 0.2571477169055682, + "grad_norm": 0.09971556067466736, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 66520 + }, + { + "epoch": 0.25718637410895145, + "grad_norm": 0.10839555412530899, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 66530 + }, + { + "epoch": 0.25722503131233476, + "grad_norm": 0.11747601628303528, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 66540 + }, + { + "epoch": 0.257263688515718, + "grad_norm": 0.11553992331027985, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 66550 + }, + { + "epoch": 0.2573023457191013, + "grad_norm": 0.12473028898239136, + "learning_rate": 0.002, + "loss": 2.367, + "step": 66560 + }, + { + "epoch": 0.2573410029224846, + "grad_norm": 0.10672403872013092, + "learning_rate": 0.002, + "loss": 2.364, + "step": 66570 + }, + { + "epoch": 0.2573796601258679, + "grad_norm": 0.1254693865776062, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 66580 + }, + { + "epoch": 0.25741831732925113, + "grad_norm": 0.12008104473352432, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 66590 + }, + { + "epoch": 0.2574569745326344, + "grad_norm": 0.11767129600048065, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 66600 + }, + { + "epoch": 0.2574956317360177, + "grad_norm": 0.11705000698566437, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 66610 + }, + { + "epoch": 0.25753428893940095, + "grad_norm": 0.11520517617464066, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 66620 + }, + { + "epoch": 0.25757294614278425, + "grad_norm": 0.10603370517492294, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 66630 + }, + { + "epoch": 0.2576116033461675, + "grad_norm": 0.14179281890392303, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 66640 + }, + { + "epoch": 0.2576502605495508, + "grad_norm": 0.10854744166135788, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 66650 + }, + { + "epoch": 0.25768891775293407, + "grad_norm": 0.10364288836717606, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 66660 + }, + { + "epoch": 0.2577275749563174, + "grad_norm": 0.11076736450195312, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 66670 + }, + { + "epoch": 0.2577662321597006, + "grad_norm": 0.12479634582996368, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 66680 + }, + { + "epoch": 0.25780488936308393, + "grad_norm": 0.11340264976024628, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 66690 + }, + { + "epoch": 0.2578435465664672, + "grad_norm": 0.13055060803890228, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 66700 + }, + { + "epoch": 0.2578822037698505, + "grad_norm": 0.09846454113721848, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 66710 + }, + { + "epoch": 0.25792086097323375, + "grad_norm": 0.1017606109380722, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 66720 + }, + { + "epoch": 0.25795951817661705, + "grad_norm": 0.11539702862501144, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 66730 + }, + { + "epoch": 0.2579981753800003, + "grad_norm": 0.1274324655532837, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 66740 + }, + { + "epoch": 0.2580368325833836, + "grad_norm": 0.11746063828468323, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 66750 + }, + { + "epoch": 0.25807548978676687, + "grad_norm": 0.11583682149648666, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 66760 + }, + { + "epoch": 0.2581141469901501, + "grad_norm": 0.11930841952562332, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 66770 + }, + { + "epoch": 0.2581528041935334, + "grad_norm": 0.10679621249437332, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 66780 + }, + { + "epoch": 0.2581914613969167, + "grad_norm": 0.11766018718481064, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 66790 + }, + { + "epoch": 0.2582301186003, + "grad_norm": 0.1263052523136139, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 66800 + }, + { + "epoch": 0.25826877580368324, + "grad_norm": 0.10427229851484299, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 66810 + }, + { + "epoch": 0.25830743300706654, + "grad_norm": 0.11562903225421906, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 66820 + }, + { + "epoch": 0.2583460902104498, + "grad_norm": 0.11342374235391617, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 66830 + }, + { + "epoch": 0.2583847474138331, + "grad_norm": 0.11070325970649719, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 66840 + }, + { + "epoch": 0.25842340461721636, + "grad_norm": 0.1250879019498825, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 66850 + }, + { + "epoch": 0.25846206182059966, + "grad_norm": 0.10222569853067398, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 66860 + }, + { + "epoch": 0.2585007190239829, + "grad_norm": 0.11579307913780212, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 66870 + }, + { + "epoch": 0.2585393762273662, + "grad_norm": 0.12290211766958237, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 66880 + }, + { + "epoch": 0.2585780334307495, + "grad_norm": 0.10187076777219772, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 66890 + }, + { + "epoch": 0.2586166906341328, + "grad_norm": 0.10334745049476624, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 66900 + }, + { + "epoch": 0.25865534783751604, + "grad_norm": 0.1190209835767746, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 66910 + }, + { + "epoch": 0.25869400504089934, + "grad_norm": 0.36490097641944885, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 66920 + }, + { + "epoch": 0.2587326622442826, + "grad_norm": 0.13372598588466644, + "learning_rate": 0.002, + "loss": 2.347, + "step": 66930 + }, + { + "epoch": 0.2587713194476659, + "grad_norm": 0.10478980094194412, + "learning_rate": 0.002, + "loss": 2.371, + "step": 66940 + }, + { + "epoch": 0.25880997665104916, + "grad_norm": 0.10454052686691284, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 66950 + }, + { + "epoch": 0.2588486338544324, + "grad_norm": 0.12227415293455124, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 66960 + }, + { + "epoch": 0.2588872910578157, + "grad_norm": 0.1253330558538437, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 66970 + }, + { + "epoch": 0.25892594826119897, + "grad_norm": 0.10769782960414886, + "learning_rate": 0.002, + "loss": 2.35, + "step": 66980 + }, + { + "epoch": 0.2589646054645823, + "grad_norm": 0.11193307489156723, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 66990 + }, + { + "epoch": 0.25900326266796553, + "grad_norm": 0.12134253233671188, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 67000 + }, + { + "epoch": 0.25904191987134884, + "grad_norm": 0.11123788356781006, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 67010 + }, + { + "epoch": 0.2590805770747321, + "grad_norm": 0.11672520637512207, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 67020 + }, + { + "epoch": 0.2591192342781154, + "grad_norm": 0.12144597619771957, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 67030 + }, + { + "epoch": 0.25915789148149865, + "grad_norm": 0.12573914229869843, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 67040 + }, + { + "epoch": 0.25919654868488196, + "grad_norm": 0.1065409928560257, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 67050 + }, + { + "epoch": 0.2592352058882652, + "grad_norm": 0.23819462954998016, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 67060 + }, + { + "epoch": 0.2592738630916485, + "grad_norm": 0.11490265280008316, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 67070 + }, + { + "epoch": 0.25931252029503177, + "grad_norm": 0.11505532264709473, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 67080 + }, + { + "epoch": 0.2593511774984151, + "grad_norm": 0.11441343277692795, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 67090 + }, + { + "epoch": 0.2593898347017983, + "grad_norm": 0.09996909648180008, + "learning_rate": 0.002, + "loss": 2.346, + "step": 67100 + }, + { + "epoch": 0.25942849190518164, + "grad_norm": 0.1097947359085083, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 67110 + }, + { + "epoch": 0.2594671491085649, + "grad_norm": 0.12589269876480103, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 67120 + }, + { + "epoch": 0.2595058063119482, + "grad_norm": 0.09830565750598907, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 67130 + }, + { + "epoch": 0.25954446351533145, + "grad_norm": 0.10417470335960388, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 67140 + }, + { + "epoch": 0.2595831207187147, + "grad_norm": 0.11758775264024734, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 67150 + }, + { + "epoch": 0.259621777922098, + "grad_norm": 0.13093090057373047, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 67160 + }, + { + "epoch": 0.25966043512548126, + "grad_norm": 0.18829749524593353, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 67170 + }, + { + "epoch": 0.25969909232886457, + "grad_norm": 0.12005820125341415, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 67180 + }, + { + "epoch": 0.2597377495322478, + "grad_norm": 0.10506993532180786, + "learning_rate": 0.002, + "loss": 2.368, + "step": 67190 + }, + { + "epoch": 0.2597764067356311, + "grad_norm": 0.09339757263660431, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 67200 + }, + { + "epoch": 0.2598150639390144, + "grad_norm": 0.11311523616313934, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 67210 + }, + { + "epoch": 0.2598537211423977, + "grad_norm": 0.13281090557575226, + "learning_rate": 0.002, + "loss": 2.349, + "step": 67220 + }, + { + "epoch": 0.25989237834578094, + "grad_norm": 0.11049260199069977, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 67230 + }, + { + "epoch": 0.25993103554916425, + "grad_norm": 0.10846215486526489, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 67240 + }, + { + "epoch": 0.2599696927525475, + "grad_norm": 0.11961953341960907, + "learning_rate": 0.002, + "loss": 2.356, + "step": 67250 + }, + { + "epoch": 0.2600083499559308, + "grad_norm": 0.14370691776275635, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 67260 + }, + { + "epoch": 0.26004700715931406, + "grad_norm": 0.11629011482000351, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 67270 + }, + { + "epoch": 0.26008566436269737, + "grad_norm": 0.09897922724485397, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 67280 + }, + { + "epoch": 0.2601243215660806, + "grad_norm": 0.13134950399398804, + "learning_rate": 0.002, + "loss": 2.362, + "step": 67290 + }, + { + "epoch": 0.2601629787694639, + "grad_norm": 0.155740886926651, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 67300 + }, + { + "epoch": 0.2602016359728472, + "grad_norm": 0.11856327950954437, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 67310 + }, + { + "epoch": 0.2602402931762305, + "grad_norm": 0.10641349107027054, + "learning_rate": 0.002, + "loss": 2.356, + "step": 67320 + }, + { + "epoch": 0.26027895037961374, + "grad_norm": 0.11414900422096252, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 67330 + }, + { + "epoch": 0.260317607582997, + "grad_norm": 0.1279212087392807, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 67340 + }, + { + "epoch": 0.2603562647863803, + "grad_norm": 0.09392930567264557, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 67350 + }, + { + "epoch": 0.26039492198976355, + "grad_norm": 0.1068810224533081, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 67360 + }, + { + "epoch": 0.26043357919314686, + "grad_norm": 0.10254529863595963, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 67370 + }, + { + "epoch": 0.2604722363965301, + "grad_norm": 0.10807019472122192, + "learning_rate": 0.002, + "loss": 2.351, + "step": 67380 + }, + { + "epoch": 0.2605108935999134, + "grad_norm": 0.11821964383125305, + "learning_rate": 0.002, + "loss": 2.357, + "step": 67390 + }, + { + "epoch": 0.26054955080329667, + "grad_norm": 0.11181916296482086, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 67400 + }, + { + "epoch": 0.26058820800668, + "grad_norm": 0.11617095023393631, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 67410 + }, + { + "epoch": 0.26062686521006323, + "grad_norm": 0.10184059292078018, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 67420 + }, + { + "epoch": 0.26066552241344654, + "grad_norm": 0.11712180823087692, + "learning_rate": 0.002, + "loss": 2.357, + "step": 67430 + }, + { + "epoch": 0.2607041796168298, + "grad_norm": 0.10022317618131638, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 67440 + }, + { + "epoch": 0.2607428368202131, + "grad_norm": 0.10569388419389725, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 67450 + }, + { + "epoch": 0.26078149402359635, + "grad_norm": 0.10248992592096329, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 67460 + }, + { + "epoch": 0.26082015122697966, + "grad_norm": 0.1069984957575798, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 67470 + }, + { + "epoch": 0.2608588084303629, + "grad_norm": 0.1299649029970169, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 67480 + }, + { + "epoch": 0.2608974656337462, + "grad_norm": 0.37685492634773254, + "learning_rate": 0.002, + "loss": 2.386, + "step": 67490 + }, + { + "epoch": 0.26093612283712947, + "grad_norm": 0.3009447157382965, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 67500 + }, + { + "epoch": 0.2609747800405127, + "grad_norm": 0.12166126817464828, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 67510 + }, + { + "epoch": 0.26101343724389603, + "grad_norm": 0.1006769984960556, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 67520 + }, + { + "epoch": 0.2610520944472793, + "grad_norm": 0.11278746277093887, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 67530 + }, + { + "epoch": 0.2610907516506626, + "grad_norm": 0.11016260087490082, + "learning_rate": 0.002, + "loss": 2.355, + "step": 67540 + }, + { + "epoch": 0.26112940885404584, + "grad_norm": 0.1064244732260704, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 67550 + }, + { + "epoch": 0.26116806605742915, + "grad_norm": 0.11934314668178558, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 67560 + }, + { + "epoch": 0.2612067232608124, + "grad_norm": 0.11332755535840988, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 67570 + }, + { + "epoch": 0.2612453804641957, + "grad_norm": 0.12420445680618286, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 67580 + }, + { + "epoch": 0.26128403766757896, + "grad_norm": 0.11511637270450592, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 67590 + }, + { + "epoch": 0.26132269487096227, + "grad_norm": 0.11553613096475601, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 67600 + }, + { + "epoch": 0.2613613520743455, + "grad_norm": 0.1271434724330902, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 67610 + }, + { + "epoch": 0.26140000927772883, + "grad_norm": 0.11405181139707565, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 67620 + }, + { + "epoch": 0.2614386664811121, + "grad_norm": 0.11136946082115173, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 67630 + }, + { + "epoch": 0.2614773236844954, + "grad_norm": 0.1222151666879654, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 67640 + }, + { + "epoch": 0.26151598088787864, + "grad_norm": 0.11218507587909698, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 67650 + }, + { + "epoch": 0.26155463809126195, + "grad_norm": 0.10514702647924423, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 67660 + }, + { + "epoch": 0.2615932952946452, + "grad_norm": 0.11863667517900467, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 67670 + }, + { + "epoch": 0.2616319524980285, + "grad_norm": 0.10940532386302948, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 67680 + }, + { + "epoch": 0.26167060970141176, + "grad_norm": 0.11782362312078476, + "learning_rate": 0.002, + "loss": 2.35, + "step": 67690 + }, + { + "epoch": 0.261709266904795, + "grad_norm": 0.11867765337228775, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 67700 + }, + { + "epoch": 0.2617479241081783, + "grad_norm": 0.10951172560453415, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 67710 + }, + { + "epoch": 0.2617865813115616, + "grad_norm": 0.11456424742937088, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 67720 + }, + { + "epoch": 0.2618252385149449, + "grad_norm": 0.11477063596248627, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 67730 + }, + { + "epoch": 0.26186389571832813, + "grad_norm": 0.11264248192310333, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 67740 + }, + { + "epoch": 0.26190255292171144, + "grad_norm": 0.10030915588140488, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 67750 + }, + { + "epoch": 0.2619412101250947, + "grad_norm": 0.10117532312870026, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 67760 + }, + { + "epoch": 0.261979867328478, + "grad_norm": 0.11972854286432266, + "learning_rate": 0.002, + "loss": 2.348, + "step": 67770 + }, + { + "epoch": 0.26201852453186125, + "grad_norm": 0.11656410247087479, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 67780 + }, + { + "epoch": 0.26205718173524456, + "grad_norm": 0.12312465161085129, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 67790 + }, + { + "epoch": 0.2620958389386278, + "grad_norm": 0.11731352657079697, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 67800 + }, + { + "epoch": 0.2621344961420111, + "grad_norm": 0.1069423109292984, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 67810 + }, + { + "epoch": 0.26217315334539437, + "grad_norm": 0.09852375090122223, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 67820 + }, + { + "epoch": 0.2622118105487777, + "grad_norm": 0.1118987649679184, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 67830 + }, + { + "epoch": 0.26225046775216093, + "grad_norm": 0.12264939397573471, + "learning_rate": 0.002, + "loss": 2.361, + "step": 67840 + }, + { + "epoch": 0.26228912495554424, + "grad_norm": 0.10814981162548065, + "learning_rate": 0.002, + "loss": 2.345, + "step": 67850 + }, + { + "epoch": 0.2623277821589275, + "grad_norm": 0.10167591273784637, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 67860 + }, + { + "epoch": 0.2623664393623108, + "grad_norm": 0.10804091393947601, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 67870 + }, + { + "epoch": 0.26240509656569405, + "grad_norm": 0.14333422482013702, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 67880 + }, + { + "epoch": 0.2624437537690773, + "grad_norm": 0.13869328796863556, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 67890 + }, + { + "epoch": 0.2624824109724606, + "grad_norm": 0.10745614022016525, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 67900 + }, + { + "epoch": 0.26252106817584386, + "grad_norm": 0.10607896000146866, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 67910 + }, + { + "epoch": 0.26255972537922717, + "grad_norm": 0.10167323052883148, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 67920 + }, + { + "epoch": 0.2625983825826104, + "grad_norm": 0.09899301081895828, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 67930 + }, + { + "epoch": 0.26263703978599373, + "grad_norm": 0.11459238082170486, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 67940 + }, + { + "epoch": 0.262675696989377, + "grad_norm": 0.11651584506034851, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 67950 + }, + { + "epoch": 0.2627143541927603, + "grad_norm": 0.10737515985965729, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 67960 + }, + { + "epoch": 0.26275301139614354, + "grad_norm": 0.1321590393781662, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 67970 + }, + { + "epoch": 0.26279166859952685, + "grad_norm": 0.1289292573928833, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 67980 + }, + { + "epoch": 0.2628303258029101, + "grad_norm": 0.11426378786563873, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 67990 + }, + { + "epoch": 0.2628689830062934, + "grad_norm": 0.120346799492836, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 68000 + }, + { + "epoch": 0.26290764020967666, + "grad_norm": 0.12574881315231323, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 68010 + }, + { + "epoch": 0.26294629741305997, + "grad_norm": 0.09758058190345764, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 68020 + }, + { + "epoch": 0.2629849546164432, + "grad_norm": 0.10797390341758728, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 68030 + }, + { + "epoch": 0.26302361181982653, + "grad_norm": 0.09947437047958374, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 68040 + }, + { + "epoch": 0.2630622690232098, + "grad_norm": 0.10770823061466217, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 68050 + }, + { + "epoch": 0.2631009262265931, + "grad_norm": 0.10565075278282166, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 68060 + }, + { + "epoch": 0.26313958342997634, + "grad_norm": 0.11738776415586472, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 68070 + }, + { + "epoch": 0.2631782406333596, + "grad_norm": 0.13340911269187927, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 68080 + }, + { + "epoch": 0.2632168978367429, + "grad_norm": 0.10474201291799545, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 68090 + }, + { + "epoch": 0.26325555504012615, + "grad_norm": 0.14180724322795868, + "learning_rate": 0.002, + "loss": 2.352, + "step": 68100 + }, + { + "epoch": 0.26329421224350946, + "grad_norm": 0.12632973492145538, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 68110 + }, + { + "epoch": 0.2633328694468927, + "grad_norm": 0.10560580343008041, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 68120 + }, + { + "epoch": 0.263371526650276, + "grad_norm": 0.09889001399278641, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 68130 + }, + { + "epoch": 0.2634101838536593, + "grad_norm": 0.15841642022132874, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 68140 + }, + { + "epoch": 0.2634488410570426, + "grad_norm": 0.09572744369506836, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 68150 + }, + { + "epoch": 0.26348749826042583, + "grad_norm": 0.10834426432847977, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 68160 + }, + { + "epoch": 0.26352615546380914, + "grad_norm": 0.11525410413742065, + "learning_rate": 0.002, + "loss": 2.345, + "step": 68170 + }, + { + "epoch": 0.2635648126671924, + "grad_norm": 0.11463785916566849, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 68180 + }, + { + "epoch": 0.2636034698705757, + "grad_norm": 0.11806154251098633, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 68190 + }, + { + "epoch": 0.26364212707395895, + "grad_norm": 0.11061472445726395, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 68200 + }, + { + "epoch": 0.26368078427734226, + "grad_norm": 0.10829704999923706, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 68210 + }, + { + "epoch": 0.2637194414807255, + "grad_norm": 0.11386598646640778, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 68220 + }, + { + "epoch": 0.2637580986841088, + "grad_norm": 0.10931659489870071, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 68230 + }, + { + "epoch": 0.2637967558874921, + "grad_norm": 0.1091981828212738, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 68240 + }, + { + "epoch": 0.2638354130908754, + "grad_norm": 0.13077843189239502, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 68250 + }, + { + "epoch": 0.26387407029425863, + "grad_norm": 0.13880740106105804, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 68260 + }, + { + "epoch": 0.2639127274976419, + "grad_norm": 0.11458608508110046, + "learning_rate": 0.002, + "loss": 2.361, + "step": 68270 + }, + { + "epoch": 0.2639513847010252, + "grad_norm": 0.14922989904880524, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 68280 + }, + { + "epoch": 0.26399004190440845, + "grad_norm": 0.11000477522611618, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 68290 + }, + { + "epoch": 0.26402869910779175, + "grad_norm": 0.09621061384677887, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 68300 + }, + { + "epoch": 0.264067356311175, + "grad_norm": 0.1066533625125885, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 68310 + }, + { + "epoch": 0.2641060135145583, + "grad_norm": 0.1137138158082962, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 68320 + }, + { + "epoch": 0.26414467071794157, + "grad_norm": 0.10997021198272705, + "learning_rate": 0.002, + "loss": 2.368, + "step": 68330 + }, + { + "epoch": 0.2641833279213249, + "grad_norm": 0.10986141115427017, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 68340 + }, + { + "epoch": 0.2642219851247081, + "grad_norm": 0.12483604997396469, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 68350 + }, + { + "epoch": 0.26426064232809143, + "grad_norm": 0.10350396484136581, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 68360 + }, + { + "epoch": 0.2642992995314747, + "grad_norm": 0.09706956893205643, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 68370 + }, + { + "epoch": 0.264337956734858, + "grad_norm": 0.10872679203748703, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 68380 + }, + { + "epoch": 0.26437661393824125, + "grad_norm": 0.12254256755113602, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 68390 + }, + { + "epoch": 0.26441527114162455, + "grad_norm": 0.11372974514961243, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 68400 + }, + { + "epoch": 0.2644539283450078, + "grad_norm": 0.1338338702917099, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 68410 + }, + { + "epoch": 0.2644925855483911, + "grad_norm": 0.10851424187421799, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 68420 + }, + { + "epoch": 0.26453124275177436, + "grad_norm": 0.1281265914440155, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 68430 + }, + { + "epoch": 0.2645698999551576, + "grad_norm": 0.18946754932403564, + "learning_rate": 0.002, + "loss": 2.351, + "step": 68440 + }, + { + "epoch": 0.2646085571585409, + "grad_norm": 0.11525660008192062, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 68450 + }, + { + "epoch": 0.2646472143619242, + "grad_norm": 0.10857021808624268, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 68460 + }, + { + "epoch": 0.2646858715653075, + "grad_norm": 0.11583767086267471, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 68470 + }, + { + "epoch": 0.26472452876869074, + "grad_norm": 0.11408522725105286, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 68480 + }, + { + "epoch": 0.26476318597207404, + "grad_norm": 0.10725697875022888, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 68490 + }, + { + "epoch": 0.2648018431754573, + "grad_norm": 0.12834322452545166, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 68500 + }, + { + "epoch": 0.2648405003788406, + "grad_norm": 0.11649401485919952, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 68510 + }, + { + "epoch": 0.26487915758222386, + "grad_norm": 0.12402457743883133, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 68520 + }, + { + "epoch": 0.26491781478560716, + "grad_norm": 0.11356259882450104, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 68530 + }, + { + "epoch": 0.2649564719889904, + "grad_norm": 0.09514134377241135, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 68540 + }, + { + "epoch": 0.2649951291923737, + "grad_norm": 0.12344872951507568, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 68550 + }, + { + "epoch": 0.265033786395757, + "grad_norm": 0.11852506548166275, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 68560 + }, + { + "epoch": 0.2650724435991403, + "grad_norm": 0.11700794845819473, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 68570 + }, + { + "epoch": 0.26511110080252354, + "grad_norm": 0.11226040124893188, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 68580 + }, + { + "epoch": 0.26514975800590684, + "grad_norm": 0.11499864608049393, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 68590 + }, + { + "epoch": 0.2651884152092901, + "grad_norm": 0.10765230655670166, + "learning_rate": 0.002, + "loss": 2.352, + "step": 68600 + }, + { + "epoch": 0.2652270724126734, + "grad_norm": 0.10035645961761475, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 68610 + }, + { + "epoch": 0.26526572961605666, + "grad_norm": 0.1291760504245758, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 68620 + }, + { + "epoch": 0.2653043868194399, + "grad_norm": 0.13323894143104553, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 68630 + }, + { + "epoch": 0.2653430440228232, + "grad_norm": 0.10504741221666336, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 68640 + }, + { + "epoch": 0.26538170122620647, + "grad_norm": 0.12466032058000565, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 68650 + }, + { + "epoch": 0.2654203584295898, + "grad_norm": 0.10273690521717072, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 68660 + }, + { + "epoch": 0.265459015632973, + "grad_norm": 0.1336238533258438, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 68670 + }, + { + "epoch": 0.26549767283635634, + "grad_norm": 0.1103762686252594, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 68680 + }, + { + "epoch": 0.2655363300397396, + "grad_norm": 0.09904215484857559, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 68690 + }, + { + "epoch": 0.2655749872431229, + "grad_norm": 0.10359543561935425, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 68700 + }, + { + "epoch": 0.26561364444650615, + "grad_norm": 0.11706099659204483, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 68710 + }, + { + "epoch": 0.26565230164988946, + "grad_norm": 0.10935186594724655, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 68720 + }, + { + "epoch": 0.2656909588532727, + "grad_norm": 0.1055302545428276, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 68730 + }, + { + "epoch": 0.265729616056656, + "grad_norm": 0.10079481452703476, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 68740 + }, + { + "epoch": 0.26576827326003927, + "grad_norm": 0.12839989364147186, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 68750 + }, + { + "epoch": 0.2658069304634226, + "grad_norm": 0.10385528951883316, + "learning_rate": 0.002, + "loss": 2.355, + "step": 68760 + }, + { + "epoch": 0.2658455876668058, + "grad_norm": 0.1055331826210022, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 68770 + }, + { + "epoch": 0.26588424487018913, + "grad_norm": 0.10499408841133118, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 68780 + }, + { + "epoch": 0.2659229020735724, + "grad_norm": 0.1188826709985733, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 68790 + }, + { + "epoch": 0.2659615592769557, + "grad_norm": 0.1307165026664734, + "learning_rate": 0.002, + "loss": 2.365, + "step": 68800 + }, + { + "epoch": 0.26600021648033895, + "grad_norm": 0.10107318311929703, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 68810 + }, + { + "epoch": 0.2660388736837222, + "grad_norm": 0.1242791935801506, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 68820 + }, + { + "epoch": 0.2660775308871055, + "grad_norm": 0.11397421360015869, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 68830 + }, + { + "epoch": 0.26611618809048876, + "grad_norm": 0.1081373542547226, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 68840 + }, + { + "epoch": 0.26615484529387207, + "grad_norm": 0.11046163737773895, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 68850 + }, + { + "epoch": 0.2661935024972553, + "grad_norm": 0.11250414699316025, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 68860 + }, + { + "epoch": 0.2662321597006386, + "grad_norm": 0.11318423599004745, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 68870 + }, + { + "epoch": 0.2662708169040219, + "grad_norm": 0.11168055981397629, + "learning_rate": 0.002, + "loss": 2.353, + "step": 68880 + }, + { + "epoch": 0.2663094741074052, + "grad_norm": 0.10483946651220322, + "learning_rate": 0.002, + "loss": 2.365, + "step": 68890 + }, + { + "epoch": 0.26634813131078844, + "grad_norm": 0.11276476085186005, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 68900 + }, + { + "epoch": 0.26638678851417175, + "grad_norm": 0.12012232840061188, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 68910 + }, + { + "epoch": 0.266425445717555, + "grad_norm": 0.15014702081680298, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 68920 + }, + { + "epoch": 0.2664641029209383, + "grad_norm": 0.09493235498666763, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 68930 + }, + { + "epoch": 0.26650276012432156, + "grad_norm": 0.141010582447052, + "learning_rate": 0.002, + "loss": 2.359, + "step": 68940 + }, + { + "epoch": 0.26654141732770487, + "grad_norm": 0.10899730026721954, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 68950 + }, + { + "epoch": 0.2665800745310881, + "grad_norm": 0.115720734000206, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 68960 + }, + { + "epoch": 0.2666187317344714, + "grad_norm": 0.10011117160320282, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 68970 + }, + { + "epoch": 0.2666573889378547, + "grad_norm": 0.0918012335896492, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 68980 + }, + { + "epoch": 0.266696046141238, + "grad_norm": 0.16054445505142212, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 68990 + }, + { + "epoch": 0.26673470334462124, + "grad_norm": 0.09328989684581757, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 69000 + }, + { + "epoch": 0.2667733605480045, + "grad_norm": 0.11078935861587524, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 69010 + }, + { + "epoch": 0.2668120177513878, + "grad_norm": 0.12276352941989899, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 69020 + }, + { + "epoch": 0.26685067495477105, + "grad_norm": 0.1035182997584343, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 69030 + }, + { + "epoch": 0.26688933215815436, + "grad_norm": 0.10880587249994278, + "learning_rate": 0.002, + "loss": 2.346, + "step": 69040 + }, + { + "epoch": 0.2669279893615376, + "grad_norm": 0.1148485541343689, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 69050 + }, + { + "epoch": 0.2669666465649209, + "grad_norm": 0.11542642116546631, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 69060 + }, + { + "epoch": 0.26700530376830417, + "grad_norm": 0.09781506657600403, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 69070 + }, + { + "epoch": 0.2670439609716875, + "grad_norm": 0.11610165238380432, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 69080 + }, + { + "epoch": 0.26708261817507073, + "grad_norm": 0.11108443886041641, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 69090 + }, + { + "epoch": 0.26712127537845404, + "grad_norm": 0.09576795995235443, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 69100 + }, + { + "epoch": 0.2671599325818373, + "grad_norm": 0.12218323349952698, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 69110 + }, + { + "epoch": 0.2671985897852206, + "grad_norm": 0.12643787264823914, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 69120 + }, + { + "epoch": 0.26723724698860385, + "grad_norm": 0.11990267783403397, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 69130 + }, + { + "epoch": 0.26727590419198716, + "grad_norm": 0.10244489461183548, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 69140 + }, + { + "epoch": 0.2673145613953704, + "grad_norm": 0.10932030528783798, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 69150 + }, + { + "epoch": 0.2673532185987537, + "grad_norm": 0.1066504642367363, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 69160 + }, + { + "epoch": 0.26739187580213697, + "grad_norm": 0.10350240767002106, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 69170 + }, + { + "epoch": 0.2674305330055202, + "grad_norm": 0.10027860105037689, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 69180 + }, + { + "epoch": 0.26746919020890353, + "grad_norm": 0.10422100126743317, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 69190 + }, + { + "epoch": 0.2675078474122868, + "grad_norm": 0.11838383227586746, + "learning_rate": 0.002, + "loss": 2.344, + "step": 69200 + }, + { + "epoch": 0.2675465046156701, + "grad_norm": 0.11060896515846252, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 69210 + }, + { + "epoch": 0.26758516181905334, + "grad_norm": 0.1625586599111557, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 69220 + }, + { + "epoch": 0.26762381902243665, + "grad_norm": 0.11933229863643646, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 69230 + }, + { + "epoch": 0.2676624762258199, + "grad_norm": 0.12870526313781738, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 69240 + }, + { + "epoch": 0.2677011334292032, + "grad_norm": 0.10569017380475998, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 69250 + }, + { + "epoch": 0.26773979063258646, + "grad_norm": 0.11131662875413895, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 69260 + }, + { + "epoch": 0.26777844783596977, + "grad_norm": 0.15190604329109192, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 69270 + }, + { + "epoch": 0.267817105039353, + "grad_norm": 0.10054759681224823, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 69280 + }, + { + "epoch": 0.26785576224273633, + "grad_norm": 0.10481575131416321, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 69290 + }, + { + "epoch": 0.2678944194461196, + "grad_norm": 0.11136908084154129, + "learning_rate": 0.002, + "loss": 2.364, + "step": 69300 + }, + { + "epoch": 0.2679330766495029, + "grad_norm": 0.10886896401643753, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 69310 + }, + { + "epoch": 0.26797173385288614, + "grad_norm": 0.10115396231412888, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 69320 + }, + { + "epoch": 0.26801039105626945, + "grad_norm": 0.11328067630529404, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 69330 + }, + { + "epoch": 0.2680490482596527, + "grad_norm": 0.09565749019384384, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 69340 + }, + { + "epoch": 0.268087705463036, + "grad_norm": 0.15927979350090027, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 69350 + }, + { + "epoch": 0.26812636266641926, + "grad_norm": 0.11798939853906631, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 69360 + }, + { + "epoch": 0.2681650198698025, + "grad_norm": 0.1248922348022461, + "learning_rate": 0.002, + "loss": 2.347, + "step": 69370 + }, + { + "epoch": 0.2682036770731858, + "grad_norm": 0.11001662909984589, + "learning_rate": 0.002, + "loss": 2.354, + "step": 69380 + }, + { + "epoch": 0.26824233427656907, + "grad_norm": 0.10940846800804138, + "learning_rate": 0.002, + "loss": 2.3754, + "step": 69390 + }, + { + "epoch": 0.2682809914799524, + "grad_norm": 0.10409963130950928, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 69400 + }, + { + "epoch": 0.26831964868333563, + "grad_norm": 0.10977187752723694, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 69410 + }, + { + "epoch": 0.26835830588671894, + "grad_norm": 0.10507378727197647, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 69420 + }, + { + "epoch": 0.2683969630901022, + "grad_norm": 0.12260796874761581, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 69430 + }, + { + "epoch": 0.2684356202934855, + "grad_norm": 0.13973881304264069, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 69440 + }, + { + "epoch": 0.26847427749686875, + "grad_norm": 0.12037132680416107, + "learning_rate": 0.002, + "loss": 2.367, + "step": 69450 + }, + { + "epoch": 0.26851293470025206, + "grad_norm": 0.11014335602521896, + "learning_rate": 0.002, + "loss": 2.364, + "step": 69460 + }, + { + "epoch": 0.2685515919036353, + "grad_norm": 0.10039269924163818, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 69470 + }, + { + "epoch": 0.2685902491070186, + "grad_norm": 0.11346255242824554, + "learning_rate": 0.002, + "loss": 2.357, + "step": 69480 + }, + { + "epoch": 0.26862890631040187, + "grad_norm": 0.12792587280273438, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 69490 + }, + { + "epoch": 0.2686675635137852, + "grad_norm": 0.09618958830833435, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 69500 + }, + { + "epoch": 0.26870622071716843, + "grad_norm": 0.10815145820379257, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 69510 + }, + { + "epoch": 0.26874487792055174, + "grad_norm": 0.12378823012113571, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 69520 + }, + { + "epoch": 0.268783535123935, + "grad_norm": 0.11091689020395279, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 69530 + }, + { + "epoch": 0.2688221923273183, + "grad_norm": 0.1288810521364212, + "learning_rate": 0.002, + "loss": 2.346, + "step": 69540 + }, + { + "epoch": 0.26886084953070155, + "grad_norm": 0.11948135495185852, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 69550 + }, + { + "epoch": 0.2688995067340848, + "grad_norm": 0.11650735884904861, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 69560 + }, + { + "epoch": 0.2689381639374681, + "grad_norm": 0.12489194422960281, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 69570 + }, + { + "epoch": 0.26897682114085136, + "grad_norm": 0.11284519731998444, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 69580 + }, + { + "epoch": 0.26901547834423467, + "grad_norm": 0.10499247908592224, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 69590 + }, + { + "epoch": 0.2690541355476179, + "grad_norm": 0.11322970688343048, + "learning_rate": 0.002, + "loss": 2.36, + "step": 69600 + }, + { + "epoch": 0.26909279275100123, + "grad_norm": 0.10059107840061188, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 69610 + }, + { + "epoch": 0.2691314499543845, + "grad_norm": 0.11188896745443344, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 69620 + }, + { + "epoch": 0.2691701071577678, + "grad_norm": 0.10746940225362778, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 69630 + }, + { + "epoch": 0.26920876436115104, + "grad_norm": 0.10799665004014969, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 69640 + }, + { + "epoch": 0.26924742156453435, + "grad_norm": 0.10684413462877274, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 69650 + }, + { + "epoch": 0.2692860787679176, + "grad_norm": 0.11300649493932724, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 69660 + }, + { + "epoch": 0.2693247359713009, + "grad_norm": 0.11004617065191269, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 69670 + }, + { + "epoch": 0.26936339317468416, + "grad_norm": 0.09407416731119156, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 69680 + }, + { + "epoch": 0.26940205037806747, + "grad_norm": 0.14031341671943665, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 69690 + }, + { + "epoch": 0.2694407075814507, + "grad_norm": 0.11828654259443283, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 69700 + }, + { + "epoch": 0.26947936478483403, + "grad_norm": 0.11903201043605804, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 69710 + }, + { + "epoch": 0.2695180219882173, + "grad_norm": 0.1179431676864624, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 69720 + }, + { + "epoch": 0.2695566791916006, + "grad_norm": 0.1048092171549797, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 69730 + }, + { + "epoch": 0.26959533639498384, + "grad_norm": 0.1114017441868782, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 69740 + }, + { + "epoch": 0.2696339935983671, + "grad_norm": 0.10939405858516693, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 69750 + }, + { + "epoch": 0.2696726508017504, + "grad_norm": 0.11268754303455353, + "learning_rate": 0.002, + "loss": 2.33, + "step": 69760 + }, + { + "epoch": 0.26971130800513365, + "grad_norm": 0.10703985393047333, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 69770 + }, + { + "epoch": 0.26974996520851696, + "grad_norm": 0.10865245014429092, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 69780 + }, + { + "epoch": 0.2697886224119002, + "grad_norm": 0.11256400495767593, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 69790 + }, + { + "epoch": 0.2698272796152835, + "grad_norm": 0.10484255105257034, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 69800 + }, + { + "epoch": 0.2698659368186668, + "grad_norm": 0.12422183156013489, + "learning_rate": 0.002, + "loss": 2.357, + "step": 69810 + }, + { + "epoch": 0.2699045940220501, + "grad_norm": 0.1264592707157135, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 69820 + }, + { + "epoch": 0.26994325122543333, + "grad_norm": 0.1143057644367218, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 69830 + }, + { + "epoch": 0.26998190842881664, + "grad_norm": 0.10738317668437958, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 69840 + }, + { + "epoch": 0.2700205656321999, + "grad_norm": 0.11261006444692612, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 69850 + }, + { + "epoch": 0.2700592228355832, + "grad_norm": 0.1316865086555481, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 69860 + }, + { + "epoch": 0.27009788003896645, + "grad_norm": 0.10657945275306702, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 69870 + }, + { + "epoch": 0.27013653724234976, + "grad_norm": 0.10246672481298447, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 69880 + }, + { + "epoch": 0.270175194445733, + "grad_norm": 0.12110228091478348, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 69890 + }, + { + "epoch": 0.2702138516491163, + "grad_norm": 0.1109195128083229, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 69900 + }, + { + "epoch": 0.2702525088524996, + "grad_norm": 0.11075662076473236, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 69910 + }, + { + "epoch": 0.2702911660558828, + "grad_norm": 0.0981040745973587, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 69920 + }, + { + "epoch": 0.27032982325926613, + "grad_norm": 0.13120904564857483, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 69930 + }, + { + "epoch": 0.2703684804626494, + "grad_norm": 0.10848618298768997, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 69940 + }, + { + "epoch": 0.2704071376660327, + "grad_norm": 0.31828004121780396, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 69950 + }, + { + "epoch": 0.27044579486941595, + "grad_norm": 0.10648057609796524, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 69960 + }, + { + "epoch": 0.27048445207279925, + "grad_norm": 0.11374471336603165, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 69970 + }, + { + "epoch": 0.2705231092761825, + "grad_norm": 0.11823917180299759, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 69980 + }, + { + "epoch": 0.2705617664795658, + "grad_norm": 0.09965338557958603, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 69990 + }, + { + "epoch": 0.27060042368294907, + "grad_norm": 0.10684505105018616, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 70000 + }, + { + "epoch": 0.2706390808863324, + "grad_norm": 0.11145705729722977, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 70010 + }, + { + "epoch": 0.2706777380897156, + "grad_norm": 0.11647245287895203, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 70020 + }, + { + "epoch": 0.27071639529309893, + "grad_norm": 0.1162240207195282, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 70030 + }, + { + "epoch": 0.2707550524964822, + "grad_norm": 0.10605619847774506, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 70040 + }, + { + "epoch": 0.2707937096998655, + "grad_norm": 0.11280248314142227, + "learning_rate": 0.002, + "loss": 2.359, + "step": 70050 + }, + { + "epoch": 0.27083236690324874, + "grad_norm": 0.10601935535669327, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 70060 + }, + { + "epoch": 0.27087102410663205, + "grad_norm": 0.11940313875675201, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 70070 + }, + { + "epoch": 0.2709096813100153, + "grad_norm": 0.10181540250778198, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 70080 + }, + { + "epoch": 0.2709483385133986, + "grad_norm": 0.10978764295578003, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 70090 + }, + { + "epoch": 0.27098699571678186, + "grad_norm": 0.11411808431148529, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 70100 + }, + { + "epoch": 0.2710256529201651, + "grad_norm": 0.09403868019580841, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 70110 + }, + { + "epoch": 0.2710643101235484, + "grad_norm": 0.11685250699520111, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 70120 + }, + { + "epoch": 0.2711029673269317, + "grad_norm": 0.10953215509653091, + "learning_rate": 0.002, + "loss": 2.339, + "step": 70130 + }, + { + "epoch": 0.271141624530315, + "grad_norm": 0.1205730140209198, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 70140 + }, + { + "epoch": 0.27118028173369824, + "grad_norm": 0.12347997725009918, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 70150 + }, + { + "epoch": 0.27121893893708154, + "grad_norm": 0.1174677386879921, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 70160 + }, + { + "epoch": 0.2712575961404648, + "grad_norm": 0.11836795508861542, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 70170 + }, + { + "epoch": 0.2712962533438481, + "grad_norm": 0.10880803316831589, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 70180 + }, + { + "epoch": 0.27133491054723136, + "grad_norm": 0.11703497916460037, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 70190 + }, + { + "epoch": 0.27137356775061466, + "grad_norm": 0.1004214659333229, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 70200 + }, + { + "epoch": 0.2714122249539979, + "grad_norm": 0.1273142695426941, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 70210 + }, + { + "epoch": 0.2714508821573812, + "grad_norm": 0.11645844578742981, + "learning_rate": 0.002, + "loss": 2.344, + "step": 70220 + }, + { + "epoch": 0.2714895393607645, + "grad_norm": 0.09346353262662888, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 70230 + }, + { + "epoch": 0.2715281965641478, + "grad_norm": 0.10376016795635223, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 70240 + }, + { + "epoch": 0.27156685376753104, + "grad_norm": 0.1030738353729248, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 70250 + }, + { + "epoch": 0.27160551097091434, + "grad_norm": 0.14229042828083038, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 70260 + }, + { + "epoch": 0.2716441681742976, + "grad_norm": 0.10577567666769028, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 70270 + }, + { + "epoch": 0.2716828253776809, + "grad_norm": 0.10997182875871658, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 70280 + }, + { + "epoch": 0.27172148258106416, + "grad_norm": 0.11268234252929688, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 70290 + }, + { + "epoch": 0.2717601397844474, + "grad_norm": 0.09050731360912323, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 70300 + }, + { + "epoch": 0.2717987969878307, + "grad_norm": 0.09953863173723221, + "learning_rate": 0.002, + "loss": 2.35, + "step": 70310 + }, + { + "epoch": 0.27183745419121397, + "grad_norm": 0.10677150636911392, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 70320 + }, + { + "epoch": 0.2718761113945973, + "grad_norm": 0.0925314724445343, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 70330 + }, + { + "epoch": 0.2719147685979805, + "grad_norm": 0.11543618142604828, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 70340 + }, + { + "epoch": 0.27195342580136384, + "grad_norm": 0.10473021119832993, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 70350 + }, + { + "epoch": 0.2719920830047471, + "grad_norm": 0.11223046481609344, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 70360 + }, + { + "epoch": 0.2720307402081304, + "grad_norm": 0.10345500707626343, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 70370 + }, + { + "epoch": 0.27206939741151365, + "grad_norm": 0.13544058799743652, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 70380 + }, + { + "epoch": 0.27210805461489695, + "grad_norm": 0.11092148721218109, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 70390 + }, + { + "epoch": 0.2721467118182802, + "grad_norm": 0.10791967064142227, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 70400 + }, + { + "epoch": 0.2721853690216635, + "grad_norm": 0.10910028964281082, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 70410 + }, + { + "epoch": 0.27222402622504677, + "grad_norm": 0.11330778151750565, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 70420 + }, + { + "epoch": 0.2722626834284301, + "grad_norm": 0.10065264999866486, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 70430 + }, + { + "epoch": 0.2723013406318133, + "grad_norm": 0.10658468306064606, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 70440 + }, + { + "epoch": 0.27233999783519663, + "grad_norm": 0.10998087376356125, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 70450 + }, + { + "epoch": 0.2723786550385799, + "grad_norm": 0.1141766682267189, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 70460 + }, + { + "epoch": 0.2724173122419632, + "grad_norm": 0.11142230778932571, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 70470 + }, + { + "epoch": 0.27245596944534645, + "grad_norm": 0.12735013663768768, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 70480 + }, + { + "epoch": 0.2724946266487297, + "grad_norm": 0.1139666959643364, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 70490 + }, + { + "epoch": 0.272533283852113, + "grad_norm": 0.11391907185316086, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 70500 + }, + { + "epoch": 0.27257194105549626, + "grad_norm": 0.10967147350311279, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 70510 + }, + { + "epoch": 0.27261059825887957, + "grad_norm": 0.12889058887958527, + "learning_rate": 0.002, + "loss": 2.354, + "step": 70520 + }, + { + "epoch": 0.2726492554622628, + "grad_norm": 0.11671818792819977, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 70530 + }, + { + "epoch": 0.2726879126656461, + "grad_norm": 0.13470597565174103, + "learning_rate": 0.002, + "loss": 2.371, + "step": 70540 + }, + { + "epoch": 0.2727265698690294, + "grad_norm": 0.1302533745765686, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 70550 + }, + { + "epoch": 0.2727652270724127, + "grad_norm": 0.10315962880849838, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 70560 + }, + { + "epoch": 0.27280388427579594, + "grad_norm": 0.113266222178936, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 70570 + }, + { + "epoch": 0.27284254147917925, + "grad_norm": 0.11444780230522156, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 70580 + }, + { + "epoch": 0.2728811986825625, + "grad_norm": 0.12794393301010132, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 70590 + }, + { + "epoch": 0.2729198558859458, + "grad_norm": 0.08904918283224106, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 70600 + }, + { + "epoch": 0.27295851308932906, + "grad_norm": 0.10772236436605453, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 70610 + }, + { + "epoch": 0.27299717029271237, + "grad_norm": 0.0939546599984169, + "learning_rate": 0.002, + "loss": 2.355, + "step": 70620 + }, + { + "epoch": 0.2730358274960956, + "grad_norm": 0.10614132881164551, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 70630 + }, + { + "epoch": 0.2730744846994789, + "grad_norm": 0.1227312833070755, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 70640 + }, + { + "epoch": 0.2731131419028622, + "grad_norm": 0.10652358829975128, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 70650 + }, + { + "epoch": 0.2731517991062455, + "grad_norm": 0.10573123395442963, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 70660 + }, + { + "epoch": 0.27319045630962874, + "grad_norm": 0.14353644847869873, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 70670 + }, + { + "epoch": 0.273229113513012, + "grad_norm": 0.10298144817352295, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 70680 + }, + { + "epoch": 0.2732677707163953, + "grad_norm": 0.10104386508464813, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 70690 + }, + { + "epoch": 0.27330642791977855, + "grad_norm": 0.12633801996707916, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 70700 + }, + { + "epoch": 0.27334508512316186, + "grad_norm": 0.1258019357919693, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 70710 + }, + { + "epoch": 0.2733837423265451, + "grad_norm": 0.11459273844957352, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 70720 + }, + { + "epoch": 0.2734223995299284, + "grad_norm": 0.12588898837566376, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 70730 + }, + { + "epoch": 0.27346105673331167, + "grad_norm": 0.10593008249998093, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 70740 + }, + { + "epoch": 0.273499713936695, + "grad_norm": 0.11217566579580307, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 70750 + }, + { + "epoch": 0.27353837114007823, + "grad_norm": 0.11181171983480453, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 70760 + }, + { + "epoch": 0.27357702834346154, + "grad_norm": 0.1203831285238266, + "learning_rate": 0.002, + "loss": 2.377, + "step": 70770 + }, + { + "epoch": 0.2736156855468448, + "grad_norm": 0.10951201617717743, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 70780 + }, + { + "epoch": 0.2736543427502281, + "grad_norm": 0.12592849135398865, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 70790 + }, + { + "epoch": 0.27369299995361135, + "grad_norm": 0.10407140851020813, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 70800 + }, + { + "epoch": 0.27373165715699466, + "grad_norm": 0.1266588568687439, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 70810 + }, + { + "epoch": 0.2737703143603779, + "grad_norm": 0.110927052795887, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 70820 + }, + { + "epoch": 0.2738089715637612, + "grad_norm": 0.12094797939062119, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 70830 + }, + { + "epoch": 0.27384762876714447, + "grad_norm": 0.11010003089904785, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 70840 + }, + { + "epoch": 0.2738862859705277, + "grad_norm": 0.10637608915567398, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 70850 + }, + { + "epoch": 0.27392494317391103, + "grad_norm": 0.10273260623216629, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 70860 + }, + { + "epoch": 0.2739636003772943, + "grad_norm": 0.10922182351350784, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 70870 + }, + { + "epoch": 0.2740022575806776, + "grad_norm": 0.12877719104290009, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 70880 + }, + { + "epoch": 0.27404091478406084, + "grad_norm": 0.11414679139852524, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 70890 + }, + { + "epoch": 0.27407957198744415, + "grad_norm": 0.12255305796861649, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 70900 + }, + { + "epoch": 0.2741182291908274, + "grad_norm": 0.10669559985399246, + "learning_rate": 0.002, + "loss": 2.355, + "step": 70910 + }, + { + "epoch": 0.2741568863942107, + "grad_norm": 0.09727916121482849, + "learning_rate": 0.002, + "loss": 2.356, + "step": 70920 + }, + { + "epoch": 0.27419554359759396, + "grad_norm": 0.11017953604459763, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 70930 + }, + { + "epoch": 0.27423420080097727, + "grad_norm": 0.1090618297457695, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 70940 + }, + { + "epoch": 0.2742728580043605, + "grad_norm": 0.11688750237226486, + "learning_rate": 0.002, + "loss": 2.358, + "step": 70950 + }, + { + "epoch": 0.27431151520774383, + "grad_norm": 0.11693534255027771, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 70960 + }, + { + "epoch": 0.2743501724111271, + "grad_norm": 0.10778669267892838, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 70970 + }, + { + "epoch": 0.2743888296145104, + "grad_norm": 0.11466331034898758, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 70980 + }, + { + "epoch": 0.27442748681789364, + "grad_norm": 0.12696002423763275, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 70990 + }, + { + "epoch": 0.27446614402127695, + "grad_norm": 0.09766501933336258, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 71000 + }, + { + "epoch": 0.2745048012246602, + "grad_norm": 0.11292238533496857, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 71010 + }, + { + "epoch": 0.2745434584280435, + "grad_norm": 0.11505763232707977, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 71020 + }, + { + "epoch": 0.27458211563142676, + "grad_norm": 0.13340947031974792, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 71030 + }, + { + "epoch": 0.27462077283481, + "grad_norm": 0.11638954281806946, + "learning_rate": 0.002, + "loss": 2.361, + "step": 71040 + }, + { + "epoch": 0.2746594300381933, + "grad_norm": 0.09738632291555405, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 71050 + }, + { + "epoch": 0.27469808724157657, + "grad_norm": 0.1314162015914917, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 71060 + }, + { + "epoch": 0.2747367444449599, + "grad_norm": 0.10656953603029251, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 71070 + }, + { + "epoch": 0.27477540164834313, + "grad_norm": 0.22209644317626953, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 71080 + }, + { + "epoch": 0.27481405885172644, + "grad_norm": 0.11620117723941803, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 71090 + }, + { + "epoch": 0.2748527160551097, + "grad_norm": 0.0935266986489296, + "learning_rate": 0.002, + "loss": 2.357, + "step": 71100 + }, + { + "epoch": 0.274891373258493, + "grad_norm": 0.10850471258163452, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 71110 + }, + { + "epoch": 0.27493003046187625, + "grad_norm": 0.10303536802530289, + "learning_rate": 0.002, + "loss": 2.357, + "step": 71120 + }, + { + "epoch": 0.27496868766525956, + "grad_norm": 0.12467154115438461, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 71130 + }, + { + "epoch": 0.2750073448686428, + "grad_norm": 0.10738974064588547, + "learning_rate": 0.002, + "loss": 2.346, + "step": 71140 + }, + { + "epoch": 0.2750460020720261, + "grad_norm": 0.3521445691585541, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 71150 + }, + { + "epoch": 0.27508465927540937, + "grad_norm": 0.09956452250480652, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 71160 + }, + { + "epoch": 0.2751233164787927, + "grad_norm": 0.12146948277950287, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 71170 + }, + { + "epoch": 0.27516197368217593, + "grad_norm": 0.10550107061862946, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 71180 + }, + { + "epoch": 0.27520063088555924, + "grad_norm": 0.09655023366212845, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 71190 + }, + { + "epoch": 0.2752392880889425, + "grad_norm": 0.11164681613445282, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 71200 + }, + { + "epoch": 0.2752779452923258, + "grad_norm": 0.10607494413852692, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 71210 + }, + { + "epoch": 0.27531660249570905, + "grad_norm": 0.1139015182852745, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 71220 + }, + { + "epoch": 0.2753552596990923, + "grad_norm": 0.10601265728473663, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 71230 + }, + { + "epoch": 0.2753939169024756, + "grad_norm": 0.1114029809832573, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 71240 + }, + { + "epoch": 0.27543257410585886, + "grad_norm": 0.12024658173322678, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 71250 + }, + { + "epoch": 0.27547123130924217, + "grad_norm": 0.11128765344619751, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 71260 + }, + { + "epoch": 0.2755098885126254, + "grad_norm": 0.10613062232732773, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 71270 + }, + { + "epoch": 0.27554854571600873, + "grad_norm": 0.09886675328016281, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 71280 + }, + { + "epoch": 0.275587202919392, + "grad_norm": 0.10275650769472122, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 71290 + }, + { + "epoch": 0.2756258601227753, + "grad_norm": 0.12820738554000854, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 71300 + }, + { + "epoch": 0.27566451732615854, + "grad_norm": 0.10675428062677383, + "learning_rate": 0.002, + "loss": 2.357, + "step": 71310 + }, + { + "epoch": 0.27570317452954185, + "grad_norm": 0.10081598907709122, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 71320 + }, + { + "epoch": 0.2757418317329251, + "grad_norm": 0.1125272884964943, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 71330 + }, + { + "epoch": 0.2757804889363084, + "grad_norm": 0.11027172952890396, + "learning_rate": 0.002, + "loss": 2.359, + "step": 71340 + }, + { + "epoch": 0.27581914613969166, + "grad_norm": 0.10339958220720291, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 71350 + }, + { + "epoch": 0.27585780334307497, + "grad_norm": 0.08919887244701385, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 71360 + }, + { + "epoch": 0.2758964605464582, + "grad_norm": 0.12407740205526352, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 71370 + }, + { + "epoch": 0.27593511774984153, + "grad_norm": 0.11857614666223526, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 71380 + }, + { + "epoch": 0.2759737749532248, + "grad_norm": 0.1211429014801979, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 71390 + }, + { + "epoch": 0.2760124321566081, + "grad_norm": 0.11378125846385956, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 71400 + }, + { + "epoch": 0.27605108935999134, + "grad_norm": 0.11991725862026215, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 71410 + }, + { + "epoch": 0.2760897465633746, + "grad_norm": 0.12218762189149857, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 71420 + }, + { + "epoch": 0.2761284037667579, + "grad_norm": 0.10152004659175873, + "learning_rate": 0.002, + "loss": 2.3766, + "step": 71430 + }, + { + "epoch": 0.27616706097014115, + "grad_norm": 0.10593225806951523, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 71440 + }, + { + "epoch": 0.27620571817352446, + "grad_norm": 0.10519793629646301, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 71450 + }, + { + "epoch": 0.2762443753769077, + "grad_norm": 0.0969279333949089, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 71460 + }, + { + "epoch": 0.276283032580291, + "grad_norm": 0.09336300939321518, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 71470 + }, + { + "epoch": 0.2763216897836743, + "grad_norm": 0.10432116687297821, + "learning_rate": 0.002, + "loss": 2.358, + "step": 71480 + }, + { + "epoch": 0.2763603469870576, + "grad_norm": 0.11239740252494812, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 71490 + }, + { + "epoch": 0.27639900419044083, + "grad_norm": 0.10245117545127869, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 71500 + }, + { + "epoch": 0.27643766139382414, + "grad_norm": 0.11861246824264526, + "learning_rate": 0.002, + "loss": 2.349, + "step": 71510 + }, + { + "epoch": 0.2764763185972074, + "grad_norm": 0.11511997878551483, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 71520 + }, + { + "epoch": 0.2765149758005907, + "grad_norm": 0.10100384801626205, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 71530 + }, + { + "epoch": 0.27655363300397395, + "grad_norm": 0.126571923494339, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 71540 + }, + { + "epoch": 0.27659229020735726, + "grad_norm": 0.1063634380698204, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 71550 + }, + { + "epoch": 0.2766309474107405, + "grad_norm": 0.11067322641611099, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 71560 + }, + { + "epoch": 0.2766696046141238, + "grad_norm": 0.12833568453788757, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 71570 + }, + { + "epoch": 0.2767082618175071, + "grad_norm": 0.10962745547294617, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 71580 + }, + { + "epoch": 0.2767469190208903, + "grad_norm": 0.11526691168546677, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 71590 + }, + { + "epoch": 0.27678557622427363, + "grad_norm": 0.10324928164482117, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 71600 + }, + { + "epoch": 0.2768242334276569, + "grad_norm": 0.11164212971925735, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 71610 + }, + { + "epoch": 0.2768628906310402, + "grad_norm": 0.12060465663671494, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 71620 + }, + { + "epoch": 0.27690154783442344, + "grad_norm": 0.10645802319049835, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 71630 + }, + { + "epoch": 0.27694020503780675, + "grad_norm": 0.11743126064538956, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 71640 + }, + { + "epoch": 0.27697886224119, + "grad_norm": 0.1196308434009552, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 71650 + }, + { + "epoch": 0.2770175194445733, + "grad_norm": 0.12106142938137054, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 71660 + }, + { + "epoch": 0.27705617664795656, + "grad_norm": 0.10936318337917328, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 71670 + }, + { + "epoch": 0.2770948338513399, + "grad_norm": 0.11110043525695801, + "learning_rate": 0.002, + "loss": 2.355, + "step": 71680 + }, + { + "epoch": 0.2771334910547231, + "grad_norm": 0.10624603182077408, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 71690 + }, + { + "epoch": 0.27717214825810643, + "grad_norm": 0.09473574906587601, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 71700 + }, + { + "epoch": 0.2772108054614897, + "grad_norm": 0.14341023564338684, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 71710 + }, + { + "epoch": 0.277249462664873, + "grad_norm": 0.11800472438335419, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 71720 + }, + { + "epoch": 0.27728811986825624, + "grad_norm": 0.11726002395153046, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 71730 + }, + { + "epoch": 0.27732677707163955, + "grad_norm": 0.09608127921819687, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 71740 + }, + { + "epoch": 0.2773654342750228, + "grad_norm": 0.11779412627220154, + "learning_rate": 0.002, + "loss": 2.357, + "step": 71750 + }, + { + "epoch": 0.2774040914784061, + "grad_norm": 0.12998349964618683, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 71760 + }, + { + "epoch": 0.27744274868178936, + "grad_norm": 0.1138630211353302, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 71770 + }, + { + "epoch": 0.2774814058851726, + "grad_norm": 0.11318184435367584, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 71780 + }, + { + "epoch": 0.2775200630885559, + "grad_norm": 0.11765322089195251, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 71790 + }, + { + "epoch": 0.2775587202919392, + "grad_norm": 0.09930826723575592, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 71800 + }, + { + "epoch": 0.2775973774953225, + "grad_norm": 0.10807865113019943, + "learning_rate": 0.002, + "loss": 2.347, + "step": 71810 + }, + { + "epoch": 0.27763603469870574, + "grad_norm": 0.09903464466333389, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 71820 + }, + { + "epoch": 0.27767469190208904, + "grad_norm": 0.11516023427248001, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 71830 + }, + { + "epoch": 0.2777133491054723, + "grad_norm": 0.11118941754102707, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 71840 + }, + { + "epoch": 0.2777520063088556, + "grad_norm": 0.10780233889818192, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 71850 + }, + { + "epoch": 0.27779066351223886, + "grad_norm": 0.11606789380311966, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 71860 + }, + { + "epoch": 0.27782932071562216, + "grad_norm": 0.14477357268333435, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 71870 + }, + { + "epoch": 0.2778679779190054, + "grad_norm": 0.10391691327095032, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 71880 + }, + { + "epoch": 0.2779066351223887, + "grad_norm": 0.09747461974620819, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 71890 + }, + { + "epoch": 0.277945292325772, + "grad_norm": 0.10803426802158356, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 71900 + }, + { + "epoch": 0.2779839495291553, + "grad_norm": 0.3230709731578827, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 71910 + }, + { + "epoch": 0.27802260673253854, + "grad_norm": 0.1216980516910553, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 71920 + }, + { + "epoch": 0.27806126393592184, + "grad_norm": 0.09765301644802094, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 71930 + }, + { + "epoch": 0.2780999211393051, + "grad_norm": 0.11342030018568039, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 71940 + }, + { + "epoch": 0.2781385783426884, + "grad_norm": 0.115015409886837, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 71950 + }, + { + "epoch": 0.27817723554607166, + "grad_norm": 0.1250956952571869, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 71960 + }, + { + "epoch": 0.2782158927494549, + "grad_norm": 0.10484731942415237, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 71970 + }, + { + "epoch": 0.2782545499528382, + "grad_norm": 0.12462014704942703, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 71980 + }, + { + "epoch": 0.27829320715622147, + "grad_norm": 0.12200839817523956, + "learning_rate": 0.002, + "loss": 2.358, + "step": 71990 + }, + { + "epoch": 0.2783318643596048, + "grad_norm": 0.11157329380512238, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 72000 + }, + { + "epoch": 0.278370521562988, + "grad_norm": 0.11760713160037994, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 72010 + }, + { + "epoch": 0.27840917876637133, + "grad_norm": 0.12260770052671432, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 72020 + }, + { + "epoch": 0.2784478359697546, + "grad_norm": 0.11049578338861465, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 72030 + }, + { + "epoch": 0.2784864931731379, + "grad_norm": 0.10234503448009491, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 72040 + }, + { + "epoch": 0.27852515037652115, + "grad_norm": 0.11964405328035355, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 72050 + }, + { + "epoch": 0.27856380757990445, + "grad_norm": 0.10206586122512817, + "learning_rate": 0.002, + "loss": 2.351, + "step": 72060 + }, + { + "epoch": 0.2786024647832877, + "grad_norm": 0.10977603495121002, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 72070 + }, + { + "epoch": 0.278641121986671, + "grad_norm": 0.10649268329143524, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 72080 + }, + { + "epoch": 0.27867977919005427, + "grad_norm": 0.11123888194561005, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 72090 + }, + { + "epoch": 0.2787184363934376, + "grad_norm": 0.10736609250307083, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 72100 + }, + { + "epoch": 0.2787570935968208, + "grad_norm": 0.1288762390613556, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 72110 + }, + { + "epoch": 0.27879575080020413, + "grad_norm": 0.10738539695739746, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 72120 + }, + { + "epoch": 0.2788344080035874, + "grad_norm": 0.10828015953302383, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 72130 + }, + { + "epoch": 0.2788730652069707, + "grad_norm": 0.11473634839057922, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 72140 + }, + { + "epoch": 0.27891172241035395, + "grad_norm": 0.12921828031539917, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 72150 + }, + { + "epoch": 0.2789503796137372, + "grad_norm": 0.11381390690803528, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 72160 + }, + { + "epoch": 0.2789890368171205, + "grad_norm": 0.1263791173696518, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 72170 + }, + { + "epoch": 0.27902769402050376, + "grad_norm": 0.16024067997932434, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 72180 + }, + { + "epoch": 0.27906635122388707, + "grad_norm": 0.12159726768732071, + "learning_rate": 0.002, + "loss": 2.346, + "step": 72190 + }, + { + "epoch": 0.2791050084272703, + "grad_norm": 0.11728315055370331, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 72200 + }, + { + "epoch": 0.2791436656306536, + "grad_norm": 0.11379316449165344, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 72210 + }, + { + "epoch": 0.2791823228340369, + "grad_norm": 0.10281714797019958, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 72220 + }, + { + "epoch": 0.2792209800374202, + "grad_norm": 0.11599381268024445, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 72230 + }, + { + "epoch": 0.27925963724080344, + "grad_norm": 0.1538357138633728, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 72240 + }, + { + "epoch": 0.27929829444418675, + "grad_norm": 0.10871226340532303, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 72250 + }, + { + "epoch": 0.27933695164757, + "grad_norm": 0.1313319057226181, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 72260 + }, + { + "epoch": 0.2793756088509533, + "grad_norm": 0.10848644375801086, + "learning_rate": 0.002, + "loss": 2.347, + "step": 72270 + }, + { + "epoch": 0.27941426605433656, + "grad_norm": 0.11999375373125076, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 72280 + }, + { + "epoch": 0.27945292325771987, + "grad_norm": 0.102352075278759, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 72290 + }, + { + "epoch": 0.2794915804611031, + "grad_norm": 0.09912648797035217, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 72300 + }, + { + "epoch": 0.2795302376644864, + "grad_norm": 0.12556324899196625, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 72310 + }, + { + "epoch": 0.2795688948678697, + "grad_norm": 0.09553080052137375, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 72320 + }, + { + "epoch": 0.279607552071253, + "grad_norm": 0.10858116298913956, + "learning_rate": 0.002, + "loss": 2.36, + "step": 72330 + }, + { + "epoch": 0.27964620927463624, + "grad_norm": 0.14677628874778748, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 72340 + }, + { + "epoch": 0.2796848664780195, + "grad_norm": 0.1237170547246933, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 72350 + }, + { + "epoch": 0.2797235236814028, + "grad_norm": 0.11486019194126129, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 72360 + }, + { + "epoch": 0.27976218088478605, + "grad_norm": 0.0969633013010025, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 72370 + }, + { + "epoch": 0.27980083808816936, + "grad_norm": 0.13080482184886932, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 72380 + }, + { + "epoch": 0.2798394952915526, + "grad_norm": 0.09762602299451828, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 72390 + }, + { + "epoch": 0.2798781524949359, + "grad_norm": 0.10149842500686646, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 72400 + }, + { + "epoch": 0.27991680969831917, + "grad_norm": 0.11325753480195999, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 72410 + }, + { + "epoch": 0.2799554669017025, + "grad_norm": 0.11837144941091537, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 72420 + }, + { + "epoch": 0.27999412410508573, + "grad_norm": 0.10435636341571808, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 72430 + }, + { + "epoch": 0.28003278130846904, + "grad_norm": 0.129971444606781, + "learning_rate": 0.002, + "loss": 2.363, + "step": 72440 + }, + { + "epoch": 0.2800714385118523, + "grad_norm": 0.10650801658630371, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 72450 + }, + { + "epoch": 0.2801100957152356, + "grad_norm": 0.11678191274404526, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 72460 + }, + { + "epoch": 0.28014875291861885, + "grad_norm": 0.11875230073928833, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 72470 + }, + { + "epoch": 0.28018741012200216, + "grad_norm": 0.10601239651441574, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 72480 + }, + { + "epoch": 0.2802260673253854, + "grad_norm": 0.12163223326206207, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 72490 + }, + { + "epoch": 0.2802647245287687, + "grad_norm": 0.11279140412807465, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 72500 + }, + { + "epoch": 0.28030338173215197, + "grad_norm": 0.12758687138557434, + "learning_rate": 0.002, + "loss": 2.3725, + "step": 72510 + }, + { + "epoch": 0.2803420389355352, + "grad_norm": 0.10631423443555832, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 72520 + }, + { + "epoch": 0.28038069613891853, + "grad_norm": 0.12531808018684387, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 72530 + }, + { + "epoch": 0.2804193533423018, + "grad_norm": 0.1248190626502037, + "learning_rate": 0.002, + "loss": 2.351, + "step": 72540 + }, + { + "epoch": 0.2804580105456851, + "grad_norm": 0.12048540264368057, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 72550 + }, + { + "epoch": 0.28049666774906834, + "grad_norm": 0.11301138252019882, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 72560 + }, + { + "epoch": 0.28053532495245165, + "grad_norm": 0.10371150821447372, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 72570 + }, + { + "epoch": 0.2805739821558349, + "grad_norm": 0.10555297136306763, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 72580 + }, + { + "epoch": 0.2806126393592182, + "grad_norm": 0.12766166031360626, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 72590 + }, + { + "epoch": 0.28065129656260146, + "grad_norm": 0.0994403064250946, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 72600 + }, + { + "epoch": 0.28068995376598477, + "grad_norm": 0.1184573695063591, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 72610 + }, + { + "epoch": 0.280728610969368, + "grad_norm": 0.1224546879529953, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 72620 + }, + { + "epoch": 0.2807672681727513, + "grad_norm": 0.11548443138599396, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 72630 + }, + { + "epoch": 0.2808059253761346, + "grad_norm": 0.095535509288311, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 72640 + }, + { + "epoch": 0.2808445825795179, + "grad_norm": 0.11959680914878845, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 72650 + }, + { + "epoch": 0.28088323978290114, + "grad_norm": 0.11694733053445816, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 72660 + }, + { + "epoch": 0.28092189698628445, + "grad_norm": 0.09618599712848663, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 72670 + }, + { + "epoch": 0.2809605541896677, + "grad_norm": 0.11327013373374939, + "learning_rate": 0.002, + "loss": 2.347, + "step": 72680 + }, + { + "epoch": 0.280999211393051, + "grad_norm": 0.10539939999580383, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 72690 + }, + { + "epoch": 0.28103786859643426, + "grad_norm": 0.11680128425359726, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 72700 + }, + { + "epoch": 0.2810765257998175, + "grad_norm": 0.1366237848997116, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 72710 + }, + { + "epoch": 0.2811151830032008, + "grad_norm": 0.12079530209302902, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 72720 + }, + { + "epoch": 0.28115384020658407, + "grad_norm": 0.1021251305937767, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 72730 + }, + { + "epoch": 0.2811924974099674, + "grad_norm": 0.1378641277551651, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 72740 + }, + { + "epoch": 0.28123115461335063, + "grad_norm": 0.11904937773942947, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 72750 + }, + { + "epoch": 0.28126981181673394, + "grad_norm": 0.11487920582294464, + "learning_rate": 0.002, + "loss": 2.349, + "step": 72760 + }, + { + "epoch": 0.2813084690201172, + "grad_norm": 0.11200712621212006, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 72770 + }, + { + "epoch": 0.2813471262235005, + "grad_norm": 0.1166585236787796, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 72780 + }, + { + "epoch": 0.28138578342688375, + "grad_norm": 0.11063521355390549, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 72790 + }, + { + "epoch": 0.28142444063026706, + "grad_norm": 0.11898882687091827, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 72800 + }, + { + "epoch": 0.2814630978336503, + "grad_norm": 0.13146457076072693, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 72810 + }, + { + "epoch": 0.2815017550370336, + "grad_norm": 0.11028890311717987, + "learning_rate": 0.002, + "loss": 2.357, + "step": 72820 + }, + { + "epoch": 0.28154041224041687, + "grad_norm": 0.10138233006000519, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 72830 + }, + { + "epoch": 0.2815790694438002, + "grad_norm": 0.11174673587083817, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 72840 + }, + { + "epoch": 0.28161772664718343, + "grad_norm": 0.11663304269313812, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 72850 + }, + { + "epoch": 0.28165638385056674, + "grad_norm": 0.11266373097896576, + "learning_rate": 0.002, + "loss": 2.346, + "step": 72860 + }, + { + "epoch": 0.28169504105395, + "grad_norm": 0.1342191845178604, + "learning_rate": 0.002, + "loss": 2.349, + "step": 72870 + }, + { + "epoch": 0.2817336982573333, + "grad_norm": 0.1129799410700798, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 72880 + }, + { + "epoch": 0.28177235546071655, + "grad_norm": 0.11189255118370056, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 72890 + }, + { + "epoch": 0.2818110126640998, + "grad_norm": 0.13493746519088745, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 72900 + }, + { + "epoch": 0.2818496698674831, + "grad_norm": 0.10434828698635101, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 72910 + }, + { + "epoch": 0.28188832707086636, + "grad_norm": 0.10769256204366684, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 72920 + }, + { + "epoch": 0.28192698427424967, + "grad_norm": 0.1049484834074974, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 72930 + }, + { + "epoch": 0.2819656414776329, + "grad_norm": 0.12519758939743042, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 72940 + }, + { + "epoch": 0.28200429868101623, + "grad_norm": 0.10872354358434677, + "learning_rate": 0.002, + "loss": 2.3724, + "step": 72950 + }, + { + "epoch": 0.2820429558843995, + "grad_norm": 0.11857183277606964, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 72960 + }, + { + "epoch": 0.2820816130877828, + "grad_norm": 0.12134796380996704, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 72970 + }, + { + "epoch": 0.28212027029116604, + "grad_norm": 0.10698603838682175, + "learning_rate": 0.002, + "loss": 2.346, + "step": 72980 + }, + { + "epoch": 0.28215892749454935, + "grad_norm": 0.10357688367366791, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 72990 + }, + { + "epoch": 0.2821975846979326, + "grad_norm": 0.13212862610816956, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 73000 + }, + { + "epoch": 0.2822362419013159, + "grad_norm": 0.08346768468618393, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 73010 + }, + { + "epoch": 0.28227489910469916, + "grad_norm": 0.10529458522796631, + "learning_rate": 0.002, + "loss": 2.355, + "step": 73020 + }, + { + "epoch": 0.28231355630808247, + "grad_norm": 0.10463078320026398, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 73030 + }, + { + "epoch": 0.2823522135114657, + "grad_norm": 0.1002931147813797, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 73040 + }, + { + "epoch": 0.28239087071484903, + "grad_norm": 0.10691235959529877, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 73050 + }, + { + "epoch": 0.2824295279182323, + "grad_norm": 0.1013958677649498, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 73060 + }, + { + "epoch": 0.2824681851216156, + "grad_norm": 0.12213055044412613, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 73070 + }, + { + "epoch": 0.28250684232499884, + "grad_norm": 0.11261018365621567, + "learning_rate": 0.002, + "loss": 2.374, + "step": 73080 + }, + { + "epoch": 0.2825454995283821, + "grad_norm": 0.10393129289150238, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 73090 + }, + { + "epoch": 0.2825841567317654, + "grad_norm": 0.10489436984062195, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 73100 + }, + { + "epoch": 0.28262281393514865, + "grad_norm": 0.12036898732185364, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 73110 + }, + { + "epoch": 0.28266147113853196, + "grad_norm": 0.10465455055236816, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 73120 + }, + { + "epoch": 0.2827001283419152, + "grad_norm": 0.10107477009296417, + "learning_rate": 0.002, + "loss": 2.353, + "step": 73130 + }, + { + "epoch": 0.2827387855452985, + "grad_norm": 0.10472358018159866, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 73140 + }, + { + "epoch": 0.2827774427486818, + "grad_norm": 0.13089285790920258, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 73150 + }, + { + "epoch": 0.2828160999520651, + "grad_norm": 0.11773528903722763, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 73160 + }, + { + "epoch": 0.28285475715544833, + "grad_norm": 0.10916437208652496, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 73170 + }, + { + "epoch": 0.28289341435883164, + "grad_norm": 0.10333863645792007, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 73180 + }, + { + "epoch": 0.2829320715622149, + "grad_norm": 0.11203617602586746, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 73190 + }, + { + "epoch": 0.2829707287655982, + "grad_norm": 0.12673601508140564, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 73200 + }, + { + "epoch": 0.28300938596898145, + "grad_norm": 0.1323099583387375, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 73210 + }, + { + "epoch": 0.28304804317236476, + "grad_norm": 0.1025305837392807, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 73220 + }, + { + "epoch": 0.283086700375748, + "grad_norm": 0.10774783790111542, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 73230 + }, + { + "epoch": 0.2831253575791313, + "grad_norm": 0.12477708607912064, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 73240 + }, + { + "epoch": 0.2831640147825146, + "grad_norm": 0.09175287932157516, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 73250 + }, + { + "epoch": 0.2832026719858978, + "grad_norm": 0.18007692694664001, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 73260 + }, + { + "epoch": 0.28324132918928113, + "grad_norm": 0.10512850433588028, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 73270 + }, + { + "epoch": 0.2832799863926644, + "grad_norm": 0.11133573949337006, + "learning_rate": 0.002, + "loss": 2.352, + "step": 73280 + }, + { + "epoch": 0.2833186435960477, + "grad_norm": 0.09325826168060303, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 73290 + }, + { + "epoch": 0.28335730079943094, + "grad_norm": 0.14559954404830933, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 73300 + }, + { + "epoch": 0.28339595800281425, + "grad_norm": 0.11711502075195312, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 73310 + }, + { + "epoch": 0.2834346152061975, + "grad_norm": 0.10935034602880478, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 73320 + }, + { + "epoch": 0.2834732724095808, + "grad_norm": 0.11247728019952774, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 73330 + }, + { + "epoch": 0.28351192961296406, + "grad_norm": 0.10688342154026031, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 73340 + }, + { + "epoch": 0.28355058681634737, + "grad_norm": 0.11608456075191498, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 73350 + }, + { + "epoch": 0.2835892440197306, + "grad_norm": 0.09863826632499695, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 73360 + }, + { + "epoch": 0.28362790122311393, + "grad_norm": 0.11395720392465591, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 73370 + }, + { + "epoch": 0.2836665584264972, + "grad_norm": 0.11138497292995453, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 73380 + }, + { + "epoch": 0.2837052156298805, + "grad_norm": 0.10729915648698807, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 73390 + }, + { + "epoch": 0.28374387283326374, + "grad_norm": 0.11885261535644531, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 73400 + }, + { + "epoch": 0.28378253003664705, + "grad_norm": 0.11109798401594162, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 73410 + }, + { + "epoch": 0.2838211872400303, + "grad_norm": 0.11580563336610794, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 73420 + }, + { + "epoch": 0.2838598444434136, + "grad_norm": 0.12981678545475006, + "learning_rate": 0.002, + "loss": 2.3804, + "step": 73430 + }, + { + "epoch": 0.28389850164679686, + "grad_norm": 0.10222245007753372, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 73440 + }, + { + "epoch": 0.2839371588501801, + "grad_norm": 0.12582792341709137, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 73450 + }, + { + "epoch": 0.2839758160535634, + "grad_norm": 0.10031749308109283, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 73460 + }, + { + "epoch": 0.2840144732569467, + "grad_norm": 0.12029723078012466, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 73470 + }, + { + "epoch": 0.28405313046033, + "grad_norm": 0.1084158644080162, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 73480 + }, + { + "epoch": 0.28409178766371324, + "grad_norm": 0.1025131493806839, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 73490 + }, + { + "epoch": 0.28413044486709654, + "grad_norm": 0.09864922612905502, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 73500 + }, + { + "epoch": 0.2841691020704798, + "grad_norm": 0.10022611916065216, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 73510 + }, + { + "epoch": 0.2842077592738631, + "grad_norm": 0.27158042788505554, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 73520 + }, + { + "epoch": 0.28424641647724636, + "grad_norm": 0.10503194481134415, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 73530 + }, + { + "epoch": 0.28428507368062966, + "grad_norm": 0.10179532319307327, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 73540 + }, + { + "epoch": 0.2843237308840129, + "grad_norm": 0.12431447207927704, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 73550 + }, + { + "epoch": 0.2843623880873962, + "grad_norm": 0.12010517716407776, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 73560 + }, + { + "epoch": 0.2844010452907795, + "grad_norm": 0.15335272252559662, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 73570 + }, + { + "epoch": 0.2844397024941628, + "grad_norm": 0.10576717555522919, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 73580 + }, + { + "epoch": 0.28447835969754603, + "grad_norm": 0.10361874103546143, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 73590 + }, + { + "epoch": 0.28451701690092934, + "grad_norm": 0.10449161380529404, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 73600 + }, + { + "epoch": 0.2845556741043126, + "grad_norm": 0.09918791055679321, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 73610 + }, + { + "epoch": 0.2845943313076959, + "grad_norm": 0.11333408206701279, + "learning_rate": 0.002, + "loss": 2.345, + "step": 73620 + }, + { + "epoch": 0.28463298851107915, + "grad_norm": 0.11512020975351334, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 73630 + }, + { + "epoch": 0.2846716457144624, + "grad_norm": 0.10907591879367828, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 73640 + }, + { + "epoch": 0.2847103029178457, + "grad_norm": 0.10631921887397766, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 73650 + }, + { + "epoch": 0.28474896012122897, + "grad_norm": 0.10795870423316956, + "learning_rate": 0.002, + "loss": 2.355, + "step": 73660 + }, + { + "epoch": 0.2847876173246123, + "grad_norm": 0.12246907502412796, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 73670 + }, + { + "epoch": 0.2848262745279955, + "grad_norm": 0.11917459219694138, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 73680 + }, + { + "epoch": 0.28486493173137883, + "grad_norm": 0.11256936937570572, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 73690 + }, + { + "epoch": 0.2849035889347621, + "grad_norm": 0.1294250190258026, + "learning_rate": 0.002, + "loss": 2.334, + "step": 73700 + }, + { + "epoch": 0.2849422461381454, + "grad_norm": 0.10935357213020325, + "learning_rate": 0.002, + "loss": 2.3733, + "step": 73710 + }, + { + "epoch": 0.28498090334152865, + "grad_norm": 0.10090257972478867, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 73720 + }, + { + "epoch": 0.28501956054491195, + "grad_norm": 0.13061776757240295, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 73730 + }, + { + "epoch": 0.2850582177482952, + "grad_norm": 0.11142712831497192, + "learning_rate": 0.002, + "loss": 2.348, + "step": 73740 + }, + { + "epoch": 0.2850968749516785, + "grad_norm": 0.09850049018859863, + "learning_rate": 0.002, + "loss": 2.3802, + "step": 73750 + }, + { + "epoch": 0.28513553215506177, + "grad_norm": 0.12158714979887009, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 73760 + }, + { + "epoch": 0.2851741893584451, + "grad_norm": 0.09792877733707428, + "learning_rate": 0.002, + "loss": 2.369, + "step": 73770 + }, + { + "epoch": 0.2852128465618283, + "grad_norm": 0.1119491308927536, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 73780 + }, + { + "epoch": 0.28525150376521163, + "grad_norm": 0.11141660809516907, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 73790 + }, + { + "epoch": 0.2852901609685949, + "grad_norm": 0.10876152664422989, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 73800 + }, + { + "epoch": 0.2853288181719782, + "grad_norm": 0.10910064727067947, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 73810 + }, + { + "epoch": 0.28536747537536145, + "grad_norm": 0.12243952602148056, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 73820 + }, + { + "epoch": 0.2854061325787447, + "grad_norm": 0.09542898833751678, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 73830 + }, + { + "epoch": 0.285444789782128, + "grad_norm": 0.09667696058750153, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 73840 + }, + { + "epoch": 0.28548344698551126, + "grad_norm": 0.09974554181098938, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 73850 + }, + { + "epoch": 0.28552210418889457, + "grad_norm": 0.11236509680747986, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 73860 + }, + { + "epoch": 0.2855607613922778, + "grad_norm": 0.11773736029863358, + "learning_rate": 0.002, + "loss": 2.355, + "step": 73870 + }, + { + "epoch": 0.2855994185956611, + "grad_norm": 0.1734328418970108, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 73880 + }, + { + "epoch": 0.2856380757990444, + "grad_norm": 0.11204519867897034, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 73890 + }, + { + "epoch": 0.2856767330024277, + "grad_norm": 0.11577246338129044, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 73900 + }, + { + "epoch": 0.28571539020581094, + "grad_norm": 0.10974831134080887, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 73910 + }, + { + "epoch": 0.28575404740919425, + "grad_norm": 0.11790665239095688, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 73920 + }, + { + "epoch": 0.2857927046125775, + "grad_norm": 0.12227218598127365, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 73930 + }, + { + "epoch": 0.2858313618159608, + "grad_norm": 0.11108889430761337, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 73940 + }, + { + "epoch": 0.28587001901934406, + "grad_norm": 0.14347556233406067, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 73950 + }, + { + "epoch": 0.28590867622272736, + "grad_norm": 0.09942697733640671, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 73960 + }, + { + "epoch": 0.2859473334261106, + "grad_norm": 0.10189233720302582, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 73970 + }, + { + "epoch": 0.2859859906294939, + "grad_norm": 0.10416730493307114, + "learning_rate": 0.002, + "loss": 2.353, + "step": 73980 + }, + { + "epoch": 0.2860246478328772, + "grad_norm": 0.11276213079690933, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 73990 + }, + { + "epoch": 0.28606330503626043, + "grad_norm": 0.11061752587556839, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 74000 + }, + { + "epoch": 0.28610196223964374, + "grad_norm": 0.10207962244749069, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 74010 + }, + { + "epoch": 0.286140619443027, + "grad_norm": 0.12822595238685608, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 74020 + }, + { + "epoch": 0.2861792766464103, + "grad_norm": 0.11346330493688583, + "learning_rate": 0.002, + "loss": 2.3779, + "step": 74030 + }, + { + "epoch": 0.28621793384979355, + "grad_norm": 0.1152123212814331, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 74040 + }, + { + "epoch": 0.28625659105317686, + "grad_norm": 0.10732946544885635, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 74050 + }, + { + "epoch": 0.2862952482565601, + "grad_norm": 0.11564039438962936, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 74060 + }, + { + "epoch": 0.2863339054599434, + "grad_norm": 0.10278456658124924, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 74070 + }, + { + "epoch": 0.28637256266332667, + "grad_norm": 0.1271258443593979, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 74080 + }, + { + "epoch": 0.28641121986671, + "grad_norm": 0.12053419649600983, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 74090 + }, + { + "epoch": 0.28644987707009323, + "grad_norm": 0.1091466099023819, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 74100 + }, + { + "epoch": 0.28648853427347654, + "grad_norm": 0.12219161540269852, + "learning_rate": 0.002, + "loss": 2.345, + "step": 74110 + }, + { + "epoch": 0.2865271914768598, + "grad_norm": 0.11780962347984314, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 74120 + }, + { + "epoch": 0.2865658486802431, + "grad_norm": 0.11488896608352661, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 74130 + }, + { + "epoch": 0.28660450588362635, + "grad_norm": 0.11384780704975128, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 74140 + }, + { + "epoch": 0.28664316308700966, + "grad_norm": 0.11270700395107269, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 74150 + }, + { + "epoch": 0.2866818202903929, + "grad_norm": 0.10677814483642578, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 74160 + }, + { + "epoch": 0.2867204774937762, + "grad_norm": 0.1030648723244667, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 74170 + }, + { + "epoch": 0.28675913469715947, + "grad_norm": 0.11373554915189743, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 74180 + }, + { + "epoch": 0.2867977919005427, + "grad_norm": 0.10422752797603607, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 74190 + }, + { + "epoch": 0.28683644910392603, + "grad_norm": 0.13142213225364685, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 74200 + }, + { + "epoch": 0.2868751063073093, + "grad_norm": 0.11340751498937607, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 74210 + }, + { + "epoch": 0.2869137635106926, + "grad_norm": 0.13458040356636047, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 74220 + }, + { + "epoch": 0.28695242071407584, + "grad_norm": 0.10151758790016174, + "learning_rate": 0.002, + "loss": 2.35, + "step": 74230 + }, + { + "epoch": 0.28699107791745915, + "grad_norm": 0.09405501186847687, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 74240 + }, + { + "epoch": 0.2870297351208424, + "grad_norm": 0.10942229628562927, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 74250 + }, + { + "epoch": 0.2870683923242257, + "grad_norm": 0.10689356923103333, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 74260 + }, + { + "epoch": 0.28710704952760896, + "grad_norm": 0.11839873343706131, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 74270 + }, + { + "epoch": 0.28714570673099227, + "grad_norm": 0.10678387433290482, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 74280 + }, + { + "epoch": 0.2871843639343755, + "grad_norm": 0.11635982990264893, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 74290 + }, + { + "epoch": 0.2872230211377588, + "grad_norm": 0.11184539645910263, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 74300 + }, + { + "epoch": 0.2872616783411421, + "grad_norm": 0.10967237502336502, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 74310 + }, + { + "epoch": 0.2873003355445254, + "grad_norm": 0.12310586124658585, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 74320 + }, + { + "epoch": 0.28733899274790864, + "grad_norm": 0.11185023933649063, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 74330 + }, + { + "epoch": 0.28737764995129195, + "grad_norm": 0.09555628895759583, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 74340 + }, + { + "epoch": 0.2874163071546752, + "grad_norm": 0.1086990013718605, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 74350 + }, + { + "epoch": 0.2874549643580585, + "grad_norm": 0.12187491357326508, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 74360 + }, + { + "epoch": 0.28749362156144176, + "grad_norm": 0.10133063793182373, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 74370 + }, + { + "epoch": 0.287532278764825, + "grad_norm": 0.11328078806400299, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 74380 + }, + { + "epoch": 0.2875709359682083, + "grad_norm": 0.10969198495149612, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 74390 + }, + { + "epoch": 0.28760959317159157, + "grad_norm": 0.09657199680805206, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 74400 + }, + { + "epoch": 0.2876482503749749, + "grad_norm": 0.1048688217997551, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 74410 + }, + { + "epoch": 0.28768690757835813, + "grad_norm": 0.10406801849603653, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 74420 + }, + { + "epoch": 0.28772556478174144, + "grad_norm": 0.10882709175348282, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 74430 + }, + { + "epoch": 0.2877642219851247, + "grad_norm": 0.11413436383008957, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 74440 + }, + { + "epoch": 0.287802879188508, + "grad_norm": 0.10573131591081619, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 74450 + }, + { + "epoch": 0.28784153639189125, + "grad_norm": 0.12862974405288696, + "learning_rate": 0.002, + "loss": 2.369, + "step": 74460 + }, + { + "epoch": 0.28788019359527456, + "grad_norm": 0.11314857006072998, + "learning_rate": 0.002, + "loss": 2.347, + "step": 74470 + }, + { + "epoch": 0.2879188507986578, + "grad_norm": 0.11113854497671127, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 74480 + }, + { + "epoch": 0.2879575080020411, + "grad_norm": 0.11759510636329651, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 74490 + }, + { + "epoch": 0.28799616520542437, + "grad_norm": 0.12655659019947052, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 74500 + }, + { + "epoch": 0.2880348224088077, + "grad_norm": 0.10255160182714462, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 74510 + }, + { + "epoch": 0.28807347961219093, + "grad_norm": 0.10705683380365372, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 74520 + }, + { + "epoch": 0.28811213681557424, + "grad_norm": 0.12066076695919037, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 74530 + }, + { + "epoch": 0.2881507940189575, + "grad_norm": 0.10226041078567505, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 74540 + }, + { + "epoch": 0.2881894512223408, + "grad_norm": 0.1151876375079155, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 74550 + }, + { + "epoch": 0.28822810842572405, + "grad_norm": 0.10210239142179489, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 74560 + }, + { + "epoch": 0.2882667656291073, + "grad_norm": 0.1542600393295288, + "learning_rate": 0.002, + "loss": 2.351, + "step": 74570 + }, + { + "epoch": 0.2883054228324906, + "grad_norm": 0.10761580616235733, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 74580 + }, + { + "epoch": 0.28834408003587386, + "grad_norm": 0.12081687897443771, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 74590 + }, + { + "epoch": 0.28838273723925717, + "grad_norm": 0.4860503673553467, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 74600 + }, + { + "epoch": 0.2884213944426404, + "grad_norm": 0.11622172594070435, + "learning_rate": 0.002, + "loss": 2.348, + "step": 74610 + }, + { + "epoch": 0.28846005164602373, + "grad_norm": 0.10896652936935425, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 74620 + }, + { + "epoch": 0.288498708849407, + "grad_norm": 0.11146285384893417, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 74630 + }, + { + "epoch": 0.2885373660527903, + "grad_norm": 0.1089526042342186, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 74640 + }, + { + "epoch": 0.28857602325617354, + "grad_norm": 0.09288636595010757, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 74650 + }, + { + "epoch": 0.28861468045955685, + "grad_norm": 0.1133343055844307, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 74660 + }, + { + "epoch": 0.2886533376629401, + "grad_norm": 0.1155557855963707, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 74670 + }, + { + "epoch": 0.2886919948663234, + "grad_norm": 0.10346411168575287, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 74680 + }, + { + "epoch": 0.28873065206970666, + "grad_norm": 0.1326557844877243, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 74690 + }, + { + "epoch": 0.28876930927308997, + "grad_norm": 0.10779032111167908, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 74700 + }, + { + "epoch": 0.2888079664764732, + "grad_norm": 0.10755519568920135, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 74710 + }, + { + "epoch": 0.28884662367985653, + "grad_norm": 0.11523166298866272, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 74720 + }, + { + "epoch": 0.2888852808832398, + "grad_norm": 0.12562212347984314, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 74730 + }, + { + "epoch": 0.2889239380866231, + "grad_norm": 0.12147145718336105, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 74740 + }, + { + "epoch": 0.28896259529000634, + "grad_norm": 0.10682717710733414, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 74750 + }, + { + "epoch": 0.2890012524933896, + "grad_norm": 0.11858783662319183, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 74760 + }, + { + "epoch": 0.2890399096967729, + "grad_norm": 0.10563915967941284, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 74770 + }, + { + "epoch": 0.28907856690015615, + "grad_norm": 0.13456431031227112, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 74780 + }, + { + "epoch": 0.28911722410353946, + "grad_norm": 0.11352846771478653, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 74790 + }, + { + "epoch": 0.2891558813069227, + "grad_norm": 0.10515779256820679, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 74800 + }, + { + "epoch": 0.289194538510306, + "grad_norm": 0.11092042177915573, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 74810 + }, + { + "epoch": 0.2892331957136893, + "grad_norm": 0.1329876035451889, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 74820 + }, + { + "epoch": 0.2892718529170726, + "grad_norm": 0.09356291592121124, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 74830 + }, + { + "epoch": 0.28931051012045583, + "grad_norm": 0.09958864748477936, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 74840 + }, + { + "epoch": 0.28934916732383914, + "grad_norm": 0.09781919419765472, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 74850 + }, + { + "epoch": 0.2893878245272224, + "grad_norm": 0.10494308173656464, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 74860 + }, + { + "epoch": 0.2894264817306057, + "grad_norm": 0.09888094663619995, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 74870 + }, + { + "epoch": 0.28946513893398895, + "grad_norm": 0.13520100712776184, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 74880 + }, + { + "epoch": 0.28950379613737226, + "grad_norm": 0.11058992147445679, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 74890 + }, + { + "epoch": 0.2895424533407555, + "grad_norm": 0.11469922959804535, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 74900 + }, + { + "epoch": 0.2895811105441388, + "grad_norm": 0.12745223939418793, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 74910 + }, + { + "epoch": 0.28961976774752207, + "grad_norm": 0.13199283182621002, + "learning_rate": 0.002, + "loss": 2.367, + "step": 74920 + }, + { + "epoch": 0.2896584249509053, + "grad_norm": 0.11187552660703659, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 74930 + }, + { + "epoch": 0.28969708215428863, + "grad_norm": 0.1066499799489975, + "learning_rate": 0.002, + "loss": 2.353, + "step": 74940 + }, + { + "epoch": 0.2897357393576719, + "grad_norm": 0.10952769219875336, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 74950 + }, + { + "epoch": 0.2897743965610552, + "grad_norm": 0.11855336278676987, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 74960 + }, + { + "epoch": 0.28981305376443844, + "grad_norm": 0.11813953518867493, + "learning_rate": 0.002, + "loss": 2.356, + "step": 74970 + }, + { + "epoch": 0.28985171096782175, + "grad_norm": 0.12099150568246841, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 74980 + }, + { + "epoch": 0.289890368171205, + "grad_norm": 0.09434831887483597, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 74990 + }, + { + "epoch": 0.2899290253745883, + "grad_norm": 0.11710929870605469, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 75000 + }, + { + "epoch": 0.28996768257797156, + "grad_norm": 0.11767060309648514, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 75010 + }, + { + "epoch": 0.29000633978135487, + "grad_norm": 0.11021557450294495, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 75020 + }, + { + "epoch": 0.2900449969847381, + "grad_norm": 0.11392875760793686, + "learning_rate": 0.002, + "loss": 2.365, + "step": 75030 + }, + { + "epoch": 0.29008365418812143, + "grad_norm": 0.1143617182970047, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 75040 + }, + { + "epoch": 0.2901223113915047, + "grad_norm": 0.10159295052289963, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 75050 + }, + { + "epoch": 0.290160968594888, + "grad_norm": 0.10352852940559387, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 75060 + }, + { + "epoch": 0.29019962579827124, + "grad_norm": 0.11853168159723282, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 75070 + }, + { + "epoch": 0.29023828300165455, + "grad_norm": 0.11491332948207855, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 75080 + }, + { + "epoch": 0.2902769402050378, + "grad_norm": 0.10985317081212997, + "learning_rate": 0.002, + "loss": 2.362, + "step": 75090 + }, + { + "epoch": 0.2903155974084211, + "grad_norm": 0.11014048010110855, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 75100 + }, + { + "epoch": 0.29035425461180436, + "grad_norm": 0.1153755933046341, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 75110 + }, + { + "epoch": 0.2903929118151876, + "grad_norm": 0.12989766895771027, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 75120 + }, + { + "epoch": 0.2904315690185709, + "grad_norm": 0.10513586550951004, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 75130 + }, + { + "epoch": 0.2904702262219542, + "grad_norm": 0.10572315007448196, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 75140 + }, + { + "epoch": 0.2905088834253375, + "grad_norm": 0.1259058564901352, + "learning_rate": 0.002, + "loss": 2.345, + "step": 75150 + }, + { + "epoch": 0.29054754062872074, + "grad_norm": 0.10508497804403305, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 75160 + }, + { + "epoch": 0.29058619783210404, + "grad_norm": 0.11335578560829163, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 75170 + }, + { + "epoch": 0.2906248550354873, + "grad_norm": 0.12427894026041031, + "learning_rate": 0.002, + "loss": 2.36, + "step": 75180 + }, + { + "epoch": 0.2906635122388706, + "grad_norm": 0.10209350287914276, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 75190 + }, + { + "epoch": 0.29070216944225385, + "grad_norm": 0.11700747907161713, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 75200 + }, + { + "epoch": 0.29074082664563716, + "grad_norm": 0.10781227797269821, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 75210 + }, + { + "epoch": 0.2907794838490204, + "grad_norm": 0.13687002658843994, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 75220 + }, + { + "epoch": 0.2908181410524037, + "grad_norm": 0.11583052575588226, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 75230 + }, + { + "epoch": 0.290856798255787, + "grad_norm": 0.10628215968608856, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 75240 + }, + { + "epoch": 0.2908954554591703, + "grad_norm": 0.11679268628358841, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 75250 + }, + { + "epoch": 0.29093411266255353, + "grad_norm": 0.10749790072441101, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 75260 + }, + { + "epoch": 0.29097276986593684, + "grad_norm": 0.10714562982320786, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 75270 + }, + { + "epoch": 0.2910114270693201, + "grad_norm": 0.12559176981449127, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 75280 + }, + { + "epoch": 0.2910500842727034, + "grad_norm": 0.10603532195091248, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 75290 + }, + { + "epoch": 0.29108874147608665, + "grad_norm": 0.1143733486533165, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 75300 + }, + { + "epoch": 0.2911273986794699, + "grad_norm": 0.10944660753011703, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 75310 + }, + { + "epoch": 0.2911660558828532, + "grad_norm": 0.10007858276367188, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 75320 + }, + { + "epoch": 0.29120471308623647, + "grad_norm": 0.12037455290555954, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 75330 + }, + { + "epoch": 0.2912433702896198, + "grad_norm": 0.44239410758018494, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 75340 + }, + { + "epoch": 0.291282027493003, + "grad_norm": 0.13063888251781464, + "learning_rate": 0.002, + "loss": 2.3893, + "step": 75350 + }, + { + "epoch": 0.29132068469638633, + "grad_norm": 0.1133558601140976, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 75360 + }, + { + "epoch": 0.2913593418997696, + "grad_norm": 0.094174325466156, + "learning_rate": 0.002, + "loss": 2.355, + "step": 75370 + }, + { + "epoch": 0.2913979991031529, + "grad_norm": 0.13268128037452698, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 75380 + }, + { + "epoch": 0.29143665630653615, + "grad_norm": 0.11000560224056244, + "learning_rate": 0.002, + "loss": 2.35, + "step": 75390 + }, + { + "epoch": 0.29147531350991945, + "grad_norm": 0.10913138091564178, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 75400 + }, + { + "epoch": 0.2915139707133027, + "grad_norm": 0.10128472745418549, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 75410 + }, + { + "epoch": 0.291552627916686, + "grad_norm": 0.11382602900266647, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 75420 + }, + { + "epoch": 0.29159128512006927, + "grad_norm": 0.11891323328018188, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 75430 + }, + { + "epoch": 0.2916299423234526, + "grad_norm": 0.10780160874128342, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 75440 + }, + { + "epoch": 0.2916685995268358, + "grad_norm": 0.10481828451156616, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 75450 + }, + { + "epoch": 0.29170725673021913, + "grad_norm": 0.10003086179494858, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 75460 + }, + { + "epoch": 0.2917459139336024, + "grad_norm": 0.12280956655740738, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 75470 + }, + { + "epoch": 0.2917845711369857, + "grad_norm": 0.10013121366500854, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 75480 + }, + { + "epoch": 0.29182322834036895, + "grad_norm": 0.10595941543579102, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 75490 + }, + { + "epoch": 0.2918618855437522, + "grad_norm": 0.14050036668777466, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 75500 + }, + { + "epoch": 0.2919005427471355, + "grad_norm": 0.11663522571325302, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 75510 + }, + { + "epoch": 0.29193919995051876, + "grad_norm": 0.10302326083183289, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 75520 + }, + { + "epoch": 0.29197785715390207, + "grad_norm": 0.10631691664457321, + "learning_rate": 0.002, + "loss": 2.356, + "step": 75530 + }, + { + "epoch": 0.2920165143572853, + "grad_norm": 0.09877592325210571, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 75540 + }, + { + "epoch": 0.2920551715606686, + "grad_norm": 0.13082407414913177, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 75550 + }, + { + "epoch": 0.2920938287640519, + "grad_norm": 0.1210227981209755, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 75560 + }, + { + "epoch": 0.2921324859674352, + "grad_norm": 0.10815666615962982, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 75570 + }, + { + "epoch": 0.29217114317081844, + "grad_norm": 0.10120754688978195, + "learning_rate": 0.002, + "loss": 2.354, + "step": 75580 + }, + { + "epoch": 0.29220980037420174, + "grad_norm": 0.1269613355398178, + "learning_rate": 0.002, + "loss": 2.3739, + "step": 75590 + }, + { + "epoch": 0.292248457577585, + "grad_norm": 0.14352715015411377, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 75600 + }, + { + "epoch": 0.2922871147809683, + "grad_norm": 0.10628818720579147, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 75610 + }, + { + "epoch": 0.29232577198435156, + "grad_norm": 0.12074145674705505, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 75620 + }, + { + "epoch": 0.29236442918773486, + "grad_norm": 0.11499188095331192, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 75630 + }, + { + "epoch": 0.2924030863911181, + "grad_norm": 0.10540103167295456, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 75640 + }, + { + "epoch": 0.2924417435945014, + "grad_norm": 0.10639649629592896, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 75650 + }, + { + "epoch": 0.2924804007978847, + "grad_norm": 0.12016444653272629, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 75660 + }, + { + "epoch": 0.29251905800126793, + "grad_norm": 0.11414739489555359, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 75670 + }, + { + "epoch": 0.29255771520465124, + "grad_norm": 0.1184229627251625, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 75680 + }, + { + "epoch": 0.2925963724080345, + "grad_norm": 0.12431208789348602, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 75690 + }, + { + "epoch": 0.2926350296114178, + "grad_norm": 0.1032339408993721, + "learning_rate": 0.002, + "loss": 2.367, + "step": 75700 + }, + { + "epoch": 0.29267368681480105, + "grad_norm": 0.11731734126806259, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 75710 + }, + { + "epoch": 0.29271234401818436, + "grad_norm": 0.12077368795871735, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 75720 + }, + { + "epoch": 0.2927510012215676, + "grad_norm": 0.1214771568775177, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 75730 + }, + { + "epoch": 0.2927896584249509, + "grad_norm": 0.09876122325658798, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 75740 + }, + { + "epoch": 0.29282831562833417, + "grad_norm": 0.12899163365364075, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 75750 + }, + { + "epoch": 0.2928669728317175, + "grad_norm": 0.13751502335071564, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 75760 + }, + { + "epoch": 0.29290563003510073, + "grad_norm": 0.1476529836654663, + "learning_rate": 0.002, + "loss": 2.351, + "step": 75770 + }, + { + "epoch": 0.29294428723848404, + "grad_norm": 0.10541699826717377, + "learning_rate": 0.002, + "loss": 2.353, + "step": 75780 + }, + { + "epoch": 0.2929829444418673, + "grad_norm": 0.13030430674552917, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 75790 + }, + { + "epoch": 0.2930216016452506, + "grad_norm": 0.1042986512184143, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 75800 + }, + { + "epoch": 0.29306025884863385, + "grad_norm": 0.12977388501167297, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 75810 + }, + { + "epoch": 0.29309891605201716, + "grad_norm": 0.10096988081932068, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 75820 + }, + { + "epoch": 0.2931375732554004, + "grad_norm": 0.1118810772895813, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 75830 + }, + { + "epoch": 0.2931762304587837, + "grad_norm": 0.10725447535514832, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 75840 + }, + { + "epoch": 0.29321488766216697, + "grad_norm": 0.11546574532985687, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 75850 + }, + { + "epoch": 0.2932535448655502, + "grad_norm": 0.09685637801885605, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 75860 + }, + { + "epoch": 0.2932922020689335, + "grad_norm": 0.11934518814086914, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 75870 + }, + { + "epoch": 0.2933308592723168, + "grad_norm": 0.10132249444723129, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 75880 + }, + { + "epoch": 0.2933695164757001, + "grad_norm": 0.12421329319477081, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 75890 + }, + { + "epoch": 0.29340817367908334, + "grad_norm": 0.09948378801345825, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 75900 + }, + { + "epoch": 0.29344683088246665, + "grad_norm": 0.1021517962217331, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 75910 + }, + { + "epoch": 0.2934854880858499, + "grad_norm": 0.12443681061267853, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 75920 + }, + { + "epoch": 0.2935241452892332, + "grad_norm": 0.1347322016954422, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 75930 + }, + { + "epoch": 0.29356280249261646, + "grad_norm": 0.12235191464424133, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 75940 + }, + { + "epoch": 0.29360145969599977, + "grad_norm": 0.1328326165676117, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 75950 + }, + { + "epoch": 0.293640116899383, + "grad_norm": 0.103482685983181, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 75960 + }, + { + "epoch": 0.2936787741027663, + "grad_norm": 0.12685489654541016, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 75970 + }, + { + "epoch": 0.2937174313061496, + "grad_norm": 0.10713542252779007, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 75980 + }, + { + "epoch": 0.2937560885095329, + "grad_norm": 0.11853060871362686, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 75990 + }, + { + "epoch": 0.29379474571291614, + "grad_norm": 0.13181206583976746, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 76000 + }, + { + "epoch": 0.29383340291629945, + "grad_norm": 0.09293057769536972, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 76010 + }, + { + "epoch": 0.2938720601196827, + "grad_norm": 0.09713239222764969, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 76020 + }, + { + "epoch": 0.293910717323066, + "grad_norm": 0.11791348457336426, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 76030 + }, + { + "epoch": 0.29394937452644926, + "grad_norm": 0.10660435259342194, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 76040 + }, + { + "epoch": 0.2939880317298325, + "grad_norm": 0.09991808235645294, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 76050 + }, + { + "epoch": 0.2940266889332158, + "grad_norm": 0.10632813721895218, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 76060 + }, + { + "epoch": 0.29406534613659907, + "grad_norm": 0.16934417188167572, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 76070 + }, + { + "epoch": 0.2941040033399824, + "grad_norm": 0.09975051879882812, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 76080 + }, + { + "epoch": 0.29414266054336563, + "grad_norm": 0.08869673311710358, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 76090 + }, + { + "epoch": 0.29418131774674894, + "grad_norm": 0.11789096891880035, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 76100 + }, + { + "epoch": 0.2942199749501322, + "grad_norm": 0.09168128669261932, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 76110 + }, + { + "epoch": 0.2942586321535155, + "grad_norm": 0.10032834857702255, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 76120 + }, + { + "epoch": 0.29429728935689875, + "grad_norm": 0.12767310440540314, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 76130 + }, + { + "epoch": 0.29433594656028206, + "grad_norm": 0.10768149048089981, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 76140 + }, + { + "epoch": 0.2943746037636653, + "grad_norm": 0.11018497496843338, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 76150 + }, + { + "epoch": 0.2944132609670486, + "grad_norm": 0.10596516728401184, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 76160 + }, + { + "epoch": 0.29445191817043187, + "grad_norm": 0.12382093816995621, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 76170 + }, + { + "epoch": 0.2944905753738152, + "grad_norm": 0.0991792306303978, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 76180 + }, + { + "epoch": 0.29452923257719843, + "grad_norm": 0.11784415692090988, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 76190 + }, + { + "epoch": 0.29456788978058174, + "grad_norm": 0.11032547801733017, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 76200 + }, + { + "epoch": 0.294606546983965, + "grad_norm": 0.11199129372835159, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 76210 + }, + { + "epoch": 0.2946452041873483, + "grad_norm": 0.0953453853726387, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 76220 + }, + { + "epoch": 0.29468386139073155, + "grad_norm": 0.10188492387533188, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 76230 + }, + { + "epoch": 0.2947225185941148, + "grad_norm": 0.1126917153596878, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 76240 + }, + { + "epoch": 0.2947611757974981, + "grad_norm": 0.10451949387788773, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 76250 + }, + { + "epoch": 0.29479983300088136, + "grad_norm": 0.11269045621156693, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 76260 + }, + { + "epoch": 0.29483849020426467, + "grad_norm": 0.10896584391593933, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 76270 + }, + { + "epoch": 0.2948771474076479, + "grad_norm": 0.10747255384922028, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 76280 + }, + { + "epoch": 0.29491580461103123, + "grad_norm": 0.12370602786540985, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 76290 + }, + { + "epoch": 0.2949544618144145, + "grad_norm": 0.12151787430047989, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 76300 + }, + { + "epoch": 0.2949931190177978, + "grad_norm": 0.1202084943652153, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 76310 + }, + { + "epoch": 0.29503177622118104, + "grad_norm": 0.22490330040454865, + "learning_rate": 0.002, + "loss": 2.362, + "step": 76320 + }, + { + "epoch": 0.29507043342456435, + "grad_norm": 0.12307056784629822, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 76330 + }, + { + "epoch": 0.2951090906279476, + "grad_norm": 0.11710674315690994, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 76340 + }, + { + "epoch": 0.2951477478313309, + "grad_norm": 0.12242462486028671, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 76350 + }, + { + "epoch": 0.29518640503471416, + "grad_norm": 0.10505883395671844, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 76360 + }, + { + "epoch": 0.29522506223809747, + "grad_norm": 0.13543276488780975, + "learning_rate": 0.002, + "loss": 2.3714, + "step": 76370 + }, + { + "epoch": 0.2952637194414807, + "grad_norm": 0.11254122108221054, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 76380 + }, + { + "epoch": 0.29530237664486403, + "grad_norm": 0.10214526206254959, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 76390 + }, + { + "epoch": 0.2953410338482473, + "grad_norm": 0.1244417205452919, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 76400 + }, + { + "epoch": 0.2953796910516306, + "grad_norm": 0.1231013685464859, + "learning_rate": 0.002, + "loss": 2.349, + "step": 76410 + }, + { + "epoch": 0.29541834825501384, + "grad_norm": 0.11223754286766052, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 76420 + }, + { + "epoch": 0.2954570054583971, + "grad_norm": 0.09990517050027847, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 76430 + }, + { + "epoch": 0.2954956626617804, + "grad_norm": 0.0930066928267479, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 76440 + }, + { + "epoch": 0.29553431986516365, + "grad_norm": 0.10342702269554138, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 76450 + }, + { + "epoch": 0.29557297706854696, + "grad_norm": 0.12185720354318619, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 76460 + }, + { + "epoch": 0.2956116342719302, + "grad_norm": 0.11930191516876221, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 76470 + }, + { + "epoch": 0.2956502914753135, + "grad_norm": 0.09556443989276886, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 76480 + }, + { + "epoch": 0.2956889486786968, + "grad_norm": 0.12179828435182571, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 76490 + }, + { + "epoch": 0.2957276058820801, + "grad_norm": 0.11521507054567337, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 76500 + }, + { + "epoch": 0.29576626308546333, + "grad_norm": 0.10643960535526276, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 76510 + }, + { + "epoch": 0.29580492028884664, + "grad_norm": 0.11946967989206314, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 76520 + }, + { + "epoch": 0.2958435774922299, + "grad_norm": 0.12031736969947815, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 76530 + }, + { + "epoch": 0.2958822346956132, + "grad_norm": 0.11797958612442017, + "learning_rate": 0.002, + "loss": 2.359, + "step": 76540 + }, + { + "epoch": 0.29592089189899645, + "grad_norm": 0.09968064725399017, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 76550 + }, + { + "epoch": 0.29595954910237976, + "grad_norm": 0.09612449258565903, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 76560 + }, + { + "epoch": 0.295998206305763, + "grad_norm": 0.14262156188488007, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 76570 + }, + { + "epoch": 0.2960368635091463, + "grad_norm": 0.1107003390789032, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 76580 + }, + { + "epoch": 0.29607552071252957, + "grad_norm": 0.11529079079627991, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 76590 + }, + { + "epoch": 0.2961141779159128, + "grad_norm": 0.1252748817205429, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 76600 + }, + { + "epoch": 0.29615283511929613, + "grad_norm": 0.09772691130638123, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 76610 + }, + { + "epoch": 0.2961914923226794, + "grad_norm": 0.10354173183441162, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 76620 + }, + { + "epoch": 0.2962301495260627, + "grad_norm": 0.12318793684244156, + "learning_rate": 0.002, + "loss": 2.356, + "step": 76630 + }, + { + "epoch": 0.29626880672944594, + "grad_norm": 0.11388937383890152, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 76640 + }, + { + "epoch": 0.29630746393282925, + "grad_norm": 0.14349432289600372, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 76650 + }, + { + "epoch": 0.2963461211362125, + "grad_norm": 0.09921904653310776, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 76660 + }, + { + "epoch": 0.2963847783395958, + "grad_norm": 0.1507887989282608, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 76670 + }, + { + "epoch": 0.29642343554297906, + "grad_norm": 0.10376685857772827, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 76680 + }, + { + "epoch": 0.29646209274636237, + "grad_norm": 0.12126835435628891, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 76690 + }, + { + "epoch": 0.2965007499497456, + "grad_norm": 0.10878845304250717, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 76700 + }, + { + "epoch": 0.29653940715312893, + "grad_norm": 0.12074249237775803, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 76710 + }, + { + "epoch": 0.2965780643565122, + "grad_norm": 0.11226803064346313, + "learning_rate": 0.002, + "loss": 2.349, + "step": 76720 + }, + { + "epoch": 0.2966167215598955, + "grad_norm": 0.09781412780284882, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 76730 + }, + { + "epoch": 0.29665537876327874, + "grad_norm": 0.11269398033618927, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 76740 + }, + { + "epoch": 0.29669403596666205, + "grad_norm": 0.11323588341474533, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 76750 + }, + { + "epoch": 0.2967326931700453, + "grad_norm": 0.09751637279987335, + "learning_rate": 0.002, + "loss": 2.347, + "step": 76760 + }, + { + "epoch": 0.2967713503734286, + "grad_norm": 0.10050305724143982, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 76770 + }, + { + "epoch": 0.29681000757681186, + "grad_norm": 0.10856390744447708, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 76780 + }, + { + "epoch": 0.2968486647801951, + "grad_norm": 0.12790358066558838, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 76790 + }, + { + "epoch": 0.2968873219835784, + "grad_norm": 0.1087295264005661, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 76800 + }, + { + "epoch": 0.2969259791869617, + "grad_norm": 0.11008793860673904, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 76810 + }, + { + "epoch": 0.296964636390345, + "grad_norm": 0.11568048596382141, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 76820 + }, + { + "epoch": 0.29700329359372823, + "grad_norm": 0.2546520531177521, + "learning_rate": 0.002, + "loss": 2.348, + "step": 76830 + }, + { + "epoch": 0.29704195079711154, + "grad_norm": 0.10498040169477463, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 76840 + }, + { + "epoch": 0.2970806080004948, + "grad_norm": 0.10412923991680145, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 76850 + }, + { + "epoch": 0.2971192652038781, + "grad_norm": 0.10490905493497849, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 76860 + }, + { + "epoch": 0.29715792240726135, + "grad_norm": 0.11890088021755219, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 76870 + }, + { + "epoch": 0.29719657961064466, + "grad_norm": 0.1103493794798851, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 76880 + }, + { + "epoch": 0.2972352368140279, + "grad_norm": 0.1093481257557869, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 76890 + }, + { + "epoch": 0.2972738940174112, + "grad_norm": 0.11652281880378723, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 76900 + }, + { + "epoch": 0.2973125512207945, + "grad_norm": 0.10535074770450592, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 76910 + }, + { + "epoch": 0.2973512084241778, + "grad_norm": 0.11236576735973358, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 76920 + }, + { + "epoch": 0.29738986562756103, + "grad_norm": 0.11530499160289764, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 76930 + }, + { + "epoch": 0.29742852283094434, + "grad_norm": 0.15051789581775665, + "learning_rate": 0.002, + "loss": 2.376, + "step": 76940 + }, + { + "epoch": 0.2974671800343276, + "grad_norm": 0.10237208753824234, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 76950 + }, + { + "epoch": 0.2975058372377109, + "grad_norm": 0.11303669959306717, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 76960 + }, + { + "epoch": 0.29754449444109415, + "grad_norm": 0.12454624474048615, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 76970 + }, + { + "epoch": 0.2975831516444774, + "grad_norm": 0.12306322157382965, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 76980 + }, + { + "epoch": 0.2976218088478607, + "grad_norm": 0.10339666157960892, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 76990 + }, + { + "epoch": 0.29766046605124397, + "grad_norm": 0.09617898613214493, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 77000 + }, + { + "epoch": 0.2976991232546273, + "grad_norm": 0.11983972787857056, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 77010 + }, + { + "epoch": 0.2977377804580105, + "grad_norm": 0.13938377797603607, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 77020 + }, + { + "epoch": 0.29777643766139383, + "grad_norm": 0.10169366747140884, + "learning_rate": 0.002, + "loss": 2.348, + "step": 77030 + }, + { + "epoch": 0.2978150948647771, + "grad_norm": 0.1186128631234169, + "learning_rate": 0.002, + "loss": 2.346, + "step": 77040 + }, + { + "epoch": 0.2978537520681604, + "grad_norm": 0.0991736575961113, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 77050 + }, + { + "epoch": 0.29789240927154365, + "grad_norm": 0.11100050061941147, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 77060 + }, + { + "epoch": 0.29793106647492695, + "grad_norm": 0.1000758484005928, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 77070 + }, + { + "epoch": 0.2979697236783102, + "grad_norm": 0.11656024307012558, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 77080 + }, + { + "epoch": 0.2980083808816935, + "grad_norm": 0.11905937641859055, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 77090 + }, + { + "epoch": 0.29804703808507677, + "grad_norm": 0.12053173780441284, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 77100 + }, + { + "epoch": 0.2980856952884601, + "grad_norm": 0.11425994336605072, + "learning_rate": 0.002, + "loss": 2.354, + "step": 77110 + }, + { + "epoch": 0.2981243524918433, + "grad_norm": 0.1111239418387413, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 77120 + }, + { + "epoch": 0.29816300969522663, + "grad_norm": 0.11100706458091736, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 77130 + }, + { + "epoch": 0.2982016668986099, + "grad_norm": 0.10714425891637802, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 77140 + }, + { + "epoch": 0.2982403241019932, + "grad_norm": 0.10456814616918564, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 77150 + }, + { + "epoch": 0.29827898130537644, + "grad_norm": 0.10369396954774857, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 77160 + }, + { + "epoch": 0.2983176385087597, + "grad_norm": 0.12451314181089401, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 77170 + }, + { + "epoch": 0.298356295712143, + "grad_norm": 0.10822725296020508, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 77180 + }, + { + "epoch": 0.29839495291552626, + "grad_norm": 0.13343235850334167, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 77190 + }, + { + "epoch": 0.29843361011890956, + "grad_norm": 0.10224916785955429, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 77200 + }, + { + "epoch": 0.2984722673222928, + "grad_norm": 0.11366762965917587, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 77210 + }, + { + "epoch": 0.2985109245256761, + "grad_norm": 0.11645630747079849, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 77220 + }, + { + "epoch": 0.2985495817290594, + "grad_norm": 0.09775960445404053, + "learning_rate": 0.002, + "loss": 2.364, + "step": 77230 + }, + { + "epoch": 0.2985882389324427, + "grad_norm": 0.11187317967414856, + "learning_rate": 0.002, + "loss": 2.349, + "step": 77240 + }, + { + "epoch": 0.29862689613582594, + "grad_norm": 0.11537643522024155, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 77250 + }, + { + "epoch": 0.29866555333920924, + "grad_norm": 0.10838068276643753, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 77260 + }, + { + "epoch": 0.2987042105425925, + "grad_norm": 0.12092362344264984, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 77270 + }, + { + "epoch": 0.2987428677459758, + "grad_norm": 0.11384254693984985, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 77280 + }, + { + "epoch": 0.29878152494935906, + "grad_norm": 0.11899744719266891, + "learning_rate": 0.002, + "loss": 2.35, + "step": 77290 + }, + { + "epoch": 0.29882018215274236, + "grad_norm": 0.12392907589673996, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 77300 + }, + { + "epoch": 0.2988588393561256, + "grad_norm": 0.1090821698307991, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 77310 + }, + { + "epoch": 0.2988974965595089, + "grad_norm": 0.13105599582195282, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 77320 + }, + { + "epoch": 0.2989361537628922, + "grad_norm": 0.11010351777076721, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 77330 + }, + { + "epoch": 0.29897481096627543, + "grad_norm": 0.10212308168411255, + "learning_rate": 0.002, + "loss": 2.34, + "step": 77340 + }, + { + "epoch": 0.29901346816965874, + "grad_norm": 0.12034210562705994, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 77350 + }, + { + "epoch": 0.299052125373042, + "grad_norm": 0.10495106130838394, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 77360 + }, + { + "epoch": 0.2990907825764253, + "grad_norm": 0.10167660564184189, + "learning_rate": 0.002, + "loss": 2.347, + "step": 77370 + }, + { + "epoch": 0.29912943977980855, + "grad_norm": 0.11317375302314758, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 77380 + }, + { + "epoch": 0.29916809698319186, + "grad_norm": 0.10915170609951019, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 77390 + }, + { + "epoch": 0.2992067541865751, + "grad_norm": 0.10342042148113251, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 77400 + }, + { + "epoch": 0.2992454113899584, + "grad_norm": 0.11812546849250793, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 77410 + }, + { + "epoch": 0.29928406859334167, + "grad_norm": 0.10676705092191696, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 77420 + }, + { + "epoch": 0.299322725796725, + "grad_norm": 0.11633370071649551, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 77430 + }, + { + "epoch": 0.2993613830001082, + "grad_norm": 0.10170099139213562, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 77440 + }, + { + "epoch": 0.29940004020349154, + "grad_norm": 0.13479268550872803, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 77450 + }, + { + "epoch": 0.2994386974068748, + "grad_norm": 0.11963546276092529, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 77460 + }, + { + "epoch": 0.2994773546102581, + "grad_norm": 0.12442842125892639, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 77470 + }, + { + "epoch": 0.29951601181364135, + "grad_norm": 0.1305210441350937, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 77480 + }, + { + "epoch": 0.29955466901702466, + "grad_norm": 0.10134751349687576, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 77490 + }, + { + "epoch": 0.2995933262204079, + "grad_norm": 0.11258494108915329, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 77500 + }, + { + "epoch": 0.2996319834237912, + "grad_norm": 0.11224709451198578, + "learning_rate": 0.002, + "loss": 2.346, + "step": 77510 + }, + { + "epoch": 0.29967064062717447, + "grad_norm": 0.09768752753734589, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 77520 + }, + { + "epoch": 0.2997092978305577, + "grad_norm": 0.1073933094739914, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 77530 + }, + { + "epoch": 0.299747955033941, + "grad_norm": 0.10424616187810898, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 77540 + }, + { + "epoch": 0.2997866122373243, + "grad_norm": 0.12393969297409058, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 77550 + }, + { + "epoch": 0.2998252694407076, + "grad_norm": 0.09931936860084534, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 77560 + }, + { + "epoch": 0.29986392664409084, + "grad_norm": 0.09614407271146774, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 77570 + }, + { + "epoch": 0.29990258384747415, + "grad_norm": 0.12007300555706024, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 77580 + }, + { + "epoch": 0.2999412410508574, + "grad_norm": 0.14052484929561615, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 77590 + }, + { + "epoch": 0.2999798982542407, + "grad_norm": 0.12478945404291153, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 77600 + }, + { + "epoch": 0.30001855545762396, + "grad_norm": 0.09192482382059097, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 77610 + }, + { + "epoch": 0.30005721266100727, + "grad_norm": 0.13805346190929413, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 77620 + }, + { + "epoch": 0.3000958698643905, + "grad_norm": 0.10263418406248093, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 77630 + }, + { + "epoch": 0.3001345270677738, + "grad_norm": 0.11766941100358963, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 77640 + }, + { + "epoch": 0.3001731842711571, + "grad_norm": 0.1011272743344307, + "learning_rate": 0.002, + "loss": 2.3752, + "step": 77650 + }, + { + "epoch": 0.3002118414745404, + "grad_norm": 0.11512536555528641, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 77660 + }, + { + "epoch": 0.30025049867792364, + "grad_norm": 0.11931298673152924, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 77670 + }, + { + "epoch": 0.30028915588130695, + "grad_norm": 0.11683052778244019, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 77680 + }, + { + "epoch": 0.3003278130846902, + "grad_norm": 0.10903342068195343, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 77690 + }, + { + "epoch": 0.3003664702880735, + "grad_norm": 0.11838795244693756, + "learning_rate": 0.002, + "loss": 2.357, + "step": 77700 + }, + { + "epoch": 0.30040512749145676, + "grad_norm": 0.1060962900519371, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 77710 + }, + { + "epoch": 0.30044378469484, + "grad_norm": 0.09655400365591049, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 77720 + }, + { + "epoch": 0.3004824418982233, + "grad_norm": 0.12824280560016632, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 77730 + }, + { + "epoch": 0.30052109910160657, + "grad_norm": 0.10142549872398376, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 77740 + }, + { + "epoch": 0.3005597563049899, + "grad_norm": 0.10544409602880478, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 77750 + }, + { + "epoch": 0.30059841350837313, + "grad_norm": 0.10959829390048981, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 77760 + }, + { + "epoch": 0.30063707071175644, + "grad_norm": 0.11233651638031006, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 77770 + }, + { + "epoch": 0.3006757279151397, + "grad_norm": 0.14760273694992065, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 77780 + }, + { + "epoch": 0.300714385118523, + "grad_norm": 0.12075531482696533, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 77790 + }, + { + "epoch": 0.30075304232190625, + "grad_norm": 0.11051961779594421, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 77800 + }, + { + "epoch": 0.30079169952528956, + "grad_norm": 0.10270072519779205, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 77810 + }, + { + "epoch": 0.3008303567286728, + "grad_norm": 0.13468053936958313, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 77820 + }, + { + "epoch": 0.3008690139320561, + "grad_norm": 0.11314025521278381, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 77830 + }, + { + "epoch": 0.30090767113543937, + "grad_norm": 0.09416459500789642, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 77840 + }, + { + "epoch": 0.3009463283388227, + "grad_norm": 0.09706717729568481, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 77850 + }, + { + "epoch": 0.30098498554220593, + "grad_norm": 0.1195613220334053, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 77860 + }, + { + "epoch": 0.30102364274558924, + "grad_norm": 0.10404033958911896, + "learning_rate": 0.002, + "loss": 2.374, + "step": 77870 + }, + { + "epoch": 0.3010622999489725, + "grad_norm": 0.10307558625936508, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 77880 + }, + { + "epoch": 0.3011009571523558, + "grad_norm": 0.10437768697738647, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 77890 + }, + { + "epoch": 0.30113961435573905, + "grad_norm": 0.11560779064893723, + "learning_rate": 0.002, + "loss": 2.344, + "step": 77900 + }, + { + "epoch": 0.3011782715591223, + "grad_norm": 0.10063166171312332, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 77910 + }, + { + "epoch": 0.3012169287625056, + "grad_norm": 0.1272033303976059, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 77920 + }, + { + "epoch": 0.30125558596588886, + "grad_norm": 0.11612261831760406, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 77930 + }, + { + "epoch": 0.30129424316927217, + "grad_norm": 0.12208713591098785, + "learning_rate": 0.002, + "loss": 2.358, + "step": 77940 + }, + { + "epoch": 0.3013329003726554, + "grad_norm": 0.10828813910484314, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 77950 + }, + { + "epoch": 0.30137155757603873, + "grad_norm": 0.11465118825435638, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 77960 + }, + { + "epoch": 0.301410214779422, + "grad_norm": 0.11285607516765594, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 77970 + }, + { + "epoch": 0.3014488719828053, + "grad_norm": 0.09235849976539612, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 77980 + }, + { + "epoch": 0.30148752918618854, + "grad_norm": 0.1082046627998352, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 77990 + }, + { + "epoch": 0.30152618638957185, + "grad_norm": 0.09929527342319489, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 78000 + }, + { + "epoch": 0.3015648435929551, + "grad_norm": 0.12402932345867157, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 78010 + }, + { + "epoch": 0.3016035007963384, + "grad_norm": 0.11547496914863586, + "learning_rate": 0.002, + "loss": 2.355, + "step": 78020 + }, + { + "epoch": 0.30164215799972166, + "grad_norm": 0.10738544911146164, + "learning_rate": 0.002, + "loss": 2.346, + "step": 78030 + }, + { + "epoch": 0.30168081520310497, + "grad_norm": 0.09993022680282593, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 78040 + }, + { + "epoch": 0.3017194724064882, + "grad_norm": 0.10725497454404831, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 78050 + }, + { + "epoch": 0.30175812960987153, + "grad_norm": 0.12580275535583496, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 78060 + }, + { + "epoch": 0.3017967868132548, + "grad_norm": 0.10700584203004837, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 78070 + }, + { + "epoch": 0.30183544401663803, + "grad_norm": 0.11347413808107376, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 78080 + }, + { + "epoch": 0.30187410122002134, + "grad_norm": 0.11956752836704254, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 78090 + }, + { + "epoch": 0.3019127584234046, + "grad_norm": 0.10778584331274033, + "learning_rate": 0.002, + "loss": 2.362, + "step": 78100 + }, + { + "epoch": 0.3019514156267879, + "grad_norm": 0.11137451976537704, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 78110 + }, + { + "epoch": 0.30199007283017115, + "grad_norm": 0.12869513034820557, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 78120 + }, + { + "epoch": 0.30202873003355446, + "grad_norm": 0.10465627908706665, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 78130 + }, + { + "epoch": 0.3020673872369377, + "grad_norm": 0.11821458488702774, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 78140 + }, + { + "epoch": 0.302106044440321, + "grad_norm": 0.11543087661266327, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 78150 + }, + { + "epoch": 0.30214470164370427, + "grad_norm": 0.12336330860853195, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 78160 + }, + { + "epoch": 0.3021833588470876, + "grad_norm": 0.10734038800001144, + "learning_rate": 0.002, + "loss": 2.362, + "step": 78170 + }, + { + "epoch": 0.30222201605047083, + "grad_norm": 0.10224812477827072, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 78180 + }, + { + "epoch": 0.30226067325385414, + "grad_norm": 0.11591426283121109, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 78190 + }, + { + "epoch": 0.3022993304572374, + "grad_norm": 0.09956828504800797, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 78200 + }, + { + "epoch": 0.3023379876606207, + "grad_norm": 0.1258390098810196, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 78210 + }, + { + "epoch": 0.30237664486400395, + "grad_norm": 0.1432776004076004, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 78220 + }, + { + "epoch": 0.30241530206738726, + "grad_norm": 0.1217392161488533, + "learning_rate": 0.002, + "loss": 2.363, + "step": 78230 + }, + { + "epoch": 0.3024539592707705, + "grad_norm": 0.1125762015581131, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 78240 + }, + { + "epoch": 0.3024926164741538, + "grad_norm": 0.11079467087984085, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 78250 + }, + { + "epoch": 0.30253127367753707, + "grad_norm": 0.14208978414535522, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 78260 + }, + { + "epoch": 0.3025699308809203, + "grad_norm": 0.10825013369321823, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 78270 + }, + { + "epoch": 0.30260858808430363, + "grad_norm": 0.12011104822158813, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 78280 + }, + { + "epoch": 0.3026472452876869, + "grad_norm": 0.1183653324842453, + "learning_rate": 0.002, + "loss": 2.357, + "step": 78290 + }, + { + "epoch": 0.3026859024910702, + "grad_norm": 0.1488329917192459, + "learning_rate": 0.002, + "loss": 2.34, + "step": 78300 + }, + { + "epoch": 0.30272455969445344, + "grad_norm": 0.10774101316928864, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 78310 + }, + { + "epoch": 0.30276321689783675, + "grad_norm": 0.11320872604846954, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 78320 + }, + { + "epoch": 0.30280187410122, + "grad_norm": 0.12253223359584808, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 78330 + }, + { + "epoch": 0.3028405313046033, + "grad_norm": 0.1480216532945633, + "learning_rate": 0.002, + "loss": 2.368, + "step": 78340 + }, + { + "epoch": 0.30287918850798656, + "grad_norm": 0.11408364027738571, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 78350 + }, + { + "epoch": 0.30291784571136987, + "grad_norm": 0.09980117529630661, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 78360 + }, + { + "epoch": 0.3029565029147531, + "grad_norm": 0.1159496158361435, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 78370 + }, + { + "epoch": 0.30299516011813643, + "grad_norm": 0.10608360916376114, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 78380 + }, + { + "epoch": 0.3030338173215197, + "grad_norm": 0.09666457027196884, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 78390 + }, + { + "epoch": 0.303072474524903, + "grad_norm": 0.1286255568265915, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 78400 + }, + { + "epoch": 0.30311113172828624, + "grad_norm": 0.14180733263492584, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 78410 + }, + { + "epoch": 0.30314978893166955, + "grad_norm": 0.12679146230220795, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 78420 + }, + { + "epoch": 0.3031884461350528, + "grad_norm": 0.09461666643619537, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 78430 + }, + { + "epoch": 0.3032271033384361, + "grad_norm": 0.17039163410663605, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 78440 + }, + { + "epoch": 0.30326576054181936, + "grad_norm": 0.11002831906080246, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 78450 + }, + { + "epoch": 0.3033044177452026, + "grad_norm": 0.10490234941244125, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 78460 + }, + { + "epoch": 0.3033430749485859, + "grad_norm": 0.10305456072092056, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 78470 + }, + { + "epoch": 0.3033817321519692, + "grad_norm": 0.13403716683387756, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 78480 + }, + { + "epoch": 0.3034203893553525, + "grad_norm": 0.09878389537334442, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 78490 + }, + { + "epoch": 0.30345904655873573, + "grad_norm": 0.10970155149698257, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 78500 + }, + { + "epoch": 0.30349770376211904, + "grad_norm": 0.10358286648988724, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 78510 + }, + { + "epoch": 0.3035363609655023, + "grad_norm": 0.1220836266875267, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 78520 + }, + { + "epoch": 0.3035750181688856, + "grad_norm": 0.10189653187990189, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 78530 + }, + { + "epoch": 0.30361367537226885, + "grad_norm": 0.1015162542462349, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 78540 + }, + { + "epoch": 0.30365233257565216, + "grad_norm": 0.11576369404792786, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 78550 + }, + { + "epoch": 0.3036909897790354, + "grad_norm": 0.18061217665672302, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 78560 + }, + { + "epoch": 0.3037296469824187, + "grad_norm": 0.1377808004617691, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 78570 + }, + { + "epoch": 0.303768304185802, + "grad_norm": 0.10999388247728348, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 78580 + }, + { + "epoch": 0.3038069613891853, + "grad_norm": 0.11197017133235931, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 78590 + }, + { + "epoch": 0.30384561859256853, + "grad_norm": 0.10789023339748383, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 78600 + }, + { + "epoch": 0.30388427579595184, + "grad_norm": 0.1047627180814743, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 78610 + }, + { + "epoch": 0.3039229329993351, + "grad_norm": 0.12693625688552856, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 78620 + }, + { + "epoch": 0.3039615902027184, + "grad_norm": 0.10265807062387466, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 78630 + }, + { + "epoch": 0.30400024740610165, + "grad_norm": 0.12135670334100723, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 78640 + }, + { + "epoch": 0.3040389046094849, + "grad_norm": 0.11370059847831726, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 78650 + }, + { + "epoch": 0.3040775618128682, + "grad_norm": 0.10926605761051178, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 78660 + }, + { + "epoch": 0.30411621901625147, + "grad_norm": 0.11264129728078842, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 78670 + }, + { + "epoch": 0.3041548762196348, + "grad_norm": 0.11051646620035172, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 78680 + }, + { + "epoch": 0.304193533423018, + "grad_norm": 0.11824481189250946, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 78690 + }, + { + "epoch": 0.30423219062640133, + "grad_norm": 0.11840855330228806, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 78700 + }, + { + "epoch": 0.3042708478297846, + "grad_norm": 0.12107060849666595, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 78710 + }, + { + "epoch": 0.3043095050331679, + "grad_norm": 0.11350583285093307, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 78720 + }, + { + "epoch": 0.30434816223655115, + "grad_norm": 0.11291664838790894, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 78730 + }, + { + "epoch": 0.30438681943993445, + "grad_norm": 0.1264103800058365, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 78740 + }, + { + "epoch": 0.3044254766433177, + "grad_norm": 0.10281990468502045, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 78750 + }, + { + "epoch": 0.304464133846701, + "grad_norm": 0.13915292918682098, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 78760 + }, + { + "epoch": 0.30450279105008426, + "grad_norm": 0.1522568017244339, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 78770 + }, + { + "epoch": 0.3045414482534676, + "grad_norm": 0.09680493921041489, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 78780 + }, + { + "epoch": 0.3045801054568508, + "grad_norm": 0.0962231457233429, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 78790 + }, + { + "epoch": 0.30461876266023413, + "grad_norm": 0.12869805097579956, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 78800 + }, + { + "epoch": 0.3046574198636174, + "grad_norm": 0.0894971638917923, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 78810 + }, + { + "epoch": 0.3046960770670007, + "grad_norm": 0.11182959377765656, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 78820 + }, + { + "epoch": 0.30473473427038394, + "grad_norm": 0.1098380908370018, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 78830 + }, + { + "epoch": 0.3047733914737672, + "grad_norm": 0.13078713417053223, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 78840 + }, + { + "epoch": 0.3048120486771505, + "grad_norm": 0.21705102920532227, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 78850 + }, + { + "epoch": 0.30485070588053376, + "grad_norm": 0.1491706222295761, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 78860 + }, + { + "epoch": 0.30488936308391706, + "grad_norm": 0.12133818864822388, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 78870 + }, + { + "epoch": 0.3049280202873003, + "grad_norm": 0.11912494152784348, + "learning_rate": 0.002, + "loss": 2.344, + "step": 78880 + }, + { + "epoch": 0.3049666774906836, + "grad_norm": 0.11220871657133102, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 78890 + }, + { + "epoch": 0.3050053346940669, + "grad_norm": 0.13244259357452393, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 78900 + }, + { + "epoch": 0.3050439918974502, + "grad_norm": 0.11833749711513519, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 78910 + }, + { + "epoch": 0.30508264910083344, + "grad_norm": 0.1003163605928421, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 78920 + }, + { + "epoch": 0.30512130630421674, + "grad_norm": 0.11988737434148788, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 78930 + }, + { + "epoch": 0.3051599635076, + "grad_norm": 0.11213191598653793, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 78940 + }, + { + "epoch": 0.3051986207109833, + "grad_norm": 0.11382569372653961, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 78950 + }, + { + "epoch": 0.30523727791436656, + "grad_norm": 0.1199234127998352, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 78960 + }, + { + "epoch": 0.30527593511774986, + "grad_norm": 0.11544682085514069, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 78970 + }, + { + "epoch": 0.3053145923211331, + "grad_norm": 0.09563816338777542, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 78980 + }, + { + "epoch": 0.3053532495245164, + "grad_norm": 0.1085505336523056, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 78990 + }, + { + "epoch": 0.3053919067278997, + "grad_norm": 0.11609254777431488, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 79000 + }, + { + "epoch": 0.30543056393128293, + "grad_norm": 0.10157139599323273, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 79010 + }, + { + "epoch": 0.30546922113466624, + "grad_norm": 0.11894809454679489, + "learning_rate": 0.002, + "loss": 2.3742, + "step": 79020 + }, + { + "epoch": 0.3055078783380495, + "grad_norm": 0.10765823721885681, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 79030 + }, + { + "epoch": 0.3055465355414328, + "grad_norm": 0.10459846258163452, + "learning_rate": 0.002, + "loss": 2.3758, + "step": 79040 + }, + { + "epoch": 0.30558519274481605, + "grad_norm": 0.09812411665916443, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 79050 + }, + { + "epoch": 0.30562384994819936, + "grad_norm": 0.1044439822435379, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 79060 + }, + { + "epoch": 0.3056625071515826, + "grad_norm": 0.11629011482000351, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 79070 + }, + { + "epoch": 0.3057011643549659, + "grad_norm": 0.09759361296892166, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 79080 + }, + { + "epoch": 0.30573982155834917, + "grad_norm": 0.12417322397232056, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 79090 + }, + { + "epoch": 0.3057784787617325, + "grad_norm": 0.12513114511966705, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 79100 + }, + { + "epoch": 0.3058171359651157, + "grad_norm": 0.10083790868520737, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 79110 + }, + { + "epoch": 0.30585579316849903, + "grad_norm": 0.13127751648426056, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 79120 + }, + { + "epoch": 0.3058944503718823, + "grad_norm": 0.1046181470155716, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 79130 + }, + { + "epoch": 0.3059331075752656, + "grad_norm": 0.11963056027889252, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 79140 + }, + { + "epoch": 0.30597176477864885, + "grad_norm": 0.10972888022661209, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 79150 + }, + { + "epoch": 0.30601042198203215, + "grad_norm": 0.103045254945755, + "learning_rate": 0.002, + "loss": 2.3729, + "step": 79160 + }, + { + "epoch": 0.3060490791854154, + "grad_norm": 0.12305113673210144, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 79170 + }, + { + "epoch": 0.3060877363887987, + "grad_norm": 0.11825665831565857, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 79180 + }, + { + "epoch": 0.30612639359218197, + "grad_norm": 0.10759798437356949, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 79190 + }, + { + "epoch": 0.3061650507955652, + "grad_norm": 0.14260214567184448, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 79200 + }, + { + "epoch": 0.3062037079989485, + "grad_norm": 0.10587218403816223, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 79210 + }, + { + "epoch": 0.3062423652023318, + "grad_norm": 0.11592812836170197, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 79220 + }, + { + "epoch": 0.3062810224057151, + "grad_norm": 0.11225926131010056, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 79230 + }, + { + "epoch": 0.30631967960909834, + "grad_norm": 0.10934487730264664, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 79240 + }, + { + "epoch": 0.30635833681248165, + "grad_norm": 0.10247712582349777, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 79250 + }, + { + "epoch": 0.3063969940158649, + "grad_norm": 0.10369041562080383, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 79260 + }, + { + "epoch": 0.3064356512192482, + "grad_norm": 0.10814987868070602, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 79270 + }, + { + "epoch": 0.30647430842263146, + "grad_norm": 0.11166869848966599, + "learning_rate": 0.002, + "loss": 2.3722, + "step": 79280 + }, + { + "epoch": 0.30651296562601477, + "grad_norm": 0.11336232721805573, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 79290 + }, + { + "epoch": 0.306551622829398, + "grad_norm": 0.0989774540066719, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 79300 + }, + { + "epoch": 0.3065902800327813, + "grad_norm": 0.14258378744125366, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 79310 + }, + { + "epoch": 0.3066289372361646, + "grad_norm": 0.10717329382896423, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 79320 + }, + { + "epoch": 0.3066675944395479, + "grad_norm": 0.10588625818490982, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 79330 + }, + { + "epoch": 0.30670625164293114, + "grad_norm": 0.1122480183839798, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 79340 + }, + { + "epoch": 0.30674490884631445, + "grad_norm": 0.11568798869848251, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 79350 + }, + { + "epoch": 0.3067835660496977, + "grad_norm": 0.11585716903209686, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 79360 + }, + { + "epoch": 0.306822223253081, + "grad_norm": 0.10288020968437195, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 79370 + }, + { + "epoch": 0.30686088045646426, + "grad_norm": 0.09953874349594116, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 79380 + }, + { + "epoch": 0.3068995376598475, + "grad_norm": 0.10273531079292297, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 79390 + }, + { + "epoch": 0.3069381948632308, + "grad_norm": 0.12214646488428116, + "learning_rate": 0.002, + "loss": 2.363, + "step": 79400 + }, + { + "epoch": 0.30697685206661407, + "grad_norm": 0.11876621097326279, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 79410 + }, + { + "epoch": 0.3070155092699974, + "grad_norm": 0.11528085172176361, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 79420 + }, + { + "epoch": 0.30705416647338063, + "grad_norm": 0.10709972679615021, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 79430 + }, + { + "epoch": 0.30709282367676394, + "grad_norm": 0.11976133286952972, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 79440 + }, + { + "epoch": 0.3071314808801472, + "grad_norm": 0.1332187056541443, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 79450 + }, + { + "epoch": 0.3071701380835305, + "grad_norm": 0.1383569836616516, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 79460 + }, + { + "epoch": 0.30720879528691375, + "grad_norm": 0.1024542897939682, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 79470 + }, + { + "epoch": 0.30724745249029706, + "grad_norm": 0.10686483234167099, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 79480 + }, + { + "epoch": 0.3072861096936803, + "grad_norm": 0.11085661500692368, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 79490 + }, + { + "epoch": 0.3073247668970636, + "grad_norm": 0.11389243602752686, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 79500 + }, + { + "epoch": 0.30736342410044687, + "grad_norm": 0.10158166289329529, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 79510 + }, + { + "epoch": 0.3074020813038302, + "grad_norm": 0.12603077292442322, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 79520 + }, + { + "epoch": 0.30744073850721343, + "grad_norm": 0.1034865453839302, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 79530 + }, + { + "epoch": 0.30747939571059674, + "grad_norm": 0.12243582308292389, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 79540 + }, + { + "epoch": 0.30751805291398, + "grad_norm": 0.09311690926551819, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 79550 + }, + { + "epoch": 0.3075567101173633, + "grad_norm": 0.12692442536354065, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 79560 + }, + { + "epoch": 0.30759536732074655, + "grad_norm": 0.12448505312204361, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 79570 + }, + { + "epoch": 0.3076340245241298, + "grad_norm": 0.12252768129110336, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 79580 + }, + { + "epoch": 0.3076726817275131, + "grad_norm": 0.09933305531740189, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 79590 + }, + { + "epoch": 0.30771133893089636, + "grad_norm": 0.12263075262308121, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 79600 + }, + { + "epoch": 0.30774999613427967, + "grad_norm": 0.10927719622850418, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 79610 + }, + { + "epoch": 0.3077886533376629, + "grad_norm": 0.10547421872615814, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 79620 + }, + { + "epoch": 0.30782731054104623, + "grad_norm": 0.10980977863073349, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 79630 + }, + { + "epoch": 0.3078659677444295, + "grad_norm": 0.10171563178300858, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 79640 + }, + { + "epoch": 0.3079046249478128, + "grad_norm": 0.11577558517456055, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 79650 + }, + { + "epoch": 0.30794328215119604, + "grad_norm": 0.10839863121509552, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 79660 + }, + { + "epoch": 0.30798193935457935, + "grad_norm": 0.11198613047599792, + "learning_rate": 0.002, + "loss": 2.359, + "step": 79670 + }, + { + "epoch": 0.3080205965579626, + "grad_norm": 0.10426493734121323, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 79680 + }, + { + "epoch": 0.3080592537613459, + "grad_norm": 0.11114238947629929, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 79690 + }, + { + "epoch": 0.30809791096472916, + "grad_norm": 0.11620768904685974, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 79700 + }, + { + "epoch": 0.30813656816811247, + "grad_norm": 0.10813305526971817, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 79710 + }, + { + "epoch": 0.3081752253714957, + "grad_norm": 0.11177055537700653, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 79720 + }, + { + "epoch": 0.30821388257487903, + "grad_norm": 0.12193883955478668, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 79730 + }, + { + "epoch": 0.3082525397782623, + "grad_norm": 0.10940881073474884, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 79740 + }, + { + "epoch": 0.30829119698164553, + "grad_norm": 0.09680374711751938, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 79750 + }, + { + "epoch": 0.30832985418502884, + "grad_norm": 0.14382275938987732, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 79760 + }, + { + "epoch": 0.3083685113884121, + "grad_norm": 0.10033523291349411, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 79770 + }, + { + "epoch": 0.3084071685917954, + "grad_norm": 0.13805530965328217, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 79780 + }, + { + "epoch": 0.30844582579517865, + "grad_norm": 0.11651264876127243, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 79790 + }, + { + "epoch": 0.30848448299856196, + "grad_norm": 0.10696911066770554, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 79800 + }, + { + "epoch": 0.3085231402019452, + "grad_norm": 0.13329538702964783, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 79810 + }, + { + "epoch": 0.3085617974053285, + "grad_norm": 0.09899991005659103, + "learning_rate": 0.002, + "loss": 2.355, + "step": 79820 + }, + { + "epoch": 0.30860045460871177, + "grad_norm": 0.11121714860200882, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 79830 + }, + { + "epoch": 0.3086391118120951, + "grad_norm": 0.10053054988384247, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 79840 + }, + { + "epoch": 0.30867776901547833, + "grad_norm": 0.13535206019878387, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 79850 + }, + { + "epoch": 0.30871642621886164, + "grad_norm": 0.10976517200469971, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 79860 + }, + { + "epoch": 0.3087550834222449, + "grad_norm": 0.12436148524284363, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 79870 + }, + { + "epoch": 0.3087937406256282, + "grad_norm": 0.10850168764591217, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 79880 + }, + { + "epoch": 0.30883239782901145, + "grad_norm": 0.1158076673746109, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 79890 + }, + { + "epoch": 0.30887105503239476, + "grad_norm": 0.10798767954111099, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 79900 + }, + { + "epoch": 0.308909712235778, + "grad_norm": 0.28459790349006653, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 79910 + }, + { + "epoch": 0.3089483694391613, + "grad_norm": 0.12084505707025528, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 79920 + }, + { + "epoch": 0.30898702664254457, + "grad_norm": 0.0929480493068695, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 79930 + }, + { + "epoch": 0.3090256838459278, + "grad_norm": 0.11760727316141129, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 79940 + }, + { + "epoch": 0.30906434104931113, + "grad_norm": 0.11596400290727615, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 79950 + }, + { + "epoch": 0.3091029982526944, + "grad_norm": 0.12344510108232498, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 79960 + }, + { + "epoch": 0.3091416554560777, + "grad_norm": 0.11655568331480026, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 79970 + }, + { + "epoch": 0.30918031265946094, + "grad_norm": 0.11264428496360779, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 79980 + }, + { + "epoch": 0.30921896986284425, + "grad_norm": 0.11132732033729553, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 79990 + }, + { + "epoch": 0.3092576270662275, + "grad_norm": 0.11722130328416824, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 80000 + }, + { + "epoch": 0.3092962842696108, + "grad_norm": 0.12687674164772034, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 80010 + }, + { + "epoch": 0.30933494147299406, + "grad_norm": 0.0991467535495758, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 80020 + }, + { + "epoch": 0.30937359867637737, + "grad_norm": 0.11307033151388168, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 80030 + }, + { + "epoch": 0.3094122558797606, + "grad_norm": 0.10980504751205444, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 80040 + }, + { + "epoch": 0.30945091308314393, + "grad_norm": 0.15489093959331512, + "learning_rate": 0.002, + "loss": 2.374, + "step": 80050 + }, + { + "epoch": 0.3094895702865272, + "grad_norm": 0.10452447086572647, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 80060 + }, + { + "epoch": 0.3095282274899105, + "grad_norm": 0.12048021703958511, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 80070 + }, + { + "epoch": 0.30956688469329374, + "grad_norm": 0.10281501710414886, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 80080 + }, + { + "epoch": 0.30960554189667705, + "grad_norm": 0.10218989104032516, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 80090 + }, + { + "epoch": 0.3096441991000603, + "grad_norm": 0.10425220429897308, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 80100 + }, + { + "epoch": 0.3096828563034436, + "grad_norm": 0.11669370532035828, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 80110 + }, + { + "epoch": 0.30972151350682686, + "grad_norm": 0.10010499507188797, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 80120 + }, + { + "epoch": 0.3097601707102101, + "grad_norm": 0.12721529603004456, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 80130 + }, + { + "epoch": 0.3097988279135934, + "grad_norm": 0.10680372267961502, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 80140 + }, + { + "epoch": 0.3098374851169767, + "grad_norm": 0.12047622352838516, + "learning_rate": 0.002, + "loss": 2.371, + "step": 80150 + }, + { + "epoch": 0.30987614232036, + "grad_norm": 0.11787277460098267, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 80160 + }, + { + "epoch": 0.30991479952374323, + "grad_norm": 0.1253037452697754, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 80170 + }, + { + "epoch": 0.30995345672712654, + "grad_norm": 0.11033058166503906, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 80180 + }, + { + "epoch": 0.3099921139305098, + "grad_norm": 0.10820924490690231, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 80190 + }, + { + "epoch": 0.3100307711338931, + "grad_norm": 0.10344351083040237, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 80200 + }, + { + "epoch": 0.31006942833727635, + "grad_norm": 0.13518910109996796, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 80210 + }, + { + "epoch": 0.31010808554065966, + "grad_norm": 0.11561016738414764, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 80220 + }, + { + "epoch": 0.3101467427440429, + "grad_norm": 0.11577355861663818, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 80230 + }, + { + "epoch": 0.3101853999474262, + "grad_norm": 0.1060025542974472, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 80240 + }, + { + "epoch": 0.3102240571508095, + "grad_norm": 0.11184864491224289, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 80250 + }, + { + "epoch": 0.3102627143541928, + "grad_norm": 0.11953794211149216, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 80260 + }, + { + "epoch": 0.31030137155757603, + "grad_norm": 0.11260048300027847, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 80270 + }, + { + "epoch": 0.31034002876095934, + "grad_norm": 0.11399795114994049, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 80280 + }, + { + "epoch": 0.3103786859643426, + "grad_norm": 0.10122570395469666, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 80290 + }, + { + "epoch": 0.3104173431677259, + "grad_norm": 0.10667862743139267, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 80300 + }, + { + "epoch": 0.31045600037110915, + "grad_norm": 0.10589800029993057, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 80310 + }, + { + "epoch": 0.3104946575744924, + "grad_norm": 0.10616010427474976, + "learning_rate": 0.002, + "loss": 2.358, + "step": 80320 + }, + { + "epoch": 0.3105333147778757, + "grad_norm": 0.13653241097927094, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 80330 + }, + { + "epoch": 0.31057197198125897, + "grad_norm": 0.11566481739282608, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 80340 + }, + { + "epoch": 0.3106106291846423, + "grad_norm": 0.12117500603199005, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 80350 + }, + { + "epoch": 0.3106492863880255, + "grad_norm": 0.11408274620771408, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 80360 + }, + { + "epoch": 0.31068794359140883, + "grad_norm": 0.09207921475172043, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 80370 + }, + { + "epoch": 0.3107266007947921, + "grad_norm": 0.11657524853944778, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 80380 + }, + { + "epoch": 0.3107652579981754, + "grad_norm": 0.10842855274677277, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 80390 + }, + { + "epoch": 0.31080391520155864, + "grad_norm": 0.10754778236150742, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 80400 + }, + { + "epoch": 0.31084257240494195, + "grad_norm": 0.12526315450668335, + "learning_rate": 0.002, + "loss": 2.352, + "step": 80410 + }, + { + "epoch": 0.3108812296083252, + "grad_norm": 0.10728145390748978, + "learning_rate": 0.002, + "loss": 2.366, + "step": 80420 + }, + { + "epoch": 0.3109198868117085, + "grad_norm": 0.11626534909009933, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 80430 + }, + { + "epoch": 0.31095854401509176, + "grad_norm": 0.10921600461006165, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 80440 + }, + { + "epoch": 0.3109972012184751, + "grad_norm": 0.10860076546669006, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 80450 + }, + { + "epoch": 0.3110358584218583, + "grad_norm": 0.10265084356069565, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 80460 + }, + { + "epoch": 0.31107451562524163, + "grad_norm": 0.11555089056491852, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 80470 + }, + { + "epoch": 0.3111131728286249, + "grad_norm": 0.11266104876995087, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 80480 + }, + { + "epoch": 0.3111518300320082, + "grad_norm": 0.11351647228002548, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 80490 + }, + { + "epoch": 0.31119048723539144, + "grad_norm": 0.10889441519975662, + "learning_rate": 0.002, + "loss": 2.364, + "step": 80500 + }, + { + "epoch": 0.3112291444387747, + "grad_norm": 0.11522519588470459, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 80510 + }, + { + "epoch": 0.311267801642158, + "grad_norm": 0.11582613736391068, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 80520 + }, + { + "epoch": 0.31130645884554126, + "grad_norm": 0.10472138971090317, + "learning_rate": 0.002, + "loss": 2.352, + "step": 80530 + }, + { + "epoch": 0.31134511604892456, + "grad_norm": 0.0929357036948204, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 80540 + }, + { + "epoch": 0.3113837732523078, + "grad_norm": 0.12064792960882187, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 80550 + }, + { + "epoch": 0.3114224304556911, + "grad_norm": 0.09501705318689346, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 80560 + }, + { + "epoch": 0.3114610876590744, + "grad_norm": 0.10537790507078171, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 80570 + }, + { + "epoch": 0.3114997448624577, + "grad_norm": 0.1048625260591507, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 80580 + }, + { + "epoch": 0.31153840206584094, + "grad_norm": 0.114802785217762, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 80590 + }, + { + "epoch": 0.31157705926922424, + "grad_norm": 0.10885383188724518, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 80600 + }, + { + "epoch": 0.3116157164726075, + "grad_norm": 0.10919132828712463, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 80610 + }, + { + "epoch": 0.3116543736759908, + "grad_norm": 0.12491604685783386, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 80620 + }, + { + "epoch": 0.31169303087937406, + "grad_norm": 0.1278710514307022, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 80630 + }, + { + "epoch": 0.31173168808275736, + "grad_norm": 0.1066141352057457, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 80640 + }, + { + "epoch": 0.3117703452861406, + "grad_norm": 0.10080710053443909, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 80650 + }, + { + "epoch": 0.3118090024895239, + "grad_norm": 0.12186428159475327, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 80660 + }, + { + "epoch": 0.3118476596929072, + "grad_norm": 0.11064880341291428, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 80670 + }, + { + "epoch": 0.3118863168962904, + "grad_norm": 0.09742892533540726, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 80680 + }, + { + "epoch": 0.31192497409967374, + "grad_norm": 0.11213352531194687, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 80690 + }, + { + "epoch": 0.311963631303057, + "grad_norm": 0.10177134722471237, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 80700 + }, + { + "epoch": 0.3120022885064403, + "grad_norm": 0.11354486644268036, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 80710 + }, + { + "epoch": 0.31204094570982355, + "grad_norm": 0.12874093651771545, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 80720 + }, + { + "epoch": 0.31207960291320685, + "grad_norm": 0.12072081863880157, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 80730 + }, + { + "epoch": 0.3121182601165901, + "grad_norm": 0.10747833549976349, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 80740 + }, + { + "epoch": 0.3121569173199734, + "grad_norm": 0.1078917533159256, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 80750 + }, + { + "epoch": 0.31219557452335667, + "grad_norm": 0.11337679624557495, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 80760 + }, + { + "epoch": 0.31223423172674, + "grad_norm": 0.11525086313486099, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 80770 + }, + { + "epoch": 0.3122728889301232, + "grad_norm": 0.11412528157234192, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 80780 + }, + { + "epoch": 0.31231154613350653, + "grad_norm": 0.12456825375556946, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 80790 + }, + { + "epoch": 0.3123502033368898, + "grad_norm": 0.10382735729217529, + "learning_rate": 0.002, + "loss": 2.357, + "step": 80800 + }, + { + "epoch": 0.3123888605402731, + "grad_norm": 0.12227677553892136, + "learning_rate": 0.002, + "loss": 2.349, + "step": 80810 + }, + { + "epoch": 0.31242751774365635, + "grad_norm": 0.112001933157444, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 80820 + }, + { + "epoch": 0.31246617494703965, + "grad_norm": 0.13644681870937347, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 80830 + }, + { + "epoch": 0.3125048321504229, + "grad_norm": 0.10916776955127716, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 80840 + }, + { + "epoch": 0.3125434893538062, + "grad_norm": 0.11666324734687805, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 80850 + }, + { + "epoch": 0.31258214655718947, + "grad_norm": 0.12494746595621109, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 80860 + }, + { + "epoch": 0.3126208037605727, + "grad_norm": 0.10945124179124832, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 80870 + }, + { + "epoch": 0.312659460963956, + "grad_norm": 0.09687016904354095, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 80880 + }, + { + "epoch": 0.3126981181673393, + "grad_norm": 0.11021928489208221, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 80890 + }, + { + "epoch": 0.3127367753707226, + "grad_norm": 0.13564597070217133, + "learning_rate": 0.002, + "loss": 2.349, + "step": 80900 + }, + { + "epoch": 0.31277543257410584, + "grad_norm": 0.10439195483922958, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 80910 + }, + { + "epoch": 0.31281408977748915, + "grad_norm": 0.10311180353164673, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 80920 + }, + { + "epoch": 0.3128527469808724, + "grad_norm": 0.12455517053604126, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 80930 + }, + { + "epoch": 0.3128914041842557, + "grad_norm": 0.11676601320505142, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 80940 + }, + { + "epoch": 0.31293006138763896, + "grad_norm": 0.14396095275878906, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 80950 + }, + { + "epoch": 0.31296871859102227, + "grad_norm": 0.11289811134338379, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 80960 + }, + { + "epoch": 0.3130073757944055, + "grad_norm": 0.1121041402220726, + "learning_rate": 0.002, + "loss": 2.359, + "step": 80970 + }, + { + "epoch": 0.3130460329977888, + "grad_norm": 0.10463961213827133, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 80980 + }, + { + "epoch": 0.3130846902011721, + "grad_norm": 0.12142655998468399, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 80990 + }, + { + "epoch": 0.3131233474045554, + "grad_norm": 0.09865577518939972, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 81000 + }, + { + "epoch": 0.31316200460793864, + "grad_norm": 0.10284759849309921, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 81010 + }, + { + "epoch": 0.31320066181132195, + "grad_norm": 0.1375407725572586, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 81020 + }, + { + "epoch": 0.3132393190147052, + "grad_norm": 0.10573754459619522, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 81030 + }, + { + "epoch": 0.3132779762180885, + "grad_norm": 0.12139849364757538, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 81040 + }, + { + "epoch": 0.31331663342147176, + "grad_norm": 0.10832665115594864, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 81050 + }, + { + "epoch": 0.313355290624855, + "grad_norm": 0.11262708157300949, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 81060 + }, + { + "epoch": 0.3133939478282383, + "grad_norm": 0.1364460289478302, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 81070 + }, + { + "epoch": 0.31343260503162157, + "grad_norm": 0.10935191810131073, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 81080 + }, + { + "epoch": 0.3134712622350049, + "grad_norm": 0.10730627179145813, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 81090 + }, + { + "epoch": 0.31350991943838813, + "grad_norm": 0.13448749482631683, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 81100 + }, + { + "epoch": 0.31354857664177144, + "grad_norm": 0.11057820171117783, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 81110 + }, + { + "epoch": 0.3135872338451547, + "grad_norm": 0.12507446110248566, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 81120 + }, + { + "epoch": 0.313625891048538, + "grad_norm": 0.13825474679470062, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 81130 + }, + { + "epoch": 0.31366454825192125, + "grad_norm": 0.11821473389863968, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 81140 + }, + { + "epoch": 0.31370320545530456, + "grad_norm": 0.12365194410085678, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 81150 + }, + { + "epoch": 0.3137418626586878, + "grad_norm": 0.10776308923959732, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 81160 + }, + { + "epoch": 0.3137805198620711, + "grad_norm": 0.1093917042016983, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 81170 + }, + { + "epoch": 0.31381917706545437, + "grad_norm": 0.11749949306249619, + "learning_rate": 0.002, + "loss": 2.355, + "step": 81180 + }, + { + "epoch": 0.3138578342688377, + "grad_norm": 0.10888249427080154, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 81190 + }, + { + "epoch": 0.31389649147222093, + "grad_norm": 0.12317828834056854, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 81200 + }, + { + "epoch": 0.31393514867560424, + "grad_norm": 0.10111136734485626, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 81210 + }, + { + "epoch": 0.3139738058789875, + "grad_norm": 0.11437378078699112, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 81220 + }, + { + "epoch": 0.3140124630823708, + "grad_norm": 0.11509440839290619, + "learning_rate": 0.002, + "loss": 2.361, + "step": 81230 + }, + { + "epoch": 0.31405112028575405, + "grad_norm": 0.11991222202777863, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 81240 + }, + { + "epoch": 0.3140897774891373, + "grad_norm": 0.16248053312301636, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 81250 + }, + { + "epoch": 0.3141284346925206, + "grad_norm": 0.1027098223567009, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 81260 + }, + { + "epoch": 0.31416709189590386, + "grad_norm": 0.12584351003170013, + "learning_rate": 0.002, + "loss": 2.357, + "step": 81270 + }, + { + "epoch": 0.31420574909928717, + "grad_norm": 0.10472863912582397, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 81280 + }, + { + "epoch": 0.3142444063026704, + "grad_norm": 0.112847700715065, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 81290 + }, + { + "epoch": 0.31428306350605373, + "grad_norm": 0.15817201137542725, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 81300 + }, + { + "epoch": 0.314321720709437, + "grad_norm": 0.09936563670635223, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 81310 + }, + { + "epoch": 0.3143603779128203, + "grad_norm": 0.10678784549236298, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 81320 + }, + { + "epoch": 0.31439903511620354, + "grad_norm": 0.10541539639234543, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 81330 + }, + { + "epoch": 0.31443769231958685, + "grad_norm": 0.1101844385266304, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 81340 + }, + { + "epoch": 0.3144763495229701, + "grad_norm": 0.1240110918879509, + "learning_rate": 0.002, + "loss": 2.3746, + "step": 81350 + }, + { + "epoch": 0.3145150067263534, + "grad_norm": 0.09983433783054352, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 81360 + }, + { + "epoch": 0.31455366392973666, + "grad_norm": 0.1422368884086609, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 81370 + }, + { + "epoch": 0.31459232113311997, + "grad_norm": 0.10908190160989761, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 81380 + }, + { + "epoch": 0.3146309783365032, + "grad_norm": 0.09860409051179886, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 81390 + }, + { + "epoch": 0.3146696355398865, + "grad_norm": 0.111362025141716, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 81400 + }, + { + "epoch": 0.3147082927432698, + "grad_norm": 0.10219592601060867, + "learning_rate": 0.002, + "loss": 2.344, + "step": 81410 + }, + { + "epoch": 0.31474694994665303, + "grad_norm": 0.12257792055606842, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 81420 + }, + { + "epoch": 0.31478560715003634, + "grad_norm": 0.216417133808136, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 81430 + }, + { + "epoch": 0.3148242643534196, + "grad_norm": 0.1161540076136589, + "learning_rate": 0.002, + "loss": 2.351, + "step": 81440 + }, + { + "epoch": 0.3148629215568029, + "grad_norm": 0.10609345138072968, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 81450 + }, + { + "epoch": 0.31490157876018615, + "grad_norm": 0.10580547153949738, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 81460 + }, + { + "epoch": 0.31494023596356946, + "grad_norm": 0.11898103356361389, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 81470 + }, + { + "epoch": 0.3149788931669527, + "grad_norm": 0.10301675647497177, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 81480 + }, + { + "epoch": 0.315017550370336, + "grad_norm": 0.09303360432386398, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 81490 + }, + { + "epoch": 0.31505620757371927, + "grad_norm": 0.10423095524311066, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 81500 + }, + { + "epoch": 0.3150948647771026, + "grad_norm": 0.09868019819259644, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 81510 + }, + { + "epoch": 0.31513352198048583, + "grad_norm": 0.10548140853643417, + "learning_rate": 0.002, + "loss": 2.335, + "step": 81520 + }, + { + "epoch": 0.31517217918386914, + "grad_norm": 0.16394978761672974, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 81530 + }, + { + "epoch": 0.3152108363872524, + "grad_norm": 0.12324749678373337, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 81540 + }, + { + "epoch": 0.3152494935906357, + "grad_norm": 0.10595546662807465, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 81550 + }, + { + "epoch": 0.31528815079401895, + "grad_norm": 0.12034602463245392, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 81560 + }, + { + "epoch": 0.31532680799740226, + "grad_norm": 0.11614970117807388, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 81570 + }, + { + "epoch": 0.3153654652007855, + "grad_norm": 0.1015036553144455, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 81580 + }, + { + "epoch": 0.3154041224041688, + "grad_norm": 0.10688374936580658, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 81590 + }, + { + "epoch": 0.31544277960755207, + "grad_norm": 0.10944467782974243, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 81600 + }, + { + "epoch": 0.3154814368109353, + "grad_norm": 0.10430245846509933, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 81610 + }, + { + "epoch": 0.31552009401431863, + "grad_norm": 0.09625860303640366, + "learning_rate": 0.002, + "loss": 2.348, + "step": 81620 + }, + { + "epoch": 0.3155587512177019, + "grad_norm": 0.12270866334438324, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 81630 + }, + { + "epoch": 0.3155974084210852, + "grad_norm": 0.11860179901123047, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 81640 + }, + { + "epoch": 0.31563606562446844, + "grad_norm": 0.10587330907583237, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 81650 + }, + { + "epoch": 0.31567472282785175, + "grad_norm": 0.0979042649269104, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 81660 + }, + { + "epoch": 0.315713380031235, + "grad_norm": 0.11213138699531555, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 81670 + }, + { + "epoch": 0.3157520372346183, + "grad_norm": 0.11067654192447662, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 81680 + }, + { + "epoch": 0.31579069443800156, + "grad_norm": 0.1135677695274353, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 81690 + }, + { + "epoch": 0.31582935164138487, + "grad_norm": 0.10148513317108154, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 81700 + }, + { + "epoch": 0.3158680088447681, + "grad_norm": 0.09532175213098526, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 81710 + }, + { + "epoch": 0.31590666604815143, + "grad_norm": 0.10355016589164734, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 81720 + }, + { + "epoch": 0.3159453232515347, + "grad_norm": 0.11893083155155182, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 81730 + }, + { + "epoch": 0.315983980454918, + "grad_norm": 0.1218695119023323, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 81740 + }, + { + "epoch": 0.31602263765830124, + "grad_norm": 0.13430169224739075, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 81750 + }, + { + "epoch": 0.31606129486168455, + "grad_norm": 0.10158354789018631, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 81760 + }, + { + "epoch": 0.3160999520650678, + "grad_norm": 0.10698272287845612, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 81770 + }, + { + "epoch": 0.3161386092684511, + "grad_norm": 0.11060472577810287, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 81780 + }, + { + "epoch": 0.31617726647183436, + "grad_norm": 0.1195729449391365, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 81790 + }, + { + "epoch": 0.3162159236752176, + "grad_norm": 0.10561022907495499, + "learning_rate": 0.002, + "loss": 2.3736, + "step": 81800 + }, + { + "epoch": 0.3162545808786009, + "grad_norm": 0.10653810948133469, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 81810 + }, + { + "epoch": 0.3162932380819842, + "grad_norm": 0.14653615653514862, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 81820 + }, + { + "epoch": 0.3163318952853675, + "grad_norm": 0.12520267069339752, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 81830 + }, + { + "epoch": 0.31637055248875073, + "grad_norm": 0.10568630695343018, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 81840 + }, + { + "epoch": 0.31640920969213404, + "grad_norm": 0.1051153689622879, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 81850 + }, + { + "epoch": 0.3164478668955173, + "grad_norm": 0.13551384210586548, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 81860 + }, + { + "epoch": 0.3164865240989006, + "grad_norm": 0.10480249673128128, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 81870 + }, + { + "epoch": 0.31652518130228385, + "grad_norm": 0.09846390038728714, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 81880 + }, + { + "epoch": 0.31656383850566716, + "grad_norm": 0.12012314796447754, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 81890 + }, + { + "epoch": 0.3166024957090504, + "grad_norm": 0.10799364000558853, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 81900 + }, + { + "epoch": 0.3166411529124337, + "grad_norm": 0.1147104874253273, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 81910 + }, + { + "epoch": 0.316679810115817, + "grad_norm": 0.09747300297021866, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 81920 + }, + { + "epoch": 0.3167184673192003, + "grad_norm": 0.11662698537111282, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 81930 + }, + { + "epoch": 0.31675712452258353, + "grad_norm": 0.10373350977897644, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 81940 + }, + { + "epoch": 0.31679578172596684, + "grad_norm": 0.10500016063451767, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 81950 + }, + { + "epoch": 0.3168344389293501, + "grad_norm": 0.10469872504472733, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 81960 + }, + { + "epoch": 0.3168730961327334, + "grad_norm": 0.11039718240499496, + "learning_rate": 0.002, + "loss": 2.347, + "step": 81970 + }, + { + "epoch": 0.31691175333611665, + "grad_norm": 0.13921773433685303, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 81980 + }, + { + "epoch": 0.3169504105394999, + "grad_norm": 0.12752720713615417, + "learning_rate": 0.002, + "loss": 2.372, + "step": 81990 + }, + { + "epoch": 0.3169890677428832, + "grad_norm": 0.10381372272968292, + "learning_rate": 0.002, + "loss": 2.349, + "step": 82000 + }, + { + "epoch": 0.31702772494626646, + "grad_norm": 0.09850414842367172, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 82010 + }, + { + "epoch": 0.3170663821496498, + "grad_norm": 0.12372690439224243, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 82020 + }, + { + "epoch": 0.317105039353033, + "grad_norm": 0.10919710248708725, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 82030 + }, + { + "epoch": 0.31714369655641633, + "grad_norm": 0.10725739598274231, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 82040 + }, + { + "epoch": 0.3171823537597996, + "grad_norm": 0.11192844808101654, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 82050 + }, + { + "epoch": 0.3172210109631829, + "grad_norm": 0.10848018527030945, + "learning_rate": 0.002, + "loss": 2.345, + "step": 82060 + }, + { + "epoch": 0.31725966816656614, + "grad_norm": 0.11807937920093536, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 82070 + }, + { + "epoch": 0.31729832536994945, + "grad_norm": 0.09818805009126663, + "learning_rate": 0.002, + "loss": 2.344, + "step": 82080 + }, + { + "epoch": 0.3173369825733327, + "grad_norm": 0.1123327910900116, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 82090 + }, + { + "epoch": 0.317375639776716, + "grad_norm": 0.114064522087574, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 82100 + }, + { + "epoch": 0.31741429698009926, + "grad_norm": 0.1075097844004631, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 82110 + }, + { + "epoch": 0.31745295418348257, + "grad_norm": 0.1298528015613556, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 82120 + }, + { + "epoch": 0.3174916113868658, + "grad_norm": 0.10509567707777023, + "learning_rate": 0.002, + "loss": 2.3704, + "step": 82130 + }, + { + "epoch": 0.31753026859024913, + "grad_norm": 0.09571907669305801, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 82140 + }, + { + "epoch": 0.3175689257936324, + "grad_norm": 0.11480700224637985, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 82150 + }, + { + "epoch": 0.3176075829970157, + "grad_norm": 0.10877734422683716, + "learning_rate": 0.002, + "loss": 2.343, + "step": 82160 + }, + { + "epoch": 0.31764624020039894, + "grad_norm": 0.10204993933439255, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 82170 + }, + { + "epoch": 0.3176848974037822, + "grad_norm": 0.128915473818779, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 82180 + }, + { + "epoch": 0.3177235546071655, + "grad_norm": 0.11168088018894196, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 82190 + }, + { + "epoch": 0.31776221181054876, + "grad_norm": 0.11000935733318329, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 82200 + }, + { + "epoch": 0.31780086901393206, + "grad_norm": 0.10184872150421143, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 82210 + }, + { + "epoch": 0.3178395262173153, + "grad_norm": 0.11816468834877014, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 82220 + }, + { + "epoch": 0.3178781834206986, + "grad_norm": 0.11335434019565582, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 82230 + }, + { + "epoch": 0.3179168406240819, + "grad_norm": 0.10691909492015839, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 82240 + }, + { + "epoch": 0.3179554978274652, + "grad_norm": 0.1318056583404541, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 82250 + }, + { + "epoch": 0.31799415503084844, + "grad_norm": 0.11883347481489182, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 82260 + }, + { + "epoch": 0.31803281223423174, + "grad_norm": 0.10550377517938614, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 82270 + }, + { + "epoch": 0.318071469437615, + "grad_norm": 0.11388055980205536, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 82280 + }, + { + "epoch": 0.3181101266409983, + "grad_norm": 0.10221295803785324, + "learning_rate": 0.002, + "loss": 2.344, + "step": 82290 + }, + { + "epoch": 0.31814878384438156, + "grad_norm": 0.09508053958415985, + "learning_rate": 0.002, + "loss": 2.355, + "step": 82300 + }, + { + "epoch": 0.31818744104776486, + "grad_norm": 0.14823344349861145, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 82310 + }, + { + "epoch": 0.3182260982511481, + "grad_norm": 0.11601465940475464, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 82320 + }, + { + "epoch": 0.3182647554545314, + "grad_norm": 0.10855911672115326, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 82330 + }, + { + "epoch": 0.3183034126579147, + "grad_norm": 0.11325045675039291, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 82340 + }, + { + "epoch": 0.3183420698612979, + "grad_norm": 0.12187658250331879, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 82350 + }, + { + "epoch": 0.31838072706468123, + "grad_norm": 0.10567446798086166, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 82360 + }, + { + "epoch": 0.3184193842680645, + "grad_norm": 0.119540274143219, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 82370 + }, + { + "epoch": 0.3184580414714478, + "grad_norm": 0.10615716874599457, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 82380 + }, + { + "epoch": 0.31849669867483105, + "grad_norm": 0.1109447181224823, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 82390 + }, + { + "epoch": 0.31853535587821435, + "grad_norm": 0.12240978330373764, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 82400 + }, + { + "epoch": 0.3185740130815976, + "grad_norm": 0.1345326006412506, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 82410 + }, + { + "epoch": 0.3186126702849809, + "grad_norm": 0.12959595024585724, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 82420 + }, + { + "epoch": 0.31865132748836417, + "grad_norm": 0.12424161285161972, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 82430 + }, + { + "epoch": 0.3186899846917475, + "grad_norm": 0.11147624999284744, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 82440 + }, + { + "epoch": 0.3187286418951307, + "grad_norm": 0.11418869346380234, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 82450 + }, + { + "epoch": 0.31876729909851403, + "grad_norm": 0.112371526658535, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 82460 + }, + { + "epoch": 0.3188059563018973, + "grad_norm": 0.13134798407554626, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 82470 + }, + { + "epoch": 0.3188446135052806, + "grad_norm": 0.11740058660507202, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 82480 + }, + { + "epoch": 0.31888327070866385, + "grad_norm": 0.11432835459709167, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 82490 + }, + { + "epoch": 0.31892192791204715, + "grad_norm": 0.10555962473154068, + "learning_rate": 0.002, + "loss": 2.356, + "step": 82500 + }, + { + "epoch": 0.3189605851154304, + "grad_norm": 0.10396882146596909, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 82510 + }, + { + "epoch": 0.3189992423188137, + "grad_norm": 0.10428617149591446, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 82520 + }, + { + "epoch": 0.31903789952219697, + "grad_norm": 0.12038405984640121, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 82530 + }, + { + "epoch": 0.3190765567255802, + "grad_norm": 0.10019529610872269, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 82540 + }, + { + "epoch": 0.3191152139289635, + "grad_norm": 0.1231880858540535, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 82550 + }, + { + "epoch": 0.3191538711323468, + "grad_norm": 0.09879335016012192, + "learning_rate": 0.002, + "loss": 2.348, + "step": 82560 + }, + { + "epoch": 0.3191925283357301, + "grad_norm": 0.1070961058139801, + "learning_rate": 0.002, + "loss": 2.345, + "step": 82570 + }, + { + "epoch": 0.31923118553911334, + "grad_norm": 0.14890055358409882, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 82580 + }, + { + "epoch": 0.31926984274249665, + "grad_norm": 0.11926935613155365, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 82590 + }, + { + "epoch": 0.3193084999458799, + "grad_norm": 0.10481767356395721, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 82600 + }, + { + "epoch": 0.3193471571492632, + "grad_norm": 0.09248179197311401, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 82610 + }, + { + "epoch": 0.31938581435264646, + "grad_norm": 0.11034086346626282, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 82620 + }, + { + "epoch": 0.31942447155602977, + "grad_norm": 0.15140187740325928, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 82630 + }, + { + "epoch": 0.319463128759413, + "grad_norm": 0.12119888514280319, + "learning_rate": 0.002, + "loss": 2.344, + "step": 82640 + }, + { + "epoch": 0.3195017859627963, + "grad_norm": 0.12400045245885849, + "learning_rate": 0.002, + "loss": 2.349, + "step": 82650 + }, + { + "epoch": 0.3195404431661796, + "grad_norm": 0.10955554991960526, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 82660 + }, + { + "epoch": 0.3195791003695629, + "grad_norm": 0.1160394623875618, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 82670 + }, + { + "epoch": 0.31961775757294614, + "grad_norm": 0.1217651292681694, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 82680 + }, + { + "epoch": 0.31965641477632945, + "grad_norm": 0.11645574867725372, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 82690 + }, + { + "epoch": 0.3196950719797127, + "grad_norm": 0.10777121782302856, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 82700 + }, + { + "epoch": 0.319733729183096, + "grad_norm": 0.10144933313131332, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 82710 + }, + { + "epoch": 0.31977238638647926, + "grad_norm": 0.13218508660793304, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 82720 + }, + { + "epoch": 0.3198110435898625, + "grad_norm": 0.125457301735878, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 82730 + }, + { + "epoch": 0.3198497007932458, + "grad_norm": 0.11357942223548889, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 82740 + }, + { + "epoch": 0.31988835799662907, + "grad_norm": 0.11013224720954895, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 82750 + }, + { + "epoch": 0.3199270152000124, + "grad_norm": 0.11626985669136047, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 82760 + }, + { + "epoch": 0.31996567240339563, + "grad_norm": 0.11156153678894043, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 82770 + }, + { + "epoch": 0.32000432960677894, + "grad_norm": 0.14560703933238983, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 82780 + }, + { + "epoch": 0.3200429868101622, + "grad_norm": 0.10113231837749481, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 82790 + }, + { + "epoch": 0.3200816440135455, + "grad_norm": 0.10319309681653976, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 82800 + }, + { + "epoch": 0.32012030121692875, + "grad_norm": 0.12566417455673218, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 82810 + }, + { + "epoch": 0.32015895842031206, + "grad_norm": 0.0948578342795372, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 82820 + }, + { + "epoch": 0.3201976156236953, + "grad_norm": 0.11071591079235077, + "learning_rate": 0.002, + "loss": 2.358, + "step": 82830 + }, + { + "epoch": 0.3202362728270786, + "grad_norm": 0.09550026059150696, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 82840 + }, + { + "epoch": 0.32027493003046187, + "grad_norm": 0.10873312503099442, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 82850 + }, + { + "epoch": 0.3203135872338452, + "grad_norm": 0.09585629403591156, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 82860 + }, + { + "epoch": 0.32035224443722843, + "grad_norm": 0.1324733942747116, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 82870 + }, + { + "epoch": 0.32039090164061174, + "grad_norm": 0.10929979383945465, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 82880 + }, + { + "epoch": 0.320429558843995, + "grad_norm": 0.09126242995262146, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 82890 + }, + { + "epoch": 0.3204682160473783, + "grad_norm": 0.12022841721773148, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 82900 + }, + { + "epoch": 0.32050687325076155, + "grad_norm": 0.12190208584070206, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 82910 + }, + { + "epoch": 0.3205455304541448, + "grad_norm": 0.11144687980413437, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 82920 + }, + { + "epoch": 0.3205841876575281, + "grad_norm": 0.12261205166578293, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 82930 + }, + { + "epoch": 0.32062284486091136, + "grad_norm": 0.10348153114318848, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 82940 + }, + { + "epoch": 0.32066150206429467, + "grad_norm": 0.10360660403966904, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 82950 + }, + { + "epoch": 0.3207001592676779, + "grad_norm": 0.1277213990688324, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 82960 + }, + { + "epoch": 0.3207388164710612, + "grad_norm": 0.11334872245788574, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 82970 + }, + { + "epoch": 0.3207774736744445, + "grad_norm": 0.22948279976844788, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 82980 + }, + { + "epoch": 0.3208161308778278, + "grad_norm": 0.09425721317529678, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 82990 + }, + { + "epoch": 0.32085478808121104, + "grad_norm": 0.13531698286533356, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 83000 + }, + { + "epoch": 0.32089344528459435, + "grad_norm": 0.0998660996556282, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 83010 + }, + { + "epoch": 0.3209321024879776, + "grad_norm": 0.1346549391746521, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 83020 + }, + { + "epoch": 0.3209707596913609, + "grad_norm": 0.11035139858722687, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 83030 + }, + { + "epoch": 0.32100941689474416, + "grad_norm": 0.11539337784051895, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 83040 + }, + { + "epoch": 0.32104807409812747, + "grad_norm": 0.11156875640153885, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 83050 + }, + { + "epoch": 0.3210867313015107, + "grad_norm": 0.11881548166275024, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 83060 + }, + { + "epoch": 0.321125388504894, + "grad_norm": 0.12837591767311096, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 83070 + }, + { + "epoch": 0.3211640457082773, + "grad_norm": 0.10233955830335617, + "learning_rate": 0.002, + "loss": 2.359, + "step": 83080 + }, + { + "epoch": 0.32120270291166053, + "grad_norm": 0.11211925745010376, + "learning_rate": 0.002, + "loss": 2.354, + "step": 83090 + }, + { + "epoch": 0.32124136011504384, + "grad_norm": 0.10675840824842453, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 83100 + }, + { + "epoch": 0.3212800173184271, + "grad_norm": 0.12227018922567368, + "learning_rate": 0.002, + "loss": 2.349, + "step": 83110 + }, + { + "epoch": 0.3213186745218104, + "grad_norm": 0.10975679010152817, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 83120 + }, + { + "epoch": 0.32135733172519365, + "grad_norm": 0.11084916442632675, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 83130 + }, + { + "epoch": 0.32139598892857696, + "grad_norm": 0.12091030925512314, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 83140 + }, + { + "epoch": 0.3214346461319602, + "grad_norm": 0.1246369257569313, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 83150 + }, + { + "epoch": 0.3214733033353435, + "grad_norm": 0.10367751866579056, + "learning_rate": 0.002, + "loss": 2.343, + "step": 83160 + }, + { + "epoch": 0.32151196053872677, + "grad_norm": 0.11552299559116364, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 83170 + }, + { + "epoch": 0.3215506177421101, + "grad_norm": 0.12590712308883667, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 83180 + }, + { + "epoch": 0.32158927494549333, + "grad_norm": 0.1412360519170761, + "learning_rate": 0.002, + "loss": 2.349, + "step": 83190 + }, + { + "epoch": 0.32162793214887664, + "grad_norm": 0.11818281561136246, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 83200 + }, + { + "epoch": 0.3216665893522599, + "grad_norm": 0.11225637048482895, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 83210 + }, + { + "epoch": 0.3217052465556432, + "grad_norm": 0.10759352892637253, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 83220 + }, + { + "epoch": 0.32174390375902645, + "grad_norm": 0.11591193825006485, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 83230 + }, + { + "epoch": 0.32178256096240976, + "grad_norm": 0.10086407512426376, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 83240 + }, + { + "epoch": 0.321821218165793, + "grad_norm": 0.12272131443023682, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 83250 + }, + { + "epoch": 0.3218598753691763, + "grad_norm": 0.12350817024707794, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 83260 + }, + { + "epoch": 0.32189853257255957, + "grad_norm": 0.10002472251653671, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 83270 + }, + { + "epoch": 0.3219371897759428, + "grad_norm": 0.10490340739488602, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 83280 + }, + { + "epoch": 0.32197584697932613, + "grad_norm": 0.1162208840250969, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 83290 + }, + { + "epoch": 0.3220145041827094, + "grad_norm": 0.12758538126945496, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 83300 + }, + { + "epoch": 0.3220531613860927, + "grad_norm": 0.10897503793239594, + "learning_rate": 0.002, + "loss": 2.357, + "step": 83310 + }, + { + "epoch": 0.32209181858947594, + "grad_norm": 0.10656184703111649, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 83320 + }, + { + "epoch": 0.32213047579285925, + "grad_norm": 0.10997114330530167, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 83330 + }, + { + "epoch": 0.3221691329962425, + "grad_norm": 0.1076001301407814, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 83340 + }, + { + "epoch": 0.3222077901996258, + "grad_norm": 0.11565978080034256, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 83350 + }, + { + "epoch": 0.32224644740300906, + "grad_norm": 0.09923960268497467, + "learning_rate": 0.002, + "loss": 2.345, + "step": 83360 + }, + { + "epoch": 0.32228510460639237, + "grad_norm": 0.10747136175632477, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 83370 + }, + { + "epoch": 0.3223237618097756, + "grad_norm": 0.119499072432518, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 83380 + }, + { + "epoch": 0.32236241901315893, + "grad_norm": 0.11400345712900162, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 83390 + }, + { + "epoch": 0.3224010762165422, + "grad_norm": 0.10465895384550095, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 83400 + }, + { + "epoch": 0.3224397334199255, + "grad_norm": 0.10290923714637756, + "learning_rate": 0.002, + "loss": 2.327, + "step": 83410 + }, + { + "epoch": 0.32247839062330874, + "grad_norm": 0.13437430560588837, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 83420 + }, + { + "epoch": 0.32251704782669205, + "grad_norm": 0.10731037706136703, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 83430 + }, + { + "epoch": 0.3225557050300753, + "grad_norm": 0.12197359651327133, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 83440 + }, + { + "epoch": 0.3225943622334586, + "grad_norm": 0.11455868929624557, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 83450 + }, + { + "epoch": 0.32263301943684186, + "grad_norm": 0.11100361496210098, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 83460 + }, + { + "epoch": 0.3226716766402251, + "grad_norm": 0.10418584197759628, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 83470 + }, + { + "epoch": 0.3227103338436084, + "grad_norm": 0.1303698569536209, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 83480 + }, + { + "epoch": 0.3227489910469917, + "grad_norm": 0.1358424574136734, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 83490 + }, + { + "epoch": 0.322787648250375, + "grad_norm": 0.10467436909675598, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 83500 + }, + { + "epoch": 0.32282630545375823, + "grad_norm": 0.12298433482646942, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 83510 + }, + { + "epoch": 0.32286496265714154, + "grad_norm": 0.12430752068758011, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 83520 + }, + { + "epoch": 0.3229036198605248, + "grad_norm": 0.11504051834344864, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 83530 + }, + { + "epoch": 0.3229422770639081, + "grad_norm": 0.11451207846403122, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 83540 + }, + { + "epoch": 0.32298093426729135, + "grad_norm": 0.11278517544269562, + "learning_rate": 0.002, + "loss": 2.365, + "step": 83550 + }, + { + "epoch": 0.32301959147067466, + "grad_norm": 0.11106080561876297, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 83560 + }, + { + "epoch": 0.3230582486740579, + "grad_norm": 0.09784231334924698, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 83570 + }, + { + "epoch": 0.3230969058774412, + "grad_norm": 0.12368303537368774, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 83580 + }, + { + "epoch": 0.3231355630808245, + "grad_norm": 0.11726155877113342, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 83590 + }, + { + "epoch": 0.3231742202842078, + "grad_norm": 0.11166765540838242, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 83600 + }, + { + "epoch": 0.32321287748759103, + "grad_norm": 0.11560188978910446, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 83610 + }, + { + "epoch": 0.32325153469097434, + "grad_norm": 0.12689295411109924, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 83620 + }, + { + "epoch": 0.3232901918943576, + "grad_norm": 0.12828336656093597, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 83630 + }, + { + "epoch": 0.3233288490977409, + "grad_norm": 0.1015634834766388, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 83640 + }, + { + "epoch": 0.32336750630112415, + "grad_norm": 0.10310135781764984, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 83650 + }, + { + "epoch": 0.3234061635045074, + "grad_norm": 0.11620552092790604, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 83660 + }, + { + "epoch": 0.3234448207078907, + "grad_norm": 0.11301601678133011, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 83670 + }, + { + "epoch": 0.32348347791127396, + "grad_norm": 0.12127354741096497, + "learning_rate": 0.002, + "loss": 2.373, + "step": 83680 + }, + { + "epoch": 0.32352213511465727, + "grad_norm": 0.10062091797590256, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 83690 + }, + { + "epoch": 0.3235607923180405, + "grad_norm": 0.11745966225862503, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 83700 + }, + { + "epoch": 0.32359944952142383, + "grad_norm": 0.10977525264024734, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 83710 + }, + { + "epoch": 0.3236381067248071, + "grad_norm": 0.10556995123624802, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 83720 + }, + { + "epoch": 0.3236767639281904, + "grad_norm": 0.10026714950799942, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 83730 + }, + { + "epoch": 0.32371542113157364, + "grad_norm": 0.1397588849067688, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 83740 + }, + { + "epoch": 0.32375407833495695, + "grad_norm": 0.10103439539670944, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 83750 + }, + { + "epoch": 0.3237927355383402, + "grad_norm": 0.10274723917245865, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 83760 + }, + { + "epoch": 0.3238313927417235, + "grad_norm": 0.10949409008026123, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 83770 + }, + { + "epoch": 0.32387004994510676, + "grad_norm": 0.10061365365982056, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 83780 + }, + { + "epoch": 0.32390870714849007, + "grad_norm": 0.11748245358467102, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 83790 + }, + { + "epoch": 0.3239473643518733, + "grad_norm": 0.11854736506938934, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 83800 + }, + { + "epoch": 0.32398602155525663, + "grad_norm": 0.10991797596216202, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 83810 + }, + { + "epoch": 0.3240246787586399, + "grad_norm": 0.12242380529642105, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 83820 + }, + { + "epoch": 0.32406333596202314, + "grad_norm": 0.10684510320425034, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 83830 + }, + { + "epoch": 0.32410199316540644, + "grad_norm": 0.10553453117609024, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 83840 + }, + { + "epoch": 0.3241406503687897, + "grad_norm": 0.13125893473625183, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 83850 + }, + { + "epoch": 0.324179307572173, + "grad_norm": 0.10782486200332642, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 83860 + }, + { + "epoch": 0.32421796477555626, + "grad_norm": 0.12534086406230927, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 83870 + }, + { + "epoch": 0.32425662197893956, + "grad_norm": 0.13595376908779144, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 83880 + }, + { + "epoch": 0.3242952791823228, + "grad_norm": 0.09643189609050751, + "learning_rate": 0.002, + "loss": 2.365, + "step": 83890 + }, + { + "epoch": 0.3243339363857061, + "grad_norm": 0.10394199937582016, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 83900 + }, + { + "epoch": 0.3243725935890894, + "grad_norm": 0.12716563045978546, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 83910 + }, + { + "epoch": 0.3244112507924727, + "grad_norm": 0.09696212410926819, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 83920 + }, + { + "epoch": 0.32444990799585594, + "grad_norm": 0.11893827468156815, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 83930 + }, + { + "epoch": 0.32448856519923924, + "grad_norm": 0.1292070746421814, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 83940 + }, + { + "epoch": 0.3245272224026225, + "grad_norm": 0.09975294023752213, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 83950 + }, + { + "epoch": 0.3245658796060058, + "grad_norm": 0.09658270329236984, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 83960 + }, + { + "epoch": 0.32460453680938905, + "grad_norm": 0.1587447077035904, + "learning_rate": 0.002, + "loss": 2.358, + "step": 83970 + }, + { + "epoch": 0.32464319401277236, + "grad_norm": 0.14374689757823944, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 83980 + }, + { + "epoch": 0.3246818512161556, + "grad_norm": 0.1280277520418167, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 83990 + }, + { + "epoch": 0.3247205084195389, + "grad_norm": 0.10204432159662247, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 84000 + }, + { + "epoch": 0.3247591656229222, + "grad_norm": 0.11131812632083893, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 84010 + }, + { + "epoch": 0.3247978228263054, + "grad_norm": 0.1131935566663742, + "learning_rate": 0.002, + "loss": 2.36, + "step": 84020 + }, + { + "epoch": 0.32483648002968873, + "grad_norm": 0.12737193703651428, + "learning_rate": 0.002, + "loss": 2.356, + "step": 84030 + }, + { + "epoch": 0.324875137233072, + "grad_norm": 0.10414043068885803, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 84040 + }, + { + "epoch": 0.3249137944364553, + "grad_norm": 0.12210876494646072, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 84050 + }, + { + "epoch": 0.32495245163983855, + "grad_norm": 0.13212950527668, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 84060 + }, + { + "epoch": 0.32499110884322185, + "grad_norm": 0.1294020563364029, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 84070 + }, + { + "epoch": 0.3250297660466051, + "grad_norm": 0.11428891867399216, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 84080 + }, + { + "epoch": 0.3250684232499884, + "grad_norm": 0.10889285057783127, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 84090 + }, + { + "epoch": 0.32510708045337167, + "grad_norm": 0.11247077584266663, + "learning_rate": 0.002, + "loss": 2.362, + "step": 84100 + }, + { + "epoch": 0.325145737656755, + "grad_norm": 0.10409799963235855, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 84110 + }, + { + "epoch": 0.3251843948601382, + "grad_norm": 0.1056474819779396, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 84120 + }, + { + "epoch": 0.32522305206352153, + "grad_norm": 0.1274235099554062, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 84130 + }, + { + "epoch": 0.3252617092669048, + "grad_norm": 0.13003771007061005, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 84140 + }, + { + "epoch": 0.3253003664702881, + "grad_norm": 0.11398117244243622, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 84150 + }, + { + "epoch": 0.32533902367367135, + "grad_norm": 0.10530584305524826, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 84160 + }, + { + "epoch": 0.32537768087705465, + "grad_norm": 0.11318954825401306, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 84170 + }, + { + "epoch": 0.3254163380804379, + "grad_norm": 0.11111290007829666, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 84180 + }, + { + "epoch": 0.3254549952838212, + "grad_norm": 0.10164378583431244, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 84190 + }, + { + "epoch": 0.32549365248720447, + "grad_norm": 0.13917332887649536, + "learning_rate": 0.002, + "loss": 2.332, + "step": 84200 + }, + { + "epoch": 0.3255323096905877, + "grad_norm": 0.10684075951576233, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 84210 + }, + { + "epoch": 0.325570966893971, + "grad_norm": 0.10433298349380493, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 84220 + }, + { + "epoch": 0.3256096240973543, + "grad_norm": 0.09909933060407639, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 84230 + }, + { + "epoch": 0.3256482813007376, + "grad_norm": 0.10950423032045364, + "learning_rate": 0.002, + "loss": 2.351, + "step": 84240 + }, + { + "epoch": 0.32568693850412084, + "grad_norm": 0.09400998055934906, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 84250 + }, + { + "epoch": 0.32572559570750415, + "grad_norm": 0.09943398833274841, + "learning_rate": 0.002, + "loss": 2.36, + "step": 84260 + }, + { + "epoch": 0.3257642529108874, + "grad_norm": 0.13087767362594604, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 84270 + }, + { + "epoch": 0.3258029101142707, + "grad_norm": 0.11305786669254303, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 84280 + }, + { + "epoch": 0.32584156731765396, + "grad_norm": 0.11657200753688812, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 84290 + }, + { + "epoch": 0.32588022452103727, + "grad_norm": 0.09545467048883438, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 84300 + }, + { + "epoch": 0.3259188817244205, + "grad_norm": 0.12025730311870575, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 84310 + }, + { + "epoch": 0.3259575389278038, + "grad_norm": 0.10664772987365723, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 84320 + }, + { + "epoch": 0.3259961961311871, + "grad_norm": 0.11544671654701233, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 84330 + }, + { + "epoch": 0.3260348533345704, + "grad_norm": 0.1060868427157402, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 84340 + }, + { + "epoch": 0.32607351053795364, + "grad_norm": 0.11616748571395874, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 84350 + }, + { + "epoch": 0.32611216774133694, + "grad_norm": 0.10780932754278183, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 84360 + }, + { + "epoch": 0.3261508249447202, + "grad_norm": 0.10857664793729782, + "learning_rate": 0.002, + "loss": 2.3749, + "step": 84370 + }, + { + "epoch": 0.3261894821481035, + "grad_norm": 0.12012740224599838, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 84380 + }, + { + "epoch": 0.32622813935148676, + "grad_norm": 0.10782299190759659, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 84390 + }, + { + "epoch": 0.32626679655487, + "grad_norm": 0.13160741329193115, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 84400 + }, + { + "epoch": 0.3263054537582533, + "grad_norm": 0.1270124465227127, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 84410 + }, + { + "epoch": 0.32634411096163657, + "grad_norm": 0.1109585240483284, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 84420 + }, + { + "epoch": 0.3263827681650199, + "grad_norm": 0.11671182513237, + "learning_rate": 0.002, + "loss": 2.364, + "step": 84430 + }, + { + "epoch": 0.32642142536840313, + "grad_norm": 0.11640793085098267, + "learning_rate": 0.002, + "loss": 2.352, + "step": 84440 + }, + { + "epoch": 0.32646008257178644, + "grad_norm": 0.1214011162519455, + "learning_rate": 0.002, + "loss": 2.359, + "step": 84450 + }, + { + "epoch": 0.3264987397751697, + "grad_norm": 0.10922209918498993, + "learning_rate": 0.002, + "loss": 2.353, + "step": 84460 + }, + { + "epoch": 0.326537396978553, + "grad_norm": 0.10862427949905396, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 84470 + }, + { + "epoch": 0.32657605418193625, + "grad_norm": 0.11239778250455856, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 84480 + }, + { + "epoch": 0.32661471138531956, + "grad_norm": 0.11458240449428558, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 84490 + }, + { + "epoch": 0.3266533685887028, + "grad_norm": 0.11729695647954941, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 84500 + }, + { + "epoch": 0.3266920257920861, + "grad_norm": 0.09423644095659256, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 84510 + }, + { + "epoch": 0.32673068299546937, + "grad_norm": 0.09924380481243134, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 84520 + }, + { + "epoch": 0.3267693401988527, + "grad_norm": 0.126951664686203, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 84530 + }, + { + "epoch": 0.32680799740223593, + "grad_norm": 0.10472346097230911, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 84540 + }, + { + "epoch": 0.32684665460561924, + "grad_norm": 0.1225130558013916, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 84550 + }, + { + "epoch": 0.3268853118090025, + "grad_norm": 0.12059375643730164, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 84560 + }, + { + "epoch": 0.3269239690123858, + "grad_norm": 0.11787009984254837, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 84570 + }, + { + "epoch": 0.32696262621576905, + "grad_norm": 0.10444694757461548, + "learning_rate": 0.002, + "loss": 2.338, + "step": 84580 + }, + { + "epoch": 0.3270012834191523, + "grad_norm": 0.12187785655260086, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 84590 + }, + { + "epoch": 0.3270399406225356, + "grad_norm": 0.10000617057085037, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 84600 + }, + { + "epoch": 0.32707859782591886, + "grad_norm": 0.10169254243373871, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 84610 + }, + { + "epoch": 0.32711725502930217, + "grad_norm": 0.11734003573656082, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 84620 + }, + { + "epoch": 0.3271559122326854, + "grad_norm": 0.11790499836206436, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 84630 + }, + { + "epoch": 0.3271945694360687, + "grad_norm": 0.11842221766710281, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 84640 + }, + { + "epoch": 0.327233226639452, + "grad_norm": 0.10803110897541046, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 84650 + }, + { + "epoch": 0.3272718838428353, + "grad_norm": 0.11215823143720627, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 84660 + }, + { + "epoch": 0.32731054104621854, + "grad_norm": 0.0982016995549202, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 84670 + }, + { + "epoch": 0.32734919824960185, + "grad_norm": 0.10206621140241623, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 84680 + }, + { + "epoch": 0.3273878554529851, + "grad_norm": 0.11638012528419495, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 84690 + }, + { + "epoch": 0.3274265126563684, + "grad_norm": 0.10438395291566849, + "learning_rate": 0.002, + "loss": 2.365, + "step": 84700 + }, + { + "epoch": 0.32746516985975166, + "grad_norm": 0.15618744492530823, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 84710 + }, + { + "epoch": 0.32750382706313497, + "grad_norm": 0.10686315596103668, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 84720 + }, + { + "epoch": 0.3275424842665182, + "grad_norm": 0.10776001960039139, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 84730 + }, + { + "epoch": 0.3275811414699015, + "grad_norm": 0.1202462762594223, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 84740 + }, + { + "epoch": 0.3276197986732848, + "grad_norm": 0.11879925429821014, + "learning_rate": 0.002, + "loss": 2.35, + "step": 84750 + }, + { + "epoch": 0.32765845587666803, + "grad_norm": 0.10598697513341904, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 84760 + }, + { + "epoch": 0.32769711308005134, + "grad_norm": 0.09640336781740189, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 84770 + }, + { + "epoch": 0.3277357702834346, + "grad_norm": 0.13301733136177063, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 84780 + }, + { + "epoch": 0.3277744274868179, + "grad_norm": 0.10526037961244583, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 84790 + }, + { + "epoch": 0.32781308469020115, + "grad_norm": 0.1060631051659584, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 84800 + }, + { + "epoch": 0.32785174189358446, + "grad_norm": 0.09776698797941208, + "learning_rate": 0.002, + "loss": 2.361, + "step": 84810 + }, + { + "epoch": 0.3278903990969677, + "grad_norm": 0.11571425944566727, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 84820 + }, + { + "epoch": 0.327929056300351, + "grad_norm": 0.12359791249036789, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 84830 + }, + { + "epoch": 0.32796771350373427, + "grad_norm": 0.11640583723783493, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 84840 + }, + { + "epoch": 0.3280063707071176, + "grad_norm": 0.12452420592308044, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 84850 + }, + { + "epoch": 0.32804502791050083, + "grad_norm": 0.11438850313425064, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 84860 + }, + { + "epoch": 0.32808368511388414, + "grad_norm": 0.1265454739332199, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 84870 + }, + { + "epoch": 0.3281223423172674, + "grad_norm": 0.1110704094171524, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 84880 + }, + { + "epoch": 0.3281609995206507, + "grad_norm": 0.11462464183568954, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 84890 + }, + { + "epoch": 0.32819965672403395, + "grad_norm": 0.10768561065196991, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 84900 + }, + { + "epoch": 0.32823831392741726, + "grad_norm": 0.11579468846321106, + "learning_rate": 0.002, + "loss": 2.333, + "step": 84910 + }, + { + "epoch": 0.3282769711308005, + "grad_norm": 0.11966416984796524, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 84920 + }, + { + "epoch": 0.3283156283341838, + "grad_norm": 0.11327365785837173, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 84930 + }, + { + "epoch": 0.32835428553756707, + "grad_norm": 0.1026577427983284, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 84940 + }, + { + "epoch": 0.3283929427409503, + "grad_norm": 0.10104644298553467, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 84950 + }, + { + "epoch": 0.32843159994433363, + "grad_norm": 0.10899697989225388, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 84960 + }, + { + "epoch": 0.3284702571477169, + "grad_norm": 0.1111191138625145, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 84970 + }, + { + "epoch": 0.3285089143511002, + "grad_norm": 0.11514212936162949, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 84980 + }, + { + "epoch": 0.32854757155448344, + "grad_norm": 0.09942366182804108, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 84990 + }, + { + "epoch": 0.32858622875786675, + "grad_norm": 0.10499731451272964, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 85000 + }, + { + "epoch": 0.32862488596125, + "grad_norm": 0.1169121041893959, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 85010 + }, + { + "epoch": 0.3286635431646333, + "grad_norm": 0.10293654352426529, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 85020 + }, + { + "epoch": 0.32870220036801656, + "grad_norm": 0.13729114830493927, + "learning_rate": 0.002, + "loss": 2.345, + "step": 85030 + }, + { + "epoch": 0.32874085757139987, + "grad_norm": 0.12647894024848938, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 85040 + }, + { + "epoch": 0.3287795147747831, + "grad_norm": 0.11063408851623535, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 85050 + }, + { + "epoch": 0.32881817197816643, + "grad_norm": 0.12294568866491318, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 85060 + }, + { + "epoch": 0.3288568291815497, + "grad_norm": 0.117158442735672, + "learning_rate": 0.002, + "loss": 2.342, + "step": 85070 + }, + { + "epoch": 0.328895486384933, + "grad_norm": 0.10392977297306061, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 85080 + }, + { + "epoch": 0.32893414358831624, + "grad_norm": 0.11240453273057938, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 85090 + }, + { + "epoch": 0.32897280079169955, + "grad_norm": 0.11931589990854263, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 85100 + }, + { + "epoch": 0.3290114579950828, + "grad_norm": 0.12070564180612564, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 85110 + }, + { + "epoch": 0.3290501151984661, + "grad_norm": 0.09947573393583298, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 85120 + }, + { + "epoch": 0.32908877240184936, + "grad_norm": 0.13053150475025177, + "learning_rate": 0.002, + "loss": 2.368, + "step": 85130 + }, + { + "epoch": 0.3291274296052326, + "grad_norm": 0.1006920337677002, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 85140 + }, + { + "epoch": 0.3291660868086159, + "grad_norm": 0.1335725635290146, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 85150 + }, + { + "epoch": 0.3292047440119992, + "grad_norm": 0.10223272442817688, + "learning_rate": 0.002, + "loss": 2.359, + "step": 85160 + }, + { + "epoch": 0.3292434012153825, + "grad_norm": 0.10168513655662537, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 85170 + }, + { + "epoch": 0.32928205841876573, + "grad_norm": 0.11560312658548355, + "learning_rate": 0.002, + "loss": 2.351, + "step": 85180 + }, + { + "epoch": 0.32932071562214904, + "grad_norm": 0.10953016579151154, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 85190 + }, + { + "epoch": 0.3293593728255323, + "grad_norm": 0.10393237322568893, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 85200 + }, + { + "epoch": 0.3293980300289156, + "grad_norm": 0.11780615150928497, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 85210 + }, + { + "epoch": 0.32943668723229885, + "grad_norm": 0.1172422245144844, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 85220 + }, + { + "epoch": 0.32947534443568216, + "grad_norm": 0.10669662058353424, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 85230 + }, + { + "epoch": 0.3295140016390654, + "grad_norm": 0.1318225860595703, + "learning_rate": 0.002, + "loss": 2.373, + "step": 85240 + }, + { + "epoch": 0.3295526588424487, + "grad_norm": 0.09088876098394394, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 85250 + }, + { + "epoch": 0.329591316045832, + "grad_norm": 0.11682692170143127, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 85260 + }, + { + "epoch": 0.3296299732492153, + "grad_norm": 0.10686168074607849, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 85270 + }, + { + "epoch": 0.32966863045259853, + "grad_norm": 0.0991005152463913, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 85280 + }, + { + "epoch": 0.32970728765598184, + "grad_norm": 0.11931562423706055, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 85290 + }, + { + "epoch": 0.3297459448593651, + "grad_norm": 0.11068251729011536, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 85300 + }, + { + "epoch": 0.3297846020627484, + "grad_norm": 0.12620867788791656, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 85310 + }, + { + "epoch": 0.32982325926613165, + "grad_norm": 0.12862162292003632, + "learning_rate": 0.002, + "loss": 2.351, + "step": 85320 + }, + { + "epoch": 0.3298619164695149, + "grad_norm": 0.11188545823097229, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 85330 + }, + { + "epoch": 0.3299005736728982, + "grad_norm": 0.10457547008991241, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 85340 + }, + { + "epoch": 0.32993923087628146, + "grad_norm": 0.11040732264518738, + "learning_rate": 0.002, + "loss": 2.357, + "step": 85350 + }, + { + "epoch": 0.32997788807966477, + "grad_norm": 0.11926726251840591, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 85360 + }, + { + "epoch": 0.330016545283048, + "grad_norm": 0.10826878994703293, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 85370 + }, + { + "epoch": 0.33005520248643133, + "grad_norm": 0.11095569282770157, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 85380 + }, + { + "epoch": 0.3300938596898146, + "grad_norm": 0.12638893723487854, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 85390 + }, + { + "epoch": 0.3301325168931979, + "grad_norm": 0.11446123570203781, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 85400 + }, + { + "epoch": 0.33017117409658114, + "grad_norm": 0.12310647964477539, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 85410 + }, + { + "epoch": 0.33020983129996445, + "grad_norm": 0.10424927622079849, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 85420 + }, + { + "epoch": 0.3302484885033477, + "grad_norm": 0.1351475864648819, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 85430 + }, + { + "epoch": 0.330287145706731, + "grad_norm": 0.12132125347852707, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 85440 + }, + { + "epoch": 0.33032580291011426, + "grad_norm": 0.10493456572294235, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 85450 + }, + { + "epoch": 0.33036446011349757, + "grad_norm": 0.10808337479829788, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 85460 + }, + { + "epoch": 0.3304031173168808, + "grad_norm": 0.15387500822544098, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 85470 + }, + { + "epoch": 0.33044177452026413, + "grad_norm": 0.1107737347483635, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 85480 + }, + { + "epoch": 0.3304804317236474, + "grad_norm": 0.1131087988615036, + "learning_rate": 0.002, + "loss": 2.361, + "step": 85490 + }, + { + "epoch": 0.33051908892703064, + "grad_norm": 0.11487334221601486, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 85500 + }, + { + "epoch": 0.33055774613041394, + "grad_norm": 0.11559855937957764, + "learning_rate": 0.002, + "loss": 2.339, + "step": 85510 + }, + { + "epoch": 0.3305964033337972, + "grad_norm": 0.1240239292383194, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 85520 + }, + { + "epoch": 0.3306350605371805, + "grad_norm": 0.13296452164649963, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 85530 + }, + { + "epoch": 0.33067371774056376, + "grad_norm": 0.11823895573616028, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 85540 + }, + { + "epoch": 0.33071237494394706, + "grad_norm": 0.10527436435222626, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 85550 + }, + { + "epoch": 0.3307510321473303, + "grad_norm": 0.10656841099262238, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 85560 + }, + { + "epoch": 0.3307896893507136, + "grad_norm": 0.11199898272752762, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 85570 + }, + { + "epoch": 0.3308283465540969, + "grad_norm": 0.1321028769016266, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 85580 + }, + { + "epoch": 0.3308670037574802, + "grad_norm": 0.1020321249961853, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 85590 + }, + { + "epoch": 0.33090566096086343, + "grad_norm": 0.11091432720422745, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 85600 + }, + { + "epoch": 0.33094431816424674, + "grad_norm": 0.12364307790994644, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 85610 + }, + { + "epoch": 0.33098297536763, + "grad_norm": 0.10222646594047546, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 85620 + }, + { + "epoch": 0.3310216325710133, + "grad_norm": 0.10115206241607666, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 85630 + }, + { + "epoch": 0.33106028977439655, + "grad_norm": 0.10824427008628845, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 85640 + }, + { + "epoch": 0.33109894697777986, + "grad_norm": 0.13098442554473877, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 85650 + }, + { + "epoch": 0.3311376041811631, + "grad_norm": 0.10785630345344543, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 85660 + }, + { + "epoch": 0.3311762613845464, + "grad_norm": 0.11627553403377533, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 85670 + }, + { + "epoch": 0.3312149185879297, + "grad_norm": 0.13079549372196198, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 85680 + }, + { + "epoch": 0.3312535757913129, + "grad_norm": 0.1064203754067421, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 85690 + }, + { + "epoch": 0.33129223299469623, + "grad_norm": 0.11663869023323059, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 85700 + }, + { + "epoch": 0.3313308901980795, + "grad_norm": 0.10477962344884872, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 85710 + }, + { + "epoch": 0.3313695474014628, + "grad_norm": 0.12362811714410782, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 85720 + }, + { + "epoch": 0.33140820460484605, + "grad_norm": 0.1224687248468399, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 85730 + }, + { + "epoch": 0.33144686180822935, + "grad_norm": 0.11334341019392014, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 85740 + }, + { + "epoch": 0.3314855190116126, + "grad_norm": 0.10793153941631317, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 85750 + }, + { + "epoch": 0.3315241762149959, + "grad_norm": 0.13817518949508667, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 85760 + }, + { + "epoch": 0.33156283341837917, + "grad_norm": 0.10457106679677963, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 85770 + }, + { + "epoch": 0.3316014906217625, + "grad_norm": 0.11196237057447433, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 85780 + }, + { + "epoch": 0.3316401478251457, + "grad_norm": 0.11701985448598862, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 85790 + }, + { + "epoch": 0.33167880502852903, + "grad_norm": 0.0993732213973999, + "learning_rate": 0.002, + "loss": 2.352, + "step": 85800 + }, + { + "epoch": 0.3317174622319123, + "grad_norm": 0.1316983848810196, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 85810 + }, + { + "epoch": 0.3317561194352956, + "grad_norm": 0.12413479387760162, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 85820 + }, + { + "epoch": 0.33179477663867885, + "grad_norm": 0.12227200716733932, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 85830 + }, + { + "epoch": 0.33183343384206215, + "grad_norm": 0.09567388147115707, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 85840 + }, + { + "epoch": 0.3318720910454454, + "grad_norm": 0.10324221104383469, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 85850 + }, + { + "epoch": 0.3319107482488287, + "grad_norm": 0.11741742491722107, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 85860 + }, + { + "epoch": 0.33194940545221197, + "grad_norm": 0.11068105697631836, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 85870 + }, + { + "epoch": 0.3319880626555952, + "grad_norm": 0.10848050564527512, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 85880 + }, + { + "epoch": 0.3320267198589785, + "grad_norm": 0.10607170313596725, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 85890 + }, + { + "epoch": 0.3320653770623618, + "grad_norm": 0.11788028478622437, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 85900 + }, + { + "epoch": 0.3321040342657451, + "grad_norm": 0.10743343830108643, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 85910 + }, + { + "epoch": 0.33214269146912834, + "grad_norm": 0.11620127409696579, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 85920 + }, + { + "epoch": 0.33218134867251164, + "grad_norm": 0.0988125130534172, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 85930 + }, + { + "epoch": 0.3322200058758949, + "grad_norm": 0.11408448219299316, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 85940 + }, + { + "epoch": 0.3322586630792782, + "grad_norm": 0.1145530492067337, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 85950 + }, + { + "epoch": 0.33229732028266146, + "grad_norm": 0.11560901999473572, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 85960 + }, + { + "epoch": 0.33233597748604476, + "grad_norm": 0.11214718222618103, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 85970 + }, + { + "epoch": 0.332374634689428, + "grad_norm": 0.09654606878757477, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 85980 + }, + { + "epoch": 0.3324132918928113, + "grad_norm": 0.098363496363163, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 85990 + }, + { + "epoch": 0.3324519490961946, + "grad_norm": 0.10288723558187485, + "learning_rate": 0.002, + "loss": 2.354, + "step": 86000 + }, + { + "epoch": 0.3324906062995779, + "grad_norm": 0.10426101833581924, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 86010 + }, + { + "epoch": 0.33252926350296114, + "grad_norm": 0.11998550593852997, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 86020 + }, + { + "epoch": 0.33256792070634444, + "grad_norm": 0.12091478705406189, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 86030 + }, + { + "epoch": 0.3326065779097277, + "grad_norm": 0.09535927325487137, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 86040 + }, + { + "epoch": 0.332645235113111, + "grad_norm": 0.11487176269292831, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 86050 + }, + { + "epoch": 0.33268389231649426, + "grad_norm": 0.11748003959655762, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 86060 + }, + { + "epoch": 0.3327225495198775, + "grad_norm": 0.108509361743927, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 86070 + }, + { + "epoch": 0.3327612067232608, + "grad_norm": 0.10883913934230804, + "learning_rate": 0.002, + "loss": 2.354, + "step": 86080 + }, + { + "epoch": 0.33279986392664407, + "grad_norm": 0.11875820904970169, + "learning_rate": 0.002, + "loss": 2.353, + "step": 86090 + }, + { + "epoch": 0.3328385211300274, + "grad_norm": 0.10039224475622177, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 86100 + }, + { + "epoch": 0.33287717833341063, + "grad_norm": 0.11563944071531296, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 86110 + }, + { + "epoch": 0.33291583553679394, + "grad_norm": 0.12889650464057922, + "learning_rate": 0.002, + "loss": 2.361, + "step": 86120 + }, + { + "epoch": 0.3329544927401772, + "grad_norm": 0.1014760285615921, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 86130 + }, + { + "epoch": 0.3329931499435605, + "grad_norm": 0.11509194225072861, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 86140 + }, + { + "epoch": 0.33303180714694375, + "grad_norm": 0.1407669335603714, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 86150 + }, + { + "epoch": 0.33307046435032706, + "grad_norm": 0.10478154569864273, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 86160 + }, + { + "epoch": 0.3331091215537103, + "grad_norm": 0.12240909785032272, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 86170 + }, + { + "epoch": 0.3331477787570936, + "grad_norm": 0.09189334511756897, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 86180 + }, + { + "epoch": 0.33318643596047687, + "grad_norm": 0.12059196829795837, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 86190 + }, + { + "epoch": 0.3332250931638602, + "grad_norm": 0.13283218443393707, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 86200 + }, + { + "epoch": 0.3332637503672434, + "grad_norm": 0.11821234226226807, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 86210 + }, + { + "epoch": 0.33330240757062674, + "grad_norm": 0.106609046459198, + "learning_rate": 0.002, + "loss": 2.353, + "step": 86220 + }, + { + "epoch": 0.33334106477401, + "grad_norm": 0.10692576318979263, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 86230 + }, + { + "epoch": 0.3333797219773933, + "grad_norm": 0.10893997550010681, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 86240 + }, + { + "epoch": 0.33341837918077655, + "grad_norm": 0.12112563848495483, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 86250 + }, + { + "epoch": 0.3334570363841598, + "grad_norm": 0.09665486216545105, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 86260 + }, + { + "epoch": 0.3334956935875431, + "grad_norm": 0.12508168816566467, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 86270 + }, + { + "epoch": 0.33353435079092636, + "grad_norm": 0.12956631183624268, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 86280 + }, + { + "epoch": 0.33357300799430967, + "grad_norm": 0.11325182765722275, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 86290 + }, + { + "epoch": 0.3336116651976929, + "grad_norm": 0.11303581297397614, + "learning_rate": 0.002, + "loss": 2.352, + "step": 86300 + }, + { + "epoch": 0.3336503224010762, + "grad_norm": 0.12054084241390228, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 86310 + }, + { + "epoch": 0.3336889796044595, + "grad_norm": 0.10581088066101074, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 86320 + }, + { + "epoch": 0.3337276368078428, + "grad_norm": 0.1152268648147583, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 86330 + }, + { + "epoch": 0.33376629401122604, + "grad_norm": 0.1067994236946106, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 86340 + }, + { + "epoch": 0.33380495121460935, + "grad_norm": 0.10510335117578506, + "learning_rate": 0.002, + "loss": 2.363, + "step": 86350 + }, + { + "epoch": 0.3338436084179926, + "grad_norm": 0.09770243614912033, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 86360 + }, + { + "epoch": 0.3338822656213759, + "grad_norm": 0.11518805474042892, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 86370 + }, + { + "epoch": 0.33392092282475916, + "grad_norm": 0.10489653050899506, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 86380 + }, + { + "epoch": 0.33395958002814247, + "grad_norm": 0.1306338906288147, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 86390 + }, + { + "epoch": 0.3339982372315257, + "grad_norm": 0.10293161869049072, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 86400 + }, + { + "epoch": 0.334036894434909, + "grad_norm": 0.10411642491817474, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 86410 + }, + { + "epoch": 0.3340755516382923, + "grad_norm": 0.1078294888138771, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 86420 + }, + { + "epoch": 0.33411420884167553, + "grad_norm": 0.10516948252916336, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 86430 + }, + { + "epoch": 0.33415286604505884, + "grad_norm": 0.12160937488079071, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 86440 + }, + { + "epoch": 0.3341915232484421, + "grad_norm": 0.10858595371246338, + "learning_rate": 0.002, + "loss": 2.343, + "step": 86450 + }, + { + "epoch": 0.3342301804518254, + "grad_norm": 0.12583279609680176, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 86460 + }, + { + "epoch": 0.33426883765520865, + "grad_norm": 0.10968731343746185, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 86470 + }, + { + "epoch": 0.33430749485859196, + "grad_norm": 0.10739333182573318, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 86480 + }, + { + "epoch": 0.3343461520619752, + "grad_norm": 0.09372472763061523, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 86490 + }, + { + "epoch": 0.3343848092653585, + "grad_norm": 0.10811075568199158, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 86500 + }, + { + "epoch": 0.33442346646874177, + "grad_norm": 0.10535812377929688, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 86510 + }, + { + "epoch": 0.3344621236721251, + "grad_norm": 0.0991438776254654, + "learning_rate": 0.002, + "loss": 2.341, + "step": 86520 + }, + { + "epoch": 0.33450078087550833, + "grad_norm": 0.09942366927862167, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 86530 + }, + { + "epoch": 0.33453943807889164, + "grad_norm": 0.10737288743257523, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 86540 + }, + { + "epoch": 0.3345780952822749, + "grad_norm": 0.10016623139381409, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 86550 + }, + { + "epoch": 0.3346167524856582, + "grad_norm": 0.1013760194182396, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 86560 + }, + { + "epoch": 0.33465540968904145, + "grad_norm": 0.09310334175825119, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 86570 + }, + { + "epoch": 0.33469406689242476, + "grad_norm": 0.13864712417125702, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 86580 + }, + { + "epoch": 0.334732724095808, + "grad_norm": 0.10814253240823746, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 86590 + }, + { + "epoch": 0.3347713812991913, + "grad_norm": 0.11107059568166733, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 86600 + }, + { + "epoch": 0.33481003850257457, + "grad_norm": 0.10960592329502106, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 86610 + }, + { + "epoch": 0.3348486957059578, + "grad_norm": 0.12890039384365082, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 86620 + }, + { + "epoch": 0.33488735290934113, + "grad_norm": 0.11162910610437393, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 86630 + }, + { + "epoch": 0.3349260101127244, + "grad_norm": 0.11151379346847534, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 86640 + }, + { + "epoch": 0.3349646673161077, + "grad_norm": 0.0964745432138443, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 86650 + }, + { + "epoch": 0.33500332451949094, + "grad_norm": 0.11261545866727829, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 86660 + }, + { + "epoch": 0.33504198172287425, + "grad_norm": 0.10220064222812653, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 86670 + }, + { + "epoch": 0.3350806389262575, + "grad_norm": 0.12906724214553833, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 86680 + }, + { + "epoch": 0.3351192961296408, + "grad_norm": 0.10271503031253815, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 86690 + }, + { + "epoch": 0.33515795333302406, + "grad_norm": 0.11177756637334824, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 86700 + }, + { + "epoch": 0.33519661053640737, + "grad_norm": 0.10489747673273087, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 86710 + }, + { + "epoch": 0.3352352677397906, + "grad_norm": 0.10679716616868973, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 86720 + }, + { + "epoch": 0.33527392494317393, + "grad_norm": 0.09327898919582367, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 86730 + }, + { + "epoch": 0.3353125821465572, + "grad_norm": 0.10325710475444794, + "learning_rate": 0.002, + "loss": 2.354, + "step": 86740 + }, + { + "epoch": 0.3353512393499405, + "grad_norm": 0.11265038698911667, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 86750 + }, + { + "epoch": 0.33538989655332374, + "grad_norm": 0.12953822314739227, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 86760 + }, + { + "epoch": 0.33542855375670705, + "grad_norm": 0.1095634251832962, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 86770 + }, + { + "epoch": 0.3354672109600903, + "grad_norm": 0.10133285075426102, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 86780 + }, + { + "epoch": 0.3355058681634736, + "grad_norm": 0.1050833910703659, + "learning_rate": 0.002, + "loss": 2.356, + "step": 86790 + }, + { + "epoch": 0.33554452536685686, + "grad_norm": 0.11059989780187607, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 86800 + }, + { + "epoch": 0.3355831825702401, + "grad_norm": 0.10115838050842285, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 86810 + }, + { + "epoch": 0.3356218397736234, + "grad_norm": 0.1260637491941452, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 86820 + }, + { + "epoch": 0.3356604969770067, + "grad_norm": 0.10474376380443573, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 86830 + }, + { + "epoch": 0.33569915418039, + "grad_norm": 0.10274680703878403, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 86840 + }, + { + "epoch": 0.33573781138377323, + "grad_norm": 0.10135255008935928, + "learning_rate": 0.002, + "loss": 2.365, + "step": 86850 + }, + { + "epoch": 0.33577646858715654, + "grad_norm": 0.12209215760231018, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 86860 + }, + { + "epoch": 0.3358151257905398, + "grad_norm": 0.0951184555888176, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 86870 + }, + { + "epoch": 0.3358537829939231, + "grad_norm": 0.10640900582075119, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 86880 + }, + { + "epoch": 0.33589244019730635, + "grad_norm": 0.08872721344232559, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 86890 + }, + { + "epoch": 0.33593109740068966, + "grad_norm": 0.12645526230335236, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 86900 + }, + { + "epoch": 0.3359697546040729, + "grad_norm": 0.127633735537529, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 86910 + }, + { + "epoch": 0.3360084118074562, + "grad_norm": 0.10818655043840408, + "learning_rate": 0.002, + "loss": 2.352, + "step": 86920 + }, + { + "epoch": 0.33604706901083947, + "grad_norm": 0.13198406994342804, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 86930 + }, + { + "epoch": 0.3360857262142228, + "grad_norm": 0.1011178269982338, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 86940 + }, + { + "epoch": 0.33612438341760603, + "grad_norm": 0.1177314817905426, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 86950 + }, + { + "epoch": 0.33616304062098934, + "grad_norm": 0.11771221458911896, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 86960 + }, + { + "epoch": 0.3362016978243726, + "grad_norm": 0.11730633676052094, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 86970 + }, + { + "epoch": 0.3362403550277559, + "grad_norm": 0.10943958163261414, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 86980 + }, + { + "epoch": 0.33627901223113915, + "grad_norm": 0.10826060175895691, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 86990 + }, + { + "epoch": 0.3363176694345224, + "grad_norm": 0.10730686783790588, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 87000 + }, + { + "epoch": 0.3363563266379057, + "grad_norm": 0.1155748963356018, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 87010 + }, + { + "epoch": 0.33639498384128896, + "grad_norm": 0.10520046949386597, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 87020 + }, + { + "epoch": 0.33643364104467227, + "grad_norm": 0.11502497643232346, + "learning_rate": 0.002, + "loss": 2.3794, + "step": 87030 + }, + { + "epoch": 0.3364722982480555, + "grad_norm": 0.11109548062086105, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 87040 + }, + { + "epoch": 0.33651095545143883, + "grad_norm": 0.10664211958646774, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 87050 + }, + { + "epoch": 0.3365496126548221, + "grad_norm": 0.10760653764009476, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 87060 + }, + { + "epoch": 0.3365882698582054, + "grad_norm": 0.12719734013080597, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 87070 + }, + { + "epoch": 0.33662692706158864, + "grad_norm": 0.12140123546123505, + "learning_rate": 0.002, + "loss": 2.363, + "step": 87080 + }, + { + "epoch": 0.33666558426497195, + "grad_norm": 0.1130446195602417, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 87090 + }, + { + "epoch": 0.3367042414683552, + "grad_norm": 0.10486172884702682, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 87100 + }, + { + "epoch": 0.3367428986717385, + "grad_norm": 0.11093088239431381, + "learning_rate": 0.002, + "loss": 2.347, + "step": 87110 + }, + { + "epoch": 0.33678155587512176, + "grad_norm": 0.10533785820007324, + "learning_rate": 0.002, + "loss": 2.356, + "step": 87120 + }, + { + "epoch": 0.33682021307850507, + "grad_norm": 0.10832514613866806, + "learning_rate": 0.002, + "loss": 2.35, + "step": 87130 + }, + { + "epoch": 0.3368588702818883, + "grad_norm": 0.11467445641756058, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 87140 + }, + { + "epoch": 0.33689752748527163, + "grad_norm": 0.10939925909042358, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 87150 + }, + { + "epoch": 0.3369361846886549, + "grad_norm": 0.12103261053562164, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 87160 + }, + { + "epoch": 0.33697484189203813, + "grad_norm": 0.0894695296883583, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 87170 + }, + { + "epoch": 0.33701349909542144, + "grad_norm": 0.10069043189287186, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 87180 + }, + { + "epoch": 0.3370521562988047, + "grad_norm": 0.10153687745332718, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 87190 + }, + { + "epoch": 0.337090813502188, + "grad_norm": 0.1355798840522766, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 87200 + }, + { + "epoch": 0.33712947070557125, + "grad_norm": 0.11290738731622696, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 87210 + }, + { + "epoch": 0.33716812790895456, + "grad_norm": 0.12131005525588989, + "learning_rate": 0.002, + "loss": 2.355, + "step": 87220 + }, + { + "epoch": 0.3372067851123378, + "grad_norm": 0.10573593527078629, + "learning_rate": 0.002, + "loss": 2.35, + "step": 87230 + }, + { + "epoch": 0.3372454423157211, + "grad_norm": 0.1039854884147644, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 87240 + }, + { + "epoch": 0.3372840995191044, + "grad_norm": 0.1108429804444313, + "learning_rate": 0.002, + "loss": 2.349, + "step": 87250 + }, + { + "epoch": 0.3373227567224877, + "grad_norm": 0.10919930040836334, + "learning_rate": 0.002, + "loss": 2.358, + "step": 87260 + }, + { + "epoch": 0.33736141392587093, + "grad_norm": 0.12618575990200043, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 87270 + }, + { + "epoch": 0.33740007112925424, + "grad_norm": 0.10285606980323792, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 87280 + }, + { + "epoch": 0.3374387283326375, + "grad_norm": 0.10287448018789291, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 87290 + }, + { + "epoch": 0.3374773855360208, + "grad_norm": 0.11148592084646225, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 87300 + }, + { + "epoch": 0.33751604273940405, + "grad_norm": 0.10517746210098267, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 87310 + }, + { + "epoch": 0.33755469994278736, + "grad_norm": 0.11742986738681793, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 87320 + }, + { + "epoch": 0.3375933571461706, + "grad_norm": 0.10818766802549362, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 87330 + }, + { + "epoch": 0.3376320143495539, + "grad_norm": 0.1280340552330017, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 87340 + }, + { + "epoch": 0.3376706715529372, + "grad_norm": 0.131245419383049, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 87350 + }, + { + "epoch": 0.3377093287563204, + "grad_norm": 0.10057683289051056, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 87360 + }, + { + "epoch": 0.33774798595970373, + "grad_norm": 0.11801260709762573, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 87370 + }, + { + "epoch": 0.337786643163087, + "grad_norm": 0.10263387113809586, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 87380 + }, + { + "epoch": 0.3378253003664703, + "grad_norm": 0.10963422060012817, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 87390 + }, + { + "epoch": 0.33786395756985355, + "grad_norm": 0.10239773243665695, + "learning_rate": 0.002, + "loss": 2.354, + "step": 87400 + }, + { + "epoch": 0.33790261477323685, + "grad_norm": 0.14015544950962067, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 87410 + }, + { + "epoch": 0.3379412719766201, + "grad_norm": 0.11772413551807404, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 87420 + }, + { + "epoch": 0.3379799291800034, + "grad_norm": 0.09805656224489212, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 87430 + }, + { + "epoch": 0.33801858638338667, + "grad_norm": 0.12166161090135574, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 87440 + }, + { + "epoch": 0.33805724358677, + "grad_norm": 0.09780203551054001, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 87450 + }, + { + "epoch": 0.3380959007901532, + "grad_norm": 0.1183888167142868, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 87460 + }, + { + "epoch": 0.33813455799353653, + "grad_norm": 0.1062530130147934, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 87470 + }, + { + "epoch": 0.3381732151969198, + "grad_norm": 0.09859632700681686, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 87480 + }, + { + "epoch": 0.3382118724003031, + "grad_norm": 0.10625289380550385, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 87490 + }, + { + "epoch": 0.33825052960368635, + "grad_norm": 0.1421256959438324, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 87500 + }, + { + "epoch": 0.33828918680706965, + "grad_norm": 0.11129707843065262, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 87510 + }, + { + "epoch": 0.3383278440104529, + "grad_norm": 0.1156439557671547, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 87520 + }, + { + "epoch": 0.3383665012138362, + "grad_norm": 0.10404882580041885, + "learning_rate": 0.002, + "loss": 2.335, + "step": 87530 + }, + { + "epoch": 0.33840515841721946, + "grad_norm": 0.10040571540594101, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 87540 + }, + { + "epoch": 0.3384438156206027, + "grad_norm": 0.10529720783233643, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 87550 + }, + { + "epoch": 0.338482472823986, + "grad_norm": 0.1062098890542984, + "learning_rate": 0.002, + "loss": 2.364, + "step": 87560 + }, + { + "epoch": 0.3385211300273693, + "grad_norm": 0.15115439891815186, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 87570 + }, + { + "epoch": 0.3385597872307526, + "grad_norm": 0.10261274874210358, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 87580 + }, + { + "epoch": 0.33859844443413584, + "grad_norm": 0.10737352818250656, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 87590 + }, + { + "epoch": 0.33863710163751914, + "grad_norm": 0.12016598880290985, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 87600 + }, + { + "epoch": 0.3386757588409024, + "grad_norm": 0.11309704929590225, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 87610 + }, + { + "epoch": 0.3387144160442857, + "grad_norm": 0.12417756021022797, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 87620 + }, + { + "epoch": 0.33875307324766896, + "grad_norm": 0.10050065070390701, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 87630 + }, + { + "epoch": 0.33879173045105226, + "grad_norm": 0.11637856811285019, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 87640 + }, + { + "epoch": 0.3388303876544355, + "grad_norm": 0.12719853222370148, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 87650 + }, + { + "epoch": 0.3388690448578188, + "grad_norm": 0.34996548295021057, + "learning_rate": 0.002, + "loss": 2.338, + "step": 87660 + }, + { + "epoch": 0.3389077020612021, + "grad_norm": 0.100518599152565, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 87670 + }, + { + "epoch": 0.3389463592645854, + "grad_norm": 0.11817089468240738, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 87680 + }, + { + "epoch": 0.33898501646796864, + "grad_norm": 0.1029106006026268, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 87690 + }, + { + "epoch": 0.33902367367135194, + "grad_norm": 0.11708049476146698, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 87700 + }, + { + "epoch": 0.3390623308747352, + "grad_norm": 0.1058112159371376, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 87710 + }, + { + "epoch": 0.3391009880781185, + "grad_norm": 0.11289533972740173, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 87720 + }, + { + "epoch": 0.33913964528150176, + "grad_norm": 0.12030244618654251, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 87730 + }, + { + "epoch": 0.339178302484885, + "grad_norm": 0.10465981066226959, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 87740 + }, + { + "epoch": 0.3392169596882683, + "grad_norm": 0.11341659724712372, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 87750 + }, + { + "epoch": 0.33925561689165157, + "grad_norm": 0.10221397876739502, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 87760 + }, + { + "epoch": 0.3392942740950349, + "grad_norm": 0.11447005718946457, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 87770 + }, + { + "epoch": 0.33933293129841813, + "grad_norm": 0.10479959100484848, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 87780 + }, + { + "epoch": 0.33937158850180144, + "grad_norm": 0.14646686613559723, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 87790 + }, + { + "epoch": 0.3394102457051847, + "grad_norm": 0.11034227907657623, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 87800 + }, + { + "epoch": 0.339448902908568, + "grad_norm": 0.116272933781147, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 87810 + }, + { + "epoch": 0.33948756011195125, + "grad_norm": 0.12207002937793732, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 87820 + }, + { + "epoch": 0.33952621731533456, + "grad_norm": 0.1189301460981369, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 87830 + }, + { + "epoch": 0.3395648745187178, + "grad_norm": 0.115324467420578, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 87840 + }, + { + "epoch": 0.3396035317221011, + "grad_norm": 0.11082760244607925, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 87850 + }, + { + "epoch": 0.33964218892548437, + "grad_norm": 0.13625425100326538, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 87860 + }, + { + "epoch": 0.3396808461288677, + "grad_norm": 0.10276339948177338, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 87870 + }, + { + "epoch": 0.3397195033322509, + "grad_norm": 0.10634419322013855, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 87880 + }, + { + "epoch": 0.33975816053563423, + "grad_norm": 0.11744745820760727, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 87890 + }, + { + "epoch": 0.3397968177390175, + "grad_norm": 0.10343390703201294, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 87900 + }, + { + "epoch": 0.33983547494240074, + "grad_norm": 0.10963012278079987, + "learning_rate": 0.002, + "loss": 2.348, + "step": 87910 + }, + { + "epoch": 0.33987413214578405, + "grad_norm": 0.11228656768798828, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 87920 + }, + { + "epoch": 0.3399127893491673, + "grad_norm": 0.1294107437133789, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 87930 + }, + { + "epoch": 0.3399514465525506, + "grad_norm": 0.13598628342151642, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 87940 + }, + { + "epoch": 0.33999010375593386, + "grad_norm": 0.11039962619543076, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 87950 + }, + { + "epoch": 0.34002876095931717, + "grad_norm": 0.1061733067035675, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 87960 + }, + { + "epoch": 0.3400674181627004, + "grad_norm": 0.12140204012393951, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 87970 + }, + { + "epoch": 0.3401060753660837, + "grad_norm": 0.09958139806985855, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 87980 + }, + { + "epoch": 0.340144732569467, + "grad_norm": 0.10445655882358551, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 87990 + }, + { + "epoch": 0.3401833897728503, + "grad_norm": 0.12749019265174866, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 88000 + }, + { + "epoch": 0.34022204697623354, + "grad_norm": 0.10322877019643784, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 88010 + }, + { + "epoch": 0.34026070417961685, + "grad_norm": 0.12513993680477142, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 88020 + }, + { + "epoch": 0.3402993613830001, + "grad_norm": 0.10777510702610016, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 88030 + }, + { + "epoch": 0.3403380185863834, + "grad_norm": 0.11263205856084824, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 88040 + }, + { + "epoch": 0.34037667578976666, + "grad_norm": 0.11279111355543137, + "learning_rate": 0.002, + "loss": 2.358, + "step": 88050 + }, + { + "epoch": 0.34041533299314997, + "grad_norm": 0.10997018218040466, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 88060 + }, + { + "epoch": 0.3404539901965332, + "grad_norm": 0.12844707071781158, + "learning_rate": 0.002, + "loss": 2.357, + "step": 88070 + }, + { + "epoch": 0.3404926473999165, + "grad_norm": 0.10197935253381729, + "learning_rate": 0.002, + "loss": 2.3763, + "step": 88080 + }, + { + "epoch": 0.3405313046032998, + "grad_norm": 0.09389302879571915, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 88090 + }, + { + "epoch": 0.34056996180668303, + "grad_norm": 0.10149596631526947, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 88100 + }, + { + "epoch": 0.34060861901006634, + "grad_norm": 0.10434433817863464, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 88110 + }, + { + "epoch": 0.3406472762134496, + "grad_norm": 0.11394469439983368, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 88120 + }, + { + "epoch": 0.3406859334168329, + "grad_norm": 0.1021641194820404, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 88130 + }, + { + "epoch": 0.34072459062021615, + "grad_norm": 0.11137614399194717, + "learning_rate": 0.002, + "loss": 2.336, + "step": 88140 + }, + { + "epoch": 0.34076324782359946, + "grad_norm": 0.1125793531537056, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 88150 + }, + { + "epoch": 0.3408019050269827, + "grad_norm": 0.11630513519048691, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 88160 + }, + { + "epoch": 0.340840562230366, + "grad_norm": 0.1093737930059433, + "learning_rate": 0.002, + "loss": 2.3708, + "step": 88170 + }, + { + "epoch": 0.34087921943374927, + "grad_norm": 0.10950718820095062, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 88180 + }, + { + "epoch": 0.3409178766371326, + "grad_norm": 0.12360899150371552, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 88190 + }, + { + "epoch": 0.34095653384051583, + "grad_norm": 0.0962943509221077, + "learning_rate": 0.002, + "loss": 2.343, + "step": 88200 + }, + { + "epoch": 0.34099519104389914, + "grad_norm": 0.10609276592731476, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 88210 + }, + { + "epoch": 0.3410338482472824, + "grad_norm": 0.10679814964532852, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 88220 + }, + { + "epoch": 0.3410725054506657, + "grad_norm": 0.1043362095952034, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 88230 + }, + { + "epoch": 0.34111116265404895, + "grad_norm": 0.1262073963880539, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 88240 + }, + { + "epoch": 0.34114981985743226, + "grad_norm": 0.13010995090007782, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 88250 + }, + { + "epoch": 0.3411884770608155, + "grad_norm": 0.10564157366752625, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 88260 + }, + { + "epoch": 0.3412271342641988, + "grad_norm": 0.1198716014623642, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 88270 + }, + { + "epoch": 0.34126579146758207, + "grad_norm": 0.11889446526765823, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 88280 + }, + { + "epoch": 0.3413044486709653, + "grad_norm": 0.10770682245492935, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 88290 + }, + { + "epoch": 0.34134310587434863, + "grad_norm": 0.09319154173135757, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 88300 + }, + { + "epoch": 0.3413817630777319, + "grad_norm": 0.11082503944635391, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 88310 + }, + { + "epoch": 0.3414204202811152, + "grad_norm": 0.12003304809331894, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 88320 + }, + { + "epoch": 0.34145907748449844, + "grad_norm": 0.10606896877288818, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 88330 + }, + { + "epoch": 0.34149773468788175, + "grad_norm": 0.11933515220880508, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 88340 + }, + { + "epoch": 0.341536391891265, + "grad_norm": 0.11698786169290543, + "learning_rate": 0.002, + "loss": 2.344, + "step": 88350 + }, + { + "epoch": 0.3415750490946483, + "grad_norm": 0.12531299889087677, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 88360 + }, + { + "epoch": 0.34161370629803156, + "grad_norm": 0.12377554923295975, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 88370 + }, + { + "epoch": 0.34165236350141487, + "grad_norm": 0.10473481565713882, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 88380 + }, + { + "epoch": 0.3416910207047981, + "grad_norm": 0.12325559556484222, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 88390 + }, + { + "epoch": 0.34172967790818143, + "grad_norm": 0.09602730721235275, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 88400 + }, + { + "epoch": 0.3417683351115647, + "grad_norm": 0.1111978143453598, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 88410 + }, + { + "epoch": 0.341806992314948, + "grad_norm": 0.1265242099761963, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 88420 + }, + { + "epoch": 0.34184564951833124, + "grad_norm": 0.10700623691082001, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 88430 + }, + { + "epoch": 0.34188430672171455, + "grad_norm": 0.10761568695306778, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 88440 + }, + { + "epoch": 0.3419229639250978, + "grad_norm": 0.10627592355012894, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 88450 + }, + { + "epoch": 0.3419616211284811, + "grad_norm": 0.09872213751077652, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 88460 + }, + { + "epoch": 0.34200027833186436, + "grad_norm": 0.1186567172408104, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 88470 + }, + { + "epoch": 0.3420389355352476, + "grad_norm": 0.11164540797472, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 88480 + }, + { + "epoch": 0.3420775927386309, + "grad_norm": 0.09923987090587616, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 88490 + }, + { + "epoch": 0.3421162499420142, + "grad_norm": 0.12012343853712082, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 88500 + }, + { + "epoch": 0.3421549071453975, + "grad_norm": 0.13408435881137848, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 88510 + }, + { + "epoch": 0.34219356434878073, + "grad_norm": 0.11128830909729004, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 88520 + }, + { + "epoch": 0.34223222155216404, + "grad_norm": 0.13357031345367432, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 88530 + }, + { + "epoch": 0.3422708787555473, + "grad_norm": 0.1076250970363617, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 88540 + }, + { + "epoch": 0.3423095359589306, + "grad_norm": 0.1268393099308014, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 88550 + }, + { + "epoch": 0.34234819316231385, + "grad_norm": 0.14251774549484253, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 88560 + }, + { + "epoch": 0.34238685036569716, + "grad_norm": 0.10408865660429001, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 88570 + }, + { + "epoch": 0.3424255075690804, + "grad_norm": 0.10077012330293655, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 88580 + }, + { + "epoch": 0.3424641647724637, + "grad_norm": 0.11243431270122528, + "learning_rate": 0.002, + "loss": 2.349, + "step": 88590 + }, + { + "epoch": 0.34250282197584697, + "grad_norm": 0.11361908912658691, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 88600 + }, + { + "epoch": 0.3425414791792303, + "grad_norm": 0.10903273522853851, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 88610 + }, + { + "epoch": 0.34258013638261353, + "grad_norm": 0.14040608704090118, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 88620 + }, + { + "epoch": 0.34261879358599684, + "grad_norm": 0.12683793902397156, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 88630 + }, + { + "epoch": 0.3426574507893801, + "grad_norm": 0.11451121419668198, + "learning_rate": 0.002, + "loss": 2.352, + "step": 88640 + }, + { + "epoch": 0.3426961079927634, + "grad_norm": 0.10799139738082886, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 88650 + }, + { + "epoch": 0.34273476519614665, + "grad_norm": 0.11651720106601715, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 88660 + }, + { + "epoch": 0.3427734223995299, + "grad_norm": 0.10717178881168365, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 88670 + }, + { + "epoch": 0.3428120796029132, + "grad_norm": 0.11072812229394913, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 88680 + }, + { + "epoch": 0.34285073680629646, + "grad_norm": 0.14107677340507507, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 88690 + }, + { + "epoch": 0.34288939400967977, + "grad_norm": 0.11483429372310638, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 88700 + }, + { + "epoch": 0.342928051213063, + "grad_norm": 0.10340628027915955, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 88710 + }, + { + "epoch": 0.34296670841644633, + "grad_norm": 0.10923565924167633, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 88720 + }, + { + "epoch": 0.3430053656198296, + "grad_norm": 0.10067480057477951, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 88730 + }, + { + "epoch": 0.3430440228232129, + "grad_norm": 0.10290886461734772, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 88740 + }, + { + "epoch": 0.34308268002659614, + "grad_norm": 0.11461234837770462, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 88750 + }, + { + "epoch": 0.34312133722997945, + "grad_norm": 0.10550641268491745, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 88760 + }, + { + "epoch": 0.3431599944333627, + "grad_norm": 0.12442265450954437, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 88770 + }, + { + "epoch": 0.343198651636746, + "grad_norm": 0.10196845233440399, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 88780 + }, + { + "epoch": 0.34323730884012926, + "grad_norm": 0.11056846380233765, + "learning_rate": 0.002, + "loss": 2.36, + "step": 88790 + }, + { + "epoch": 0.34327596604351257, + "grad_norm": 0.09794145077466965, + "learning_rate": 0.002, + "loss": 2.36, + "step": 88800 + }, + { + "epoch": 0.3433146232468958, + "grad_norm": 0.10579000413417816, + "learning_rate": 0.002, + "loss": 2.342, + "step": 88810 + }, + { + "epoch": 0.34335328045027913, + "grad_norm": 0.09847233444452286, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 88820 + }, + { + "epoch": 0.3433919376536624, + "grad_norm": 0.1011461690068245, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 88830 + }, + { + "epoch": 0.34343059485704563, + "grad_norm": 0.09516558051109314, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 88840 + }, + { + "epoch": 0.34346925206042894, + "grad_norm": 0.12700580060482025, + "learning_rate": 0.002, + "loss": 2.35, + "step": 88850 + }, + { + "epoch": 0.3435079092638122, + "grad_norm": 0.10590207576751709, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 88860 + }, + { + "epoch": 0.3435465664671955, + "grad_norm": 0.11522980779409409, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 88870 + }, + { + "epoch": 0.34358522367057875, + "grad_norm": 0.11492440104484558, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 88880 + }, + { + "epoch": 0.34362388087396206, + "grad_norm": 0.10037875920534134, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 88890 + }, + { + "epoch": 0.3436625380773453, + "grad_norm": 0.10939756035804749, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 88900 + }, + { + "epoch": 0.3437011952807286, + "grad_norm": 0.10372549295425415, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 88910 + }, + { + "epoch": 0.3437398524841119, + "grad_norm": 0.2707937955856323, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 88920 + }, + { + "epoch": 0.3437785096874952, + "grad_norm": 0.13005994260311127, + "learning_rate": 0.002, + "loss": 2.3827, + "step": 88930 + }, + { + "epoch": 0.34381716689087843, + "grad_norm": 0.20814430713653564, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 88940 + }, + { + "epoch": 0.34385582409426174, + "grad_norm": 0.09909890592098236, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 88950 + }, + { + "epoch": 0.343894481297645, + "grad_norm": 0.12622849643230438, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 88960 + }, + { + "epoch": 0.3439331385010283, + "grad_norm": 0.10739883780479431, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 88970 + }, + { + "epoch": 0.34397179570441155, + "grad_norm": 0.11089082062244415, + "learning_rate": 0.002, + "loss": 2.354, + "step": 88980 + }, + { + "epoch": 0.34401045290779486, + "grad_norm": 0.10172960162162781, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 88990 + }, + { + "epoch": 0.3440491101111781, + "grad_norm": 0.11513733118772507, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 89000 + }, + { + "epoch": 0.3440877673145614, + "grad_norm": 0.1210126280784607, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 89010 + }, + { + "epoch": 0.3441264245179447, + "grad_norm": 0.1345120519399643, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 89020 + }, + { + "epoch": 0.3441650817213279, + "grad_norm": 0.11055257171392441, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 89030 + }, + { + "epoch": 0.34420373892471123, + "grad_norm": 0.11494717001914978, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 89040 + }, + { + "epoch": 0.3442423961280945, + "grad_norm": 0.11111489683389664, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 89050 + }, + { + "epoch": 0.3442810533314778, + "grad_norm": 0.10929816216230392, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 89060 + }, + { + "epoch": 0.34431971053486105, + "grad_norm": 0.10010305047035217, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 89070 + }, + { + "epoch": 0.34435836773824435, + "grad_norm": 0.11715249717235565, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 89080 + }, + { + "epoch": 0.3443970249416276, + "grad_norm": 0.11301356554031372, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 89090 + }, + { + "epoch": 0.3444356821450109, + "grad_norm": 0.12198659777641296, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 89100 + }, + { + "epoch": 0.34447433934839417, + "grad_norm": 0.4755155146121979, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 89110 + }, + { + "epoch": 0.3445129965517775, + "grad_norm": 0.11209255456924438, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 89120 + }, + { + "epoch": 0.3445516537551607, + "grad_norm": 0.11255837976932526, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 89130 + }, + { + "epoch": 0.34459031095854403, + "grad_norm": 0.12965063750743866, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 89140 + }, + { + "epoch": 0.3446289681619273, + "grad_norm": 0.1032148078083992, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 89150 + }, + { + "epoch": 0.3446676253653106, + "grad_norm": 0.09087871760129929, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 89160 + }, + { + "epoch": 0.34470628256869384, + "grad_norm": 0.11577743291854858, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 89170 + }, + { + "epoch": 0.34474493977207715, + "grad_norm": 0.10745842754840851, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 89180 + }, + { + "epoch": 0.3447835969754604, + "grad_norm": 0.10638607293367386, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 89190 + }, + { + "epoch": 0.3448222541788437, + "grad_norm": 0.15563400089740753, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 89200 + }, + { + "epoch": 0.34486091138222696, + "grad_norm": 0.11064454168081284, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 89210 + }, + { + "epoch": 0.3448995685856102, + "grad_norm": 0.1113789826631546, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 89220 + }, + { + "epoch": 0.3449382257889935, + "grad_norm": 0.10496684908866882, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 89230 + }, + { + "epoch": 0.3449768829923768, + "grad_norm": 0.11125270277261734, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 89240 + }, + { + "epoch": 0.3450155401957601, + "grad_norm": 0.10501991957426071, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 89250 + }, + { + "epoch": 0.34505419739914334, + "grad_norm": 0.10074132680892944, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 89260 + }, + { + "epoch": 0.34509285460252664, + "grad_norm": 0.1131800040602684, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 89270 + }, + { + "epoch": 0.3451315118059099, + "grad_norm": 0.11575585603713989, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 89280 + }, + { + "epoch": 0.3451701690092932, + "grad_norm": 0.1217227578163147, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 89290 + }, + { + "epoch": 0.34520882621267646, + "grad_norm": 0.10264372825622559, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 89300 + }, + { + "epoch": 0.34524748341605976, + "grad_norm": 0.11179578304290771, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 89310 + }, + { + "epoch": 0.345286140619443, + "grad_norm": 0.12197361886501312, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 89320 + }, + { + "epoch": 0.3453247978228263, + "grad_norm": 0.1156570091843605, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 89330 + }, + { + "epoch": 0.3453634550262096, + "grad_norm": 0.12871617078781128, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 89340 + }, + { + "epoch": 0.3454021122295929, + "grad_norm": 0.11505485326051712, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 89350 + }, + { + "epoch": 0.34544076943297614, + "grad_norm": 0.1145724356174469, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 89360 + }, + { + "epoch": 0.34547942663635944, + "grad_norm": 0.11084239184856415, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 89370 + }, + { + "epoch": 0.3455180838397427, + "grad_norm": 0.12926331162452698, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 89380 + }, + { + "epoch": 0.345556741043126, + "grad_norm": 0.10337939858436584, + "learning_rate": 0.002, + "loss": 2.361, + "step": 89390 + }, + { + "epoch": 0.34559539824650926, + "grad_norm": 0.09303661435842514, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 89400 + }, + { + "epoch": 0.3456340554498925, + "grad_norm": 0.10564062744379044, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 89410 + }, + { + "epoch": 0.3456727126532758, + "grad_norm": 0.11615221202373505, + "learning_rate": 0.002, + "loss": 2.344, + "step": 89420 + }, + { + "epoch": 0.34571136985665907, + "grad_norm": 0.11174018681049347, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 89430 + }, + { + "epoch": 0.3457500270600424, + "grad_norm": 0.09725714474916458, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 89440 + }, + { + "epoch": 0.3457886842634256, + "grad_norm": 0.22712481021881104, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 89450 + }, + { + "epoch": 0.34582734146680894, + "grad_norm": 0.09932104498147964, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 89460 + }, + { + "epoch": 0.3458659986701922, + "grad_norm": 0.103012815117836, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 89470 + }, + { + "epoch": 0.3459046558735755, + "grad_norm": 0.09817779809236526, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 89480 + }, + { + "epoch": 0.34594331307695875, + "grad_norm": 0.11735199391841888, + "learning_rate": 0.002, + "loss": 2.347, + "step": 89490 + }, + { + "epoch": 0.34598197028034205, + "grad_norm": 0.11065150797367096, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 89500 + }, + { + "epoch": 0.3460206274837253, + "grad_norm": 0.10216960310935974, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 89510 + }, + { + "epoch": 0.3460592846871086, + "grad_norm": 0.1042240783572197, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 89520 + }, + { + "epoch": 0.34609794189049187, + "grad_norm": 0.11820337921380997, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 89530 + }, + { + "epoch": 0.3461365990938752, + "grad_norm": 0.10278623551130295, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 89540 + }, + { + "epoch": 0.3461752562972584, + "grad_norm": 0.10534875094890594, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 89550 + }, + { + "epoch": 0.34621391350064173, + "grad_norm": 0.10304959118366241, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 89560 + }, + { + "epoch": 0.346252570704025, + "grad_norm": 0.10242988914251328, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 89570 + }, + { + "epoch": 0.34629122790740824, + "grad_norm": 0.11503741890192032, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 89580 + }, + { + "epoch": 0.34632988511079155, + "grad_norm": 0.1033119484782219, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 89590 + }, + { + "epoch": 0.3463685423141748, + "grad_norm": 0.11573673039674759, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 89600 + }, + { + "epoch": 0.3464071995175581, + "grad_norm": 0.10920635610818863, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 89610 + }, + { + "epoch": 0.34644585672094136, + "grad_norm": 0.10660285502672195, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 89620 + }, + { + "epoch": 0.34648451392432467, + "grad_norm": 0.106463722884655, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 89630 + }, + { + "epoch": 0.3465231711277079, + "grad_norm": 0.13084150850772858, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 89640 + }, + { + "epoch": 0.3465618283310912, + "grad_norm": 0.13473057746887207, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 89650 + }, + { + "epoch": 0.3466004855344745, + "grad_norm": 0.10127367079257965, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 89660 + }, + { + "epoch": 0.3466391427378578, + "grad_norm": 0.10174165666103363, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 89670 + }, + { + "epoch": 0.34667779994124104, + "grad_norm": 0.102707639336586, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 89680 + }, + { + "epoch": 0.34671645714462435, + "grad_norm": 0.10903124511241913, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 89690 + }, + { + "epoch": 0.3467551143480076, + "grad_norm": 0.10413419455289841, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 89700 + }, + { + "epoch": 0.3467937715513909, + "grad_norm": 0.11480054259300232, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 89710 + }, + { + "epoch": 0.34683242875477416, + "grad_norm": 0.1044558733701706, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 89720 + }, + { + "epoch": 0.34687108595815747, + "grad_norm": 0.1191057562828064, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 89730 + }, + { + "epoch": 0.3469097431615407, + "grad_norm": 0.11219650506973267, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 89740 + }, + { + "epoch": 0.346948400364924, + "grad_norm": 0.11830902099609375, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 89750 + }, + { + "epoch": 0.3469870575683073, + "grad_norm": 0.09900355339050293, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 89760 + }, + { + "epoch": 0.34702571477169053, + "grad_norm": 0.10634540766477585, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 89770 + }, + { + "epoch": 0.34706437197507384, + "grad_norm": 0.11318907886743546, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 89780 + }, + { + "epoch": 0.3471030291784571, + "grad_norm": 0.10234291106462479, + "learning_rate": 0.002, + "loss": 2.355, + "step": 89790 + }, + { + "epoch": 0.3471416863818404, + "grad_norm": 0.11988101899623871, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 89800 + }, + { + "epoch": 0.34718034358522365, + "grad_norm": 0.10497704148292542, + "learning_rate": 0.002, + "loss": 2.35, + "step": 89810 + }, + { + "epoch": 0.34721900078860696, + "grad_norm": 0.10977409780025482, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 89820 + }, + { + "epoch": 0.3472576579919902, + "grad_norm": 0.10474051535129547, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 89830 + }, + { + "epoch": 0.3472963151953735, + "grad_norm": 0.10059578716754913, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 89840 + }, + { + "epoch": 0.34733497239875677, + "grad_norm": 0.13399332761764526, + "learning_rate": 0.002, + "loss": 2.35, + "step": 89850 + }, + { + "epoch": 0.3473736296021401, + "grad_norm": 0.09893032908439636, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 89860 + }, + { + "epoch": 0.34741228680552333, + "grad_norm": 0.10207752883434296, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 89870 + }, + { + "epoch": 0.34745094400890664, + "grad_norm": 0.10035461187362671, + "learning_rate": 0.002, + "loss": 2.367, + "step": 89880 + }, + { + "epoch": 0.3474896012122899, + "grad_norm": 0.1164204478263855, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 89890 + }, + { + "epoch": 0.3475282584156732, + "grad_norm": 0.11555982381105423, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 89900 + }, + { + "epoch": 0.34756691561905645, + "grad_norm": 0.13107863068580627, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 89910 + }, + { + "epoch": 0.34760557282243976, + "grad_norm": 0.0959523543715477, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 89920 + }, + { + "epoch": 0.347644230025823, + "grad_norm": 0.1214970275759697, + "learning_rate": 0.002, + "loss": 2.357, + "step": 89930 + }, + { + "epoch": 0.3476828872292063, + "grad_norm": 0.1260724663734436, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 89940 + }, + { + "epoch": 0.34772154443258957, + "grad_norm": 0.10034012794494629, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 89950 + }, + { + "epoch": 0.3477602016359728, + "grad_norm": 0.11203020066022873, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 89960 + }, + { + "epoch": 0.34779885883935613, + "grad_norm": 0.12978847324848175, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 89970 + }, + { + "epoch": 0.3478375160427394, + "grad_norm": 0.09564714133739471, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 89980 + }, + { + "epoch": 0.3478761732461227, + "grad_norm": 0.10480118542909622, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 89990 + }, + { + "epoch": 0.34791483044950594, + "grad_norm": 0.09294721484184265, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 90000 + }, + { + "epoch": 0.34795348765288925, + "grad_norm": 0.12380876392126083, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 90010 + }, + { + "epoch": 0.3479921448562725, + "grad_norm": 0.1039481908082962, + "learning_rate": 0.002, + "loss": 2.339, + "step": 90020 + }, + { + "epoch": 0.3480308020596558, + "grad_norm": 0.10823719948530197, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 90030 + }, + { + "epoch": 0.34806945926303906, + "grad_norm": 0.09384870529174805, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 90040 + }, + { + "epoch": 0.34810811646642237, + "grad_norm": 0.0991966649889946, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 90050 + }, + { + "epoch": 0.3481467736698056, + "grad_norm": 0.11492978036403656, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 90060 + }, + { + "epoch": 0.34818543087318893, + "grad_norm": 0.11849114298820496, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 90070 + }, + { + "epoch": 0.3482240880765722, + "grad_norm": 0.10737583786249161, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 90080 + }, + { + "epoch": 0.3482627452799555, + "grad_norm": 0.09581415355205536, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 90090 + }, + { + "epoch": 0.34830140248333874, + "grad_norm": 0.11195375770330429, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 90100 + }, + { + "epoch": 0.34834005968672205, + "grad_norm": 0.11681769043207169, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 90110 + }, + { + "epoch": 0.3483787168901053, + "grad_norm": 0.13141022622585297, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 90120 + }, + { + "epoch": 0.3484173740934886, + "grad_norm": 0.10962757468223572, + "learning_rate": 0.002, + "loss": 2.352, + "step": 90130 + }, + { + "epoch": 0.34845603129687186, + "grad_norm": 0.11447460949420929, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 90140 + }, + { + "epoch": 0.3484946885002551, + "grad_norm": 0.09598016738891602, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 90150 + }, + { + "epoch": 0.3485333457036384, + "grad_norm": 0.1261773258447647, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 90160 + }, + { + "epoch": 0.34857200290702167, + "grad_norm": 0.11035182327032089, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 90170 + }, + { + "epoch": 0.348610660110405, + "grad_norm": 0.10185796022415161, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 90180 + }, + { + "epoch": 0.34864931731378823, + "grad_norm": 0.11506524682044983, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 90190 + }, + { + "epoch": 0.34868797451717154, + "grad_norm": 0.12680265307426453, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 90200 + }, + { + "epoch": 0.3487266317205548, + "grad_norm": 0.10287638753652573, + "learning_rate": 0.002, + "loss": 2.363, + "step": 90210 + }, + { + "epoch": 0.3487652889239381, + "grad_norm": 0.1062975525856018, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 90220 + }, + { + "epoch": 0.34880394612732135, + "grad_norm": 0.09651152789592743, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 90230 + }, + { + "epoch": 0.34884260333070466, + "grad_norm": 0.40010392665863037, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 90240 + }, + { + "epoch": 0.3488812605340879, + "grad_norm": 0.12321013957262039, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 90250 + }, + { + "epoch": 0.3489199177374712, + "grad_norm": 0.1322363018989563, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 90260 + }, + { + "epoch": 0.34895857494085447, + "grad_norm": 0.11214771866798401, + "learning_rate": 0.002, + "loss": 2.355, + "step": 90270 + }, + { + "epoch": 0.3489972321442378, + "grad_norm": 0.10390046238899231, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 90280 + }, + { + "epoch": 0.34903588934762103, + "grad_norm": 0.0927385538816452, + "learning_rate": 0.002, + "loss": 2.3767, + "step": 90290 + }, + { + "epoch": 0.34907454655100434, + "grad_norm": 0.12240613996982574, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 90300 + }, + { + "epoch": 0.3491132037543876, + "grad_norm": 0.10265913605690002, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 90310 + }, + { + "epoch": 0.3491518609577709, + "grad_norm": 0.1110464334487915, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 90320 + }, + { + "epoch": 0.34919051816115415, + "grad_norm": 0.11584184318780899, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 90330 + }, + { + "epoch": 0.3492291753645374, + "grad_norm": 0.1128770038485527, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 90340 + }, + { + "epoch": 0.3492678325679207, + "grad_norm": 0.09741054475307465, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 90350 + }, + { + "epoch": 0.34930648977130396, + "grad_norm": 0.09995657205581665, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 90360 + }, + { + "epoch": 0.34934514697468727, + "grad_norm": 0.11530742049217224, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 90370 + }, + { + "epoch": 0.3493838041780705, + "grad_norm": 0.11915342509746552, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 90380 + }, + { + "epoch": 0.34942246138145383, + "grad_norm": 0.10356609523296356, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 90390 + }, + { + "epoch": 0.3494611185848371, + "grad_norm": 0.10015129297971725, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 90400 + }, + { + "epoch": 0.3494997757882204, + "grad_norm": 0.11576341837644577, + "learning_rate": 0.002, + "loss": 2.355, + "step": 90410 + }, + { + "epoch": 0.34953843299160364, + "grad_norm": 0.10489057749509811, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 90420 + }, + { + "epoch": 0.34957709019498695, + "grad_norm": 0.11370553821325302, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 90430 + }, + { + "epoch": 0.3496157473983702, + "grad_norm": 0.10937584191560745, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 90440 + }, + { + "epoch": 0.3496544046017535, + "grad_norm": 0.11915868520736694, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 90450 + }, + { + "epoch": 0.34969306180513676, + "grad_norm": 0.09813954681158066, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 90460 + }, + { + "epoch": 0.34973171900852007, + "grad_norm": 0.11999206244945526, + "learning_rate": 0.002, + "loss": 2.34, + "step": 90470 + }, + { + "epoch": 0.3497703762119033, + "grad_norm": 0.14466355741024017, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 90480 + }, + { + "epoch": 0.34980903341528663, + "grad_norm": 0.09629233181476593, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 90490 + }, + { + "epoch": 0.3498476906186699, + "grad_norm": 0.10470282286405563, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 90500 + }, + { + "epoch": 0.34988634782205313, + "grad_norm": 0.1057397648692131, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 90510 + }, + { + "epoch": 0.34992500502543644, + "grad_norm": 0.1281382441520691, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 90520 + }, + { + "epoch": 0.3499636622288197, + "grad_norm": 0.11314621567726135, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 90530 + }, + { + "epoch": 0.350002319432203, + "grad_norm": 0.11300663650035858, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 90540 + }, + { + "epoch": 0.35004097663558625, + "grad_norm": 0.10833021998405457, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 90550 + }, + { + "epoch": 0.35007963383896956, + "grad_norm": 0.09223330020904541, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 90560 + }, + { + "epoch": 0.3501182910423528, + "grad_norm": 0.1024753525853157, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 90570 + }, + { + "epoch": 0.3501569482457361, + "grad_norm": 0.09533923119306564, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 90580 + }, + { + "epoch": 0.3501956054491194, + "grad_norm": 0.11984021961688995, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 90590 + }, + { + "epoch": 0.3502342626525027, + "grad_norm": 0.10845741629600525, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 90600 + }, + { + "epoch": 0.35027291985588593, + "grad_norm": 0.09411056339740753, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 90610 + }, + { + "epoch": 0.35031157705926924, + "grad_norm": 0.10180425643920898, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 90620 + }, + { + "epoch": 0.3503502342626525, + "grad_norm": 0.24714742600917816, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 90630 + }, + { + "epoch": 0.3503888914660358, + "grad_norm": 0.10621386766433716, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 90640 + }, + { + "epoch": 0.35042754866941905, + "grad_norm": 0.09178213775157928, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 90650 + }, + { + "epoch": 0.35046620587280236, + "grad_norm": 0.10342089831829071, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 90660 + }, + { + "epoch": 0.3505048630761856, + "grad_norm": 0.11341840028762817, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 90670 + }, + { + "epoch": 0.3505435202795689, + "grad_norm": 0.18417048454284668, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 90680 + }, + { + "epoch": 0.3505821774829522, + "grad_norm": 0.10052413493394852, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 90690 + }, + { + "epoch": 0.3506208346863354, + "grad_norm": 0.1067756861448288, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 90700 + }, + { + "epoch": 0.35065949188971873, + "grad_norm": 0.10531529039144516, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 90710 + }, + { + "epoch": 0.350698149093102, + "grad_norm": 0.09352817386388779, + "learning_rate": 0.002, + "loss": 2.3768, + "step": 90720 + }, + { + "epoch": 0.3507368062964853, + "grad_norm": 0.1080123707652092, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 90730 + }, + { + "epoch": 0.35077546349986855, + "grad_norm": 0.10450432449579239, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 90740 + }, + { + "epoch": 0.35081412070325185, + "grad_norm": 0.11024756729602814, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 90750 + }, + { + "epoch": 0.3508527779066351, + "grad_norm": 0.10455089062452316, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 90760 + }, + { + "epoch": 0.3508914351100184, + "grad_norm": 0.10989495366811752, + "learning_rate": 0.002, + "loss": 2.34, + "step": 90770 + }, + { + "epoch": 0.35093009231340166, + "grad_norm": 0.25357547402381897, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 90780 + }, + { + "epoch": 0.350968749516785, + "grad_norm": 0.11106446385383606, + "learning_rate": 0.002, + "loss": 2.368, + "step": 90790 + }, + { + "epoch": 0.3510074067201682, + "grad_norm": 0.10034093260765076, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 90800 + }, + { + "epoch": 0.35104606392355153, + "grad_norm": 0.10782653838396072, + "learning_rate": 0.002, + "loss": 2.367, + "step": 90810 + }, + { + "epoch": 0.3510847211269348, + "grad_norm": 0.1085231602191925, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 90820 + }, + { + "epoch": 0.3511233783303181, + "grad_norm": 0.11838402599096298, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 90830 + }, + { + "epoch": 0.35116203553370134, + "grad_norm": 0.11609060317277908, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 90840 + }, + { + "epoch": 0.35120069273708465, + "grad_norm": 0.12586942315101624, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 90850 + }, + { + "epoch": 0.3512393499404679, + "grad_norm": 0.10402925312519073, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 90860 + }, + { + "epoch": 0.3512780071438512, + "grad_norm": 0.128147691488266, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 90870 + }, + { + "epoch": 0.35131666434723446, + "grad_norm": 0.11510083824396133, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 90880 + }, + { + "epoch": 0.3513553215506177, + "grad_norm": 0.10362055897712708, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 90890 + }, + { + "epoch": 0.351393978754001, + "grad_norm": 0.09541057795286179, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 90900 + }, + { + "epoch": 0.3514326359573843, + "grad_norm": 0.12204904854297638, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 90910 + }, + { + "epoch": 0.3514712931607676, + "grad_norm": 0.11423654854297638, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 90920 + }, + { + "epoch": 0.35150995036415084, + "grad_norm": 0.11182309687137604, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 90930 + }, + { + "epoch": 0.35154860756753414, + "grad_norm": 0.11528758704662323, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 90940 + }, + { + "epoch": 0.3515872647709174, + "grad_norm": 0.11533716320991516, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 90950 + }, + { + "epoch": 0.3516259219743007, + "grad_norm": 0.103913314640522, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 90960 + }, + { + "epoch": 0.35166457917768396, + "grad_norm": 0.10832104831933975, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 90970 + }, + { + "epoch": 0.35170323638106726, + "grad_norm": 0.12107133865356445, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 90980 + }, + { + "epoch": 0.3517418935844505, + "grad_norm": 0.10678855329751968, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 90990 + }, + { + "epoch": 0.3517805507878338, + "grad_norm": 0.10596594214439392, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 91000 + }, + { + "epoch": 0.3518192079912171, + "grad_norm": 0.11420460045337677, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 91010 + }, + { + "epoch": 0.3518578651946004, + "grad_norm": 0.38878318667411804, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 91020 + }, + { + "epoch": 0.35189652239798364, + "grad_norm": 0.0983964204788208, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 91030 + }, + { + "epoch": 0.35193517960136694, + "grad_norm": 0.10400891304016113, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 91040 + }, + { + "epoch": 0.3519738368047502, + "grad_norm": 0.11215876042842865, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 91050 + }, + { + "epoch": 0.3520124940081335, + "grad_norm": 0.12313003838062286, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 91060 + }, + { + "epoch": 0.35205115121151676, + "grad_norm": 0.11094030737876892, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 91070 + }, + { + "epoch": 0.3520898084149, + "grad_norm": 0.0991196483373642, + "learning_rate": 0.002, + "loss": 2.3686, + "step": 91080 + }, + { + "epoch": 0.3521284656182833, + "grad_norm": 0.10023964941501617, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 91090 + }, + { + "epoch": 0.35216712282166657, + "grad_norm": 0.10561109334230423, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 91100 + }, + { + "epoch": 0.3522057800250499, + "grad_norm": 0.11194334179162979, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 91110 + }, + { + "epoch": 0.3522444372284331, + "grad_norm": 0.11674809455871582, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 91120 + }, + { + "epoch": 0.35228309443181643, + "grad_norm": 0.09680789709091187, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 91130 + }, + { + "epoch": 0.3523217516351997, + "grad_norm": 0.11687670648097992, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 91140 + }, + { + "epoch": 0.352360408838583, + "grad_norm": 0.11844884604215622, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 91150 + }, + { + "epoch": 0.35239906604196625, + "grad_norm": 0.13373969495296478, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 91160 + }, + { + "epoch": 0.35243772324534955, + "grad_norm": 0.10809897631406784, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 91170 + }, + { + "epoch": 0.3524763804487328, + "grad_norm": 0.10605595260858536, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 91180 + }, + { + "epoch": 0.3525150376521161, + "grad_norm": 0.10121627897024155, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 91190 + }, + { + "epoch": 0.35255369485549937, + "grad_norm": 0.11170769482851028, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 91200 + }, + { + "epoch": 0.3525923520588827, + "grad_norm": 0.12599629163742065, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 91210 + }, + { + "epoch": 0.3526310092622659, + "grad_norm": 0.09302575886249542, + "learning_rate": 0.002, + "loss": 2.35, + "step": 91220 + }, + { + "epoch": 0.35266966646564923, + "grad_norm": 0.13696546852588654, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 91230 + }, + { + "epoch": 0.3527083236690325, + "grad_norm": 0.09611720591783524, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 91240 + }, + { + "epoch": 0.35274698087241574, + "grad_norm": 0.10214751213788986, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 91250 + }, + { + "epoch": 0.35278563807579905, + "grad_norm": 0.10894928127527237, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 91260 + }, + { + "epoch": 0.3528242952791823, + "grad_norm": 0.10802995413541794, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 91270 + }, + { + "epoch": 0.3528629524825656, + "grad_norm": 0.1036207526922226, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 91280 + }, + { + "epoch": 0.35290160968594886, + "grad_norm": 0.11407134681940079, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 91290 + }, + { + "epoch": 0.35294026688933217, + "grad_norm": 0.10703326016664505, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 91300 + }, + { + "epoch": 0.3529789240927154, + "grad_norm": 0.1012360006570816, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 91310 + }, + { + "epoch": 0.3530175812960987, + "grad_norm": 0.11459990590810776, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 91320 + }, + { + "epoch": 0.353056238499482, + "grad_norm": 0.1016920953989029, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 91330 + }, + { + "epoch": 0.3530948957028653, + "grad_norm": 0.11714129894971848, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 91340 + }, + { + "epoch": 0.35313355290624854, + "grad_norm": 0.10821188241243362, + "learning_rate": 0.002, + "loss": 2.364, + "step": 91350 + }, + { + "epoch": 0.35317221010963185, + "grad_norm": 0.11678121238946915, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 91360 + }, + { + "epoch": 0.3532108673130151, + "grad_norm": 0.11290333420038223, + "learning_rate": 0.002, + "loss": 2.347, + "step": 91370 + }, + { + "epoch": 0.3532495245163984, + "grad_norm": 0.10532896965742111, + "learning_rate": 0.002, + "loss": 2.356, + "step": 91380 + }, + { + "epoch": 0.35328818171978166, + "grad_norm": 0.10636654496192932, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 91390 + }, + { + "epoch": 0.35332683892316497, + "grad_norm": 0.09490149468183517, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 91400 + }, + { + "epoch": 0.3533654961265482, + "grad_norm": 0.09918931126594543, + "learning_rate": 0.002, + "loss": 2.362, + "step": 91410 + }, + { + "epoch": 0.3534041533299315, + "grad_norm": 0.10238192975521088, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 91420 + }, + { + "epoch": 0.3534428105333148, + "grad_norm": 0.12480217218399048, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 91430 + }, + { + "epoch": 0.35348146773669803, + "grad_norm": 0.10058391094207764, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 91440 + }, + { + "epoch": 0.35352012494008134, + "grad_norm": 0.10655000805854797, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 91450 + }, + { + "epoch": 0.3535587821434646, + "grad_norm": 0.10399568825960159, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 91460 + }, + { + "epoch": 0.3535974393468479, + "grad_norm": 0.1104901060461998, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 91470 + }, + { + "epoch": 0.35363609655023115, + "grad_norm": 0.11655103415250778, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 91480 + }, + { + "epoch": 0.35367475375361446, + "grad_norm": 0.11704276502132416, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 91490 + }, + { + "epoch": 0.3537134109569977, + "grad_norm": 0.10143926739692688, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 91500 + }, + { + "epoch": 0.353752068160381, + "grad_norm": 0.11378287523984909, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 91510 + }, + { + "epoch": 0.35379072536376427, + "grad_norm": 0.11769437789916992, + "learning_rate": 0.002, + "loss": 2.358, + "step": 91520 + }, + { + "epoch": 0.3538293825671476, + "grad_norm": 0.10199916362762451, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 91530 + }, + { + "epoch": 0.35386803977053083, + "grad_norm": 0.12485165148973465, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 91540 + }, + { + "epoch": 0.35390669697391414, + "grad_norm": 0.12636171281337738, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 91550 + }, + { + "epoch": 0.3539453541772974, + "grad_norm": 0.10836539417505264, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 91560 + }, + { + "epoch": 0.3539840113806807, + "grad_norm": 0.12219023704528809, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 91570 + }, + { + "epoch": 0.35402266858406395, + "grad_norm": 0.1115482971072197, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 91580 + }, + { + "epoch": 0.35406132578744726, + "grad_norm": 0.09615295380353928, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 91590 + }, + { + "epoch": 0.3540999829908305, + "grad_norm": 0.11683586239814758, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 91600 + }, + { + "epoch": 0.3541386401942138, + "grad_norm": 0.10799077898263931, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 91610 + }, + { + "epoch": 0.35417729739759707, + "grad_norm": 0.1008571982383728, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 91620 + }, + { + "epoch": 0.3542159546009803, + "grad_norm": 0.12394775450229645, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 91630 + }, + { + "epoch": 0.35425461180436363, + "grad_norm": 0.0992874875664711, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 91640 + }, + { + "epoch": 0.3542932690077469, + "grad_norm": 0.10617563128471375, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 91650 + }, + { + "epoch": 0.3543319262111302, + "grad_norm": 0.11367245763540268, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 91660 + }, + { + "epoch": 0.35437058341451344, + "grad_norm": 0.10529954731464386, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 91670 + }, + { + "epoch": 0.35440924061789675, + "grad_norm": 0.10327640175819397, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 91680 + }, + { + "epoch": 0.35444789782128, + "grad_norm": 0.12126418948173523, + "learning_rate": 0.002, + "loss": 2.362, + "step": 91690 + }, + { + "epoch": 0.3544865550246633, + "grad_norm": 0.10129111260175705, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 91700 + }, + { + "epoch": 0.35452521222804656, + "grad_norm": 0.11337399482727051, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 91710 + }, + { + "epoch": 0.35456386943142987, + "grad_norm": 0.1090293824672699, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 91720 + }, + { + "epoch": 0.3546025266348131, + "grad_norm": 0.11311867833137512, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 91730 + }, + { + "epoch": 0.3546411838381964, + "grad_norm": 0.11192495375871658, + "learning_rate": 0.002, + "loss": 2.351, + "step": 91740 + }, + { + "epoch": 0.3546798410415797, + "grad_norm": 0.11630705744028091, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 91750 + }, + { + "epoch": 0.354718498244963, + "grad_norm": 0.09618744254112244, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 91760 + }, + { + "epoch": 0.35475715544834624, + "grad_norm": 0.10563170164823532, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 91770 + }, + { + "epoch": 0.35479581265172955, + "grad_norm": 0.10414636135101318, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 91780 + }, + { + "epoch": 0.3548344698551128, + "grad_norm": 0.10870873928070068, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 91790 + }, + { + "epoch": 0.3548731270584961, + "grad_norm": 0.1033695712685585, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 91800 + }, + { + "epoch": 0.35491178426187936, + "grad_norm": 0.10390590131282806, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 91810 + }, + { + "epoch": 0.3549504414652626, + "grad_norm": 0.11860582232475281, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 91820 + }, + { + "epoch": 0.3549890986686459, + "grad_norm": 0.10430839657783508, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 91830 + }, + { + "epoch": 0.35502775587202917, + "grad_norm": 0.10912937670946121, + "learning_rate": 0.002, + "loss": 2.342, + "step": 91840 + }, + { + "epoch": 0.3550664130754125, + "grad_norm": 0.11180725693702698, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 91850 + }, + { + "epoch": 0.35510507027879573, + "grad_norm": 0.1123897135257721, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 91860 + }, + { + "epoch": 0.35514372748217904, + "grad_norm": 0.10792334377765656, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 91870 + }, + { + "epoch": 0.3551823846855623, + "grad_norm": 0.12448374181985855, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 91880 + }, + { + "epoch": 0.3552210418889456, + "grad_norm": 0.11364062875509262, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 91890 + }, + { + "epoch": 0.35525969909232885, + "grad_norm": 0.10941498726606369, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 91900 + }, + { + "epoch": 0.35529835629571216, + "grad_norm": 0.10665152221918106, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 91910 + }, + { + "epoch": 0.3553370134990954, + "grad_norm": 0.10886060446500778, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 91920 + }, + { + "epoch": 0.3553756707024787, + "grad_norm": 0.113824762403965, + "learning_rate": 0.002, + "loss": 2.3775, + "step": 91930 + }, + { + "epoch": 0.35541432790586197, + "grad_norm": 0.11197502166032791, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 91940 + }, + { + "epoch": 0.3554529851092453, + "grad_norm": 0.11675450950860977, + "learning_rate": 0.002, + "loss": 2.353, + "step": 91950 + }, + { + "epoch": 0.35549164231262853, + "grad_norm": 0.12733623385429382, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 91960 + }, + { + "epoch": 0.35553029951601184, + "grad_norm": 0.12649857997894287, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 91970 + }, + { + "epoch": 0.3555689567193951, + "grad_norm": 0.10965298861265182, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 91980 + }, + { + "epoch": 0.35560761392277834, + "grad_norm": 0.11883385479450226, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 91990 + }, + { + "epoch": 0.35564627112616165, + "grad_norm": 0.10928390920162201, + "learning_rate": 0.002, + "loss": 2.332, + "step": 92000 + }, + { + "epoch": 0.3556849283295449, + "grad_norm": 0.10265020281076431, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 92010 + }, + { + "epoch": 0.3557235855329282, + "grad_norm": 0.11968455463647842, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 92020 + }, + { + "epoch": 0.35576224273631146, + "grad_norm": 0.1736990213394165, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 92030 + }, + { + "epoch": 0.35580089993969477, + "grad_norm": 0.10316959768533707, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 92040 + }, + { + "epoch": 0.355839557143078, + "grad_norm": 0.10016027837991714, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 92050 + }, + { + "epoch": 0.35587821434646133, + "grad_norm": 0.11082405596971512, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 92060 + }, + { + "epoch": 0.3559168715498446, + "grad_norm": 0.11873820424079895, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 92070 + }, + { + "epoch": 0.3559555287532279, + "grad_norm": 0.13147886097431183, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 92080 + }, + { + "epoch": 0.35599418595661114, + "grad_norm": 0.29662472009658813, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 92090 + }, + { + "epoch": 0.35603284315999445, + "grad_norm": 0.1175631433725357, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 92100 + }, + { + "epoch": 0.3560715003633777, + "grad_norm": 0.10309901833534241, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 92110 + }, + { + "epoch": 0.356110157566761, + "grad_norm": 0.0919211357831955, + "learning_rate": 0.002, + "loss": 2.34, + "step": 92120 + }, + { + "epoch": 0.35614881477014426, + "grad_norm": 0.09981719404459, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 92130 + }, + { + "epoch": 0.35618747197352757, + "grad_norm": 0.1206381544470787, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 92140 + }, + { + "epoch": 0.3562261291769108, + "grad_norm": 0.12230429798364639, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 92150 + }, + { + "epoch": 0.35626478638029413, + "grad_norm": 0.10440311580896378, + "learning_rate": 0.002, + "loss": 2.357, + "step": 92160 + }, + { + "epoch": 0.3563034435836774, + "grad_norm": 0.11227347701787949, + "learning_rate": 0.002, + "loss": 2.349, + "step": 92170 + }, + { + "epoch": 0.35634210078706063, + "grad_norm": 0.1011897549033165, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 92180 + }, + { + "epoch": 0.35638075799044394, + "grad_norm": 0.11445823311805725, + "learning_rate": 0.002, + "loss": 2.366, + "step": 92190 + }, + { + "epoch": 0.3564194151938272, + "grad_norm": 0.0916280522942543, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 92200 + }, + { + "epoch": 0.3564580723972105, + "grad_norm": 0.1168455183506012, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 92210 + }, + { + "epoch": 0.35649672960059375, + "grad_norm": 0.12422209978103638, + "learning_rate": 0.002, + "loss": 2.348, + "step": 92220 + }, + { + "epoch": 0.35653538680397706, + "grad_norm": 0.10890578478574753, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 92230 + }, + { + "epoch": 0.3565740440073603, + "grad_norm": 0.11009981483221054, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 92240 + }, + { + "epoch": 0.3566127012107436, + "grad_norm": 0.0971464216709137, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 92250 + }, + { + "epoch": 0.3566513584141269, + "grad_norm": 0.10866507887840271, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 92260 + }, + { + "epoch": 0.3566900156175102, + "grad_norm": 0.10748264193534851, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 92270 + }, + { + "epoch": 0.35672867282089343, + "grad_norm": 0.12036815285682678, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 92280 + }, + { + "epoch": 0.35676733002427674, + "grad_norm": 0.1034461110830307, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 92290 + }, + { + "epoch": 0.35680598722766, + "grad_norm": 0.13951030373573303, + "learning_rate": 0.002, + "loss": 2.3743, + "step": 92300 + }, + { + "epoch": 0.3568446444310433, + "grad_norm": 0.0967438593506813, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 92310 + }, + { + "epoch": 0.35688330163442655, + "grad_norm": 0.10530319064855576, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 92320 + }, + { + "epoch": 0.35692195883780986, + "grad_norm": 0.10578849166631699, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 92330 + }, + { + "epoch": 0.3569606160411931, + "grad_norm": 0.10831478238105774, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 92340 + }, + { + "epoch": 0.3569992732445764, + "grad_norm": 0.10773373395204544, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 92350 + }, + { + "epoch": 0.3570379304479597, + "grad_norm": 0.11242599040269852, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 92360 + }, + { + "epoch": 0.3570765876513429, + "grad_norm": 0.12345028668642044, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 92370 + }, + { + "epoch": 0.35711524485472623, + "grad_norm": 0.10369107127189636, + "learning_rate": 0.002, + "loss": 2.354, + "step": 92380 + }, + { + "epoch": 0.3571539020581095, + "grad_norm": 0.1744854897260666, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 92390 + }, + { + "epoch": 0.3571925592614928, + "grad_norm": 0.09829877316951752, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 92400 + }, + { + "epoch": 0.35723121646487604, + "grad_norm": 0.12784981727600098, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 92410 + }, + { + "epoch": 0.35726987366825935, + "grad_norm": 0.10792503505945206, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 92420 + }, + { + "epoch": 0.3573085308716426, + "grad_norm": 0.10034052282571793, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 92430 + }, + { + "epoch": 0.3573471880750259, + "grad_norm": 0.13357554376125336, + "learning_rate": 0.002, + "loss": 2.357, + "step": 92440 + }, + { + "epoch": 0.35738584527840916, + "grad_norm": 0.28310683369636536, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 92450 + }, + { + "epoch": 0.35742450248179247, + "grad_norm": 0.1128840446472168, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 92460 + }, + { + "epoch": 0.3574631596851757, + "grad_norm": 0.14639827609062195, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 92470 + }, + { + "epoch": 0.35750181688855903, + "grad_norm": 0.11009307205677032, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 92480 + }, + { + "epoch": 0.3575404740919423, + "grad_norm": 0.10203824937343597, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 92490 + }, + { + "epoch": 0.3575791312953256, + "grad_norm": 0.11197981238365173, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 92500 + }, + { + "epoch": 0.35761778849870884, + "grad_norm": 0.1011284589767456, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 92510 + }, + { + "epoch": 0.35765644570209215, + "grad_norm": 0.11905599385499954, + "learning_rate": 0.002, + "loss": 2.354, + "step": 92520 + }, + { + "epoch": 0.3576951029054754, + "grad_norm": 0.10511179268360138, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 92530 + }, + { + "epoch": 0.3577337601088587, + "grad_norm": 0.10166637599468231, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 92540 + }, + { + "epoch": 0.35777241731224196, + "grad_norm": 0.11526691913604736, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 92550 + }, + { + "epoch": 0.3578110745156252, + "grad_norm": 0.12250806391239166, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 92560 + }, + { + "epoch": 0.3578497317190085, + "grad_norm": 0.10860671103000641, + "learning_rate": 0.002, + "loss": 2.371, + "step": 92570 + }, + { + "epoch": 0.3578883889223918, + "grad_norm": 0.11007049679756165, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 92580 + }, + { + "epoch": 0.3579270461257751, + "grad_norm": 0.11075068265199661, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 92590 + }, + { + "epoch": 0.35796570332915834, + "grad_norm": 0.09433702379465103, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 92600 + }, + { + "epoch": 0.35800436053254164, + "grad_norm": 0.09903281182050705, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 92610 + }, + { + "epoch": 0.3580430177359249, + "grad_norm": 0.12536199390888214, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 92620 + }, + { + "epoch": 0.3580816749393082, + "grad_norm": 0.10655872523784637, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 92630 + }, + { + "epoch": 0.35812033214269146, + "grad_norm": 0.09485611319541931, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 92640 + }, + { + "epoch": 0.35815898934607476, + "grad_norm": 0.11137279123067856, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 92650 + }, + { + "epoch": 0.358197646549458, + "grad_norm": 0.11436645686626434, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 92660 + }, + { + "epoch": 0.3582363037528413, + "grad_norm": 0.12434104084968567, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 92670 + }, + { + "epoch": 0.3582749609562246, + "grad_norm": 0.10417872667312622, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 92680 + }, + { + "epoch": 0.3583136181596079, + "grad_norm": 0.10744566470384598, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 92690 + }, + { + "epoch": 0.35835227536299114, + "grad_norm": 0.09762832522392273, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 92700 + }, + { + "epoch": 0.35839093256637444, + "grad_norm": 0.12386251240968704, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 92710 + }, + { + "epoch": 0.3584295897697577, + "grad_norm": 0.09850963950157166, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 92720 + }, + { + "epoch": 0.358468246973141, + "grad_norm": 0.12494229525327682, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 92730 + }, + { + "epoch": 0.35850690417652425, + "grad_norm": 0.1048731803894043, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 92740 + }, + { + "epoch": 0.3585455613799075, + "grad_norm": 0.11765920370817184, + "learning_rate": 0.002, + "loss": 2.3748, + "step": 92750 + }, + { + "epoch": 0.3585842185832908, + "grad_norm": 0.11133372783660889, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 92760 + }, + { + "epoch": 0.35862287578667407, + "grad_norm": 0.10941710323095322, + "learning_rate": 0.002, + "loss": 2.3691, + "step": 92770 + }, + { + "epoch": 0.3586615329900574, + "grad_norm": 0.10292758792638779, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 92780 + }, + { + "epoch": 0.3587001901934406, + "grad_norm": 0.09566858410835266, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 92790 + }, + { + "epoch": 0.35873884739682393, + "grad_norm": 0.1483062505722046, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 92800 + }, + { + "epoch": 0.3587775046002072, + "grad_norm": 0.11509834229946136, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 92810 + }, + { + "epoch": 0.3588161618035905, + "grad_norm": 0.09733151644468307, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 92820 + }, + { + "epoch": 0.35885481900697375, + "grad_norm": 0.09017828851938248, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 92830 + }, + { + "epoch": 0.35889347621035705, + "grad_norm": 0.10793974995613098, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 92840 + }, + { + "epoch": 0.3589321334137403, + "grad_norm": 0.10240405797958374, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 92850 + }, + { + "epoch": 0.3589707906171236, + "grad_norm": 0.09181945770978928, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 92860 + }, + { + "epoch": 0.35900944782050687, + "grad_norm": 0.1125139519572258, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 92870 + }, + { + "epoch": 0.3590481050238902, + "grad_norm": 0.09814155846834183, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 92880 + }, + { + "epoch": 0.3590867622272734, + "grad_norm": 0.12032115459442139, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 92890 + }, + { + "epoch": 0.35912541943065673, + "grad_norm": 0.10703227669000626, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 92900 + }, + { + "epoch": 0.35916407663404, + "grad_norm": 0.11409148573875427, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 92910 + }, + { + "epoch": 0.35920273383742324, + "grad_norm": 0.11414051055908203, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 92920 + }, + { + "epoch": 0.35924139104080655, + "grad_norm": 0.10449480265378952, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 92930 + }, + { + "epoch": 0.3592800482441898, + "grad_norm": 0.1115059182047844, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 92940 + }, + { + "epoch": 0.3593187054475731, + "grad_norm": 0.11202369630336761, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 92950 + }, + { + "epoch": 0.35935736265095636, + "grad_norm": 0.10093695670366287, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 92960 + }, + { + "epoch": 0.35939601985433967, + "grad_norm": 0.10710848122835159, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 92970 + }, + { + "epoch": 0.3594346770577229, + "grad_norm": 0.11425944417715073, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 92980 + }, + { + "epoch": 0.3594733342611062, + "grad_norm": 0.12227421253919601, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 92990 + }, + { + "epoch": 0.3595119914644895, + "grad_norm": 0.10208730399608612, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 93000 + }, + { + "epoch": 0.3595506486678728, + "grad_norm": 0.1007249504327774, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 93010 + }, + { + "epoch": 0.35958930587125604, + "grad_norm": 0.10328896343708038, + "learning_rate": 0.002, + "loss": 2.352, + "step": 93020 + }, + { + "epoch": 0.35962796307463935, + "grad_norm": 0.10905642807483673, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 93030 + }, + { + "epoch": 0.3596666202780226, + "grad_norm": 0.11170773953199387, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 93040 + }, + { + "epoch": 0.3597052774814059, + "grad_norm": 0.09574315696954727, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 93050 + }, + { + "epoch": 0.35974393468478916, + "grad_norm": 0.1102205142378807, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 93060 + }, + { + "epoch": 0.35978259188817246, + "grad_norm": 0.10546736419200897, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 93070 + }, + { + "epoch": 0.3598212490915557, + "grad_norm": 0.11542915552854538, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 93080 + }, + { + "epoch": 0.359859906294939, + "grad_norm": 0.11506014317274094, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 93090 + }, + { + "epoch": 0.3598985634983223, + "grad_norm": 0.09840735793113708, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 93100 + }, + { + "epoch": 0.35993722070170553, + "grad_norm": 0.09480399638414383, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 93110 + }, + { + "epoch": 0.35997587790508884, + "grad_norm": 0.12192349880933762, + "learning_rate": 0.002, + "loss": 2.336, + "step": 93120 + }, + { + "epoch": 0.3600145351084721, + "grad_norm": 0.103517085313797, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 93130 + }, + { + "epoch": 0.3600531923118554, + "grad_norm": 0.10403070598840714, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 93140 + }, + { + "epoch": 0.36009184951523865, + "grad_norm": 0.09937585890293121, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 93150 + }, + { + "epoch": 0.36013050671862196, + "grad_norm": 0.10563259571790695, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 93160 + }, + { + "epoch": 0.3601691639220052, + "grad_norm": 0.11061891913414001, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 93170 + }, + { + "epoch": 0.3602078211253885, + "grad_norm": 0.11007753014564514, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 93180 + }, + { + "epoch": 0.36024647832877177, + "grad_norm": 0.111729197204113, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 93190 + }, + { + "epoch": 0.3602851355321551, + "grad_norm": 0.09627048671245575, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 93200 + }, + { + "epoch": 0.36032379273553833, + "grad_norm": 0.12040142714977264, + "learning_rate": 0.002, + "loss": 2.349, + "step": 93210 + }, + { + "epoch": 0.36036244993892164, + "grad_norm": 0.11313582956790924, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 93220 + }, + { + "epoch": 0.3604011071423049, + "grad_norm": 0.11437612771987915, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 93230 + }, + { + "epoch": 0.3604397643456882, + "grad_norm": 0.10665919631719589, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 93240 + }, + { + "epoch": 0.36047842154907145, + "grad_norm": 0.10124839842319489, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 93250 + }, + { + "epoch": 0.36051707875245476, + "grad_norm": 0.11167119443416595, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 93260 + }, + { + "epoch": 0.360555735955838, + "grad_norm": 0.09860999882221222, + "learning_rate": 0.002, + "loss": 2.352, + "step": 93270 + }, + { + "epoch": 0.3605943931592213, + "grad_norm": 0.11736667901277542, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 93280 + }, + { + "epoch": 0.36063305036260457, + "grad_norm": 0.10274846106767654, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 93290 + }, + { + "epoch": 0.3606717075659878, + "grad_norm": 0.10463331639766693, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 93300 + }, + { + "epoch": 0.36071036476937113, + "grad_norm": 0.10493109375238419, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 93310 + }, + { + "epoch": 0.3607490219727544, + "grad_norm": 0.09283468127250671, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 93320 + }, + { + "epoch": 0.3607876791761377, + "grad_norm": 0.11488457024097443, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 93330 + }, + { + "epoch": 0.36082633637952094, + "grad_norm": 0.11543713510036469, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 93340 + }, + { + "epoch": 0.36086499358290425, + "grad_norm": 0.10279402881860733, + "learning_rate": 0.002, + "loss": 2.338, + "step": 93350 + }, + { + "epoch": 0.3609036507862875, + "grad_norm": 0.09951184689998627, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 93360 + }, + { + "epoch": 0.3609423079896708, + "grad_norm": 0.12271188944578171, + "learning_rate": 0.002, + "loss": 2.344, + "step": 93370 + }, + { + "epoch": 0.36098096519305406, + "grad_norm": 0.09874992072582245, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 93380 + }, + { + "epoch": 0.36101962239643737, + "grad_norm": 0.09354733675718307, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 93390 + }, + { + "epoch": 0.3610582795998206, + "grad_norm": 0.1220618411898613, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 93400 + }, + { + "epoch": 0.3610969368032039, + "grad_norm": 0.1061968058347702, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 93410 + }, + { + "epoch": 0.3611355940065872, + "grad_norm": 0.10884738713502884, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 93420 + }, + { + "epoch": 0.3611742512099705, + "grad_norm": 0.11903372406959534, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 93430 + }, + { + "epoch": 0.36121290841335374, + "grad_norm": 0.11023060977458954, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 93440 + }, + { + "epoch": 0.36125156561673705, + "grad_norm": 0.10786955803632736, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 93450 + }, + { + "epoch": 0.3612902228201203, + "grad_norm": 0.14083674550056458, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 93460 + }, + { + "epoch": 0.3613288800235036, + "grad_norm": 0.10121428966522217, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 93470 + }, + { + "epoch": 0.36136753722688686, + "grad_norm": 0.1078413799405098, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 93480 + }, + { + "epoch": 0.3614061944302701, + "grad_norm": 0.1193716898560524, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 93490 + }, + { + "epoch": 0.3614448516336534, + "grad_norm": 0.10673259943723679, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 93500 + }, + { + "epoch": 0.36148350883703667, + "grad_norm": 0.10572133958339691, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 93510 + }, + { + "epoch": 0.36152216604042, + "grad_norm": 0.10543181002140045, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 93520 + }, + { + "epoch": 0.36156082324380323, + "grad_norm": 0.09912768006324768, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 93530 + }, + { + "epoch": 0.36159948044718654, + "grad_norm": 0.09820456057786942, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 93540 + }, + { + "epoch": 0.3616381376505698, + "grad_norm": 0.10524751991033554, + "learning_rate": 0.002, + "loss": 2.354, + "step": 93550 + }, + { + "epoch": 0.3616767948539531, + "grad_norm": 0.09994732588529587, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 93560 + }, + { + "epoch": 0.36171545205733635, + "grad_norm": 0.10982286185026169, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 93570 + }, + { + "epoch": 0.36175410926071966, + "grad_norm": 0.09946732968091965, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 93580 + }, + { + "epoch": 0.3617927664641029, + "grad_norm": 0.09362184256315231, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 93590 + }, + { + "epoch": 0.3618314236674862, + "grad_norm": 0.10326167941093445, + "learning_rate": 0.002, + "loss": 2.349, + "step": 93600 + }, + { + "epoch": 0.36187008087086947, + "grad_norm": 0.12770886719226837, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 93610 + }, + { + "epoch": 0.3619087380742528, + "grad_norm": 0.10822170972824097, + "learning_rate": 0.002, + "loss": 2.344, + "step": 93620 + }, + { + "epoch": 0.36194739527763603, + "grad_norm": 0.09560711681842804, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 93630 + }, + { + "epoch": 0.36198605248101934, + "grad_norm": 0.1178121343255043, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 93640 + }, + { + "epoch": 0.3620247096844026, + "grad_norm": 0.11329429596662521, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 93650 + }, + { + "epoch": 0.36206336688778584, + "grad_norm": 0.11173105239868164, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 93660 + }, + { + "epoch": 0.36210202409116915, + "grad_norm": 0.09975706785917282, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 93670 + }, + { + "epoch": 0.3621406812945524, + "grad_norm": 0.10781252384185791, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 93680 + }, + { + "epoch": 0.3621793384979357, + "grad_norm": 0.11460427939891815, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 93690 + }, + { + "epoch": 0.36221799570131896, + "grad_norm": 0.10809577256441116, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 93700 + }, + { + "epoch": 0.36225665290470227, + "grad_norm": 0.1188875287771225, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 93710 + }, + { + "epoch": 0.3622953101080855, + "grad_norm": 0.128997340798378, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 93720 + }, + { + "epoch": 0.36233396731146883, + "grad_norm": 0.1061416044831276, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 93730 + }, + { + "epoch": 0.3623726245148521, + "grad_norm": 0.10427704453468323, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 93740 + }, + { + "epoch": 0.3624112817182354, + "grad_norm": 0.10611186176538467, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 93750 + }, + { + "epoch": 0.36244993892161864, + "grad_norm": 0.10901859402656555, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 93760 + }, + { + "epoch": 0.36248859612500195, + "grad_norm": 0.13348504900932312, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 93770 + }, + { + "epoch": 0.3625272533283852, + "grad_norm": 0.09349879622459412, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 93780 + }, + { + "epoch": 0.3625659105317685, + "grad_norm": 0.10900308191776276, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 93790 + }, + { + "epoch": 0.36260456773515176, + "grad_norm": 0.12062409520149231, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 93800 + }, + { + "epoch": 0.36264322493853507, + "grad_norm": 0.12152603268623352, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 93810 + }, + { + "epoch": 0.3626818821419183, + "grad_norm": 0.09724075347185135, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 93820 + }, + { + "epoch": 0.36272053934530163, + "grad_norm": 0.10395587980747223, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 93830 + }, + { + "epoch": 0.3627591965486849, + "grad_norm": 0.0977049246430397, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 93840 + }, + { + "epoch": 0.36279785375206813, + "grad_norm": 0.10224491357803345, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 93850 + }, + { + "epoch": 0.36283651095545144, + "grad_norm": 0.1509615033864975, + "learning_rate": 0.002, + "loss": 2.347, + "step": 93860 + }, + { + "epoch": 0.3628751681588347, + "grad_norm": 0.1311606913805008, + "learning_rate": 0.002, + "loss": 2.365, + "step": 93870 + }, + { + "epoch": 0.362913825362218, + "grad_norm": 0.11439214646816254, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 93880 + }, + { + "epoch": 0.36295248256560125, + "grad_norm": 0.1027848944067955, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 93890 + }, + { + "epoch": 0.36299113976898456, + "grad_norm": 0.11457744240760803, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 93900 + }, + { + "epoch": 0.3630297969723678, + "grad_norm": 0.1078697144985199, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 93910 + }, + { + "epoch": 0.3630684541757511, + "grad_norm": 0.09145821630954742, + "learning_rate": 0.002, + "loss": 2.372, + "step": 93920 + }, + { + "epoch": 0.3631071113791344, + "grad_norm": 0.1060747280716896, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 93930 + }, + { + "epoch": 0.3631457685825177, + "grad_norm": 0.1224551796913147, + "learning_rate": 0.002, + "loss": 2.368, + "step": 93940 + }, + { + "epoch": 0.36318442578590093, + "grad_norm": 0.10747087746858597, + "learning_rate": 0.002, + "loss": 2.363, + "step": 93950 + }, + { + "epoch": 0.36322308298928424, + "grad_norm": 0.1146809458732605, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 93960 + }, + { + "epoch": 0.3632617401926675, + "grad_norm": 0.12076695263385773, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 93970 + }, + { + "epoch": 0.3633003973960508, + "grad_norm": 0.10855092853307724, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 93980 + }, + { + "epoch": 0.36333905459943405, + "grad_norm": 0.10646027326583862, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 93990 + }, + { + "epoch": 0.36337771180281736, + "grad_norm": 0.11337734013795853, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 94000 + }, + { + "epoch": 0.3634163690062006, + "grad_norm": 0.10778078436851501, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 94010 + }, + { + "epoch": 0.3634550262095839, + "grad_norm": 0.10317351669073105, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 94020 + }, + { + "epoch": 0.3634936834129672, + "grad_norm": 0.12111659348011017, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 94030 + }, + { + "epoch": 0.3635323406163504, + "grad_norm": 0.10069271922111511, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 94040 + }, + { + "epoch": 0.36357099781973373, + "grad_norm": 0.09737294912338257, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 94050 + }, + { + "epoch": 0.363609655023117, + "grad_norm": 0.09887672960758209, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 94060 + }, + { + "epoch": 0.3636483122265003, + "grad_norm": 0.0944209098815918, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 94070 + }, + { + "epoch": 0.36368696942988354, + "grad_norm": 0.11983916163444519, + "learning_rate": 0.002, + "loss": 2.3717, + "step": 94080 + }, + { + "epoch": 0.36372562663326685, + "grad_norm": 0.12243504822254181, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 94090 + }, + { + "epoch": 0.3637642838366501, + "grad_norm": 0.12480605393648148, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 94100 + }, + { + "epoch": 0.3638029410400334, + "grad_norm": 0.09351043403148651, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 94110 + }, + { + "epoch": 0.36384159824341666, + "grad_norm": 0.09889024496078491, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 94120 + }, + { + "epoch": 0.36388025544679997, + "grad_norm": 0.11944614350795746, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 94130 + }, + { + "epoch": 0.3639189126501832, + "grad_norm": 0.12215811014175415, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 94140 + }, + { + "epoch": 0.36395756985356653, + "grad_norm": 0.11237761378288269, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 94150 + }, + { + "epoch": 0.3639962270569498, + "grad_norm": 0.09652663767337799, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 94160 + }, + { + "epoch": 0.3640348842603331, + "grad_norm": 0.11002325266599655, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 94170 + }, + { + "epoch": 0.36407354146371634, + "grad_norm": 0.10044171661138535, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 94180 + }, + { + "epoch": 0.36411219866709965, + "grad_norm": 0.12201712280511856, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 94190 + }, + { + "epoch": 0.3641508558704829, + "grad_norm": 0.0983533188700676, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 94200 + }, + { + "epoch": 0.3641895130738662, + "grad_norm": 0.11045312136411667, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 94210 + }, + { + "epoch": 0.36422817027724946, + "grad_norm": 0.10729973763227463, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 94220 + }, + { + "epoch": 0.3642668274806327, + "grad_norm": 0.11313141137361526, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 94230 + }, + { + "epoch": 0.364305484684016, + "grad_norm": 0.11352071911096573, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 94240 + }, + { + "epoch": 0.3643441418873993, + "grad_norm": 0.11024627834558487, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 94250 + }, + { + "epoch": 0.3643827990907826, + "grad_norm": 0.12387119978666306, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 94260 + }, + { + "epoch": 0.36442145629416584, + "grad_norm": 0.1023462638258934, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 94270 + }, + { + "epoch": 0.36446011349754914, + "grad_norm": 0.11274252831935883, + "learning_rate": 0.002, + "loss": 2.338, + "step": 94280 + }, + { + "epoch": 0.3644987707009324, + "grad_norm": 0.09633929282426834, + "learning_rate": 0.002, + "loss": 2.35, + "step": 94290 + }, + { + "epoch": 0.3645374279043157, + "grad_norm": 0.10494520515203476, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 94300 + }, + { + "epoch": 0.36457608510769896, + "grad_norm": 0.11165018379688263, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 94310 + }, + { + "epoch": 0.36461474231108226, + "grad_norm": 0.10920723527669907, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 94320 + }, + { + "epoch": 0.3646533995144655, + "grad_norm": 0.12387372553348541, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 94330 + }, + { + "epoch": 0.3646920567178488, + "grad_norm": 0.11350051313638687, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 94340 + }, + { + "epoch": 0.3647307139212321, + "grad_norm": 0.11592475324869156, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 94350 + }, + { + "epoch": 0.3647693711246154, + "grad_norm": 0.12059365212917328, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 94360 + }, + { + "epoch": 0.36480802832799863, + "grad_norm": 0.10482364147901535, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 94370 + }, + { + "epoch": 0.36484668553138194, + "grad_norm": 0.10916123539209366, + "learning_rate": 0.002, + "loss": 2.34, + "step": 94380 + }, + { + "epoch": 0.3648853427347652, + "grad_norm": 0.10865961015224457, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 94390 + }, + { + "epoch": 0.3649239999381485, + "grad_norm": 0.11367253959178925, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 94400 + }, + { + "epoch": 0.36496265714153175, + "grad_norm": 0.10054226964712143, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 94410 + }, + { + "epoch": 0.365001314344915, + "grad_norm": 0.1019655168056488, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 94420 + }, + { + "epoch": 0.3650399715482983, + "grad_norm": 0.12992607057094574, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 94430 + }, + { + "epoch": 0.36507862875168157, + "grad_norm": 0.10929842293262482, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 94440 + }, + { + "epoch": 0.3651172859550649, + "grad_norm": 0.12828373908996582, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 94450 + }, + { + "epoch": 0.3651559431584481, + "grad_norm": 0.12689423561096191, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 94460 + }, + { + "epoch": 0.36519460036183143, + "grad_norm": 0.1337389349937439, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 94470 + }, + { + "epoch": 0.3652332575652147, + "grad_norm": 0.09126801043748856, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 94480 + }, + { + "epoch": 0.365271914768598, + "grad_norm": 0.118788942694664, + "learning_rate": 0.002, + "loss": 2.3693, + "step": 94490 + }, + { + "epoch": 0.36531057197198125, + "grad_norm": 0.09804191440343857, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 94500 + }, + { + "epoch": 0.36534922917536455, + "grad_norm": 0.10007265210151672, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 94510 + }, + { + "epoch": 0.3653878863787478, + "grad_norm": 0.11969096213579178, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 94520 + }, + { + "epoch": 0.3654265435821311, + "grad_norm": 0.1275913268327713, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 94530 + }, + { + "epoch": 0.36546520078551437, + "grad_norm": 0.1127488911151886, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 94540 + }, + { + "epoch": 0.3655038579888977, + "grad_norm": 0.10983943939208984, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 94550 + }, + { + "epoch": 0.3655425151922809, + "grad_norm": 0.11403996497392654, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 94560 + }, + { + "epoch": 0.36558117239566423, + "grad_norm": 0.10751233249902725, + "learning_rate": 0.002, + "loss": 2.3744, + "step": 94570 + }, + { + "epoch": 0.3656198295990475, + "grad_norm": 0.10286374390125275, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 94580 + }, + { + "epoch": 0.36565848680243074, + "grad_norm": 0.10275772213935852, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 94590 + }, + { + "epoch": 0.36569714400581405, + "grad_norm": 0.12379030883312225, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 94600 + }, + { + "epoch": 0.3657358012091973, + "grad_norm": 0.10697554796934128, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 94610 + }, + { + "epoch": 0.3657744584125806, + "grad_norm": 0.1183970719575882, + "learning_rate": 0.002, + "loss": 2.366, + "step": 94620 + }, + { + "epoch": 0.36581311561596386, + "grad_norm": 0.10810043662786484, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 94630 + }, + { + "epoch": 0.36585177281934717, + "grad_norm": 0.08997925370931625, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 94640 + }, + { + "epoch": 0.3658904300227304, + "grad_norm": 0.12436822801828384, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 94650 + }, + { + "epoch": 0.3659290872261137, + "grad_norm": 0.11453136801719666, + "learning_rate": 0.002, + "loss": 2.3772, + "step": 94660 + }, + { + "epoch": 0.365967744429497, + "grad_norm": 0.09779436886310577, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 94670 + }, + { + "epoch": 0.3660064016328803, + "grad_norm": 0.1175207644701004, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 94680 + }, + { + "epoch": 0.36604505883626354, + "grad_norm": 0.09444441646337509, + "learning_rate": 0.002, + "loss": 2.335, + "step": 94690 + }, + { + "epoch": 0.36608371603964684, + "grad_norm": 0.11687050014734268, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 94700 + }, + { + "epoch": 0.3661223732430301, + "grad_norm": 0.11369027197360992, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 94710 + }, + { + "epoch": 0.3661610304464134, + "grad_norm": 0.09931481629610062, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 94720 + }, + { + "epoch": 0.36619968764979666, + "grad_norm": 0.12874948978424072, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 94730 + }, + { + "epoch": 0.36623834485317996, + "grad_norm": 0.1392941176891327, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 94740 + }, + { + "epoch": 0.3662770020565632, + "grad_norm": 0.1356460601091385, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 94750 + }, + { + "epoch": 0.3663156592599465, + "grad_norm": 0.10736415535211563, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 94760 + }, + { + "epoch": 0.3663543164633298, + "grad_norm": 0.11838210374116898, + "learning_rate": 0.002, + "loss": 2.352, + "step": 94770 + }, + { + "epoch": 0.36639297366671303, + "grad_norm": 0.1028166115283966, + "learning_rate": 0.002, + "loss": 2.352, + "step": 94780 + }, + { + "epoch": 0.36643163087009634, + "grad_norm": 0.1099080815911293, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 94790 + }, + { + "epoch": 0.3664702880734796, + "grad_norm": 0.12541049718856812, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 94800 + }, + { + "epoch": 0.3665089452768629, + "grad_norm": 0.11674153059720993, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 94810 + }, + { + "epoch": 0.36654760248024615, + "grad_norm": 0.10327841341495514, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 94820 + }, + { + "epoch": 0.36658625968362946, + "grad_norm": 0.13201890885829926, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 94830 + }, + { + "epoch": 0.3666249168870127, + "grad_norm": 0.09061075001955032, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 94840 + }, + { + "epoch": 0.366663574090396, + "grad_norm": 0.13644355535507202, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 94850 + }, + { + "epoch": 0.36670223129377927, + "grad_norm": 0.11915605515241623, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 94860 + }, + { + "epoch": 0.3667408884971626, + "grad_norm": 0.11097453534603119, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 94870 + }, + { + "epoch": 0.36677954570054583, + "grad_norm": 0.13492131233215332, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 94880 + }, + { + "epoch": 0.36681820290392914, + "grad_norm": 0.10164101421833038, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 94890 + }, + { + "epoch": 0.3668568601073124, + "grad_norm": 0.09530888497829437, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 94900 + }, + { + "epoch": 0.3668955173106957, + "grad_norm": 0.12775802612304688, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 94910 + }, + { + "epoch": 0.36693417451407895, + "grad_norm": 0.10944028943777084, + "learning_rate": 0.002, + "loss": 2.344, + "step": 94920 + }, + { + "epoch": 0.36697283171746226, + "grad_norm": 0.1615549772977829, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 94930 + }, + { + "epoch": 0.3670114889208455, + "grad_norm": 0.117291659116745, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 94940 + }, + { + "epoch": 0.3670501461242288, + "grad_norm": 0.11271423101425171, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 94950 + }, + { + "epoch": 0.36708880332761207, + "grad_norm": 0.09867943078279495, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 94960 + }, + { + "epoch": 0.3671274605309953, + "grad_norm": 0.12499125301837921, + "learning_rate": 0.002, + "loss": 2.357, + "step": 94970 + }, + { + "epoch": 0.3671661177343786, + "grad_norm": 0.11344917863607407, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 94980 + }, + { + "epoch": 0.3672047749377619, + "grad_norm": 0.11792797595262527, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 94990 + }, + { + "epoch": 0.3672434321411452, + "grad_norm": 0.11155661940574646, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 95000 + }, + { + "epoch": 0.36728208934452844, + "grad_norm": 0.09080757200717926, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 95010 + }, + { + "epoch": 0.36732074654791175, + "grad_norm": 0.10427747666835785, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 95020 + }, + { + "epoch": 0.367359403751295, + "grad_norm": 0.10497792810201645, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 95030 + }, + { + "epoch": 0.3673980609546783, + "grad_norm": 0.09480317682027817, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 95040 + }, + { + "epoch": 0.36743671815806156, + "grad_norm": 0.11391110718250275, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 95050 + }, + { + "epoch": 0.36747537536144487, + "grad_norm": 0.10386598110198975, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 95060 + }, + { + "epoch": 0.3675140325648281, + "grad_norm": 0.1177930235862732, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 95070 + }, + { + "epoch": 0.3675526897682114, + "grad_norm": 0.11678066849708557, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 95080 + }, + { + "epoch": 0.3675913469715947, + "grad_norm": 0.09995070099830627, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 95090 + }, + { + "epoch": 0.367630004174978, + "grad_norm": 0.10244379937648773, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 95100 + }, + { + "epoch": 0.36766866137836124, + "grad_norm": 0.10045316070318222, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 95110 + }, + { + "epoch": 0.36770731858174455, + "grad_norm": 0.10228868573904037, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 95120 + }, + { + "epoch": 0.3677459757851278, + "grad_norm": 0.10760489106178284, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 95130 + }, + { + "epoch": 0.3677846329885111, + "grad_norm": 0.10703897476196289, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 95140 + }, + { + "epoch": 0.36782329019189436, + "grad_norm": 0.12178358435630798, + "learning_rate": 0.002, + "loss": 2.351, + "step": 95150 + }, + { + "epoch": 0.3678619473952776, + "grad_norm": 0.09344667941331863, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 95160 + }, + { + "epoch": 0.3679006045986609, + "grad_norm": 0.11168822646141052, + "learning_rate": 0.002, + "loss": 2.3727, + "step": 95170 + }, + { + "epoch": 0.36793926180204417, + "grad_norm": 0.1050405502319336, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 95180 + }, + { + "epoch": 0.3679779190054275, + "grad_norm": 0.12630388140678406, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 95190 + }, + { + "epoch": 0.36801657620881073, + "grad_norm": 0.11250712722539902, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 95200 + }, + { + "epoch": 0.36805523341219404, + "grad_norm": 0.104097880423069, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 95210 + }, + { + "epoch": 0.3680938906155773, + "grad_norm": 0.11244583129882812, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 95220 + }, + { + "epoch": 0.3681325478189606, + "grad_norm": 0.09972363710403442, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 95230 + }, + { + "epoch": 0.36817120502234385, + "grad_norm": 0.10571520775556564, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 95240 + }, + { + "epoch": 0.36820986222572716, + "grad_norm": 0.1095057874917984, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 95250 + }, + { + "epoch": 0.3682485194291104, + "grad_norm": 0.1093902736902237, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 95260 + }, + { + "epoch": 0.3682871766324937, + "grad_norm": 0.11586953699588776, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 95270 + }, + { + "epoch": 0.36832583383587697, + "grad_norm": 0.10942596942186356, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 95280 + }, + { + "epoch": 0.3683644910392603, + "grad_norm": 0.09717775136232376, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 95290 + }, + { + "epoch": 0.36840314824264353, + "grad_norm": 0.10185949504375458, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 95300 + }, + { + "epoch": 0.36844180544602684, + "grad_norm": 0.11331840604543686, + "learning_rate": 0.002, + "loss": 2.344, + "step": 95310 + }, + { + "epoch": 0.3684804626494101, + "grad_norm": 0.10007768869400024, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 95320 + }, + { + "epoch": 0.36851911985279334, + "grad_norm": 0.10249575227499008, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 95330 + }, + { + "epoch": 0.36855777705617665, + "grad_norm": 0.12019418925046921, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 95340 + }, + { + "epoch": 0.3685964342595599, + "grad_norm": 0.11633682996034622, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 95350 + }, + { + "epoch": 0.3686350914629432, + "grad_norm": 0.12141241133213043, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 95360 + }, + { + "epoch": 0.36867374866632646, + "grad_norm": 0.09715086966753006, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 95370 + }, + { + "epoch": 0.36871240586970977, + "grad_norm": 0.11704263091087341, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 95380 + }, + { + "epoch": 0.368751063073093, + "grad_norm": 0.11834775656461716, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 95390 + }, + { + "epoch": 0.36878972027647633, + "grad_norm": 0.105399951338768, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 95400 + }, + { + "epoch": 0.3688283774798596, + "grad_norm": 0.11063943058252335, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 95410 + }, + { + "epoch": 0.3688670346832429, + "grad_norm": 0.12205120921134949, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 95420 + }, + { + "epoch": 0.36890569188662614, + "grad_norm": 0.1160108670592308, + "learning_rate": 0.002, + "loss": 2.356, + "step": 95430 + }, + { + "epoch": 0.36894434909000945, + "grad_norm": 0.09953995794057846, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 95440 + }, + { + "epoch": 0.3689830062933927, + "grad_norm": 0.09703753143548965, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 95450 + }, + { + "epoch": 0.369021663496776, + "grad_norm": 0.11698596179485321, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 95460 + }, + { + "epoch": 0.36906032070015926, + "grad_norm": 0.11900828778743744, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 95470 + }, + { + "epoch": 0.36909897790354257, + "grad_norm": 0.11115144938230515, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 95480 + }, + { + "epoch": 0.3691376351069258, + "grad_norm": 0.10782202333211899, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 95490 + }, + { + "epoch": 0.36917629231030913, + "grad_norm": 0.11181306093931198, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 95500 + }, + { + "epoch": 0.3692149495136924, + "grad_norm": 0.11659783869981766, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 95510 + }, + { + "epoch": 0.36925360671707563, + "grad_norm": 0.11101959645748138, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 95520 + }, + { + "epoch": 0.36929226392045894, + "grad_norm": 0.11048574000597, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 95530 + }, + { + "epoch": 0.3693309211238422, + "grad_norm": 0.10598281770944595, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 95540 + }, + { + "epoch": 0.3693695783272255, + "grad_norm": 0.1005745604634285, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 95550 + }, + { + "epoch": 0.36940823553060875, + "grad_norm": 0.1077663004398346, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 95560 + }, + { + "epoch": 0.36944689273399206, + "grad_norm": 0.09539022296667099, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 95570 + }, + { + "epoch": 0.3694855499373753, + "grad_norm": 0.12170499563217163, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 95580 + }, + { + "epoch": 0.3695242071407586, + "grad_norm": 0.09968260675668716, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 95590 + }, + { + "epoch": 0.3695628643441419, + "grad_norm": 0.10713895410299301, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 95600 + }, + { + "epoch": 0.3696015215475252, + "grad_norm": 0.11493389308452606, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 95610 + }, + { + "epoch": 0.36964017875090843, + "grad_norm": 0.1029747948050499, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 95620 + }, + { + "epoch": 0.36967883595429174, + "grad_norm": 0.1195148453116417, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 95630 + }, + { + "epoch": 0.369717493157675, + "grad_norm": 0.10771917551755905, + "learning_rate": 0.002, + "loss": 2.353, + "step": 95640 + }, + { + "epoch": 0.3697561503610583, + "grad_norm": 0.12105977535247803, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 95650 + }, + { + "epoch": 0.36979480756444155, + "grad_norm": 0.10347715020179749, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 95660 + }, + { + "epoch": 0.36983346476782486, + "grad_norm": 0.10475093871355057, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 95670 + }, + { + "epoch": 0.3698721219712081, + "grad_norm": 0.12026255577802658, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 95680 + }, + { + "epoch": 0.3699107791745914, + "grad_norm": 0.11982334405183792, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 95690 + }, + { + "epoch": 0.36994943637797467, + "grad_norm": 0.10484941303730011, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 95700 + }, + { + "epoch": 0.3699880935813579, + "grad_norm": 0.10729681700468063, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 95710 + }, + { + "epoch": 0.37002675078474123, + "grad_norm": 0.09477420151233673, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 95720 + }, + { + "epoch": 0.3700654079881245, + "grad_norm": 0.1019349992275238, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 95730 + }, + { + "epoch": 0.3701040651915078, + "grad_norm": 0.10930332541465759, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 95740 + }, + { + "epoch": 0.37014272239489104, + "grad_norm": 0.09593915194272995, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 95750 + }, + { + "epoch": 0.37018137959827435, + "grad_norm": 0.11683354526758194, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 95760 + }, + { + "epoch": 0.3702200368016576, + "grad_norm": 0.10252895206212997, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 95770 + }, + { + "epoch": 0.3702586940050409, + "grad_norm": 0.10136279463768005, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 95780 + }, + { + "epoch": 0.37029735120842416, + "grad_norm": 0.13326722383499146, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 95790 + }, + { + "epoch": 0.37033600841180747, + "grad_norm": 0.14020223915576935, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 95800 + }, + { + "epoch": 0.3703746656151907, + "grad_norm": 0.09939390420913696, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 95810 + }, + { + "epoch": 0.37041332281857403, + "grad_norm": 0.10651414841413498, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 95820 + }, + { + "epoch": 0.3704519800219573, + "grad_norm": 0.10730873793363571, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 95830 + }, + { + "epoch": 0.3704906372253406, + "grad_norm": 0.10960599780082703, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 95840 + }, + { + "epoch": 0.37052929442872384, + "grad_norm": 0.11024501174688339, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 95850 + }, + { + "epoch": 0.37056795163210715, + "grad_norm": 0.11730218678712845, + "learning_rate": 0.002, + "loss": 2.356, + "step": 95860 + }, + { + "epoch": 0.3706066088354904, + "grad_norm": 0.10243767499923706, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 95870 + }, + { + "epoch": 0.3706452660388737, + "grad_norm": 0.11166820675134659, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 95880 + }, + { + "epoch": 0.37068392324225696, + "grad_norm": 0.120213583111763, + "learning_rate": 0.002, + "loss": 2.354, + "step": 95890 + }, + { + "epoch": 0.3707225804456402, + "grad_norm": 0.09946276247501373, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 95900 + }, + { + "epoch": 0.3707612376490235, + "grad_norm": 0.11692880094051361, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 95910 + }, + { + "epoch": 0.3707998948524068, + "grad_norm": 0.11046019941568375, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 95920 + }, + { + "epoch": 0.3708385520557901, + "grad_norm": 0.10743413865566254, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 95930 + }, + { + "epoch": 0.37087720925917333, + "grad_norm": 0.10488501936197281, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 95940 + }, + { + "epoch": 0.37091586646255664, + "grad_norm": 0.1552240252494812, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 95950 + }, + { + "epoch": 0.3709545236659399, + "grad_norm": 0.09831106662750244, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 95960 + }, + { + "epoch": 0.3709931808693232, + "grad_norm": 0.1153578832745552, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 95970 + }, + { + "epoch": 0.37103183807270645, + "grad_norm": 0.1132926493883133, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 95980 + }, + { + "epoch": 0.37107049527608976, + "grad_norm": 0.1189347356557846, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 95990 + }, + { + "epoch": 0.371109152479473, + "grad_norm": 0.10806018859148026, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 96000 + }, + { + "epoch": 0.3711478096828563, + "grad_norm": 0.11607592552900314, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 96010 + }, + { + "epoch": 0.3711864668862396, + "grad_norm": 0.0974685400724411, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 96020 + }, + { + "epoch": 0.3712251240896229, + "grad_norm": 0.22955001890659332, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 96030 + }, + { + "epoch": 0.37126378129300613, + "grad_norm": 0.10110989212989807, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 96040 + }, + { + "epoch": 0.37130243849638944, + "grad_norm": 0.10315662622451782, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 96050 + }, + { + "epoch": 0.3713410956997727, + "grad_norm": 0.09571203589439392, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 96060 + }, + { + "epoch": 0.37137975290315595, + "grad_norm": 0.10977359116077423, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 96070 + }, + { + "epoch": 0.37141841010653925, + "grad_norm": 0.27224186062812805, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 96080 + }, + { + "epoch": 0.3714570673099225, + "grad_norm": 0.11507481336593628, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 96090 + }, + { + "epoch": 0.3714957245133058, + "grad_norm": 0.1136784628033638, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 96100 + }, + { + "epoch": 0.37153438171668907, + "grad_norm": 0.12112376093864441, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 96110 + }, + { + "epoch": 0.3715730389200724, + "grad_norm": 0.10242711007595062, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 96120 + }, + { + "epoch": 0.3716116961234556, + "grad_norm": 0.1104845181107521, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 96130 + }, + { + "epoch": 0.37165035332683893, + "grad_norm": 0.10000521689653397, + "learning_rate": 0.002, + "loss": 2.358, + "step": 96140 + }, + { + "epoch": 0.3716890105302222, + "grad_norm": 0.10198475420475006, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 96150 + }, + { + "epoch": 0.3717276677336055, + "grad_norm": 0.1174432635307312, + "learning_rate": 0.002, + "loss": 2.337, + "step": 96160 + }, + { + "epoch": 0.37176632493698875, + "grad_norm": 0.1013440415263176, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 96170 + }, + { + "epoch": 0.37180498214037205, + "grad_norm": 0.10320638865232468, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 96180 + }, + { + "epoch": 0.3718436393437553, + "grad_norm": 0.11725156009197235, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 96190 + }, + { + "epoch": 0.3718822965471386, + "grad_norm": 0.10073649883270264, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 96200 + }, + { + "epoch": 0.37192095375052187, + "grad_norm": 0.09045589715242386, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 96210 + }, + { + "epoch": 0.3719596109539052, + "grad_norm": 0.11545506864786148, + "learning_rate": 0.002, + "loss": 2.364, + "step": 96220 + }, + { + "epoch": 0.3719982681572884, + "grad_norm": 0.1079561710357666, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 96230 + }, + { + "epoch": 0.37203692536067173, + "grad_norm": 0.1202242448925972, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 96240 + }, + { + "epoch": 0.372075582564055, + "grad_norm": 0.09573031216859818, + "learning_rate": 0.002, + "loss": 2.342, + "step": 96250 + }, + { + "epoch": 0.37211423976743824, + "grad_norm": 0.11349551379680634, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 96260 + }, + { + "epoch": 0.37215289697082155, + "grad_norm": 0.11149785667657852, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 96270 + }, + { + "epoch": 0.3721915541742048, + "grad_norm": 0.09543846547603607, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 96280 + }, + { + "epoch": 0.3722302113775881, + "grad_norm": 0.10158144682645798, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 96290 + }, + { + "epoch": 0.37226886858097136, + "grad_norm": 0.13857035338878632, + "learning_rate": 0.002, + "loss": 2.369, + "step": 96300 + }, + { + "epoch": 0.37230752578435466, + "grad_norm": 0.11624547839164734, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 96310 + }, + { + "epoch": 0.3723461829877379, + "grad_norm": 0.10812583565711975, + "learning_rate": 0.002, + "loss": 2.3781, + "step": 96320 + }, + { + "epoch": 0.3723848401911212, + "grad_norm": 0.09926456212997437, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 96330 + }, + { + "epoch": 0.3724234973945045, + "grad_norm": 0.10067030787467957, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 96340 + }, + { + "epoch": 0.3724621545978878, + "grad_norm": 0.09444724768400192, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 96350 + }, + { + "epoch": 0.37250081180127104, + "grad_norm": 0.1284225434064865, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 96360 + }, + { + "epoch": 0.37253946900465434, + "grad_norm": 0.09563818573951721, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 96370 + }, + { + "epoch": 0.3725781262080376, + "grad_norm": 0.10342410206794739, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 96380 + }, + { + "epoch": 0.3726167834114209, + "grad_norm": 0.1257498413324356, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 96390 + }, + { + "epoch": 0.37265544061480416, + "grad_norm": 0.10211283713579178, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 96400 + }, + { + "epoch": 0.37269409781818746, + "grad_norm": 0.10875606536865234, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 96410 + }, + { + "epoch": 0.3727327550215707, + "grad_norm": 0.11335258930921555, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 96420 + }, + { + "epoch": 0.372771412224954, + "grad_norm": 0.1230175718665123, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 96430 + }, + { + "epoch": 0.3728100694283373, + "grad_norm": 0.10346999764442444, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 96440 + }, + { + "epoch": 0.37284872663172053, + "grad_norm": 0.1104331836104393, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 96450 + }, + { + "epoch": 0.37288738383510384, + "grad_norm": 0.11010728776454926, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 96460 + }, + { + "epoch": 0.3729260410384871, + "grad_norm": 0.09920106828212738, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 96470 + }, + { + "epoch": 0.3729646982418704, + "grad_norm": 0.1061813235282898, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 96480 + }, + { + "epoch": 0.37300335544525365, + "grad_norm": 0.12769728899002075, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 96490 + }, + { + "epoch": 0.37304201264863696, + "grad_norm": 0.11152181774377823, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 96500 + }, + { + "epoch": 0.3730806698520202, + "grad_norm": 0.10381684452295303, + "learning_rate": 0.002, + "loss": 2.346, + "step": 96510 + }, + { + "epoch": 0.3731193270554035, + "grad_norm": 0.09493334591388702, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 96520 + }, + { + "epoch": 0.37315798425878677, + "grad_norm": 0.10465577989816666, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 96530 + }, + { + "epoch": 0.3731966414621701, + "grad_norm": 0.11680374294519424, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 96540 + }, + { + "epoch": 0.37323529866555333, + "grad_norm": 0.10015437752008438, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 96550 + }, + { + "epoch": 0.37327395586893664, + "grad_norm": 0.10573911666870117, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 96560 + }, + { + "epoch": 0.3733126130723199, + "grad_norm": 0.10745342075824738, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 96570 + }, + { + "epoch": 0.3733512702757032, + "grad_norm": 0.10386376827955246, + "learning_rate": 0.002, + "loss": 2.35, + "step": 96580 + }, + { + "epoch": 0.37338992747908645, + "grad_norm": 0.10528868436813354, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 96590 + }, + { + "epoch": 0.37342858468246976, + "grad_norm": 0.10415124893188477, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 96600 + }, + { + "epoch": 0.373467241885853, + "grad_norm": 0.12020357698202133, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 96610 + }, + { + "epoch": 0.3735058990892363, + "grad_norm": 0.10863015800714493, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 96620 + }, + { + "epoch": 0.37354455629261957, + "grad_norm": 0.09396566450595856, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 96630 + }, + { + "epoch": 0.3735832134960028, + "grad_norm": 0.11533330380916595, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 96640 + }, + { + "epoch": 0.3736218706993861, + "grad_norm": 0.10272518545389175, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 96650 + }, + { + "epoch": 0.3736605279027694, + "grad_norm": 0.08855467289686203, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 96660 + }, + { + "epoch": 0.3736991851061527, + "grad_norm": 0.12185222655534744, + "learning_rate": 0.002, + "loss": 2.3716, + "step": 96670 + }, + { + "epoch": 0.37373784230953594, + "grad_norm": 0.09124171733856201, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 96680 + }, + { + "epoch": 0.37377649951291925, + "grad_norm": 0.10895459353923798, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 96690 + }, + { + "epoch": 0.3738151567163025, + "grad_norm": 0.1075105369091034, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 96700 + }, + { + "epoch": 0.3738538139196858, + "grad_norm": 0.08626224845647812, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 96710 + }, + { + "epoch": 0.37389247112306906, + "grad_norm": 0.0954337790608406, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 96720 + }, + { + "epoch": 0.37393112832645237, + "grad_norm": 0.10353606939315796, + "learning_rate": 0.002, + "loss": 2.347, + "step": 96730 + }, + { + "epoch": 0.3739697855298356, + "grad_norm": 0.10286654531955719, + "learning_rate": 0.002, + "loss": 2.353, + "step": 96740 + }, + { + "epoch": 0.3740084427332189, + "grad_norm": 0.09858877211809158, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 96750 + }, + { + "epoch": 0.3740470999366022, + "grad_norm": 0.11728513985872269, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 96760 + }, + { + "epoch": 0.3740857571399855, + "grad_norm": 0.13198749721050262, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 96770 + }, + { + "epoch": 0.37412441434336874, + "grad_norm": 0.10547465831041336, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 96780 + }, + { + "epoch": 0.37416307154675205, + "grad_norm": 0.1052117720246315, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 96790 + }, + { + "epoch": 0.3742017287501353, + "grad_norm": 0.10624329000711441, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 96800 + }, + { + "epoch": 0.3742403859535186, + "grad_norm": 0.10447991639375687, + "learning_rate": 0.002, + "loss": 2.35, + "step": 96810 + }, + { + "epoch": 0.37427904315690186, + "grad_norm": 0.10218934714794159, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 96820 + }, + { + "epoch": 0.3743177003602851, + "grad_norm": 0.12363198399543762, + "learning_rate": 0.002, + "loss": 2.342, + "step": 96830 + }, + { + "epoch": 0.3743563575636684, + "grad_norm": 0.1127898246049881, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 96840 + }, + { + "epoch": 0.37439501476705167, + "grad_norm": 0.11404630541801453, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 96850 + }, + { + "epoch": 0.374433671970435, + "grad_norm": 0.10721984505653381, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 96860 + }, + { + "epoch": 0.37447232917381823, + "grad_norm": 0.10237617790699005, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 96870 + }, + { + "epoch": 0.37451098637720154, + "grad_norm": 0.11818091571331024, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 96880 + }, + { + "epoch": 0.3745496435805848, + "grad_norm": 0.12402646988630295, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 96890 + }, + { + "epoch": 0.3745883007839681, + "grad_norm": 0.10971760004758835, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 96900 + }, + { + "epoch": 0.37462695798735135, + "grad_norm": 0.11073453724384308, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 96910 + }, + { + "epoch": 0.37466561519073466, + "grad_norm": 0.5414279103279114, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 96920 + }, + { + "epoch": 0.3747042723941179, + "grad_norm": 0.12150266021490097, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 96930 + }, + { + "epoch": 0.3747429295975012, + "grad_norm": 0.09668447822332382, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 96940 + }, + { + "epoch": 0.37478158680088447, + "grad_norm": 0.11486414819955826, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 96950 + }, + { + "epoch": 0.3748202440042678, + "grad_norm": 0.13404735922813416, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 96960 + }, + { + "epoch": 0.37485890120765103, + "grad_norm": 0.11293788254261017, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 96970 + }, + { + "epoch": 0.37489755841103434, + "grad_norm": 0.0979723185300827, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 96980 + }, + { + "epoch": 0.3749362156144176, + "grad_norm": 0.10279171913862228, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 96990 + }, + { + "epoch": 0.37497487281780084, + "grad_norm": 0.11414069682359695, + "learning_rate": 0.002, + "loss": 2.33, + "step": 97000 + }, + { + "epoch": 0.37501353002118415, + "grad_norm": 0.11273612082004547, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 97010 + }, + { + "epoch": 0.3750521872245674, + "grad_norm": 0.10220120847225189, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 97020 + }, + { + "epoch": 0.3750908444279507, + "grad_norm": 0.1266205608844757, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 97030 + }, + { + "epoch": 0.37512950163133396, + "grad_norm": 0.09308404475450516, + "learning_rate": 0.002, + "loss": 2.347, + "step": 97040 + }, + { + "epoch": 0.37516815883471727, + "grad_norm": 0.11220557242631912, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 97050 + }, + { + "epoch": 0.3752068160381005, + "grad_norm": 0.12289377301931381, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 97060 + }, + { + "epoch": 0.37524547324148383, + "grad_norm": 0.11952238529920578, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 97070 + }, + { + "epoch": 0.3752841304448671, + "grad_norm": 0.0957283079624176, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 97080 + }, + { + "epoch": 0.3753227876482504, + "grad_norm": 0.11071771383285522, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 97090 + }, + { + "epoch": 0.37536144485163364, + "grad_norm": 0.10729561746120453, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 97100 + }, + { + "epoch": 0.37540010205501695, + "grad_norm": 0.25892820954322815, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 97110 + }, + { + "epoch": 0.3754387592584002, + "grad_norm": 0.09794148802757263, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 97120 + }, + { + "epoch": 0.3754774164617835, + "grad_norm": 0.09345117211341858, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 97130 + }, + { + "epoch": 0.37551607366516676, + "grad_norm": 0.11832363903522491, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 97140 + }, + { + "epoch": 0.37555473086855007, + "grad_norm": 0.1080976128578186, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 97150 + }, + { + "epoch": 0.3755933880719333, + "grad_norm": 0.09897181391716003, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 97160 + }, + { + "epoch": 0.37563204527531663, + "grad_norm": 0.0907154381275177, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 97170 + }, + { + "epoch": 0.3756707024786999, + "grad_norm": 0.13835717737674713, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 97180 + }, + { + "epoch": 0.37570935968208313, + "grad_norm": 0.10165028274059296, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 97190 + }, + { + "epoch": 0.37574801688546644, + "grad_norm": 0.11934266984462738, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 97200 + }, + { + "epoch": 0.3757866740888497, + "grad_norm": 0.11578059196472168, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 97210 + }, + { + "epoch": 0.375825331292233, + "grad_norm": 0.12091308832168579, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 97220 + }, + { + "epoch": 0.37586398849561625, + "grad_norm": 0.12344370037317276, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 97230 + }, + { + "epoch": 0.37590264569899956, + "grad_norm": 0.12931908667087555, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 97240 + }, + { + "epoch": 0.3759413029023828, + "grad_norm": 0.10830745846033096, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 97250 + }, + { + "epoch": 0.3759799601057661, + "grad_norm": 0.10483557730913162, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 97260 + }, + { + "epoch": 0.37601861730914937, + "grad_norm": 0.10018157958984375, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 97270 + }, + { + "epoch": 0.3760572745125327, + "grad_norm": 0.10008195042610168, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 97280 + }, + { + "epoch": 0.37609593171591593, + "grad_norm": 0.09947829693555832, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 97290 + }, + { + "epoch": 0.37613458891929924, + "grad_norm": 0.10209108889102936, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 97300 + }, + { + "epoch": 0.3761732461226825, + "grad_norm": 0.1302970051765442, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 97310 + }, + { + "epoch": 0.3762119033260658, + "grad_norm": 0.1283721923828125, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 97320 + }, + { + "epoch": 0.37625056052944905, + "grad_norm": 0.11452938616275787, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 97330 + }, + { + "epoch": 0.37628921773283236, + "grad_norm": 0.11943821609020233, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 97340 + }, + { + "epoch": 0.3763278749362156, + "grad_norm": 0.09788057953119278, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 97350 + }, + { + "epoch": 0.3763665321395989, + "grad_norm": 0.10299163311719894, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 97360 + }, + { + "epoch": 0.37640518934298217, + "grad_norm": 0.0997253954410553, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 97370 + }, + { + "epoch": 0.3764438465463654, + "grad_norm": 0.10356666147708893, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 97380 + }, + { + "epoch": 0.37648250374974873, + "grad_norm": 0.12059180438518524, + "learning_rate": 0.002, + "loss": 2.354, + "step": 97390 + }, + { + "epoch": 0.376521160953132, + "grad_norm": 0.10693185776472092, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 97400 + }, + { + "epoch": 0.3765598181565153, + "grad_norm": 0.12091413140296936, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 97410 + }, + { + "epoch": 0.37659847535989854, + "grad_norm": 0.1141573116183281, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 97420 + }, + { + "epoch": 0.37663713256328185, + "grad_norm": 0.10382256656885147, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 97430 + }, + { + "epoch": 0.3766757897666651, + "grad_norm": 0.11503864079713821, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 97440 + }, + { + "epoch": 0.3767144469700484, + "grad_norm": 0.11092833429574966, + "learning_rate": 0.002, + "loss": 2.337, + "step": 97450 + }, + { + "epoch": 0.37675310417343166, + "grad_norm": 0.09959018975496292, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 97460 + }, + { + "epoch": 0.37679176137681497, + "grad_norm": 0.11717210710048676, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 97470 + }, + { + "epoch": 0.3768304185801982, + "grad_norm": 0.11616809666156769, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 97480 + }, + { + "epoch": 0.37686907578358153, + "grad_norm": 0.09150540083646774, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 97490 + }, + { + "epoch": 0.3769077329869648, + "grad_norm": 0.09806869924068451, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 97500 + }, + { + "epoch": 0.3769463901903481, + "grad_norm": 0.12199116498231888, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 97510 + }, + { + "epoch": 0.37698504739373134, + "grad_norm": 0.10591573268175125, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 97520 + }, + { + "epoch": 0.37702370459711465, + "grad_norm": 0.145875483751297, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 97530 + }, + { + "epoch": 0.3770623618004979, + "grad_norm": 0.10147275775671005, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 97540 + }, + { + "epoch": 0.3771010190038812, + "grad_norm": 0.11346987634897232, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 97550 + }, + { + "epoch": 0.37713967620726446, + "grad_norm": 0.10580125451087952, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 97560 + }, + { + "epoch": 0.3771783334106477, + "grad_norm": 0.10647939890623093, + "learning_rate": 0.002, + "loss": 2.368, + "step": 97570 + }, + { + "epoch": 0.377216990614031, + "grad_norm": 0.11025557667016983, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 97580 + }, + { + "epoch": 0.3772556478174143, + "grad_norm": 0.099567711353302, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 97590 + }, + { + "epoch": 0.3772943050207976, + "grad_norm": 0.10200417786836624, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 97600 + }, + { + "epoch": 0.37733296222418083, + "grad_norm": 0.10742084681987762, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 97610 + }, + { + "epoch": 0.37737161942756414, + "grad_norm": 0.08838263899087906, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 97620 + }, + { + "epoch": 0.3774102766309474, + "grad_norm": 0.12436746060848236, + "learning_rate": 0.002, + "loss": 2.352, + "step": 97630 + }, + { + "epoch": 0.3774489338343307, + "grad_norm": 0.10953371971845627, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 97640 + }, + { + "epoch": 0.37748759103771395, + "grad_norm": 0.08956970274448395, + "learning_rate": 0.002, + "loss": 2.337, + "step": 97650 + }, + { + "epoch": 0.37752624824109726, + "grad_norm": 0.11406704038381577, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 97660 + }, + { + "epoch": 0.3775649054444805, + "grad_norm": 0.10393359512090683, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 97670 + }, + { + "epoch": 0.3776035626478638, + "grad_norm": 0.11221380531787872, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 97680 + }, + { + "epoch": 0.3776422198512471, + "grad_norm": 0.09847605228424072, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 97690 + }, + { + "epoch": 0.3776808770546304, + "grad_norm": 0.12601011991500854, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 97700 + }, + { + "epoch": 0.37771953425801363, + "grad_norm": 0.11960670351982117, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 97710 + }, + { + "epoch": 0.37775819146139694, + "grad_norm": 0.09933385998010635, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 97720 + }, + { + "epoch": 0.3777968486647802, + "grad_norm": 0.11257601529359818, + "learning_rate": 0.002, + "loss": 2.358, + "step": 97730 + }, + { + "epoch": 0.37783550586816345, + "grad_norm": 0.11432036757469177, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 97740 + }, + { + "epoch": 0.37787416307154675, + "grad_norm": 0.12867693603038788, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 97750 + }, + { + "epoch": 0.37791282027493, + "grad_norm": 0.10882619023323059, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 97760 + }, + { + "epoch": 0.3779514774783133, + "grad_norm": 0.10990484058856964, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 97770 + }, + { + "epoch": 0.37799013468169657, + "grad_norm": 0.11563392728567123, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 97780 + }, + { + "epoch": 0.3780287918850799, + "grad_norm": 0.10057725757360458, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 97790 + }, + { + "epoch": 0.3780674490884631, + "grad_norm": 0.10217918455600739, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 97800 + }, + { + "epoch": 0.37810610629184643, + "grad_norm": 0.11344746500253677, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 97810 + }, + { + "epoch": 0.3781447634952297, + "grad_norm": 0.11891037225723267, + "learning_rate": 0.002, + "loss": 2.355, + "step": 97820 + }, + { + "epoch": 0.378183420698613, + "grad_norm": 0.10988009721040726, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 97830 + }, + { + "epoch": 0.37822207790199625, + "grad_norm": 0.11649468541145325, + "learning_rate": 0.002, + "loss": 2.357, + "step": 97840 + }, + { + "epoch": 0.37826073510537955, + "grad_norm": 0.09838244318962097, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 97850 + }, + { + "epoch": 0.3782993923087628, + "grad_norm": 0.100824736058712, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 97860 + }, + { + "epoch": 0.3783380495121461, + "grad_norm": 0.1081569567322731, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 97870 + }, + { + "epoch": 0.37837670671552937, + "grad_norm": 0.11926387250423431, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 97880 + }, + { + "epoch": 0.3784153639189127, + "grad_norm": 0.1074013039469719, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 97890 + }, + { + "epoch": 0.3784540211222959, + "grad_norm": 0.11106620728969574, + "learning_rate": 0.002, + "loss": 2.344, + "step": 97900 + }, + { + "epoch": 0.37849267832567923, + "grad_norm": 0.12460718303918839, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 97910 + }, + { + "epoch": 0.3785313355290625, + "grad_norm": 0.1140846386551857, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 97920 + }, + { + "epoch": 0.37856999273244574, + "grad_norm": 0.12423646450042725, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 97930 + }, + { + "epoch": 0.37860864993582904, + "grad_norm": 0.10674221813678741, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 97940 + }, + { + "epoch": 0.3786473071392123, + "grad_norm": 0.09802944213151932, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 97950 + }, + { + "epoch": 0.3786859643425956, + "grad_norm": 0.09465181082487106, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 97960 + }, + { + "epoch": 0.37872462154597886, + "grad_norm": 0.10393911600112915, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 97970 + }, + { + "epoch": 0.37876327874936216, + "grad_norm": 0.11896202713251114, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 97980 + }, + { + "epoch": 0.3788019359527454, + "grad_norm": 0.10339158028364182, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 97990 + }, + { + "epoch": 0.3788405931561287, + "grad_norm": 0.1181914359331131, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 98000 + }, + { + "epoch": 0.378879250359512, + "grad_norm": 0.11918921768665314, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 98010 + }, + { + "epoch": 0.3789179075628953, + "grad_norm": 0.10368076711893082, + "learning_rate": 0.002, + "loss": 2.347, + "step": 98020 + }, + { + "epoch": 0.37895656476627854, + "grad_norm": 0.10133563727140427, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 98030 + }, + { + "epoch": 0.37899522196966184, + "grad_norm": 0.11930687725543976, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 98040 + }, + { + "epoch": 0.3790338791730451, + "grad_norm": 0.12275891751050949, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 98050 + }, + { + "epoch": 0.3790725363764284, + "grad_norm": 0.10572463274002075, + "learning_rate": 0.002, + "loss": 2.352, + "step": 98060 + }, + { + "epoch": 0.37911119357981166, + "grad_norm": 0.10790557414293289, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 98070 + }, + { + "epoch": 0.37914985078319496, + "grad_norm": 0.11652061343193054, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 98080 + }, + { + "epoch": 0.3791885079865782, + "grad_norm": 0.10054761916399002, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 98090 + }, + { + "epoch": 0.3792271651899615, + "grad_norm": 0.10746383666992188, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 98100 + }, + { + "epoch": 0.3792658223933448, + "grad_norm": 0.12173549830913544, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 98110 + }, + { + "epoch": 0.37930447959672803, + "grad_norm": 0.11230204999446869, + "learning_rate": 0.002, + "loss": 2.359, + "step": 98120 + }, + { + "epoch": 0.37934313680011134, + "grad_norm": 0.09881207346916199, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 98130 + }, + { + "epoch": 0.3793817940034946, + "grad_norm": 1.9445171356201172, + "learning_rate": 0.002, + "loss": 2.354, + "step": 98140 + }, + { + "epoch": 0.3794204512068779, + "grad_norm": 0.11113037914037704, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 98150 + }, + { + "epoch": 0.37945910841026115, + "grad_norm": 0.11721598356962204, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 98160 + }, + { + "epoch": 0.37949776561364446, + "grad_norm": 0.1077088937163353, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 98170 + }, + { + "epoch": 0.3795364228170277, + "grad_norm": 0.10289087891578674, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 98180 + }, + { + "epoch": 0.379575080020411, + "grad_norm": 0.0997520238161087, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 98190 + }, + { + "epoch": 0.37961373722379427, + "grad_norm": 0.10366350412368774, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 98200 + }, + { + "epoch": 0.3796523944271776, + "grad_norm": 0.10660743713378906, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 98210 + }, + { + "epoch": 0.3796910516305608, + "grad_norm": 0.1255519837141037, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 98220 + }, + { + "epoch": 0.37972970883394414, + "grad_norm": 0.12160088866949081, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 98230 + }, + { + "epoch": 0.3797683660373274, + "grad_norm": 0.10024891793727875, + "learning_rate": 0.002, + "loss": 2.352, + "step": 98240 + }, + { + "epoch": 0.3798070232407107, + "grad_norm": 0.11600322276353836, + "learning_rate": 0.002, + "loss": 2.349, + "step": 98250 + }, + { + "epoch": 0.37984568044409395, + "grad_norm": 0.09131622314453125, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 98260 + }, + { + "epoch": 0.37988433764747725, + "grad_norm": 0.08803276717662811, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 98270 + }, + { + "epoch": 0.3799229948508605, + "grad_norm": 0.0936739444732666, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 98280 + }, + { + "epoch": 0.3799616520542438, + "grad_norm": 0.10160307586193085, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 98290 + }, + { + "epoch": 0.38000030925762707, + "grad_norm": 0.11466259509325027, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 98300 + }, + { + "epoch": 0.3800389664610103, + "grad_norm": 0.0952131450176239, + "learning_rate": 0.002, + "loss": 2.369, + "step": 98310 + }, + { + "epoch": 0.3800776236643936, + "grad_norm": 0.10741259157657623, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 98320 + }, + { + "epoch": 0.3801162808677769, + "grad_norm": 0.09484761208295822, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 98330 + }, + { + "epoch": 0.3801549380711602, + "grad_norm": 0.1288508027791977, + "learning_rate": 0.002, + "loss": 2.351, + "step": 98340 + }, + { + "epoch": 0.38019359527454344, + "grad_norm": 0.09343413263559341, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 98350 + }, + { + "epoch": 0.38023225247792675, + "grad_norm": 0.10034056752920151, + "learning_rate": 0.002, + "loss": 2.342, + "step": 98360 + }, + { + "epoch": 0.38027090968131, + "grad_norm": 0.10348794609308243, + "learning_rate": 0.002, + "loss": 2.366, + "step": 98370 + }, + { + "epoch": 0.3803095668846933, + "grad_norm": 0.09628686308860779, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 98380 + }, + { + "epoch": 0.38034822408807656, + "grad_norm": 0.1153346374630928, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 98390 + }, + { + "epoch": 0.38038688129145987, + "grad_norm": 0.1036255732178688, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 98400 + }, + { + "epoch": 0.3804255384948431, + "grad_norm": 0.12434010207653046, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 98410 + }, + { + "epoch": 0.3804641956982264, + "grad_norm": 0.09980366379022598, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 98420 + }, + { + "epoch": 0.3805028529016097, + "grad_norm": 0.09620928019285202, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 98430 + }, + { + "epoch": 0.380541510104993, + "grad_norm": 0.11719270050525665, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 98440 + }, + { + "epoch": 0.38058016730837624, + "grad_norm": 0.0989764928817749, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 98450 + }, + { + "epoch": 0.38061882451175955, + "grad_norm": 0.11647021770477295, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 98460 + }, + { + "epoch": 0.3806574817151428, + "grad_norm": 0.11862388253211975, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 98470 + }, + { + "epoch": 0.3806961389185261, + "grad_norm": 0.09813261777162552, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 98480 + }, + { + "epoch": 0.38073479612190936, + "grad_norm": 0.10625061392784119, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 98490 + }, + { + "epoch": 0.3807734533252926, + "grad_norm": 0.11874415725469589, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 98500 + }, + { + "epoch": 0.3808121105286759, + "grad_norm": 0.12056658416986465, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 98510 + }, + { + "epoch": 0.38085076773205917, + "grad_norm": 0.10916418582201004, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 98520 + }, + { + "epoch": 0.3808894249354425, + "grad_norm": 0.13108888268470764, + "learning_rate": 0.002, + "loss": 2.353, + "step": 98530 + }, + { + "epoch": 0.38092808213882573, + "grad_norm": 0.10709444433450699, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 98540 + }, + { + "epoch": 0.38096673934220904, + "grad_norm": 0.09459146857261658, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 98550 + }, + { + "epoch": 0.3810053965455923, + "grad_norm": 0.10232074558734894, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 98560 + }, + { + "epoch": 0.3810440537489756, + "grad_norm": 0.12341094017028809, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 98570 + }, + { + "epoch": 0.38108271095235885, + "grad_norm": 0.10351862013339996, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 98580 + }, + { + "epoch": 0.38112136815574216, + "grad_norm": 0.12595586478710175, + "learning_rate": 0.002, + "loss": 2.352, + "step": 98590 + }, + { + "epoch": 0.3811600253591254, + "grad_norm": 0.08833569288253784, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 98600 + }, + { + "epoch": 0.3811986825625087, + "grad_norm": 0.09319626539945602, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 98610 + }, + { + "epoch": 0.38123733976589197, + "grad_norm": 0.1339443475008011, + "learning_rate": 0.002, + "loss": 2.3705, + "step": 98620 + }, + { + "epoch": 0.3812759969692753, + "grad_norm": 0.12461947649717331, + "learning_rate": 0.002, + "loss": 2.342, + "step": 98630 + }, + { + "epoch": 0.38131465417265853, + "grad_norm": 0.10771634429693222, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 98640 + }, + { + "epoch": 0.38135331137604184, + "grad_norm": 0.1092999279499054, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 98650 + }, + { + "epoch": 0.3813919685794251, + "grad_norm": 0.10708318650722504, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 98660 + }, + { + "epoch": 0.38143062578280834, + "grad_norm": 0.11872132867574692, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 98670 + }, + { + "epoch": 0.38146928298619165, + "grad_norm": 0.10649701207876205, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 98680 + }, + { + "epoch": 0.3815079401895749, + "grad_norm": 0.0995868369936943, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 98690 + }, + { + "epoch": 0.3815465973929582, + "grad_norm": 0.10507569462060928, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 98700 + }, + { + "epoch": 0.38158525459634146, + "grad_norm": 0.0972597673535347, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 98710 + }, + { + "epoch": 0.38162391179972477, + "grad_norm": 0.47026678919792175, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 98720 + }, + { + "epoch": 0.381662569003108, + "grad_norm": 0.10154223442077637, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 98730 + }, + { + "epoch": 0.38170122620649133, + "grad_norm": 0.10038376599550247, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 98740 + }, + { + "epoch": 0.3817398834098746, + "grad_norm": 0.11687665432691574, + "learning_rate": 0.002, + "loss": 2.351, + "step": 98750 + }, + { + "epoch": 0.3817785406132579, + "grad_norm": 0.09308470785617828, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 98760 + }, + { + "epoch": 0.38181719781664114, + "grad_norm": 0.11552231013774872, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 98770 + }, + { + "epoch": 0.38185585502002445, + "grad_norm": 0.13166840374469757, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 98780 + }, + { + "epoch": 0.3818945122234077, + "grad_norm": 0.13677500188350677, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 98790 + }, + { + "epoch": 0.381933169426791, + "grad_norm": 0.13627128303050995, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 98800 + }, + { + "epoch": 0.38197182663017426, + "grad_norm": 0.09993304312229156, + "learning_rate": 0.002, + "loss": 2.358, + "step": 98810 + }, + { + "epoch": 0.38201048383355757, + "grad_norm": 0.10988759994506836, + "learning_rate": 0.002, + "loss": 2.352, + "step": 98820 + }, + { + "epoch": 0.3820491410369408, + "grad_norm": 0.11517474800348282, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 98830 + }, + { + "epoch": 0.38208779824032413, + "grad_norm": 0.10560702532529831, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 98840 + }, + { + "epoch": 0.3821264554437074, + "grad_norm": 0.11196266114711761, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 98850 + }, + { + "epoch": 0.38216511264709063, + "grad_norm": 0.10116425156593323, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 98860 + }, + { + "epoch": 0.38220376985047394, + "grad_norm": 0.11830340325832367, + "learning_rate": 0.002, + "loss": 2.3666, + "step": 98870 + }, + { + "epoch": 0.3822424270538572, + "grad_norm": 0.11167599260807037, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 98880 + }, + { + "epoch": 0.3822810842572405, + "grad_norm": 0.11318913847208023, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 98890 + }, + { + "epoch": 0.38231974146062375, + "grad_norm": 0.12292078137397766, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 98900 + }, + { + "epoch": 0.38235839866400706, + "grad_norm": 0.11362916976213455, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 98910 + }, + { + "epoch": 0.3823970558673903, + "grad_norm": 0.13096804916858673, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 98920 + }, + { + "epoch": 0.3824357130707736, + "grad_norm": 0.1046454980969429, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 98930 + }, + { + "epoch": 0.38247437027415687, + "grad_norm": 0.11374887824058533, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 98940 + }, + { + "epoch": 0.3825130274775402, + "grad_norm": 0.1099177896976471, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 98950 + }, + { + "epoch": 0.38255168468092343, + "grad_norm": 0.10790438205003738, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 98960 + }, + { + "epoch": 0.38259034188430674, + "grad_norm": 0.2521016001701355, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 98970 + }, + { + "epoch": 0.38262899908769, + "grad_norm": 0.09931904077529907, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 98980 + }, + { + "epoch": 0.3826676562910733, + "grad_norm": 0.11894132196903229, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 98990 + }, + { + "epoch": 0.38270631349445655, + "grad_norm": 0.12821850180625916, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 99000 + }, + { + "epoch": 0.38274497069783986, + "grad_norm": 0.10607095062732697, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 99010 + }, + { + "epoch": 0.3827836279012231, + "grad_norm": 0.09565222263336182, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 99020 + }, + { + "epoch": 0.3828222851046064, + "grad_norm": 0.11812412738800049, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 99030 + }, + { + "epoch": 0.38286094230798967, + "grad_norm": 0.12434925884008408, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 99040 + }, + { + "epoch": 0.3828995995113729, + "grad_norm": 0.09686396270990372, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 99050 + }, + { + "epoch": 0.38293825671475623, + "grad_norm": 0.1269039660692215, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 99060 + }, + { + "epoch": 0.3829769139181395, + "grad_norm": 0.09965300559997559, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 99070 + }, + { + "epoch": 0.3830155711215228, + "grad_norm": 0.09235601127147675, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 99080 + }, + { + "epoch": 0.38305422832490604, + "grad_norm": 0.10772426426410675, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 99090 + }, + { + "epoch": 0.38309288552828935, + "grad_norm": 0.1444188356399536, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 99100 + }, + { + "epoch": 0.3831315427316726, + "grad_norm": 0.11507777869701385, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 99110 + }, + { + "epoch": 0.3831701999350559, + "grad_norm": 0.11316626518964767, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 99120 + }, + { + "epoch": 0.38320885713843916, + "grad_norm": 0.11571101099252701, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 99130 + }, + { + "epoch": 0.38324751434182247, + "grad_norm": 0.0992959663271904, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 99140 + }, + { + "epoch": 0.3832861715452057, + "grad_norm": 0.12370272725820541, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 99150 + }, + { + "epoch": 0.38332482874858903, + "grad_norm": 0.0909237340092659, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 99160 + }, + { + "epoch": 0.3833634859519723, + "grad_norm": 0.13060149550437927, + "learning_rate": 0.002, + "loss": 2.359, + "step": 99170 + }, + { + "epoch": 0.3834021431553556, + "grad_norm": 0.1097990944981575, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 99180 + }, + { + "epoch": 0.38344080035873884, + "grad_norm": 0.12019643187522888, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 99190 + }, + { + "epoch": 0.38347945756212215, + "grad_norm": 0.10726134479045868, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 99200 + }, + { + "epoch": 0.3835181147655054, + "grad_norm": 0.11693156510591507, + "learning_rate": 0.002, + "loss": 2.37, + "step": 99210 + }, + { + "epoch": 0.3835567719688887, + "grad_norm": 0.10005655139684677, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 99220 + }, + { + "epoch": 0.38359542917227196, + "grad_norm": 0.10546668618917465, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 99230 + }, + { + "epoch": 0.3836340863756552, + "grad_norm": 0.11036413162946701, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 99240 + }, + { + "epoch": 0.3836727435790385, + "grad_norm": 0.09516773372888565, + "learning_rate": 0.002, + "loss": 2.349, + "step": 99250 + }, + { + "epoch": 0.3837114007824218, + "grad_norm": 0.10512733459472656, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 99260 + }, + { + "epoch": 0.3837500579858051, + "grad_norm": 0.1009531170129776, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 99270 + }, + { + "epoch": 0.38378871518918833, + "grad_norm": 0.29204094409942627, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 99280 + }, + { + "epoch": 0.38382737239257164, + "grad_norm": 0.1206972673535347, + "learning_rate": 0.002, + "loss": 2.36, + "step": 99290 + }, + { + "epoch": 0.3838660295959549, + "grad_norm": 0.09276767075061798, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 99300 + }, + { + "epoch": 0.3839046867993382, + "grad_norm": 0.09996117651462555, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 99310 + }, + { + "epoch": 0.38394334400272145, + "grad_norm": 0.09887313842773438, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 99320 + }, + { + "epoch": 0.38398200120610476, + "grad_norm": 0.10435585677623749, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 99330 + }, + { + "epoch": 0.384020658409488, + "grad_norm": 0.10483123362064362, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 99340 + }, + { + "epoch": 0.3840593156128713, + "grad_norm": 0.10192303359508514, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 99350 + }, + { + "epoch": 0.3840979728162546, + "grad_norm": 0.1300593763589859, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 99360 + }, + { + "epoch": 0.3841366300196379, + "grad_norm": 0.10089082270860672, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 99370 + }, + { + "epoch": 0.38417528722302113, + "grad_norm": 0.11565294116735458, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 99380 + }, + { + "epoch": 0.38421394442640444, + "grad_norm": 0.10272479802370071, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 99390 + }, + { + "epoch": 0.3842526016297877, + "grad_norm": 0.12943372130393982, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 99400 + }, + { + "epoch": 0.38429125883317095, + "grad_norm": 0.11217237263917923, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 99410 + }, + { + "epoch": 0.38432991603655425, + "grad_norm": 0.12317653745412827, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 99420 + }, + { + "epoch": 0.3843685732399375, + "grad_norm": 0.10992909222841263, + "learning_rate": 0.002, + "loss": 2.365, + "step": 99430 + }, + { + "epoch": 0.3844072304433208, + "grad_norm": 0.11047405749559402, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 99440 + }, + { + "epoch": 0.38444588764670407, + "grad_norm": 0.10105373710393906, + "learning_rate": 0.002, + "loss": 2.353, + "step": 99450 + }, + { + "epoch": 0.3844845448500874, + "grad_norm": 0.12573686242103577, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 99460 + }, + { + "epoch": 0.3845232020534706, + "grad_norm": 0.12352735549211502, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 99470 + }, + { + "epoch": 0.38456185925685393, + "grad_norm": 0.1070810928940773, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 99480 + }, + { + "epoch": 0.3846005164602372, + "grad_norm": 0.10397505760192871, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 99490 + }, + { + "epoch": 0.3846391736636205, + "grad_norm": 0.1048097237944603, + "learning_rate": 0.002, + "loss": 2.375, + "step": 99500 + }, + { + "epoch": 0.38467783086700374, + "grad_norm": 0.10362658649682999, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 99510 + }, + { + "epoch": 0.38471648807038705, + "grad_norm": 0.1036435216665268, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 99520 + }, + { + "epoch": 0.3847551452737703, + "grad_norm": 0.11805278807878494, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 99530 + }, + { + "epoch": 0.3847938024771536, + "grad_norm": 0.11448401212692261, + "learning_rate": 0.002, + "loss": 2.34, + "step": 99540 + }, + { + "epoch": 0.38483245968053686, + "grad_norm": 0.10449942946434021, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 99550 + }, + { + "epoch": 0.3848711168839202, + "grad_norm": 0.08898783475160599, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 99560 + }, + { + "epoch": 0.3849097740873034, + "grad_norm": 0.12039284408092499, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 99570 + }, + { + "epoch": 0.38494843129068673, + "grad_norm": 0.09976094961166382, + "learning_rate": 0.002, + "loss": 2.339, + "step": 99580 + }, + { + "epoch": 0.38498708849407, + "grad_norm": 0.10564619302749634, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 99590 + }, + { + "epoch": 0.38502574569745324, + "grad_norm": 0.10846693813800812, + "learning_rate": 0.002, + "loss": 2.358, + "step": 99600 + }, + { + "epoch": 0.38506440290083654, + "grad_norm": 0.10320775210857391, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 99610 + }, + { + "epoch": 0.3851030601042198, + "grad_norm": 0.09724020957946777, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 99620 + }, + { + "epoch": 0.3851417173076031, + "grad_norm": 0.1234230324625969, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 99630 + }, + { + "epoch": 0.38518037451098636, + "grad_norm": 0.10737951844930649, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 99640 + }, + { + "epoch": 0.38521903171436966, + "grad_norm": 0.10597635805606842, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 99650 + }, + { + "epoch": 0.3852576889177529, + "grad_norm": 0.10188396275043488, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 99660 + }, + { + "epoch": 0.3852963461211362, + "grad_norm": 0.10658920556306839, + "learning_rate": 0.002, + "loss": 2.36, + "step": 99670 + }, + { + "epoch": 0.3853350033245195, + "grad_norm": 0.11044390499591827, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 99680 + }, + { + "epoch": 0.3853736605279028, + "grad_norm": 0.09951883554458618, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 99690 + }, + { + "epoch": 0.38541231773128604, + "grad_norm": 0.1322626769542694, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 99700 + }, + { + "epoch": 0.38545097493466934, + "grad_norm": 0.10381346940994263, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 99710 + }, + { + "epoch": 0.3854896321380526, + "grad_norm": 0.09962229430675507, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 99720 + }, + { + "epoch": 0.3855282893414359, + "grad_norm": 0.1005672961473465, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 99730 + }, + { + "epoch": 0.38556694654481916, + "grad_norm": 0.11470509320497513, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 99740 + }, + { + "epoch": 0.38560560374820246, + "grad_norm": 0.09590152651071548, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 99750 + }, + { + "epoch": 0.3856442609515857, + "grad_norm": 0.12328245490789413, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 99760 + }, + { + "epoch": 0.385682918154969, + "grad_norm": 0.10069391131401062, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 99770 + }, + { + "epoch": 0.3857215753583523, + "grad_norm": 0.11742331087589264, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 99780 + }, + { + "epoch": 0.3857602325617355, + "grad_norm": 0.10783084481954575, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 99790 + }, + { + "epoch": 0.38579888976511884, + "grad_norm": 0.10142330825328827, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 99800 + }, + { + "epoch": 0.3858375469685021, + "grad_norm": 0.10665824264287949, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 99810 + }, + { + "epoch": 0.3858762041718854, + "grad_norm": 0.1259702444076538, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 99820 + }, + { + "epoch": 0.38591486137526865, + "grad_norm": 0.08524170517921448, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 99830 + }, + { + "epoch": 0.38595351857865196, + "grad_norm": 0.10294031351804733, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 99840 + }, + { + "epoch": 0.3859921757820352, + "grad_norm": 0.091279037296772, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 99850 + }, + { + "epoch": 0.3860308329854185, + "grad_norm": 0.13400551676750183, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 99860 + }, + { + "epoch": 0.38606949018880177, + "grad_norm": 0.11633538454771042, + "learning_rate": 0.002, + "loss": 2.341, + "step": 99870 + }, + { + "epoch": 0.3861081473921851, + "grad_norm": 0.09606362134218216, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 99880 + }, + { + "epoch": 0.3861468045955683, + "grad_norm": 0.1037554070353508, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 99890 + }, + { + "epoch": 0.38618546179895163, + "grad_norm": 0.10212548822164536, + "learning_rate": 0.002, + "loss": 2.338, + "step": 99900 + }, + { + "epoch": 0.3862241190023349, + "grad_norm": 0.10313539206981659, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 99910 + }, + { + "epoch": 0.3862627762057182, + "grad_norm": 0.12302293628454208, + "learning_rate": 0.002, + "loss": 2.3687, + "step": 99920 + }, + { + "epoch": 0.38630143340910145, + "grad_norm": 0.12717850506305695, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 99930 + }, + { + "epoch": 0.38634009061248475, + "grad_norm": 0.11817540228366852, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 99940 + }, + { + "epoch": 0.386378747815868, + "grad_norm": 0.1065082773566246, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 99950 + }, + { + "epoch": 0.3864174050192513, + "grad_norm": 0.11863639950752258, + "learning_rate": 0.002, + "loss": 2.3699, + "step": 99960 + }, + { + "epoch": 0.38645606222263457, + "grad_norm": 0.10106471925973892, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 99970 + }, + { + "epoch": 0.3864947194260178, + "grad_norm": 0.10772629082202911, + "learning_rate": 0.002, + "loss": 2.357, + "step": 99980 + }, + { + "epoch": 0.3865333766294011, + "grad_norm": 0.11508717387914658, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 99990 + }, + { + "epoch": 0.3865720338327844, + "grad_norm": 0.12177982926368713, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 100000 + }, + { + "epoch": 0.3866106910361677, + "grad_norm": 0.10621000081300735, + "learning_rate": 0.002, + "loss": 2.355, + "step": 100010 + }, + { + "epoch": 0.38664934823955094, + "grad_norm": 0.09023228287696838, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 100020 + }, + { + "epoch": 0.38668800544293425, + "grad_norm": 0.13165788352489471, + "learning_rate": 0.002, + "loss": 2.353, + "step": 100030 + }, + { + "epoch": 0.3867266626463175, + "grad_norm": 0.10812313854694366, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 100040 + }, + { + "epoch": 0.3867653198497008, + "grad_norm": 0.1083117350935936, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 100050 + }, + { + "epoch": 0.38680397705308406, + "grad_norm": 0.12282432615756989, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 100060 + }, + { + "epoch": 0.38684263425646737, + "grad_norm": 0.10567369312047958, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 100070 + }, + { + "epoch": 0.3868812914598506, + "grad_norm": 0.13557782769203186, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 100080 + }, + { + "epoch": 0.3869199486632339, + "grad_norm": 0.0997626781463623, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 100090 + }, + { + "epoch": 0.3869586058666172, + "grad_norm": 0.11340288817882538, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 100100 + }, + { + "epoch": 0.3869972630700005, + "grad_norm": 0.11692352592945099, + "learning_rate": 0.002, + "loss": 2.365, + "step": 100110 + }, + { + "epoch": 0.38703592027338374, + "grad_norm": 0.10254278779029846, + "learning_rate": 0.002, + "loss": 2.349, + "step": 100120 + }, + { + "epoch": 0.38707457747676705, + "grad_norm": 0.1118793785572052, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 100130 + }, + { + "epoch": 0.3871132346801503, + "grad_norm": 0.11556681990623474, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 100140 + }, + { + "epoch": 0.3871518918835336, + "grad_norm": 0.12200041115283966, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 100150 + }, + { + "epoch": 0.38719054908691686, + "grad_norm": 0.125214621424675, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 100160 + }, + { + "epoch": 0.3872292062903001, + "grad_norm": 0.10922552645206451, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 100170 + }, + { + "epoch": 0.3872678634936834, + "grad_norm": 0.09976062923669815, + "learning_rate": 0.002, + "loss": 2.358, + "step": 100180 + }, + { + "epoch": 0.38730652069706667, + "grad_norm": 0.10824978351593018, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 100190 + }, + { + "epoch": 0.38734517790045, + "grad_norm": 0.09569326043128967, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 100200 + }, + { + "epoch": 0.38738383510383323, + "grad_norm": 0.10602735728025436, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 100210 + }, + { + "epoch": 0.38742249230721654, + "grad_norm": 0.09490755200386047, + "learning_rate": 0.002, + "loss": 2.344, + "step": 100220 + }, + { + "epoch": 0.3874611495105998, + "grad_norm": 0.100970558822155, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 100230 + }, + { + "epoch": 0.3874998067139831, + "grad_norm": 0.10085583478212357, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 100240 + }, + { + "epoch": 0.38753846391736635, + "grad_norm": 0.09461560845375061, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 100250 + }, + { + "epoch": 0.38757712112074966, + "grad_norm": 0.12680354714393616, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 100260 + }, + { + "epoch": 0.3876157783241329, + "grad_norm": 0.11424418538808823, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 100270 + }, + { + "epoch": 0.3876544355275162, + "grad_norm": 0.1029893308877945, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 100280 + }, + { + "epoch": 0.38769309273089947, + "grad_norm": 0.11332143098115921, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 100290 + }, + { + "epoch": 0.3877317499342828, + "grad_norm": 0.11435232311487198, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 100300 + }, + { + "epoch": 0.38777040713766603, + "grad_norm": 0.09768123924732208, + "learning_rate": 0.002, + "loss": 2.357, + "step": 100310 + }, + { + "epoch": 0.38780906434104934, + "grad_norm": 0.11493542790412903, + "learning_rate": 0.002, + "loss": 2.358, + "step": 100320 + }, + { + "epoch": 0.3878477215444326, + "grad_norm": 0.09657100588083267, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 100330 + }, + { + "epoch": 0.38788637874781584, + "grad_norm": 0.11080711334943771, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 100340 + }, + { + "epoch": 0.38792503595119915, + "grad_norm": 0.12045042961835861, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 100350 + }, + { + "epoch": 0.3879636931545824, + "grad_norm": 0.12552835047245026, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 100360 + }, + { + "epoch": 0.3880023503579657, + "grad_norm": 0.10142803192138672, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 100370 + }, + { + "epoch": 0.38804100756134896, + "grad_norm": 0.09850650280714035, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 100380 + }, + { + "epoch": 0.38807966476473227, + "grad_norm": 0.09998472779989243, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 100390 + }, + { + "epoch": 0.3881183219681155, + "grad_norm": 0.11177127808332443, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 100400 + }, + { + "epoch": 0.38815697917149883, + "grad_norm": 0.11975325644016266, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 100410 + }, + { + "epoch": 0.3881956363748821, + "grad_norm": 0.10338979959487915, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 100420 + }, + { + "epoch": 0.3882342935782654, + "grad_norm": 0.10348150879144669, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 100430 + }, + { + "epoch": 0.38827295078164864, + "grad_norm": 0.09385969489812851, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 100440 + }, + { + "epoch": 0.38831160798503195, + "grad_norm": 0.11644785851240158, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 100450 + }, + { + "epoch": 0.3883502651884152, + "grad_norm": 0.09925386309623718, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 100460 + }, + { + "epoch": 0.3883889223917985, + "grad_norm": 0.1058383509516716, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 100470 + }, + { + "epoch": 0.38842757959518176, + "grad_norm": 0.11195088922977448, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 100480 + }, + { + "epoch": 0.38846623679856507, + "grad_norm": 0.11256535351276398, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 100490 + }, + { + "epoch": 0.3885048940019483, + "grad_norm": 0.1299486607313156, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 100500 + }, + { + "epoch": 0.3885435512053316, + "grad_norm": 0.10605520009994507, + "learning_rate": 0.002, + "loss": 2.357, + "step": 100510 + }, + { + "epoch": 0.3885822084087149, + "grad_norm": 0.1053491160273552, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 100520 + }, + { + "epoch": 0.38862086561209813, + "grad_norm": 0.10584665834903717, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 100530 + }, + { + "epoch": 0.38865952281548144, + "grad_norm": 0.12226726114749908, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 100540 + }, + { + "epoch": 0.3886981800188647, + "grad_norm": 0.09885209053754807, + "learning_rate": 0.002, + "loss": 2.359, + "step": 100550 + }, + { + "epoch": 0.388736837222248, + "grad_norm": 0.11455842852592468, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 100560 + }, + { + "epoch": 0.38877549442563125, + "grad_norm": 0.10637041181325912, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 100570 + }, + { + "epoch": 0.38881415162901456, + "grad_norm": 0.12398865073919296, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 100580 + }, + { + "epoch": 0.3888528088323978, + "grad_norm": 0.9685719013214111, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 100590 + }, + { + "epoch": 0.3888914660357811, + "grad_norm": 0.21529839932918549, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 100600 + }, + { + "epoch": 0.38893012323916437, + "grad_norm": 0.10525954514741898, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 100610 + }, + { + "epoch": 0.3889687804425477, + "grad_norm": 0.0974649041891098, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 100620 + }, + { + "epoch": 0.38900743764593093, + "grad_norm": 0.10220787674188614, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 100630 + }, + { + "epoch": 0.38904609484931424, + "grad_norm": 0.11424384266138077, + "learning_rate": 0.002, + "loss": 2.338, + "step": 100640 + }, + { + "epoch": 0.3890847520526975, + "grad_norm": 0.09804636985063553, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 100650 + }, + { + "epoch": 0.3891234092560808, + "grad_norm": 0.10460495948791504, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 100660 + }, + { + "epoch": 0.38916206645946405, + "grad_norm": 0.09209802001714706, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 100670 + }, + { + "epoch": 0.38920072366284736, + "grad_norm": 0.1093793734908104, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 100680 + }, + { + "epoch": 0.3892393808662306, + "grad_norm": 0.1261608600616455, + "learning_rate": 0.002, + "loss": 2.363, + "step": 100690 + }, + { + "epoch": 0.3892780380696139, + "grad_norm": 0.1179034411907196, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 100700 + }, + { + "epoch": 0.38931669527299717, + "grad_norm": 0.10968802869319916, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 100710 + }, + { + "epoch": 0.3893553524763804, + "grad_norm": 0.10325589776039124, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 100720 + }, + { + "epoch": 0.38939400967976373, + "grad_norm": 0.1065792515873909, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 100730 + }, + { + "epoch": 0.389432666883147, + "grad_norm": 0.12430709600448608, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 100740 + }, + { + "epoch": 0.3894713240865303, + "grad_norm": 0.12091759592294693, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 100750 + }, + { + "epoch": 0.38950998128991354, + "grad_norm": 0.10765664279460907, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 100760 + }, + { + "epoch": 0.38954863849329685, + "grad_norm": 0.09883479028940201, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 100770 + }, + { + "epoch": 0.3895872956966801, + "grad_norm": 0.12051030993461609, + "learning_rate": 0.002, + "loss": 2.342, + "step": 100780 + }, + { + "epoch": 0.3896259529000634, + "grad_norm": 0.1364177167415619, + "learning_rate": 0.002, + "loss": 2.36, + "step": 100790 + }, + { + "epoch": 0.38966461010344666, + "grad_norm": 0.11860163509845734, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 100800 + }, + { + "epoch": 0.38970326730682997, + "grad_norm": 0.09373585134744644, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 100810 + }, + { + "epoch": 0.3897419245102132, + "grad_norm": 0.10079554468393326, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 100820 + }, + { + "epoch": 0.38978058171359653, + "grad_norm": 0.1616877168416977, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 100830 + }, + { + "epoch": 0.3898192389169798, + "grad_norm": 0.11990071833133698, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 100840 + }, + { + "epoch": 0.3898578961203631, + "grad_norm": 0.1280861496925354, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 100850 + }, + { + "epoch": 0.38989655332374634, + "grad_norm": 0.10243765264749527, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 100860 + }, + { + "epoch": 0.38993521052712965, + "grad_norm": 0.10121989250183105, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 100870 + }, + { + "epoch": 0.3899738677305129, + "grad_norm": 0.12439978867769241, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 100880 + }, + { + "epoch": 0.3900125249338962, + "grad_norm": 0.10615507513284683, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 100890 + }, + { + "epoch": 0.39005118213727946, + "grad_norm": 0.10463973879814148, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 100900 + }, + { + "epoch": 0.3900898393406627, + "grad_norm": 0.13091835379600525, + "learning_rate": 0.002, + "loss": 2.373, + "step": 100910 + }, + { + "epoch": 0.390128496544046, + "grad_norm": 0.11568377912044525, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 100920 + }, + { + "epoch": 0.3901671537474293, + "grad_norm": 0.11564431339502335, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 100930 + }, + { + "epoch": 0.3902058109508126, + "grad_norm": 0.11365380883216858, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 100940 + }, + { + "epoch": 0.39024446815419583, + "grad_norm": 0.11408404260873795, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 100950 + }, + { + "epoch": 0.39028312535757914, + "grad_norm": 0.10143405199050903, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 100960 + }, + { + "epoch": 0.3903217825609624, + "grad_norm": 0.11916525661945343, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 100970 + }, + { + "epoch": 0.3903604397643457, + "grad_norm": 0.11719028651714325, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 100980 + }, + { + "epoch": 0.39039909696772895, + "grad_norm": 0.1009218841791153, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 100990 + }, + { + "epoch": 0.39043775417111226, + "grad_norm": 0.17523200809955597, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 101000 + }, + { + "epoch": 0.3904764113744955, + "grad_norm": 0.11103260517120361, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 101010 + }, + { + "epoch": 0.3905150685778788, + "grad_norm": 0.09755750745534897, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 101020 + }, + { + "epoch": 0.3905537257812621, + "grad_norm": 0.09089493751525879, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 101030 + }, + { + "epoch": 0.3905923829846454, + "grad_norm": 0.09701880812644958, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 101040 + }, + { + "epoch": 0.39063104018802863, + "grad_norm": 0.10882467031478882, + "learning_rate": 0.002, + "loss": 2.339, + "step": 101050 + }, + { + "epoch": 0.39066969739141194, + "grad_norm": 0.11161111295223236, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 101060 + }, + { + "epoch": 0.3907083545947952, + "grad_norm": 0.11580555140972137, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 101070 + }, + { + "epoch": 0.39074701179817845, + "grad_norm": 0.10042113810777664, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 101080 + }, + { + "epoch": 0.39078566900156175, + "grad_norm": 0.12678062915802002, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 101090 + }, + { + "epoch": 0.390824326204945, + "grad_norm": 0.09452252835035324, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 101100 + }, + { + "epoch": 0.3908629834083283, + "grad_norm": 0.10759731382131577, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 101110 + }, + { + "epoch": 0.39090164061171156, + "grad_norm": 0.11984322965145111, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 101120 + }, + { + "epoch": 0.3909402978150949, + "grad_norm": 0.11291025578975677, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 101130 + }, + { + "epoch": 0.3909789550184781, + "grad_norm": 0.1867348998785019, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 101140 + }, + { + "epoch": 0.39101761222186143, + "grad_norm": 0.10820949822664261, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 101150 + }, + { + "epoch": 0.3910562694252447, + "grad_norm": 0.11150885373353958, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 101160 + }, + { + "epoch": 0.391094926628628, + "grad_norm": 0.12619318068027496, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 101170 + }, + { + "epoch": 0.39113358383201124, + "grad_norm": 0.09870398789644241, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 101180 + }, + { + "epoch": 0.39117224103539455, + "grad_norm": 0.1280161589384079, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 101190 + }, + { + "epoch": 0.3912108982387778, + "grad_norm": 0.10799254477024078, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 101200 + }, + { + "epoch": 0.3912495554421611, + "grad_norm": 0.10901007801294327, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 101210 + }, + { + "epoch": 0.39128821264554436, + "grad_norm": 0.10274651646614075, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 101220 + }, + { + "epoch": 0.39132686984892767, + "grad_norm": 0.11202777922153473, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 101230 + }, + { + "epoch": 0.3913655270523109, + "grad_norm": 0.10203932970762253, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 101240 + }, + { + "epoch": 0.39140418425569423, + "grad_norm": 0.10965485125780106, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 101250 + }, + { + "epoch": 0.3914428414590775, + "grad_norm": 0.10451711714267731, + "learning_rate": 0.002, + "loss": 2.342, + "step": 101260 + }, + { + "epoch": 0.39148149866246074, + "grad_norm": 0.10822159796953201, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 101270 + }, + { + "epoch": 0.39152015586584404, + "grad_norm": 0.10773058980703354, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 101280 + }, + { + "epoch": 0.3915588130692273, + "grad_norm": 0.11595533788204193, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 101290 + }, + { + "epoch": 0.3915974702726106, + "grad_norm": 0.0982164740562439, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 101300 + }, + { + "epoch": 0.39163612747599386, + "grad_norm": 0.11945968121290207, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 101310 + }, + { + "epoch": 0.39167478467937716, + "grad_norm": 0.09924999624490738, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 101320 + }, + { + "epoch": 0.3917134418827604, + "grad_norm": 0.13069695234298706, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 101330 + }, + { + "epoch": 0.3917520990861437, + "grad_norm": 0.10410667210817337, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 101340 + }, + { + "epoch": 0.391790756289527, + "grad_norm": 0.11061934381723404, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 101350 + }, + { + "epoch": 0.3918294134929103, + "grad_norm": 0.11253561824560165, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 101360 + }, + { + "epoch": 0.39186807069629354, + "grad_norm": 0.09618701040744781, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 101370 + }, + { + "epoch": 0.39190672789967684, + "grad_norm": 0.10598044097423553, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 101380 + }, + { + "epoch": 0.3919453851030601, + "grad_norm": 0.11163297295570374, + "learning_rate": 0.002, + "loss": 2.34, + "step": 101390 + }, + { + "epoch": 0.3919840423064434, + "grad_norm": 0.1058531403541565, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 101400 + }, + { + "epoch": 0.39202269950982666, + "grad_norm": 0.10144970566034317, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 101410 + }, + { + "epoch": 0.39206135671320996, + "grad_norm": 0.1309688538312912, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 101420 + }, + { + "epoch": 0.3921000139165932, + "grad_norm": 0.0972873792052269, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 101430 + }, + { + "epoch": 0.3921386711199765, + "grad_norm": 0.11298844963312149, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 101440 + }, + { + "epoch": 0.3921773283233598, + "grad_norm": 0.09406055510044098, + "learning_rate": 0.002, + "loss": 2.3698, + "step": 101450 + }, + { + "epoch": 0.392215985526743, + "grad_norm": 0.10956672579050064, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 101460 + }, + { + "epoch": 0.39225464273012633, + "grad_norm": 0.1047091856598854, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 101470 + }, + { + "epoch": 0.3922932999335096, + "grad_norm": 0.09906767308712006, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 101480 + }, + { + "epoch": 0.3923319571368929, + "grad_norm": 0.1101418063044548, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 101490 + }, + { + "epoch": 0.39237061434027615, + "grad_norm": 0.10937904566526413, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 101500 + }, + { + "epoch": 0.39240927154365945, + "grad_norm": 0.09774620085954666, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 101510 + }, + { + "epoch": 0.3924479287470427, + "grad_norm": 0.11182210594415665, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 101520 + }, + { + "epoch": 0.392486585950426, + "grad_norm": 0.11913948506116867, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 101530 + }, + { + "epoch": 0.39252524315380927, + "grad_norm": 0.1034182757139206, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 101540 + }, + { + "epoch": 0.3925639003571926, + "grad_norm": 0.11861073225736618, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 101550 + }, + { + "epoch": 0.3926025575605758, + "grad_norm": 0.10756992548704147, + "learning_rate": 0.002, + "loss": 2.344, + "step": 101560 + }, + { + "epoch": 0.39264121476395913, + "grad_norm": 0.10961413383483887, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 101570 + }, + { + "epoch": 0.3926798719673424, + "grad_norm": 0.10385117679834366, + "learning_rate": 0.002, + "loss": 2.355, + "step": 101580 + }, + { + "epoch": 0.3927185291707257, + "grad_norm": 0.10954749584197998, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 101590 + }, + { + "epoch": 0.39275718637410895, + "grad_norm": 0.10971171408891678, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 101600 + }, + { + "epoch": 0.39279584357749225, + "grad_norm": 0.10529763251543045, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 101610 + }, + { + "epoch": 0.3928345007808755, + "grad_norm": 0.10678326338529587, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 101620 + }, + { + "epoch": 0.3928731579842588, + "grad_norm": 0.1138482615351677, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 101630 + }, + { + "epoch": 0.39291181518764207, + "grad_norm": 0.10951939970254898, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 101640 + }, + { + "epoch": 0.3929504723910253, + "grad_norm": 0.10590513050556183, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 101650 + }, + { + "epoch": 0.3929891295944086, + "grad_norm": 0.10488978028297424, + "learning_rate": 0.002, + "loss": 2.357, + "step": 101660 + }, + { + "epoch": 0.3930277867977919, + "grad_norm": 0.1191963478922844, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 101670 + }, + { + "epoch": 0.3930664440011752, + "grad_norm": 0.10693497955799103, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 101680 + }, + { + "epoch": 0.39310510120455844, + "grad_norm": 0.1340564340353012, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 101690 + }, + { + "epoch": 0.39314375840794175, + "grad_norm": 0.13421568274497986, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 101700 + }, + { + "epoch": 0.393182415611325, + "grad_norm": 0.10439509898424149, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 101710 + }, + { + "epoch": 0.3932210728147083, + "grad_norm": 0.1057930588722229, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 101720 + }, + { + "epoch": 0.39325973001809156, + "grad_norm": 0.11698611825704575, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 101730 + }, + { + "epoch": 0.39329838722147487, + "grad_norm": 0.12837250530719757, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 101740 + }, + { + "epoch": 0.3933370444248581, + "grad_norm": 0.09668096154928207, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 101750 + }, + { + "epoch": 0.3933757016282414, + "grad_norm": 0.1062922403216362, + "learning_rate": 0.002, + "loss": 2.356, + "step": 101760 + }, + { + "epoch": 0.3934143588316247, + "grad_norm": 0.10767418146133423, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 101770 + }, + { + "epoch": 0.393453016035008, + "grad_norm": 0.1008111760020256, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 101780 + }, + { + "epoch": 0.39349167323839124, + "grad_norm": 0.1303732544183731, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 101790 + }, + { + "epoch": 0.39353033044177455, + "grad_norm": 0.10353969037532806, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 101800 + }, + { + "epoch": 0.3935689876451578, + "grad_norm": 0.0988667905330658, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 101810 + }, + { + "epoch": 0.39360764484854105, + "grad_norm": 0.10777206718921661, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 101820 + }, + { + "epoch": 0.39364630205192436, + "grad_norm": 0.14098067581653595, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 101830 + }, + { + "epoch": 0.3936849592553076, + "grad_norm": 0.10238955169916153, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 101840 + }, + { + "epoch": 0.3937236164586909, + "grad_norm": 0.1051107719540596, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 101850 + }, + { + "epoch": 0.39376227366207417, + "grad_norm": 0.1134108379483223, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 101860 + }, + { + "epoch": 0.3938009308654575, + "grad_norm": 0.11773721873760223, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 101870 + }, + { + "epoch": 0.39383958806884073, + "grad_norm": 0.11069432646036148, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 101880 + }, + { + "epoch": 0.39387824527222404, + "grad_norm": 0.10330471396446228, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 101890 + }, + { + "epoch": 0.3939169024756073, + "grad_norm": 0.09684577584266663, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 101900 + }, + { + "epoch": 0.3939555596789906, + "grad_norm": 0.11704834550619125, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 101910 + }, + { + "epoch": 0.39399421688237385, + "grad_norm": 0.109725721180439, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 101920 + }, + { + "epoch": 0.39403287408575716, + "grad_norm": 0.09688954055309296, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 101930 + }, + { + "epoch": 0.3940715312891404, + "grad_norm": 0.13549424707889557, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 101940 + }, + { + "epoch": 0.3941101884925237, + "grad_norm": 0.09275535494089127, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 101950 + }, + { + "epoch": 0.39414884569590697, + "grad_norm": 0.09769611805677414, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 101960 + }, + { + "epoch": 0.3941875028992903, + "grad_norm": 0.1042463481426239, + "learning_rate": 0.002, + "loss": 2.346, + "step": 101970 + }, + { + "epoch": 0.39422616010267353, + "grad_norm": 0.10281561315059662, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 101980 + }, + { + "epoch": 0.39426481730605684, + "grad_norm": 0.11353082209825516, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 101990 + }, + { + "epoch": 0.3943034745094401, + "grad_norm": 0.11525498330593109, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 102000 + }, + { + "epoch": 0.39434213171282334, + "grad_norm": 0.09364749491214752, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 102010 + }, + { + "epoch": 0.39438078891620665, + "grad_norm": 0.09514220058917999, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 102020 + }, + { + "epoch": 0.3944194461195899, + "grad_norm": 0.09396491944789886, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 102030 + }, + { + "epoch": 0.3944581033229732, + "grad_norm": 0.09882369637489319, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 102040 + }, + { + "epoch": 0.39449676052635646, + "grad_norm": 0.10531751811504364, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 102050 + }, + { + "epoch": 0.39453541772973977, + "grad_norm": 0.10298800468444824, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 102060 + }, + { + "epoch": 0.394574074933123, + "grad_norm": 0.10079105943441391, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 102070 + }, + { + "epoch": 0.39461273213650633, + "grad_norm": 0.1270875334739685, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 102080 + }, + { + "epoch": 0.3946513893398896, + "grad_norm": 0.1297849863767624, + "learning_rate": 0.002, + "loss": 2.3694, + "step": 102090 + }, + { + "epoch": 0.3946900465432729, + "grad_norm": 0.09605436772108078, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 102100 + }, + { + "epoch": 0.39472870374665614, + "grad_norm": 0.09594700485467911, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 102110 + }, + { + "epoch": 0.39476736095003945, + "grad_norm": 0.1143462061882019, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 102120 + }, + { + "epoch": 0.3948060181534227, + "grad_norm": 0.11425799131393433, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 102130 + }, + { + "epoch": 0.394844675356806, + "grad_norm": 0.10427649319171906, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 102140 + }, + { + "epoch": 0.39488333256018926, + "grad_norm": 0.09673214703798294, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 102150 + }, + { + "epoch": 0.39492198976357257, + "grad_norm": 0.10224836319684982, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 102160 + }, + { + "epoch": 0.3949606469669558, + "grad_norm": 0.09987068176269531, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 102170 + }, + { + "epoch": 0.3949993041703391, + "grad_norm": 0.10207951813936234, + "learning_rate": 0.002, + "loss": 2.351, + "step": 102180 + }, + { + "epoch": 0.3950379613737224, + "grad_norm": 0.11631299555301666, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 102190 + }, + { + "epoch": 0.39507661857710563, + "grad_norm": 0.10433251410722733, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 102200 + }, + { + "epoch": 0.39511527578048894, + "grad_norm": 0.10600210726261139, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 102210 + }, + { + "epoch": 0.3951539329838722, + "grad_norm": 0.09988721460103989, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 102220 + }, + { + "epoch": 0.3951925901872555, + "grad_norm": 0.10811429470777512, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 102230 + }, + { + "epoch": 0.39523124739063875, + "grad_norm": 0.10436130315065384, + "learning_rate": 0.002, + "loss": 2.368, + "step": 102240 + }, + { + "epoch": 0.39526990459402206, + "grad_norm": 0.10150985419750214, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 102250 + }, + { + "epoch": 0.3953085617974053, + "grad_norm": 0.10846556723117828, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 102260 + }, + { + "epoch": 0.3953472190007886, + "grad_norm": 0.10766957700252533, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 102270 + }, + { + "epoch": 0.39538587620417187, + "grad_norm": 0.12314067780971527, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 102280 + }, + { + "epoch": 0.3954245334075552, + "grad_norm": 0.1232890635728836, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 102290 + }, + { + "epoch": 0.39546319061093843, + "grad_norm": 0.11686611920595169, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 102300 + }, + { + "epoch": 0.39550184781432174, + "grad_norm": 0.093016117811203, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 102310 + }, + { + "epoch": 0.395540505017705, + "grad_norm": 0.11799125373363495, + "learning_rate": 0.002, + "loss": 2.349, + "step": 102320 + }, + { + "epoch": 0.3955791622210883, + "grad_norm": 0.10260337591171265, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 102330 + }, + { + "epoch": 0.39561781942447155, + "grad_norm": 0.12818340957164764, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 102340 + }, + { + "epoch": 0.39565647662785486, + "grad_norm": 0.09108906984329224, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 102350 + }, + { + "epoch": 0.3956951338312381, + "grad_norm": 0.10101552307605743, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 102360 + }, + { + "epoch": 0.3957337910346214, + "grad_norm": 0.1083957850933075, + "learning_rate": 0.002, + "loss": 2.342, + "step": 102370 + }, + { + "epoch": 0.39577244823800467, + "grad_norm": 0.10751020908355713, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 102380 + }, + { + "epoch": 0.3958111054413879, + "grad_norm": 0.12719103693962097, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 102390 + }, + { + "epoch": 0.39584976264477123, + "grad_norm": 0.09584097564220428, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 102400 + }, + { + "epoch": 0.3958884198481545, + "grad_norm": 0.11484284698963165, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 102410 + }, + { + "epoch": 0.3959270770515378, + "grad_norm": 0.10826011002063751, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 102420 + }, + { + "epoch": 0.39596573425492104, + "grad_norm": 0.10017026960849762, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 102430 + }, + { + "epoch": 0.39600439145830435, + "grad_norm": 0.10661051422357559, + "learning_rate": 0.002, + "loss": 2.334, + "step": 102440 + }, + { + "epoch": 0.3960430486616876, + "grad_norm": 0.10739060491323471, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 102450 + }, + { + "epoch": 0.3960817058650709, + "grad_norm": 0.1067962646484375, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 102460 + }, + { + "epoch": 0.39612036306845416, + "grad_norm": 0.11817949265241623, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 102470 + }, + { + "epoch": 0.39615902027183747, + "grad_norm": 0.11644504964351654, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 102480 + }, + { + "epoch": 0.3961976774752207, + "grad_norm": 0.11023860424757004, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 102490 + }, + { + "epoch": 0.39623633467860403, + "grad_norm": 0.3637651205062866, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 102500 + }, + { + "epoch": 0.3962749918819873, + "grad_norm": 0.0973176658153534, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 102510 + }, + { + "epoch": 0.3963136490853706, + "grad_norm": 0.10590552538633347, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 102520 + }, + { + "epoch": 0.39635230628875384, + "grad_norm": 0.10570419579744339, + "learning_rate": 0.002, + "loss": 2.355, + "step": 102530 + }, + { + "epoch": 0.39639096349213715, + "grad_norm": 0.09971023350954056, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 102540 + }, + { + "epoch": 0.3964296206955204, + "grad_norm": 0.10089641809463501, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 102550 + }, + { + "epoch": 0.3964682778989037, + "grad_norm": 0.12613315880298615, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 102560 + }, + { + "epoch": 0.39650693510228696, + "grad_norm": 0.10003890842199326, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 102570 + }, + { + "epoch": 0.3965455923056702, + "grad_norm": 0.12416119128465652, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 102580 + }, + { + "epoch": 0.3965842495090535, + "grad_norm": 0.10028557479381561, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 102590 + }, + { + "epoch": 0.3966229067124368, + "grad_norm": 0.11924396455287933, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 102600 + }, + { + "epoch": 0.3966615639158201, + "grad_norm": 0.09663531929254532, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 102610 + }, + { + "epoch": 0.39670022111920333, + "grad_norm": 0.09673736989498138, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 102620 + }, + { + "epoch": 0.39673887832258664, + "grad_norm": 0.10709283500909805, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 102630 + }, + { + "epoch": 0.3967775355259699, + "grad_norm": 0.104097880423069, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 102640 + }, + { + "epoch": 0.3968161927293532, + "grad_norm": 0.11647637188434601, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 102650 + }, + { + "epoch": 0.39685484993273645, + "grad_norm": 0.09082239121198654, + "learning_rate": 0.002, + "loss": 2.361, + "step": 102660 + }, + { + "epoch": 0.39689350713611976, + "grad_norm": 0.10260343551635742, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 102670 + }, + { + "epoch": 0.396932164339503, + "grad_norm": 0.09805385768413544, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 102680 + }, + { + "epoch": 0.3969708215428863, + "grad_norm": 0.11476589739322662, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 102690 + }, + { + "epoch": 0.3970094787462696, + "grad_norm": 0.13257639110088348, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 102700 + }, + { + "epoch": 0.3970481359496529, + "grad_norm": 0.12035766243934631, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 102710 + }, + { + "epoch": 0.39708679315303613, + "grad_norm": 0.09962354600429535, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 102720 + }, + { + "epoch": 0.39712545035641944, + "grad_norm": 0.11391833424568176, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 102730 + }, + { + "epoch": 0.3971641075598027, + "grad_norm": 0.09128336608409882, + "learning_rate": 0.002, + "loss": 2.333, + "step": 102740 + }, + { + "epoch": 0.39720276476318594, + "grad_norm": 0.10476367920637131, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 102750 + }, + { + "epoch": 0.39724142196656925, + "grad_norm": 0.11733968555927277, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 102760 + }, + { + "epoch": 0.3972800791699525, + "grad_norm": 0.0970330610871315, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 102770 + }, + { + "epoch": 0.3973187363733358, + "grad_norm": 0.10360578447580338, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 102780 + }, + { + "epoch": 0.39735739357671906, + "grad_norm": 0.11639165133237839, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 102790 + }, + { + "epoch": 0.3973960507801024, + "grad_norm": 0.1210801899433136, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 102800 + }, + { + "epoch": 0.3974347079834856, + "grad_norm": 0.09269449859857559, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 102810 + }, + { + "epoch": 0.39747336518686893, + "grad_norm": 0.14338408410549164, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 102820 + }, + { + "epoch": 0.3975120223902522, + "grad_norm": 0.09909947961568832, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 102830 + }, + { + "epoch": 0.3975506795936355, + "grad_norm": 0.1005893275141716, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 102840 + }, + { + "epoch": 0.39758933679701874, + "grad_norm": 0.11026202142238617, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 102850 + }, + { + "epoch": 0.39762799400040205, + "grad_norm": 0.12471762299537659, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 102860 + }, + { + "epoch": 0.3976666512037853, + "grad_norm": 0.10944908857345581, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 102870 + }, + { + "epoch": 0.3977053084071686, + "grad_norm": 0.10404592752456665, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 102880 + }, + { + "epoch": 0.39774396561055186, + "grad_norm": 0.12181949615478516, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 102890 + }, + { + "epoch": 0.39778262281393517, + "grad_norm": 0.10199355334043503, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 102900 + }, + { + "epoch": 0.3978212800173184, + "grad_norm": 0.10170946270227432, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 102910 + }, + { + "epoch": 0.39785993722070173, + "grad_norm": 0.12267200648784637, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 102920 + }, + { + "epoch": 0.397898594424085, + "grad_norm": 0.11017567664384842, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 102930 + }, + { + "epoch": 0.39793725162746824, + "grad_norm": 0.1563769429922104, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 102940 + }, + { + "epoch": 0.39797590883085154, + "grad_norm": 0.10438354313373566, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 102950 + }, + { + "epoch": 0.3980145660342348, + "grad_norm": 0.10634902864694595, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 102960 + }, + { + "epoch": 0.3980532232376181, + "grad_norm": 0.0988369733095169, + "learning_rate": 0.002, + "loss": 2.34, + "step": 102970 + }, + { + "epoch": 0.39809188044100136, + "grad_norm": 0.09615012258291245, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 102980 + }, + { + "epoch": 0.39813053764438466, + "grad_norm": 0.11945393681526184, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 102990 + }, + { + "epoch": 0.3981691948477679, + "grad_norm": 0.10153481364250183, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 103000 + }, + { + "epoch": 0.3982078520511512, + "grad_norm": 0.10002585500478745, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 103010 + }, + { + "epoch": 0.3982465092545345, + "grad_norm": 0.11555618792772293, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 103020 + }, + { + "epoch": 0.3982851664579178, + "grad_norm": 0.10872345417737961, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 103030 + }, + { + "epoch": 0.39832382366130104, + "grad_norm": 0.11528029292821884, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 103040 + }, + { + "epoch": 0.39836248086468434, + "grad_norm": 0.10003827512264252, + "learning_rate": 0.002, + "loss": 2.329, + "step": 103050 + }, + { + "epoch": 0.3984011380680676, + "grad_norm": 0.11331178992986679, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 103060 + }, + { + "epoch": 0.3984397952714509, + "grad_norm": 0.12022735923528671, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 103070 + }, + { + "epoch": 0.39847845247483415, + "grad_norm": 0.10411456227302551, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 103080 + }, + { + "epoch": 0.39851710967821746, + "grad_norm": 0.10312678664922714, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 103090 + }, + { + "epoch": 0.3985557668816007, + "grad_norm": 0.11242202669382095, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 103100 + }, + { + "epoch": 0.398594424084984, + "grad_norm": 0.10822474956512451, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 103110 + }, + { + "epoch": 0.3986330812883673, + "grad_norm": 0.10457213968038559, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 103120 + }, + { + "epoch": 0.3986717384917505, + "grad_norm": 0.11331282556056976, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 103130 + }, + { + "epoch": 0.39871039569513383, + "grad_norm": 0.10345172882080078, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 103140 + }, + { + "epoch": 0.3987490528985171, + "grad_norm": 0.09362413734197617, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 103150 + }, + { + "epoch": 0.3987877101019004, + "grad_norm": 0.09951414912939072, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 103160 + }, + { + "epoch": 0.39882636730528365, + "grad_norm": 0.10795580595731735, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 103170 + }, + { + "epoch": 0.39886502450866695, + "grad_norm": 0.10748409479856491, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 103180 + }, + { + "epoch": 0.3989036817120502, + "grad_norm": 0.10139096528291702, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 103190 + }, + { + "epoch": 0.3989423389154335, + "grad_norm": 0.10073640942573547, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 103200 + }, + { + "epoch": 0.39898099611881677, + "grad_norm": 0.09807011485099792, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 103210 + }, + { + "epoch": 0.3990196533222001, + "grad_norm": 0.10768158733844757, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 103220 + }, + { + "epoch": 0.3990583105255833, + "grad_norm": 0.10457302629947662, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 103230 + }, + { + "epoch": 0.39909696772896663, + "grad_norm": 0.12398704141378403, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 103240 + }, + { + "epoch": 0.3991356249323499, + "grad_norm": 0.0993892028927803, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 103250 + }, + { + "epoch": 0.3991742821357332, + "grad_norm": 0.10782822966575623, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 103260 + }, + { + "epoch": 0.39921293933911645, + "grad_norm": 0.10682012140750885, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 103270 + }, + { + "epoch": 0.39925159654249975, + "grad_norm": 0.10554670542478561, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 103280 + }, + { + "epoch": 0.399290253745883, + "grad_norm": 0.11510240286588669, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 103290 + }, + { + "epoch": 0.3993289109492663, + "grad_norm": 0.10856513679027557, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 103300 + }, + { + "epoch": 0.39936756815264957, + "grad_norm": 0.10347574204206467, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 103310 + }, + { + "epoch": 0.3994062253560328, + "grad_norm": 1.0058751106262207, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 103320 + }, + { + "epoch": 0.3994448825594161, + "grad_norm": 0.13041211664676666, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 103330 + }, + { + "epoch": 0.3994835397627994, + "grad_norm": 0.10741354525089264, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 103340 + }, + { + "epoch": 0.3995221969661827, + "grad_norm": 0.11092893034219742, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 103350 + }, + { + "epoch": 0.39956085416956594, + "grad_norm": 0.08905737102031708, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 103360 + }, + { + "epoch": 0.39959951137294925, + "grad_norm": 0.10552625358104706, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 103370 + }, + { + "epoch": 0.3996381685763325, + "grad_norm": 0.10402737557888031, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 103380 + }, + { + "epoch": 0.3996768257797158, + "grad_norm": 0.1106201559305191, + "learning_rate": 0.002, + "loss": 2.334, + "step": 103390 + }, + { + "epoch": 0.39971548298309906, + "grad_norm": 0.1093304455280304, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 103400 + }, + { + "epoch": 0.39975414018648237, + "grad_norm": 0.09958748519420624, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 103410 + }, + { + "epoch": 0.3997927973898656, + "grad_norm": 0.11973224580287933, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 103420 + }, + { + "epoch": 0.3998314545932489, + "grad_norm": 0.10258115828037262, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 103430 + }, + { + "epoch": 0.3998701117966322, + "grad_norm": 0.3839898705482483, + "learning_rate": 0.002, + "loss": 2.328, + "step": 103440 + }, + { + "epoch": 0.3999087690000155, + "grad_norm": 0.11322489380836487, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 103450 + }, + { + "epoch": 0.39994742620339874, + "grad_norm": 0.10540080815553665, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 103460 + }, + { + "epoch": 0.39998608340678204, + "grad_norm": 0.11972621828317642, + "learning_rate": 0.002, + "loss": 2.359, + "step": 103470 + }, + { + "epoch": 0.4000247406101653, + "grad_norm": 0.11651024967432022, + "learning_rate": 0.002, + "loss": 2.349, + "step": 103480 + }, + { + "epoch": 0.40006339781354855, + "grad_norm": 0.11100133508443832, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 103490 + }, + { + "epoch": 0.40010205501693186, + "grad_norm": 0.11071529984474182, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 103500 + }, + { + "epoch": 0.4001407122203151, + "grad_norm": 0.10063956677913666, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 103510 + }, + { + "epoch": 0.4001793694236984, + "grad_norm": 0.12929409742355347, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 103520 + }, + { + "epoch": 0.40021802662708167, + "grad_norm": 0.12283274531364441, + "learning_rate": 0.002, + "loss": 2.352, + "step": 103530 + }, + { + "epoch": 0.400256683830465, + "grad_norm": 0.09345348924398422, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 103540 + }, + { + "epoch": 0.40029534103384823, + "grad_norm": 0.1171153336763382, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 103550 + }, + { + "epoch": 0.40033399823723154, + "grad_norm": 0.10845030099153519, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 103560 + }, + { + "epoch": 0.4003726554406148, + "grad_norm": 0.1053931936621666, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 103570 + }, + { + "epoch": 0.4004113126439981, + "grad_norm": 0.13287438452243805, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 103580 + }, + { + "epoch": 0.40044996984738135, + "grad_norm": 0.10590993613004684, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 103590 + }, + { + "epoch": 0.40048862705076466, + "grad_norm": 0.10484614968299866, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 103600 + }, + { + "epoch": 0.4005272842541479, + "grad_norm": 0.09930567443370819, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 103610 + }, + { + "epoch": 0.4005659414575312, + "grad_norm": 0.1418037712574005, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 103620 + }, + { + "epoch": 0.40060459866091447, + "grad_norm": 0.11796095222234726, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 103630 + }, + { + "epoch": 0.4006432558642978, + "grad_norm": 0.10199625045061111, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 103640 + }, + { + "epoch": 0.40068191306768103, + "grad_norm": 0.11979342997074127, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 103650 + }, + { + "epoch": 0.40072057027106434, + "grad_norm": 0.1384526789188385, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 103660 + }, + { + "epoch": 0.4007592274744476, + "grad_norm": 0.09928222000598907, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 103670 + }, + { + "epoch": 0.40079788467783084, + "grad_norm": 0.10234517604112625, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 103680 + }, + { + "epoch": 0.40083654188121415, + "grad_norm": 0.10509918630123138, + "learning_rate": 0.002, + "loss": 2.342, + "step": 103690 + }, + { + "epoch": 0.4008751990845974, + "grad_norm": 0.12255855649709702, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 103700 + }, + { + "epoch": 0.4009138562879807, + "grad_norm": 0.10494012385606766, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 103710 + }, + { + "epoch": 0.40095251349136396, + "grad_norm": 0.09403003007173538, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 103720 + }, + { + "epoch": 0.40099117069474727, + "grad_norm": 0.1168377697467804, + "learning_rate": 0.002, + "loss": 2.356, + "step": 103730 + }, + { + "epoch": 0.4010298278981305, + "grad_norm": 0.10256733745336533, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 103740 + }, + { + "epoch": 0.4010684851015138, + "grad_norm": 0.11215299367904663, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 103750 + }, + { + "epoch": 0.4011071423048971, + "grad_norm": 0.10858602821826935, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 103760 + }, + { + "epoch": 0.4011457995082804, + "grad_norm": 0.10192175954580307, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 103770 + }, + { + "epoch": 0.40118445671166364, + "grad_norm": 0.10576779395341873, + "learning_rate": 0.002, + "loss": 2.352, + "step": 103780 + }, + { + "epoch": 0.40122311391504695, + "grad_norm": 0.12473911792039871, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 103790 + }, + { + "epoch": 0.4012617711184302, + "grad_norm": 0.10249465703964233, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 103800 + }, + { + "epoch": 0.4013004283218135, + "grad_norm": 0.11050129681825638, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 103810 + }, + { + "epoch": 0.40133908552519676, + "grad_norm": 0.11205238103866577, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 103820 + }, + { + "epoch": 0.40137774272858007, + "grad_norm": 0.12690870463848114, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 103830 + }, + { + "epoch": 0.4014163999319633, + "grad_norm": 0.10052044689655304, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 103840 + }, + { + "epoch": 0.4014550571353466, + "grad_norm": 0.10286065191030502, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 103850 + }, + { + "epoch": 0.4014937143387299, + "grad_norm": 0.11306209862232208, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 103860 + }, + { + "epoch": 0.40153237154211313, + "grad_norm": 0.10765783488750458, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 103870 + }, + { + "epoch": 0.40157102874549644, + "grad_norm": 0.0955042615532875, + "learning_rate": 0.002, + "loss": 2.328, + "step": 103880 + }, + { + "epoch": 0.4016096859488797, + "grad_norm": 0.10637594014406204, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 103890 + }, + { + "epoch": 0.401648343152263, + "grad_norm": 0.1104235053062439, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 103900 + }, + { + "epoch": 0.40168700035564625, + "grad_norm": 0.09370309114456177, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 103910 + }, + { + "epoch": 0.40172565755902956, + "grad_norm": 0.12469807267189026, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 103920 + }, + { + "epoch": 0.4017643147624128, + "grad_norm": 0.12554563581943512, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 103930 + }, + { + "epoch": 0.4018029719657961, + "grad_norm": 0.10871759802103043, + "learning_rate": 0.002, + "loss": 2.357, + "step": 103940 + }, + { + "epoch": 0.40184162916917937, + "grad_norm": 0.10306213796138763, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 103950 + }, + { + "epoch": 0.4018802863725627, + "grad_norm": 0.11437251418828964, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 103960 + }, + { + "epoch": 0.40191894357594593, + "grad_norm": 0.09903883934020996, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 103970 + }, + { + "epoch": 0.40195760077932924, + "grad_norm": 0.10756850987672806, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 103980 + }, + { + "epoch": 0.4019962579827125, + "grad_norm": 0.10248870402574539, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 103990 + }, + { + "epoch": 0.4020349151860958, + "grad_norm": 0.11387482285499573, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 104000 + }, + { + "epoch": 0.40207357238947905, + "grad_norm": 0.10296018421649933, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 104010 + }, + { + "epoch": 0.40211222959286236, + "grad_norm": 0.11493543535470963, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 104020 + }, + { + "epoch": 0.4021508867962456, + "grad_norm": 0.11589168012142181, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 104030 + }, + { + "epoch": 0.4021895439996289, + "grad_norm": 0.09854254126548767, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 104040 + }, + { + "epoch": 0.40222820120301217, + "grad_norm": 0.10222586244344711, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 104050 + }, + { + "epoch": 0.4022668584063954, + "grad_norm": 0.11131079494953156, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 104060 + }, + { + "epoch": 0.40230551560977873, + "grad_norm": 0.11473394930362701, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 104070 + }, + { + "epoch": 0.402344172813162, + "grad_norm": 0.10697763413190842, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 104080 + }, + { + "epoch": 0.4023828300165453, + "grad_norm": 0.12346579879522324, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 104090 + }, + { + "epoch": 0.40242148721992854, + "grad_norm": 0.11302345246076584, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 104100 + }, + { + "epoch": 0.40246014442331185, + "grad_norm": 0.09973450005054474, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 104110 + }, + { + "epoch": 0.4024988016266951, + "grad_norm": 0.10161063820123672, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 104120 + }, + { + "epoch": 0.4025374588300784, + "grad_norm": 0.10558301210403442, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 104130 + }, + { + "epoch": 0.40257611603346166, + "grad_norm": 0.10620249807834625, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 104140 + }, + { + "epoch": 0.40261477323684497, + "grad_norm": 0.10933729261159897, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 104150 + }, + { + "epoch": 0.4026534304402282, + "grad_norm": 0.11010383814573288, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 104160 + }, + { + "epoch": 0.40269208764361153, + "grad_norm": 0.10498680174350739, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 104170 + }, + { + "epoch": 0.4027307448469948, + "grad_norm": 0.1227358728647232, + "learning_rate": 0.002, + "loss": 2.353, + "step": 104180 + }, + { + "epoch": 0.4027694020503781, + "grad_norm": 0.1105983778834343, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 104190 + }, + { + "epoch": 0.40280805925376134, + "grad_norm": 0.12710198760032654, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 104200 + }, + { + "epoch": 0.40284671645714465, + "grad_norm": 0.10611477494239807, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 104210 + }, + { + "epoch": 0.4028853736605279, + "grad_norm": 0.10403914749622345, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 104220 + }, + { + "epoch": 0.4029240308639112, + "grad_norm": 0.10941457003355026, + "learning_rate": 0.002, + "loss": 2.344, + "step": 104230 + }, + { + "epoch": 0.40296268806729446, + "grad_norm": 0.10502897948026657, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 104240 + }, + { + "epoch": 0.4030013452706777, + "grad_norm": 0.10019665956497192, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 104250 + }, + { + "epoch": 0.403040002474061, + "grad_norm": 0.13944409787654877, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 104260 + }, + { + "epoch": 0.4030786596774443, + "grad_norm": 0.101788729429245, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 104270 + }, + { + "epoch": 0.4031173168808276, + "grad_norm": 0.10548502206802368, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 104280 + }, + { + "epoch": 0.40315597408421083, + "grad_norm": 0.11913346499204636, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 104290 + }, + { + "epoch": 0.40319463128759414, + "grad_norm": 0.10414118319749832, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 104300 + }, + { + "epoch": 0.4032332884909774, + "grad_norm": 0.10877209901809692, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 104310 + }, + { + "epoch": 0.4032719456943607, + "grad_norm": 0.10864819586277008, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 104320 + }, + { + "epoch": 0.40331060289774395, + "grad_norm": 0.09370414912700653, + "learning_rate": 0.002, + "loss": 2.356, + "step": 104330 + }, + { + "epoch": 0.40334926010112726, + "grad_norm": 0.0959504172205925, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 104340 + }, + { + "epoch": 0.4033879173045105, + "grad_norm": 0.11955036967992783, + "learning_rate": 0.002, + "loss": 2.357, + "step": 104350 + }, + { + "epoch": 0.4034265745078938, + "grad_norm": 0.10818187147378922, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 104360 + }, + { + "epoch": 0.4034652317112771, + "grad_norm": 0.10869024693965912, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 104370 + }, + { + "epoch": 0.4035038889146604, + "grad_norm": 0.11019674688577652, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 104380 + }, + { + "epoch": 0.40354254611804363, + "grad_norm": 0.1048496887087822, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 104390 + }, + { + "epoch": 0.40358120332142694, + "grad_norm": 0.10483365505933762, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 104400 + }, + { + "epoch": 0.4036198605248102, + "grad_norm": 0.10269634425640106, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 104410 + }, + { + "epoch": 0.40365851772819344, + "grad_norm": 0.10694144666194916, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 104420 + }, + { + "epoch": 0.40369717493157675, + "grad_norm": 0.09867972135543823, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 104430 + }, + { + "epoch": 0.40373583213496, + "grad_norm": 0.1175709143280983, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 104440 + }, + { + "epoch": 0.4037744893383433, + "grad_norm": 0.09954724460840225, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 104450 + }, + { + "epoch": 0.40381314654172656, + "grad_norm": 0.12176381796598434, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 104460 + }, + { + "epoch": 0.40385180374510987, + "grad_norm": 0.11036128550767899, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 104470 + }, + { + "epoch": 0.4038904609484931, + "grad_norm": 0.10182004421949387, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 104480 + }, + { + "epoch": 0.40392911815187643, + "grad_norm": 0.11092185229063034, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 104490 + }, + { + "epoch": 0.4039677753552597, + "grad_norm": 0.0999113991856575, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 104500 + }, + { + "epoch": 0.404006432558643, + "grad_norm": 0.10253775119781494, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 104510 + }, + { + "epoch": 0.40404508976202624, + "grad_norm": 0.10981032252311707, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 104520 + }, + { + "epoch": 0.40408374696540955, + "grad_norm": 0.09996292740106583, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 104530 + }, + { + "epoch": 0.4041224041687928, + "grad_norm": 0.13263949751853943, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 104540 + }, + { + "epoch": 0.4041610613721761, + "grad_norm": 0.11656218022108078, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 104550 + }, + { + "epoch": 0.40419971857555936, + "grad_norm": 0.11897653341293335, + "learning_rate": 0.002, + "loss": 2.342, + "step": 104560 + }, + { + "epoch": 0.40423837577894267, + "grad_norm": 0.10586295276880264, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 104570 + }, + { + "epoch": 0.4042770329823259, + "grad_norm": 0.10061287879943848, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 104580 + }, + { + "epoch": 0.40431569018570923, + "grad_norm": 0.1381102204322815, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 104590 + }, + { + "epoch": 0.4043543473890925, + "grad_norm": 0.11627264320850372, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 104600 + }, + { + "epoch": 0.40439300459247574, + "grad_norm": 0.10314036905765533, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 104610 + }, + { + "epoch": 0.40443166179585904, + "grad_norm": 0.10095860809087753, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 104620 + }, + { + "epoch": 0.4044703189992423, + "grad_norm": 0.1049952283501625, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 104630 + }, + { + "epoch": 0.4045089762026256, + "grad_norm": 0.11221372336149216, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 104640 + }, + { + "epoch": 0.40454763340600886, + "grad_norm": 0.10091055184602737, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 104650 + }, + { + "epoch": 0.40458629060939216, + "grad_norm": 0.09377772361040115, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 104660 + }, + { + "epoch": 0.4046249478127754, + "grad_norm": 0.09856727719306946, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 104670 + }, + { + "epoch": 0.4046636050161587, + "grad_norm": 0.11405321210622787, + "learning_rate": 0.002, + "loss": 2.357, + "step": 104680 + }, + { + "epoch": 0.404702262219542, + "grad_norm": 0.10105791687965393, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 104690 + }, + { + "epoch": 0.4047409194229253, + "grad_norm": 0.10604265332221985, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 104700 + }, + { + "epoch": 0.40477957662630853, + "grad_norm": 0.10855577141046524, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 104710 + }, + { + "epoch": 0.40481823382969184, + "grad_norm": 0.11611035466194153, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 104720 + }, + { + "epoch": 0.4048568910330751, + "grad_norm": 0.1073804497718811, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 104730 + }, + { + "epoch": 0.4048955482364584, + "grad_norm": 0.10479304194450378, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 104740 + }, + { + "epoch": 0.40493420543984165, + "grad_norm": 0.1819019615650177, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 104750 + }, + { + "epoch": 0.40497286264322496, + "grad_norm": 0.11257249116897583, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 104760 + }, + { + "epoch": 0.4050115198466082, + "grad_norm": 0.11527591943740845, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 104770 + }, + { + "epoch": 0.4050501770499915, + "grad_norm": 0.10990086197853088, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 104780 + }, + { + "epoch": 0.4050888342533748, + "grad_norm": 0.11660205572843552, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 104790 + }, + { + "epoch": 0.405127491456758, + "grad_norm": 0.08872044831514359, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 104800 + }, + { + "epoch": 0.40516614866014133, + "grad_norm": 0.12553000450134277, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 104810 + }, + { + "epoch": 0.4052048058635246, + "grad_norm": 0.09895485639572144, + "learning_rate": 0.002, + "loss": 2.348, + "step": 104820 + }, + { + "epoch": 0.4052434630669079, + "grad_norm": 0.11463747918605804, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 104830 + }, + { + "epoch": 0.40528212027029115, + "grad_norm": 0.09791713953018188, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 104840 + }, + { + "epoch": 0.40532077747367445, + "grad_norm": 0.13168174028396606, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 104850 + }, + { + "epoch": 0.4053594346770577, + "grad_norm": 0.10816147923469543, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 104860 + }, + { + "epoch": 0.405398091880441, + "grad_norm": 0.10885234922170639, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 104870 + }, + { + "epoch": 0.40543674908382427, + "grad_norm": 0.09620843082666397, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 104880 + }, + { + "epoch": 0.4054754062872076, + "grad_norm": 0.10717884451150894, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 104890 + }, + { + "epoch": 0.4055140634905908, + "grad_norm": 0.5260806679725647, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 104900 + }, + { + "epoch": 0.40555272069397413, + "grad_norm": 0.11013922840356827, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 104910 + }, + { + "epoch": 0.4055913778973574, + "grad_norm": 0.09905974566936493, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 104920 + }, + { + "epoch": 0.4056300351007407, + "grad_norm": 0.09774720668792725, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 104930 + }, + { + "epoch": 0.40566869230412395, + "grad_norm": 0.10519535094499588, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 104940 + }, + { + "epoch": 0.40570734950750725, + "grad_norm": 0.12374908477067947, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 104950 + }, + { + "epoch": 0.4057460067108905, + "grad_norm": 0.10747280716896057, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 104960 + }, + { + "epoch": 0.4057846639142738, + "grad_norm": 0.10043985396623611, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 104970 + }, + { + "epoch": 0.40582332111765707, + "grad_norm": 0.1304740309715271, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 104980 + }, + { + "epoch": 0.4058619783210403, + "grad_norm": 0.1062544658780098, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 104990 + }, + { + "epoch": 0.4059006355244236, + "grad_norm": 0.10171884298324585, + "learning_rate": 0.002, + "loss": 2.336, + "step": 105000 + }, + { + "epoch": 0.4059392927278069, + "grad_norm": 0.11481379717588425, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 105010 + }, + { + "epoch": 0.4059779499311902, + "grad_norm": 0.10549386590719223, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 105020 + }, + { + "epoch": 0.40601660713457344, + "grad_norm": 0.1036456972360611, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 105030 + }, + { + "epoch": 0.40605526433795675, + "grad_norm": 0.0944003090262413, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 105040 + }, + { + "epoch": 0.40609392154134, + "grad_norm": 0.11297862231731415, + "learning_rate": 0.002, + "loss": 2.352, + "step": 105050 + }, + { + "epoch": 0.4061325787447233, + "grad_norm": 0.11371901631355286, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 105060 + }, + { + "epoch": 0.40617123594810656, + "grad_norm": 0.1074351891875267, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 105070 + }, + { + "epoch": 0.40620989315148986, + "grad_norm": 0.1140996515750885, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 105080 + }, + { + "epoch": 0.4062485503548731, + "grad_norm": 0.10436806827783585, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 105090 + }, + { + "epoch": 0.4062872075582564, + "grad_norm": 0.10116159915924072, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 105100 + }, + { + "epoch": 0.4063258647616397, + "grad_norm": 0.11260448396205902, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 105110 + }, + { + "epoch": 0.406364521965023, + "grad_norm": 0.10626557469367981, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 105120 + }, + { + "epoch": 0.40640317916840624, + "grad_norm": 0.11319101601839066, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 105130 + }, + { + "epoch": 0.40644183637178954, + "grad_norm": 0.11202666908502579, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 105140 + }, + { + "epoch": 0.4064804935751728, + "grad_norm": 0.1096389889717102, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 105150 + }, + { + "epoch": 0.40651915077855605, + "grad_norm": 0.10176476836204529, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 105160 + }, + { + "epoch": 0.40655780798193936, + "grad_norm": 0.10365190356969833, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 105170 + }, + { + "epoch": 0.4065964651853226, + "grad_norm": 0.11331445723772049, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 105180 + }, + { + "epoch": 0.4066351223887059, + "grad_norm": 0.11339357495307922, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 105190 + }, + { + "epoch": 0.40667377959208917, + "grad_norm": 0.09283099323511124, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 105200 + }, + { + "epoch": 0.4067124367954725, + "grad_norm": 0.12029348313808441, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 105210 + }, + { + "epoch": 0.40675109399885573, + "grad_norm": 0.11738674342632294, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 105220 + }, + { + "epoch": 0.40678975120223904, + "grad_norm": 0.09112228453159332, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 105230 + }, + { + "epoch": 0.4068284084056223, + "grad_norm": 0.1028887927532196, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 105240 + }, + { + "epoch": 0.4068670656090056, + "grad_norm": 0.10166403651237488, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 105250 + }, + { + "epoch": 0.40690572281238885, + "grad_norm": 0.10761390626430511, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 105260 + }, + { + "epoch": 0.40694438001577216, + "grad_norm": 0.10484203696250916, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 105270 + }, + { + "epoch": 0.4069830372191554, + "grad_norm": 0.0940113440155983, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 105280 + }, + { + "epoch": 0.4070216944225387, + "grad_norm": 0.11318383365869522, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 105290 + }, + { + "epoch": 0.40706035162592197, + "grad_norm": 0.10878942161798477, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 105300 + }, + { + "epoch": 0.4070990088293053, + "grad_norm": 0.12262070178985596, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 105310 + }, + { + "epoch": 0.4071376660326885, + "grad_norm": 0.11862120032310486, + "learning_rate": 0.002, + "loss": 2.348, + "step": 105320 + }, + { + "epoch": 0.40717632323607184, + "grad_norm": 0.11528993397951126, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 105330 + }, + { + "epoch": 0.4072149804394551, + "grad_norm": 0.10077010840177536, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 105340 + }, + { + "epoch": 0.40725363764283834, + "grad_norm": 0.0977274626493454, + "learning_rate": 0.002, + "loss": 2.328, + "step": 105350 + }, + { + "epoch": 0.40729229484622165, + "grad_norm": 0.0942181721329689, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 105360 + }, + { + "epoch": 0.4073309520496049, + "grad_norm": 0.0979325994849205, + "learning_rate": 0.002, + "loss": 2.335, + "step": 105370 + }, + { + "epoch": 0.4073696092529882, + "grad_norm": 0.11181334406137466, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 105380 + }, + { + "epoch": 0.40740826645637146, + "grad_norm": 0.1126566082239151, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 105390 + }, + { + "epoch": 0.40744692365975477, + "grad_norm": 0.12782098352909088, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 105400 + }, + { + "epoch": 0.407485580863138, + "grad_norm": 0.10336902737617493, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 105410 + }, + { + "epoch": 0.4075242380665213, + "grad_norm": 0.09775971621274948, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 105420 + }, + { + "epoch": 0.4075628952699046, + "grad_norm": 0.10696987807750702, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 105430 + }, + { + "epoch": 0.4076015524732879, + "grad_norm": 0.12094870954751968, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 105440 + }, + { + "epoch": 0.40764020967667114, + "grad_norm": 0.1064288467168808, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 105450 + }, + { + "epoch": 0.40767886688005445, + "grad_norm": 0.10916435718536377, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 105460 + }, + { + "epoch": 0.4077175240834377, + "grad_norm": 0.10927937179803848, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 105470 + }, + { + "epoch": 0.407756181286821, + "grad_norm": 0.12412333488464355, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 105480 + }, + { + "epoch": 0.40779483849020426, + "grad_norm": 0.11743910610675812, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 105490 + }, + { + "epoch": 0.40783349569358757, + "grad_norm": 0.09887317568063736, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 105500 + }, + { + "epoch": 0.4078721528969708, + "grad_norm": 0.15866301953792572, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 105510 + }, + { + "epoch": 0.4079108101003541, + "grad_norm": 0.10922886431217194, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 105520 + }, + { + "epoch": 0.4079494673037374, + "grad_norm": 0.10216683894395828, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 105530 + }, + { + "epoch": 0.40798812450712063, + "grad_norm": 0.12087800353765488, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 105540 + }, + { + "epoch": 0.40802678171050394, + "grad_norm": 0.10817883163690567, + "learning_rate": 0.002, + "loss": 2.359, + "step": 105550 + }, + { + "epoch": 0.4080654389138872, + "grad_norm": 0.11017916351556778, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 105560 + }, + { + "epoch": 0.4081040961172705, + "grad_norm": 0.10396047681570053, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 105570 + }, + { + "epoch": 0.40814275332065375, + "grad_norm": 0.11955048143863678, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 105580 + }, + { + "epoch": 0.40818141052403706, + "grad_norm": 0.10369185358285904, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 105590 + }, + { + "epoch": 0.4082200677274203, + "grad_norm": 0.10445476323366165, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 105600 + }, + { + "epoch": 0.4082587249308036, + "grad_norm": 0.09951169043779373, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 105610 + }, + { + "epoch": 0.40829738213418687, + "grad_norm": 0.09170377254486084, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 105620 + }, + { + "epoch": 0.4083360393375702, + "grad_norm": 0.1086997240781784, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 105630 + }, + { + "epoch": 0.40837469654095343, + "grad_norm": 0.1046295091509819, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 105640 + }, + { + "epoch": 0.40841335374433674, + "grad_norm": 0.10843697935342789, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 105650 + }, + { + "epoch": 0.40845201094772, + "grad_norm": 0.09829798340797424, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 105660 + }, + { + "epoch": 0.4084906681511033, + "grad_norm": 0.11206449568271637, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 105670 + }, + { + "epoch": 0.40852932535448655, + "grad_norm": 0.10997811704874039, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 105680 + }, + { + "epoch": 0.40856798255786986, + "grad_norm": 0.10579683631658554, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 105690 + }, + { + "epoch": 0.4086066397612531, + "grad_norm": 0.10083601623773575, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 105700 + }, + { + "epoch": 0.4086452969646364, + "grad_norm": 0.09944254159927368, + "learning_rate": 0.002, + "loss": 2.3737, + "step": 105710 + }, + { + "epoch": 0.40868395416801967, + "grad_norm": 0.0935181975364685, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 105720 + }, + { + "epoch": 0.4087226113714029, + "grad_norm": 0.11061915010213852, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 105730 + }, + { + "epoch": 0.40876126857478623, + "grad_norm": 0.11106288433074951, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 105740 + }, + { + "epoch": 0.4087999257781695, + "grad_norm": 0.10588667541742325, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 105750 + }, + { + "epoch": 0.4088385829815528, + "grad_norm": 0.10737384855747223, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 105760 + }, + { + "epoch": 0.40887724018493604, + "grad_norm": 0.1374361664056778, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 105770 + }, + { + "epoch": 0.40891589738831935, + "grad_norm": 0.11525832861661911, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 105780 + }, + { + "epoch": 0.4089545545917026, + "grad_norm": 0.10367967188358307, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 105790 + }, + { + "epoch": 0.4089932117950859, + "grad_norm": 0.10809770226478577, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 105800 + }, + { + "epoch": 0.40903186899846916, + "grad_norm": 0.10791625082492828, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 105810 + }, + { + "epoch": 0.40907052620185247, + "grad_norm": 0.12326755374670029, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 105820 + }, + { + "epoch": 0.4091091834052357, + "grad_norm": 0.10126736760139465, + "learning_rate": 0.002, + "loss": 2.339, + "step": 105830 + }, + { + "epoch": 0.40914784060861903, + "grad_norm": 0.13051722943782806, + "learning_rate": 0.002, + "loss": 2.351, + "step": 105840 + }, + { + "epoch": 0.4091864978120023, + "grad_norm": 0.10089803487062454, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 105850 + }, + { + "epoch": 0.4092251550153856, + "grad_norm": 0.11042924225330353, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 105860 + }, + { + "epoch": 0.40926381221876884, + "grad_norm": 0.09863913804292679, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 105870 + }, + { + "epoch": 0.40930246942215215, + "grad_norm": 0.10707473754882812, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 105880 + }, + { + "epoch": 0.4093411266255354, + "grad_norm": 0.0968175083398819, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 105890 + }, + { + "epoch": 0.40937978382891865, + "grad_norm": 0.1127745509147644, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 105900 + }, + { + "epoch": 0.40941844103230196, + "grad_norm": 0.11764765530824661, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 105910 + }, + { + "epoch": 0.4094570982356852, + "grad_norm": 0.09807364642620087, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 105920 + }, + { + "epoch": 0.4094957554390685, + "grad_norm": 0.11089298129081726, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 105930 + }, + { + "epoch": 0.4095344126424518, + "grad_norm": 0.11690270900726318, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 105940 + }, + { + "epoch": 0.4095730698458351, + "grad_norm": 0.11261983960866928, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 105950 + }, + { + "epoch": 0.40961172704921833, + "grad_norm": 0.11782555282115936, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 105960 + }, + { + "epoch": 0.40965038425260164, + "grad_norm": 0.11236891895532608, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 105970 + }, + { + "epoch": 0.4096890414559849, + "grad_norm": 0.1086883395910263, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 105980 + }, + { + "epoch": 0.4097276986593682, + "grad_norm": 0.1255752593278885, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 105990 + }, + { + "epoch": 0.40976635586275145, + "grad_norm": 0.11962178349494934, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 106000 + }, + { + "epoch": 0.40980501306613476, + "grad_norm": 0.130793496966362, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 106010 + }, + { + "epoch": 0.409843670269518, + "grad_norm": 0.09094803780317307, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 106020 + }, + { + "epoch": 0.4098823274729013, + "grad_norm": 0.11750062555074692, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 106030 + }, + { + "epoch": 0.40992098467628457, + "grad_norm": 0.09984748065471649, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 106040 + }, + { + "epoch": 0.4099596418796679, + "grad_norm": 0.1055670976638794, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 106050 + }, + { + "epoch": 0.40999829908305113, + "grad_norm": 0.10549233108758926, + "learning_rate": 0.002, + "loss": 2.348, + "step": 106060 + }, + { + "epoch": 0.41003695628643444, + "grad_norm": 0.09887228906154633, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 106070 + }, + { + "epoch": 0.4100756134898177, + "grad_norm": 0.09559593349695206, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 106080 + }, + { + "epoch": 0.41011427069320094, + "grad_norm": 0.10596460849046707, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 106090 + }, + { + "epoch": 0.41015292789658425, + "grad_norm": 0.11016621440649033, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 106100 + }, + { + "epoch": 0.4101915850999675, + "grad_norm": 0.10938017815351486, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 106110 + }, + { + "epoch": 0.4102302423033508, + "grad_norm": 0.11935988813638687, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 106120 + }, + { + "epoch": 0.41026889950673406, + "grad_norm": 0.10754488408565521, + "learning_rate": 0.002, + "loss": 2.344, + "step": 106130 + }, + { + "epoch": 0.41030755671011737, + "grad_norm": 0.11438973993062973, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 106140 + }, + { + "epoch": 0.4103462139135006, + "grad_norm": 0.09833820909261703, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 106150 + }, + { + "epoch": 0.41038487111688393, + "grad_norm": 0.1259237676858902, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 106160 + }, + { + "epoch": 0.4104235283202672, + "grad_norm": 0.10356998443603516, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 106170 + }, + { + "epoch": 0.4104621855236505, + "grad_norm": 0.13003289699554443, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 106180 + }, + { + "epoch": 0.41050084272703374, + "grad_norm": 0.11426402628421783, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 106190 + }, + { + "epoch": 0.41053949993041705, + "grad_norm": 0.11478512734174728, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 106200 + }, + { + "epoch": 0.4105781571338003, + "grad_norm": 0.1121613010764122, + "learning_rate": 0.002, + "loss": 2.358, + "step": 106210 + }, + { + "epoch": 0.4106168143371836, + "grad_norm": 0.12235704064369202, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 106220 + }, + { + "epoch": 0.41065547154056686, + "grad_norm": 0.10301785916090012, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 106230 + }, + { + "epoch": 0.41069412874395017, + "grad_norm": 0.10899627953767776, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 106240 + }, + { + "epoch": 0.4107327859473334, + "grad_norm": 0.13639208674430847, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 106250 + }, + { + "epoch": 0.41077144315071673, + "grad_norm": 0.10303498804569244, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 106260 + }, + { + "epoch": 0.4108101003541, + "grad_norm": 0.09691531956195831, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 106270 + }, + { + "epoch": 0.41084875755748324, + "grad_norm": 0.11968454718589783, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 106280 + }, + { + "epoch": 0.41088741476086654, + "grad_norm": 0.11995863914489746, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 106290 + }, + { + "epoch": 0.4109260719642498, + "grad_norm": 0.10339125245809555, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 106300 + }, + { + "epoch": 0.4109647291676331, + "grad_norm": 0.11860562115907669, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 106310 + }, + { + "epoch": 0.41100338637101635, + "grad_norm": 0.11929059028625488, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 106320 + }, + { + "epoch": 0.41104204357439966, + "grad_norm": 0.11013022810220718, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 106330 + }, + { + "epoch": 0.4110807007777829, + "grad_norm": 0.09725578874349594, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 106340 + }, + { + "epoch": 0.4111193579811662, + "grad_norm": 0.10636448115110397, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 106350 + }, + { + "epoch": 0.4111580151845495, + "grad_norm": 0.11153136938810349, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 106360 + }, + { + "epoch": 0.4111966723879328, + "grad_norm": 0.10090118646621704, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 106370 + }, + { + "epoch": 0.41123532959131603, + "grad_norm": 0.112342469394207, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 106380 + }, + { + "epoch": 0.41127398679469934, + "grad_norm": 0.10965652018785477, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 106390 + }, + { + "epoch": 0.4113126439980826, + "grad_norm": 0.11831896007061005, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 106400 + }, + { + "epoch": 0.4113513012014659, + "grad_norm": 0.09602269530296326, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 106410 + }, + { + "epoch": 0.41138995840484915, + "grad_norm": 0.10719896852970123, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 106420 + }, + { + "epoch": 0.41142861560823246, + "grad_norm": 0.11348539590835571, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 106430 + }, + { + "epoch": 0.4114672728116157, + "grad_norm": 0.12239199876785278, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 106440 + }, + { + "epoch": 0.411505930014999, + "grad_norm": 0.12341707199811935, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 106450 + }, + { + "epoch": 0.4115445872183823, + "grad_norm": 0.10860970616340637, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 106460 + }, + { + "epoch": 0.4115832444217655, + "grad_norm": 0.1317441314458847, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 106470 + }, + { + "epoch": 0.41162190162514883, + "grad_norm": 0.11049704253673553, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 106480 + }, + { + "epoch": 0.4116605588285321, + "grad_norm": 0.12230638414621353, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 106490 + }, + { + "epoch": 0.4116992160319154, + "grad_norm": 0.09292108565568924, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 106500 + }, + { + "epoch": 0.41173787323529865, + "grad_norm": 0.11424490809440613, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 106510 + }, + { + "epoch": 0.41177653043868195, + "grad_norm": 0.1115012913942337, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 106520 + }, + { + "epoch": 0.4118151876420652, + "grad_norm": 0.09740482270717621, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 106530 + }, + { + "epoch": 0.4118538448454485, + "grad_norm": 0.09662970900535583, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 106540 + }, + { + "epoch": 0.41189250204883177, + "grad_norm": 0.12155226618051529, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 106550 + }, + { + "epoch": 0.4119311592522151, + "grad_norm": 0.1143067330121994, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 106560 + }, + { + "epoch": 0.4119698164555983, + "grad_norm": 0.12124653905630112, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 106570 + }, + { + "epoch": 0.41200847365898163, + "grad_norm": 0.10572107136249542, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 106580 + }, + { + "epoch": 0.4120471308623649, + "grad_norm": 0.12428019195795059, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 106590 + }, + { + "epoch": 0.4120857880657482, + "grad_norm": 0.10228022933006287, + "learning_rate": 0.002, + "loss": 2.347, + "step": 106600 + }, + { + "epoch": 0.41212444526913145, + "grad_norm": 0.11307167261838913, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 106610 + }, + { + "epoch": 0.41216310247251475, + "grad_norm": 0.10472637414932251, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 106620 + }, + { + "epoch": 0.412201759675898, + "grad_norm": 0.1070481687784195, + "learning_rate": 0.002, + "loss": 2.358, + "step": 106630 + }, + { + "epoch": 0.4122404168792813, + "grad_norm": 0.10893788933753967, + "learning_rate": 0.002, + "loss": 2.3681, + "step": 106640 + }, + { + "epoch": 0.41227907408266457, + "grad_norm": 0.09896333515644073, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 106650 + }, + { + "epoch": 0.4123177312860478, + "grad_norm": 0.10149840265512466, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 106660 + }, + { + "epoch": 0.4123563884894311, + "grad_norm": 0.10944268852472305, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 106670 + }, + { + "epoch": 0.4123950456928144, + "grad_norm": 0.10874912887811661, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 106680 + }, + { + "epoch": 0.4124337028961977, + "grad_norm": 0.12328887730836868, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 106690 + }, + { + "epoch": 0.41247236009958094, + "grad_norm": 0.10866201668977737, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 106700 + }, + { + "epoch": 0.41251101730296424, + "grad_norm": 0.10595963895320892, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 106710 + }, + { + "epoch": 0.4125496745063475, + "grad_norm": 0.12083906680345535, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 106720 + }, + { + "epoch": 0.4125883317097308, + "grad_norm": 0.10444987565279007, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 106730 + }, + { + "epoch": 0.41262698891311406, + "grad_norm": 0.11359119415283203, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 106740 + }, + { + "epoch": 0.41266564611649736, + "grad_norm": 0.10856324434280396, + "learning_rate": 0.002, + "loss": 2.342, + "step": 106750 + }, + { + "epoch": 0.4127043033198806, + "grad_norm": 0.09387435019016266, + "learning_rate": 0.002, + "loss": 2.356, + "step": 106760 + }, + { + "epoch": 0.4127429605232639, + "grad_norm": 0.12010136246681213, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 106770 + }, + { + "epoch": 0.4127816177266472, + "grad_norm": 0.11715513467788696, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 106780 + }, + { + "epoch": 0.4128202749300305, + "grad_norm": 0.10977223515510559, + "learning_rate": 0.002, + "loss": 2.355, + "step": 106790 + }, + { + "epoch": 0.41285893213341374, + "grad_norm": 0.10523109883069992, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 106800 + }, + { + "epoch": 0.41289758933679704, + "grad_norm": 0.09810391813516617, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 106810 + }, + { + "epoch": 0.4129362465401803, + "grad_norm": 0.1089128777384758, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 106820 + }, + { + "epoch": 0.41297490374356355, + "grad_norm": 0.09619535505771637, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 106830 + }, + { + "epoch": 0.41301356094694686, + "grad_norm": 0.10872559249401093, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 106840 + }, + { + "epoch": 0.4130522181503301, + "grad_norm": 0.11064718663692474, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 106850 + }, + { + "epoch": 0.4130908753537134, + "grad_norm": 0.0972963497042656, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 106860 + }, + { + "epoch": 0.41312953255709667, + "grad_norm": 0.1047365665435791, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 106870 + }, + { + "epoch": 0.41316818976048, + "grad_norm": 0.1001274511218071, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 106880 + }, + { + "epoch": 0.41320684696386323, + "grad_norm": 0.12085501104593277, + "learning_rate": 0.002, + "loss": 2.341, + "step": 106890 + }, + { + "epoch": 0.41324550416724654, + "grad_norm": 0.09879327565431595, + "learning_rate": 0.002, + "loss": 2.347, + "step": 106900 + }, + { + "epoch": 0.4132841613706298, + "grad_norm": 0.10587608814239502, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 106910 + }, + { + "epoch": 0.4133228185740131, + "grad_norm": 0.12716144323349, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 106920 + }, + { + "epoch": 0.41336147577739635, + "grad_norm": 0.11554614454507828, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 106930 + }, + { + "epoch": 0.41340013298077966, + "grad_norm": 0.09272408485412598, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 106940 + }, + { + "epoch": 0.4134387901841629, + "grad_norm": 0.11044779419898987, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 106950 + }, + { + "epoch": 0.4134774473875462, + "grad_norm": 0.1153455302119255, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 106960 + }, + { + "epoch": 0.41351610459092947, + "grad_norm": 0.10462018847465515, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 106970 + }, + { + "epoch": 0.4135547617943128, + "grad_norm": 0.10811108350753784, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 106980 + }, + { + "epoch": 0.413593418997696, + "grad_norm": 0.10190818458795547, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 106990 + }, + { + "epoch": 0.41363207620107934, + "grad_norm": 0.11271360516548157, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 107000 + }, + { + "epoch": 0.4136707334044626, + "grad_norm": 0.09467026591300964, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 107010 + }, + { + "epoch": 0.41370939060784584, + "grad_norm": 0.11904910951852798, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 107020 + }, + { + "epoch": 0.41374804781122915, + "grad_norm": 0.09495735913515091, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 107030 + }, + { + "epoch": 0.4137867050146124, + "grad_norm": 0.14995649456977844, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 107040 + }, + { + "epoch": 0.4138253622179957, + "grad_norm": 0.09876397252082825, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 107050 + }, + { + "epoch": 0.41386401942137896, + "grad_norm": 0.1055774912238121, + "learning_rate": 0.002, + "loss": 2.333, + "step": 107060 + }, + { + "epoch": 0.41390267662476227, + "grad_norm": 0.10995199531316757, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 107070 + }, + { + "epoch": 0.4139413338281455, + "grad_norm": 0.10983167588710785, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 107080 + }, + { + "epoch": 0.4139799910315288, + "grad_norm": 0.11265630275011063, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 107090 + }, + { + "epoch": 0.4140186482349121, + "grad_norm": 0.11353180557489395, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 107100 + }, + { + "epoch": 0.4140573054382954, + "grad_norm": 0.100088931620121, + "learning_rate": 0.002, + "loss": 2.338, + "step": 107110 + }, + { + "epoch": 0.41409596264167864, + "grad_norm": 0.09908263385295868, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 107120 + }, + { + "epoch": 0.41413461984506195, + "grad_norm": 0.10703715682029724, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 107130 + }, + { + "epoch": 0.4141732770484452, + "grad_norm": 0.1116807758808136, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 107140 + }, + { + "epoch": 0.4142119342518285, + "grad_norm": 0.1046561673283577, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 107150 + }, + { + "epoch": 0.41425059145521176, + "grad_norm": 0.09532133489847183, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 107160 + }, + { + "epoch": 0.41428924865859507, + "grad_norm": 0.10459303855895996, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 107170 + }, + { + "epoch": 0.4143279058619783, + "grad_norm": 0.12302389740943909, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 107180 + }, + { + "epoch": 0.4143665630653616, + "grad_norm": 0.11073164641857147, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 107190 + }, + { + "epoch": 0.4144052202687449, + "grad_norm": 0.1090584397315979, + "learning_rate": 0.002, + "loss": 2.354, + "step": 107200 + }, + { + "epoch": 0.41444387747212813, + "grad_norm": 0.1135617047548294, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 107210 + }, + { + "epoch": 0.41448253467551144, + "grad_norm": 0.10791900008916855, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 107220 + }, + { + "epoch": 0.4145211918788947, + "grad_norm": 0.10380267351865768, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 107230 + }, + { + "epoch": 0.414559849082278, + "grad_norm": 0.10777231305837631, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 107240 + }, + { + "epoch": 0.41459850628566125, + "grad_norm": 0.10638809949159622, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 107250 + }, + { + "epoch": 0.41463716348904456, + "grad_norm": 0.09734046459197998, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 107260 + }, + { + "epoch": 0.4146758206924278, + "grad_norm": 0.10681114345788956, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 107270 + }, + { + "epoch": 0.4147144778958111, + "grad_norm": 0.11719990521669388, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 107280 + }, + { + "epoch": 0.41475313509919437, + "grad_norm": 0.11085479706525803, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 107290 + }, + { + "epoch": 0.4147917923025777, + "grad_norm": 0.11180248111486435, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 107300 + }, + { + "epoch": 0.41483044950596093, + "grad_norm": 0.11796993762254715, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 107310 + }, + { + "epoch": 0.41486910670934424, + "grad_norm": 0.09738306701183319, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 107320 + }, + { + "epoch": 0.4149077639127275, + "grad_norm": 0.12929093837738037, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 107330 + }, + { + "epoch": 0.4149464211161108, + "grad_norm": 0.10308624804019928, + "learning_rate": 0.002, + "loss": 2.3718, + "step": 107340 + }, + { + "epoch": 0.41498507831949405, + "grad_norm": 0.11912292242050171, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 107350 + }, + { + "epoch": 0.41502373552287736, + "grad_norm": 0.09912280738353729, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 107360 + }, + { + "epoch": 0.4150623927262606, + "grad_norm": 0.10287779569625854, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 107370 + }, + { + "epoch": 0.4151010499296439, + "grad_norm": 0.1134791225194931, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 107380 + }, + { + "epoch": 0.41513970713302717, + "grad_norm": 0.12196041643619537, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 107390 + }, + { + "epoch": 0.4151783643364104, + "grad_norm": 0.10882119834423065, + "learning_rate": 0.002, + "loss": 2.332, + "step": 107400 + }, + { + "epoch": 0.41521702153979373, + "grad_norm": 0.11768088489770889, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 107410 + }, + { + "epoch": 0.415255678743177, + "grad_norm": 0.10867585241794586, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 107420 + }, + { + "epoch": 0.4152943359465603, + "grad_norm": 0.10891842097043991, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 107430 + }, + { + "epoch": 0.41533299314994354, + "grad_norm": 0.11367695778608322, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 107440 + }, + { + "epoch": 0.41537165035332685, + "grad_norm": 0.11949791759252548, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 107450 + }, + { + "epoch": 0.4154103075567101, + "grad_norm": 0.10001518577337265, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 107460 + }, + { + "epoch": 0.4154489647600934, + "grad_norm": 0.0977843701839447, + "learning_rate": 0.002, + "loss": 2.358, + "step": 107470 + }, + { + "epoch": 0.41548762196347666, + "grad_norm": 0.12059198319911957, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 107480 + }, + { + "epoch": 0.41552627916685997, + "grad_norm": 0.11329180002212524, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 107490 + }, + { + "epoch": 0.4155649363702432, + "grad_norm": 0.11127132177352905, + "learning_rate": 0.002, + "loss": 2.355, + "step": 107500 + }, + { + "epoch": 0.41560359357362653, + "grad_norm": 0.11088550090789795, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 107510 + }, + { + "epoch": 0.4156422507770098, + "grad_norm": 0.10160195827484131, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 107520 + }, + { + "epoch": 0.4156809079803931, + "grad_norm": 0.10547743737697601, + "learning_rate": 0.002, + "loss": 2.351, + "step": 107530 + }, + { + "epoch": 0.41571956518377634, + "grad_norm": 0.09817437827587128, + "learning_rate": 0.002, + "loss": 2.343, + "step": 107540 + }, + { + "epoch": 0.41575822238715965, + "grad_norm": 0.10416698455810547, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 107550 + }, + { + "epoch": 0.4157968795905429, + "grad_norm": 0.11405066400766373, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 107560 + }, + { + "epoch": 0.41583553679392615, + "grad_norm": 0.10036198049783707, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 107570 + }, + { + "epoch": 0.41587419399730946, + "grad_norm": 0.09698426723480225, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 107580 + }, + { + "epoch": 0.4159128512006927, + "grad_norm": 0.1098899245262146, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 107590 + }, + { + "epoch": 0.415951508404076, + "grad_norm": 0.10067665576934814, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 107600 + }, + { + "epoch": 0.4159901656074593, + "grad_norm": 0.12178201973438263, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 107610 + }, + { + "epoch": 0.4160288228108426, + "grad_norm": 0.10082918405532837, + "learning_rate": 0.002, + "loss": 2.357, + "step": 107620 + }, + { + "epoch": 0.41606748001422583, + "grad_norm": 0.10588428378105164, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 107630 + }, + { + "epoch": 0.41610613721760914, + "grad_norm": 0.11798638105392456, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 107640 + }, + { + "epoch": 0.4161447944209924, + "grad_norm": 0.10255949944257736, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 107650 + }, + { + "epoch": 0.4161834516243757, + "grad_norm": 0.11976686865091324, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 107660 + }, + { + "epoch": 0.41622210882775895, + "grad_norm": 0.09483418613672256, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 107670 + }, + { + "epoch": 0.41626076603114226, + "grad_norm": 0.09426402300596237, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 107680 + }, + { + "epoch": 0.4162994232345255, + "grad_norm": 0.09577522426843643, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 107690 + }, + { + "epoch": 0.4163380804379088, + "grad_norm": 0.09289302676916122, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 107700 + }, + { + "epoch": 0.41637673764129207, + "grad_norm": 0.11215560138225555, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 107710 + }, + { + "epoch": 0.4164153948446754, + "grad_norm": 0.10089147090911865, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 107720 + }, + { + "epoch": 0.41645405204805863, + "grad_norm": 0.10375631600618362, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 107730 + }, + { + "epoch": 0.41649270925144194, + "grad_norm": 0.1042468249797821, + "learning_rate": 0.002, + "loss": 2.343, + "step": 107740 + }, + { + "epoch": 0.4165313664548252, + "grad_norm": 0.1084354892373085, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 107750 + }, + { + "epoch": 0.41657002365820844, + "grad_norm": 0.10817048698663712, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 107760 + }, + { + "epoch": 0.41660868086159175, + "grad_norm": 0.09909099340438843, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 107770 + }, + { + "epoch": 0.416647338064975, + "grad_norm": 0.09349465370178223, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 107780 + }, + { + "epoch": 0.4166859952683583, + "grad_norm": 0.12862420082092285, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 107790 + }, + { + "epoch": 0.41672465247174156, + "grad_norm": 0.10976967960596085, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 107800 + }, + { + "epoch": 0.41676330967512487, + "grad_norm": 0.17169824242591858, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 107810 + }, + { + "epoch": 0.4168019668785081, + "grad_norm": 0.09951794892549515, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 107820 + }, + { + "epoch": 0.41684062408189143, + "grad_norm": 0.11703871935606003, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 107830 + }, + { + "epoch": 0.4168792812852747, + "grad_norm": 0.10398931801319122, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 107840 + }, + { + "epoch": 0.416917938488658, + "grad_norm": 0.09673362970352173, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 107850 + }, + { + "epoch": 0.41695659569204124, + "grad_norm": 0.11419400572776794, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 107860 + }, + { + "epoch": 0.41699525289542455, + "grad_norm": 0.126008540391922, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 107870 + }, + { + "epoch": 0.4170339100988078, + "grad_norm": 0.11196989566087723, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 107880 + }, + { + "epoch": 0.4170725673021911, + "grad_norm": 0.10262865573167801, + "learning_rate": 0.002, + "loss": 2.3723, + "step": 107890 + }, + { + "epoch": 0.41711122450557436, + "grad_norm": 0.10403737425804138, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 107900 + }, + { + "epoch": 0.41714988170895767, + "grad_norm": 0.09810178726911545, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 107910 + }, + { + "epoch": 0.4171885389123409, + "grad_norm": 0.101883165538311, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 107920 + }, + { + "epoch": 0.41722719611572423, + "grad_norm": 0.12212926894426346, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 107930 + }, + { + "epoch": 0.4172658533191075, + "grad_norm": 0.11113600432872772, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 107940 + }, + { + "epoch": 0.41730451052249073, + "grad_norm": 0.10756055265665054, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 107950 + }, + { + "epoch": 0.41734316772587404, + "grad_norm": 0.10574343800544739, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 107960 + }, + { + "epoch": 0.4173818249292573, + "grad_norm": 0.10672122985124588, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 107970 + }, + { + "epoch": 0.4174204821326406, + "grad_norm": 0.11252713203430176, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 107980 + }, + { + "epoch": 0.41745913933602385, + "grad_norm": 0.1140674352645874, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 107990 + }, + { + "epoch": 0.41749779653940716, + "grad_norm": 0.13871759176254272, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 108000 + }, + { + "epoch": 0.4175364537427904, + "grad_norm": 0.10054554790258408, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 108010 + }, + { + "epoch": 0.4175751109461737, + "grad_norm": 0.10804137587547302, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 108020 + }, + { + "epoch": 0.417613768149557, + "grad_norm": 0.1006598025560379, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 108030 + }, + { + "epoch": 0.4176524253529403, + "grad_norm": 0.14845651388168335, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 108040 + }, + { + "epoch": 0.41769108255632353, + "grad_norm": 0.10627973079681396, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 108050 + }, + { + "epoch": 0.41772973975970684, + "grad_norm": 0.11973363161087036, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 108060 + }, + { + "epoch": 0.4177683969630901, + "grad_norm": 0.12248005717992783, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 108070 + }, + { + "epoch": 0.4178070541664734, + "grad_norm": 0.09348124265670776, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 108080 + }, + { + "epoch": 0.41784571136985665, + "grad_norm": 0.10690181702375412, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 108090 + }, + { + "epoch": 0.41788436857323996, + "grad_norm": 0.10468243807554245, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 108100 + }, + { + "epoch": 0.4179230257766232, + "grad_norm": 0.1052987203001976, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 108110 + }, + { + "epoch": 0.4179616829800065, + "grad_norm": 0.1134672611951828, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 108120 + }, + { + "epoch": 0.4180003401833898, + "grad_norm": 0.10740668326616287, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 108130 + }, + { + "epoch": 0.418038997386773, + "grad_norm": 0.12365444004535675, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 108140 + }, + { + "epoch": 0.41807765459015633, + "grad_norm": 0.10862985253334045, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 108150 + }, + { + "epoch": 0.4181163117935396, + "grad_norm": 0.10410787910223007, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 108160 + }, + { + "epoch": 0.4181549689969229, + "grad_norm": 0.09590797871351242, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 108170 + }, + { + "epoch": 0.41819362620030615, + "grad_norm": 0.11057563126087189, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 108180 + }, + { + "epoch": 0.41823228340368945, + "grad_norm": 0.09386027604341507, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 108190 + }, + { + "epoch": 0.4182709406070727, + "grad_norm": 0.11849407851696014, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 108200 + }, + { + "epoch": 0.418309597810456, + "grad_norm": 0.10359001904726028, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 108210 + }, + { + "epoch": 0.41834825501383927, + "grad_norm": 0.10447119176387787, + "learning_rate": 0.002, + "loss": 2.375, + "step": 108220 + }, + { + "epoch": 0.4183869122172226, + "grad_norm": 0.10291597247123718, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 108230 + }, + { + "epoch": 0.4184255694206058, + "grad_norm": 0.12071507424116135, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 108240 + }, + { + "epoch": 0.41846422662398913, + "grad_norm": 0.12983140349388123, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 108250 + }, + { + "epoch": 0.4185028838273724, + "grad_norm": 0.0902920514345169, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 108260 + }, + { + "epoch": 0.4185415410307557, + "grad_norm": 0.107839435338974, + "learning_rate": 0.002, + "loss": 2.363, + "step": 108270 + }, + { + "epoch": 0.41858019823413894, + "grad_norm": 0.11176130920648575, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 108280 + }, + { + "epoch": 0.41861885543752225, + "grad_norm": 0.11270838975906372, + "learning_rate": 0.002, + "loss": 2.341, + "step": 108290 + }, + { + "epoch": 0.4186575126409055, + "grad_norm": 0.10063161700963974, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 108300 + }, + { + "epoch": 0.4186961698442888, + "grad_norm": 0.10945228487253189, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 108310 + }, + { + "epoch": 0.41873482704767206, + "grad_norm": 0.11797590553760529, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 108320 + }, + { + "epoch": 0.4187734842510553, + "grad_norm": 0.1306760460138321, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 108330 + }, + { + "epoch": 0.4188121414544386, + "grad_norm": 0.11011867225170135, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 108340 + }, + { + "epoch": 0.4188507986578219, + "grad_norm": 0.10377843677997589, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 108350 + }, + { + "epoch": 0.4188894558612052, + "grad_norm": 0.12875615060329437, + "learning_rate": 0.002, + "loss": 2.355, + "step": 108360 + }, + { + "epoch": 0.41892811306458844, + "grad_norm": 0.10549406707286835, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 108370 + }, + { + "epoch": 0.41896677026797174, + "grad_norm": 0.12035589665174484, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 108380 + }, + { + "epoch": 0.419005427471355, + "grad_norm": 0.11725050956010818, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 108390 + }, + { + "epoch": 0.4190440846747383, + "grad_norm": 0.09805501252412796, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 108400 + }, + { + "epoch": 0.41908274187812156, + "grad_norm": 0.11357685923576355, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 108410 + }, + { + "epoch": 0.41912139908150486, + "grad_norm": 0.13415689766407013, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 108420 + }, + { + "epoch": 0.4191600562848881, + "grad_norm": 0.10046670585870743, + "learning_rate": 0.002, + "loss": 2.343, + "step": 108430 + }, + { + "epoch": 0.4191987134882714, + "grad_norm": 0.11026618629693985, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 108440 + }, + { + "epoch": 0.4192373706916547, + "grad_norm": 0.10505351424217224, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 108450 + }, + { + "epoch": 0.419276027895038, + "grad_norm": 0.114077128469944, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 108460 + }, + { + "epoch": 0.41931468509842124, + "grad_norm": 0.11202042549848557, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 108470 + }, + { + "epoch": 0.41935334230180454, + "grad_norm": 0.11489993333816528, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 108480 + }, + { + "epoch": 0.4193919995051878, + "grad_norm": 0.11202628910541534, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 108490 + }, + { + "epoch": 0.41943065670857105, + "grad_norm": 0.10482347756624222, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 108500 + }, + { + "epoch": 0.41946931391195436, + "grad_norm": 0.10478508472442627, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 108510 + }, + { + "epoch": 0.4195079711153376, + "grad_norm": 0.10179168730974197, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 108520 + }, + { + "epoch": 0.4195466283187209, + "grad_norm": 0.10658016055822372, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 108530 + }, + { + "epoch": 0.41958528552210417, + "grad_norm": 0.12446645647287369, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 108540 + }, + { + "epoch": 0.4196239427254875, + "grad_norm": 0.11415350437164307, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 108550 + }, + { + "epoch": 0.4196625999288707, + "grad_norm": 0.11967852711677551, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 108560 + }, + { + "epoch": 0.41970125713225404, + "grad_norm": 0.11305100470781326, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 108570 + }, + { + "epoch": 0.4197399143356373, + "grad_norm": 0.09932440519332886, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 108580 + }, + { + "epoch": 0.4197785715390206, + "grad_norm": 0.10772234946489334, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 108590 + }, + { + "epoch": 0.41981722874240385, + "grad_norm": 0.1011858731508255, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 108600 + }, + { + "epoch": 0.41985588594578716, + "grad_norm": 0.10514667630195618, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 108610 + }, + { + "epoch": 0.4198945431491704, + "grad_norm": 0.0941380187869072, + "learning_rate": 0.002, + "loss": 2.35, + "step": 108620 + }, + { + "epoch": 0.4199332003525537, + "grad_norm": 0.10625889152288437, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 108630 + }, + { + "epoch": 0.41997185755593697, + "grad_norm": 0.12152034789323807, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 108640 + }, + { + "epoch": 0.4200105147593203, + "grad_norm": 0.11208917945623398, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 108650 + }, + { + "epoch": 0.4200491719627035, + "grad_norm": 0.10172926634550095, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 108660 + }, + { + "epoch": 0.42008782916608683, + "grad_norm": 0.10808614641427994, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 108670 + }, + { + "epoch": 0.4201264863694701, + "grad_norm": 0.10961339622735977, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 108680 + }, + { + "epoch": 0.42016514357285334, + "grad_norm": 0.09312523901462555, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 108690 + }, + { + "epoch": 0.42020380077623665, + "grad_norm": 0.09796839952468872, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 108700 + }, + { + "epoch": 0.4202424579796199, + "grad_norm": 0.11329052597284317, + "learning_rate": 0.002, + "loss": 2.356, + "step": 108710 + }, + { + "epoch": 0.4202811151830032, + "grad_norm": 0.11482075601816177, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 108720 + }, + { + "epoch": 0.42031977238638646, + "grad_norm": 0.11668254435062408, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 108730 + }, + { + "epoch": 0.42035842958976977, + "grad_norm": 0.0998915284872055, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 108740 + }, + { + "epoch": 0.420397086793153, + "grad_norm": 0.11637427657842636, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 108750 + }, + { + "epoch": 0.4204357439965363, + "grad_norm": 0.10949467122554779, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 108760 + }, + { + "epoch": 0.4204744011999196, + "grad_norm": 0.10018786042928696, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 108770 + }, + { + "epoch": 0.4205130584033029, + "grad_norm": 0.12903913855552673, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 108780 + }, + { + "epoch": 0.42055171560668614, + "grad_norm": 0.0959692895412445, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 108790 + }, + { + "epoch": 0.42059037281006945, + "grad_norm": 0.11546919494867325, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 108800 + }, + { + "epoch": 0.4206290300134527, + "grad_norm": 0.10684224218130112, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 108810 + }, + { + "epoch": 0.420667687216836, + "grad_norm": 0.10135342925786972, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 108820 + }, + { + "epoch": 0.42070634442021926, + "grad_norm": 0.09952717274427414, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 108830 + }, + { + "epoch": 0.42074500162360257, + "grad_norm": 0.11792684346437454, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 108840 + }, + { + "epoch": 0.4207836588269858, + "grad_norm": 0.108956478536129, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 108850 + }, + { + "epoch": 0.4208223160303691, + "grad_norm": 0.1198933944106102, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 108860 + }, + { + "epoch": 0.4208609732337524, + "grad_norm": 0.09738875180482864, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 108870 + }, + { + "epoch": 0.42089963043713563, + "grad_norm": 0.10686634480953217, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 108880 + }, + { + "epoch": 0.42093828764051894, + "grad_norm": 0.09045381098985672, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 108890 + }, + { + "epoch": 0.4209769448439022, + "grad_norm": 0.0957198441028595, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 108900 + }, + { + "epoch": 0.4210156020472855, + "grad_norm": 0.10955697298049927, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 108910 + }, + { + "epoch": 0.42105425925066875, + "grad_norm": 0.10267753154039383, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 108920 + }, + { + "epoch": 0.42109291645405206, + "grad_norm": 0.09929932653903961, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 108930 + }, + { + "epoch": 0.4211315736574353, + "grad_norm": 0.10407166928052902, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 108940 + }, + { + "epoch": 0.4211702308608186, + "grad_norm": 0.11709648370742798, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 108950 + }, + { + "epoch": 0.42120888806420187, + "grad_norm": 0.09369426965713501, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 108960 + }, + { + "epoch": 0.4212475452675852, + "grad_norm": 0.1129760593175888, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 108970 + }, + { + "epoch": 0.42128620247096843, + "grad_norm": 0.10691390931606293, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 108980 + }, + { + "epoch": 0.42132485967435174, + "grad_norm": 0.11483962833881378, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 108990 + }, + { + "epoch": 0.421363516877735, + "grad_norm": 0.11706017702817917, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 109000 + }, + { + "epoch": 0.4214021740811183, + "grad_norm": 0.09031584113836288, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 109010 + }, + { + "epoch": 0.42144083128450155, + "grad_norm": 0.1046423614025116, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 109020 + }, + { + "epoch": 0.42147948848788486, + "grad_norm": 0.10996993631124496, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 109030 + }, + { + "epoch": 0.4215181456912681, + "grad_norm": 0.10464781522750854, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 109040 + }, + { + "epoch": 0.4215568028946514, + "grad_norm": 0.10103486478328705, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 109050 + }, + { + "epoch": 0.42159546009803467, + "grad_norm": 0.11956936120986938, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 109060 + }, + { + "epoch": 0.4216341173014179, + "grad_norm": 0.10601706057786942, + "learning_rate": 0.002, + "loss": 2.369, + "step": 109070 + }, + { + "epoch": 0.42167277450480123, + "grad_norm": 0.11325680464506149, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 109080 + }, + { + "epoch": 0.4217114317081845, + "grad_norm": 0.11050796508789062, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 109090 + }, + { + "epoch": 0.4217500889115678, + "grad_norm": 0.11937505006790161, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 109100 + }, + { + "epoch": 0.42178874611495104, + "grad_norm": 0.10740794986486435, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 109110 + }, + { + "epoch": 0.42182740331833435, + "grad_norm": 0.10873711109161377, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 109120 + }, + { + "epoch": 0.4218660605217176, + "grad_norm": 0.09789435565471649, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 109130 + }, + { + "epoch": 0.4219047177251009, + "grad_norm": 0.12149273604154587, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 109140 + }, + { + "epoch": 0.42194337492848416, + "grad_norm": 0.12185237556695938, + "learning_rate": 0.002, + "loss": 2.349, + "step": 109150 + }, + { + "epoch": 0.42198203213186747, + "grad_norm": 0.10570470988750458, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 109160 + }, + { + "epoch": 0.4220206893352507, + "grad_norm": 0.11031711846590042, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 109170 + }, + { + "epoch": 0.42205934653863403, + "grad_norm": 0.1117752268910408, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 109180 + }, + { + "epoch": 0.4220980037420173, + "grad_norm": 0.10328806936740875, + "learning_rate": 0.002, + "loss": 2.3696, + "step": 109190 + }, + { + "epoch": 0.4221366609454006, + "grad_norm": 0.11777733266353607, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 109200 + }, + { + "epoch": 0.42217531814878384, + "grad_norm": 0.10123222321271896, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 109210 + }, + { + "epoch": 0.42221397535216715, + "grad_norm": 0.09601963311433792, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 109220 + }, + { + "epoch": 0.4222526325555504, + "grad_norm": 0.10355490446090698, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 109230 + }, + { + "epoch": 0.42229128975893365, + "grad_norm": 0.12778176367282867, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 109240 + }, + { + "epoch": 0.42232994696231696, + "grad_norm": 0.1110726073384285, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 109250 + }, + { + "epoch": 0.4223686041657002, + "grad_norm": 0.10866408050060272, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 109260 + }, + { + "epoch": 0.4224072613690835, + "grad_norm": 0.10042106360197067, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 109270 + }, + { + "epoch": 0.42244591857246677, + "grad_norm": 0.11243009567260742, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 109280 + }, + { + "epoch": 0.4224845757758501, + "grad_norm": 0.10251044481992722, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 109290 + }, + { + "epoch": 0.42252323297923333, + "grad_norm": 0.0979798436164856, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 109300 + }, + { + "epoch": 0.42256189018261664, + "grad_norm": 0.1106082871556282, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 109310 + }, + { + "epoch": 0.4226005473859999, + "grad_norm": 0.11181111633777618, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 109320 + }, + { + "epoch": 0.4226392045893832, + "grad_norm": 0.09771327674388885, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 109330 + }, + { + "epoch": 0.42267786179276645, + "grad_norm": 0.11357173323631287, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 109340 + }, + { + "epoch": 0.42271651899614976, + "grad_norm": 0.10991322249174118, + "learning_rate": 0.002, + "loss": 2.354, + "step": 109350 + }, + { + "epoch": 0.422755176199533, + "grad_norm": 0.09731178730726242, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 109360 + }, + { + "epoch": 0.4227938334029163, + "grad_norm": 0.09690798819065094, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 109370 + }, + { + "epoch": 0.42283249060629957, + "grad_norm": 0.09169499576091766, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 109380 + }, + { + "epoch": 0.4228711478096829, + "grad_norm": 0.11018433421850204, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 109390 + }, + { + "epoch": 0.42290980501306613, + "grad_norm": 0.10204283148050308, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 109400 + }, + { + "epoch": 0.42294846221644944, + "grad_norm": 0.12003272771835327, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 109410 + }, + { + "epoch": 0.4229871194198327, + "grad_norm": 0.10572025179862976, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 109420 + }, + { + "epoch": 0.42302577662321594, + "grad_norm": 0.11625831574201584, + "learning_rate": 0.002, + "loss": 2.346, + "step": 109430 + }, + { + "epoch": 0.42306443382659925, + "grad_norm": 0.11393840610980988, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 109440 + }, + { + "epoch": 0.4231030910299825, + "grad_norm": 0.10184957087039948, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 109450 + }, + { + "epoch": 0.4231417482333658, + "grad_norm": 0.11889674514532089, + "learning_rate": 0.002, + "loss": 2.343, + "step": 109460 + }, + { + "epoch": 0.42318040543674906, + "grad_norm": 0.09927832335233688, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 109470 + }, + { + "epoch": 0.42321906264013237, + "grad_norm": 0.09848331660032272, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 109480 + }, + { + "epoch": 0.4232577198435156, + "grad_norm": 0.1316344141960144, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 109490 + }, + { + "epoch": 0.42329637704689893, + "grad_norm": 0.09667333960533142, + "learning_rate": 0.002, + "loss": 2.354, + "step": 109500 + }, + { + "epoch": 0.4233350342502822, + "grad_norm": 0.11364229023456573, + "learning_rate": 0.002, + "loss": 2.336, + "step": 109510 + }, + { + "epoch": 0.4233736914536655, + "grad_norm": 0.09983102232217789, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 109520 + }, + { + "epoch": 0.42341234865704874, + "grad_norm": 0.10466358810663223, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 109530 + }, + { + "epoch": 0.42345100586043205, + "grad_norm": 0.17259664833545685, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 109540 + }, + { + "epoch": 0.4234896630638153, + "grad_norm": 0.11260571330785751, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 109550 + }, + { + "epoch": 0.4235283202671986, + "grad_norm": 0.11814837157726288, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 109560 + }, + { + "epoch": 0.42356697747058186, + "grad_norm": 0.11993677169084549, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 109570 + }, + { + "epoch": 0.42360563467396517, + "grad_norm": 0.12143225222826004, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 109580 + }, + { + "epoch": 0.4236442918773484, + "grad_norm": 0.10307957977056503, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 109590 + }, + { + "epoch": 0.42368294908073173, + "grad_norm": 0.09886326640844345, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 109600 + }, + { + "epoch": 0.423721606284115, + "grad_norm": 0.10816150903701782, + "learning_rate": 0.002, + "loss": 2.342, + "step": 109610 + }, + { + "epoch": 0.42376026348749823, + "grad_norm": 0.09745252877473831, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 109620 + }, + { + "epoch": 0.42379892069088154, + "grad_norm": 0.11790237575769424, + "learning_rate": 0.002, + "loss": 2.344, + "step": 109630 + }, + { + "epoch": 0.4238375778942648, + "grad_norm": 0.11370725184679031, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 109640 + }, + { + "epoch": 0.4238762350976481, + "grad_norm": 0.1040082573890686, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 109650 + }, + { + "epoch": 0.42391489230103135, + "grad_norm": 0.12172899395227432, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 109660 + }, + { + "epoch": 0.42395354950441466, + "grad_norm": 0.12151210010051727, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 109670 + }, + { + "epoch": 0.4239922067077979, + "grad_norm": 0.09635493159294128, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 109680 + }, + { + "epoch": 0.4240308639111812, + "grad_norm": 0.1302810162305832, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 109690 + }, + { + "epoch": 0.4240695211145645, + "grad_norm": 0.1121583878993988, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 109700 + }, + { + "epoch": 0.4241081783179478, + "grad_norm": 0.10202564299106598, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 109710 + }, + { + "epoch": 0.42414683552133103, + "grad_norm": 0.11339136958122253, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 109720 + }, + { + "epoch": 0.42418549272471434, + "grad_norm": 0.10718164592981339, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 109730 + }, + { + "epoch": 0.4242241499280976, + "grad_norm": 0.11814633756875992, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 109740 + }, + { + "epoch": 0.4242628071314809, + "grad_norm": 0.1083814725279808, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 109750 + }, + { + "epoch": 0.42430146433486415, + "grad_norm": 0.10616223514080048, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 109760 + }, + { + "epoch": 0.42434012153824746, + "grad_norm": 0.1067415177822113, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 109770 + }, + { + "epoch": 0.4243787787416307, + "grad_norm": 0.1123105064034462, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 109780 + }, + { + "epoch": 0.424417435945014, + "grad_norm": 0.10695965588092804, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 109790 + }, + { + "epoch": 0.4244560931483973, + "grad_norm": 0.11023906618356705, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 109800 + }, + { + "epoch": 0.4244947503517805, + "grad_norm": 0.11257080733776093, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 109810 + }, + { + "epoch": 0.42453340755516383, + "grad_norm": 0.12878021597862244, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 109820 + }, + { + "epoch": 0.4245720647585471, + "grad_norm": 0.10409737378358841, + "learning_rate": 0.002, + "loss": 2.35, + "step": 109830 + }, + { + "epoch": 0.4246107219619304, + "grad_norm": 0.11641906946897507, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 109840 + }, + { + "epoch": 0.42464937916531365, + "grad_norm": 0.10731948912143707, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 109850 + }, + { + "epoch": 0.42468803636869695, + "grad_norm": 0.1166120246052742, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 109860 + }, + { + "epoch": 0.4247266935720802, + "grad_norm": 0.1179104596376419, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 109870 + }, + { + "epoch": 0.4247653507754635, + "grad_norm": 0.10144560039043427, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 109880 + }, + { + "epoch": 0.42480400797884676, + "grad_norm": 0.10600446164608002, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 109890 + }, + { + "epoch": 0.4248426651822301, + "grad_norm": 0.10021763294935226, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 109900 + }, + { + "epoch": 0.4248813223856133, + "grad_norm": 0.12647105753421783, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 109910 + }, + { + "epoch": 0.42491997958899663, + "grad_norm": 0.10528162866830826, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 109920 + }, + { + "epoch": 0.4249586367923799, + "grad_norm": 0.09694304317235947, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 109930 + }, + { + "epoch": 0.4249972939957632, + "grad_norm": 0.12218614667654037, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 109940 + }, + { + "epoch": 0.42503595119914644, + "grad_norm": 0.09037092328071594, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 109950 + }, + { + "epoch": 0.42507460840252975, + "grad_norm": 0.10441536456346512, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 109960 + }, + { + "epoch": 0.425113265605913, + "grad_norm": 0.09831688553094864, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 109970 + }, + { + "epoch": 0.42515192280929626, + "grad_norm": 0.13294687867164612, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 109980 + }, + { + "epoch": 0.42519058001267956, + "grad_norm": 0.10908882319927216, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 109990 + }, + { + "epoch": 0.4252292372160628, + "grad_norm": 0.12384915351867676, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 110000 + }, + { + "epoch": 0.4252678944194461, + "grad_norm": 0.09297414124011993, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 110010 + }, + { + "epoch": 0.4253065516228294, + "grad_norm": 0.10207641124725342, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 110020 + }, + { + "epoch": 0.4253452088262127, + "grad_norm": 0.09486313164234161, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 110030 + }, + { + "epoch": 0.42538386602959594, + "grad_norm": 0.10835953801870346, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 110040 + }, + { + "epoch": 0.42542252323297924, + "grad_norm": 0.10523147881031036, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 110050 + }, + { + "epoch": 0.4254611804363625, + "grad_norm": 0.10926689207553864, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 110060 + }, + { + "epoch": 0.4254998376397458, + "grad_norm": 0.10223328322172165, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 110070 + }, + { + "epoch": 0.42553849484312906, + "grad_norm": 0.1133745089173317, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 110080 + }, + { + "epoch": 0.42557715204651236, + "grad_norm": 0.11308858543634415, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 110090 + }, + { + "epoch": 0.4256158092498956, + "grad_norm": 0.10442328453063965, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 110100 + }, + { + "epoch": 0.4256544664532789, + "grad_norm": 0.1094212755560875, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 110110 + }, + { + "epoch": 0.4256931236566622, + "grad_norm": 0.1028483510017395, + "learning_rate": 0.002, + "loss": 2.347, + "step": 110120 + }, + { + "epoch": 0.4257317808600455, + "grad_norm": 0.10775095969438553, + "learning_rate": 0.002, + "loss": 2.343, + "step": 110130 + }, + { + "epoch": 0.42577043806342874, + "grad_norm": 0.10518835484981537, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 110140 + }, + { + "epoch": 0.42580909526681204, + "grad_norm": 0.11880367249250412, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 110150 + }, + { + "epoch": 0.4258477524701953, + "grad_norm": 0.10569731891155243, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 110160 + }, + { + "epoch": 0.42588640967357855, + "grad_norm": 0.10588982701301575, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 110170 + }, + { + "epoch": 0.42592506687696186, + "grad_norm": 0.10703343152999878, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 110180 + }, + { + "epoch": 0.4259637240803451, + "grad_norm": 0.10179496556520462, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 110190 + }, + { + "epoch": 0.4260023812837284, + "grad_norm": 0.11455044895410538, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 110200 + }, + { + "epoch": 0.42604103848711167, + "grad_norm": 0.1237921491265297, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 110210 + }, + { + "epoch": 0.426079695690495, + "grad_norm": 0.10922089219093323, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 110220 + }, + { + "epoch": 0.4261183528938782, + "grad_norm": 0.11585081368684769, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 110230 + }, + { + "epoch": 0.42615701009726153, + "grad_norm": 0.09888070076704025, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 110240 + }, + { + "epoch": 0.4261956673006448, + "grad_norm": 0.1655680388212204, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 110250 + }, + { + "epoch": 0.4262343245040281, + "grad_norm": 0.11325088143348694, + "learning_rate": 0.002, + "loss": 2.358, + "step": 110260 + }, + { + "epoch": 0.42627298170741135, + "grad_norm": 0.13202178478240967, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 110270 + }, + { + "epoch": 0.42631163891079465, + "grad_norm": 0.11255593597888947, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 110280 + }, + { + "epoch": 0.4263502961141779, + "grad_norm": 0.1038581058382988, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 110290 + }, + { + "epoch": 0.4263889533175612, + "grad_norm": 0.11428995430469513, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 110300 + }, + { + "epoch": 0.42642761052094447, + "grad_norm": 0.10118252038955688, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 110310 + }, + { + "epoch": 0.4264662677243278, + "grad_norm": 0.10601963847875595, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 110320 + }, + { + "epoch": 0.426504924927711, + "grad_norm": 0.11903627216815948, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 110330 + }, + { + "epoch": 0.42654358213109433, + "grad_norm": 0.10223052650690079, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 110340 + }, + { + "epoch": 0.4265822393344776, + "grad_norm": 0.3356073796749115, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 110350 + }, + { + "epoch": 0.42662089653786084, + "grad_norm": 0.10752939432859421, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 110360 + }, + { + "epoch": 0.42665955374124415, + "grad_norm": 0.10979057103395462, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 110370 + }, + { + "epoch": 0.4266982109446274, + "grad_norm": 0.10704939812421799, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 110380 + }, + { + "epoch": 0.4267368681480107, + "grad_norm": 0.09147896617650986, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 110390 + }, + { + "epoch": 0.42677552535139396, + "grad_norm": 0.14855432510375977, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 110400 + }, + { + "epoch": 0.42681418255477727, + "grad_norm": 0.11184597760438919, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 110410 + }, + { + "epoch": 0.4268528397581605, + "grad_norm": 0.08798114955425262, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 110420 + }, + { + "epoch": 0.4268914969615438, + "grad_norm": 0.10517899692058563, + "learning_rate": 0.002, + "loss": 2.344, + "step": 110430 + }, + { + "epoch": 0.4269301541649271, + "grad_norm": 0.09603867679834366, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 110440 + }, + { + "epoch": 0.4269688113683104, + "grad_norm": 0.10815239697694778, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 110450 + }, + { + "epoch": 0.42700746857169364, + "grad_norm": 0.09845545142889023, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 110460 + }, + { + "epoch": 0.42704612577507695, + "grad_norm": 0.0918077826499939, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 110470 + }, + { + "epoch": 0.4270847829784602, + "grad_norm": 0.11321431398391724, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 110480 + }, + { + "epoch": 0.4271234401818435, + "grad_norm": 0.11410441249608994, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 110490 + }, + { + "epoch": 0.42716209738522676, + "grad_norm": 0.09834035485982895, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 110500 + }, + { + "epoch": 0.42720075458861007, + "grad_norm": 0.1138286218047142, + "learning_rate": 0.002, + "loss": 2.354, + "step": 110510 + }, + { + "epoch": 0.4272394117919933, + "grad_norm": 0.11498411744832993, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 110520 + }, + { + "epoch": 0.4272780689953766, + "grad_norm": 0.11137287318706512, + "learning_rate": 0.002, + "loss": 2.35, + "step": 110530 + }, + { + "epoch": 0.4273167261987599, + "grad_norm": 0.10131389647722244, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 110540 + }, + { + "epoch": 0.42735538340214313, + "grad_norm": 0.10402471572160721, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 110550 + }, + { + "epoch": 0.42739404060552644, + "grad_norm": 0.11300648748874664, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 110560 + }, + { + "epoch": 0.4274326978089097, + "grad_norm": 0.09731554985046387, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 110570 + }, + { + "epoch": 0.427471355012293, + "grad_norm": 0.1155574768781662, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 110580 + }, + { + "epoch": 0.42751001221567625, + "grad_norm": 0.10829325020313263, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 110590 + }, + { + "epoch": 0.42754866941905956, + "grad_norm": 0.10412159562110901, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 110600 + }, + { + "epoch": 0.4275873266224428, + "grad_norm": 0.11188670992851257, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 110610 + }, + { + "epoch": 0.4276259838258261, + "grad_norm": 0.1147642582654953, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 110620 + }, + { + "epoch": 0.42766464102920937, + "grad_norm": 0.11237642914056778, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 110630 + }, + { + "epoch": 0.4277032982325927, + "grad_norm": 0.13152167201042175, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 110640 + }, + { + "epoch": 0.42774195543597593, + "grad_norm": 0.10723838210105896, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 110650 + }, + { + "epoch": 0.42778061263935924, + "grad_norm": 0.10246115922927856, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 110660 + }, + { + "epoch": 0.4278192698427425, + "grad_norm": 0.11409907788038254, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 110670 + }, + { + "epoch": 0.4278579270461258, + "grad_norm": 0.11728665977716446, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 110680 + }, + { + "epoch": 0.42789658424950905, + "grad_norm": 0.12186159938573837, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 110690 + }, + { + "epoch": 0.42793524145289236, + "grad_norm": 0.10480479896068573, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 110700 + }, + { + "epoch": 0.4279738986562756, + "grad_norm": 0.10415849089622498, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 110710 + }, + { + "epoch": 0.4280125558596589, + "grad_norm": 0.10350564867258072, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 110720 + }, + { + "epoch": 0.42805121306304217, + "grad_norm": 0.10367821902036667, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 110730 + }, + { + "epoch": 0.4280898702664254, + "grad_norm": 0.10381826758384705, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 110740 + }, + { + "epoch": 0.42812852746980873, + "grad_norm": 0.10306168347597122, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 110750 + }, + { + "epoch": 0.428167184673192, + "grad_norm": 0.12796978652477264, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 110760 + }, + { + "epoch": 0.4282058418765753, + "grad_norm": 0.10191497951745987, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 110770 + }, + { + "epoch": 0.42824449907995854, + "grad_norm": 0.10926347225904465, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 110780 + }, + { + "epoch": 0.42828315628334185, + "grad_norm": 0.10035771876573563, + "learning_rate": 0.002, + "loss": 2.364, + "step": 110790 + }, + { + "epoch": 0.4283218134867251, + "grad_norm": 0.10627880692481995, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 110800 + }, + { + "epoch": 0.4283604706901084, + "grad_norm": 0.10428674519062042, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 110810 + }, + { + "epoch": 0.42839912789349166, + "grad_norm": 0.11040990799665451, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 110820 + }, + { + "epoch": 0.42843778509687497, + "grad_norm": 0.09725743532180786, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 110830 + }, + { + "epoch": 0.4284764423002582, + "grad_norm": 0.10322292149066925, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 110840 + }, + { + "epoch": 0.42851509950364153, + "grad_norm": 0.12217211723327637, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 110850 + }, + { + "epoch": 0.4285537567070248, + "grad_norm": 0.1275206059217453, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 110860 + }, + { + "epoch": 0.4285924139104081, + "grad_norm": 0.11368425190448761, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 110870 + }, + { + "epoch": 0.42863107111379134, + "grad_norm": 0.10147102922201157, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 110880 + }, + { + "epoch": 0.42866972831717465, + "grad_norm": 0.0996956005692482, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 110890 + }, + { + "epoch": 0.4287083855205579, + "grad_norm": 0.3376283645629883, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 110900 + }, + { + "epoch": 0.42874704272394115, + "grad_norm": 0.13506639003753662, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 110910 + }, + { + "epoch": 0.42878569992732446, + "grad_norm": 0.10100368410348892, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 110920 + }, + { + "epoch": 0.4288243571307077, + "grad_norm": 0.09164312481880188, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 110930 + }, + { + "epoch": 0.428863014334091, + "grad_norm": 0.12385259568691254, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 110940 + }, + { + "epoch": 0.42890167153747427, + "grad_norm": 0.1286320984363556, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 110950 + }, + { + "epoch": 0.4289403287408576, + "grad_norm": 0.11350031942129135, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 110960 + }, + { + "epoch": 0.42897898594424083, + "grad_norm": 0.10365412384271622, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 110970 + }, + { + "epoch": 0.42901764314762414, + "grad_norm": 0.1234087273478508, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 110980 + }, + { + "epoch": 0.4290563003510074, + "grad_norm": 0.11059076339006424, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 110990 + }, + { + "epoch": 0.4290949575543907, + "grad_norm": 0.09807975590229034, + "learning_rate": 0.002, + "loss": 2.36, + "step": 111000 + }, + { + "epoch": 0.42913361475777395, + "grad_norm": 0.12637066841125488, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 111010 + }, + { + "epoch": 0.42917227196115726, + "grad_norm": 0.08945341408252716, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 111020 + }, + { + "epoch": 0.4292109291645405, + "grad_norm": 0.10467803478240967, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 111030 + }, + { + "epoch": 0.4292495863679238, + "grad_norm": 0.09956691414117813, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 111040 + }, + { + "epoch": 0.42928824357130707, + "grad_norm": 0.1146853044629097, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 111050 + }, + { + "epoch": 0.4293269007746904, + "grad_norm": 0.10632400959730148, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 111060 + }, + { + "epoch": 0.42936555797807363, + "grad_norm": 0.09750671684741974, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 111070 + }, + { + "epoch": 0.42940421518145694, + "grad_norm": 0.10774068534374237, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 111080 + }, + { + "epoch": 0.4294428723848402, + "grad_norm": 0.09877938777208328, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 111090 + }, + { + "epoch": 0.42948152958822344, + "grad_norm": 0.09424333274364471, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 111100 + }, + { + "epoch": 0.42952018679160675, + "grad_norm": 0.1110386922955513, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 111110 + }, + { + "epoch": 0.42955884399499, + "grad_norm": 0.10476934909820557, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 111120 + }, + { + "epoch": 0.4295975011983733, + "grad_norm": 0.11968885362148285, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 111130 + }, + { + "epoch": 0.42963615840175656, + "grad_norm": 0.10490576177835464, + "learning_rate": 0.002, + "loss": 2.344, + "step": 111140 + }, + { + "epoch": 0.42967481560513987, + "grad_norm": 0.09844032675027847, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 111150 + }, + { + "epoch": 0.4297134728085231, + "grad_norm": 0.11125150322914124, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 111160 + }, + { + "epoch": 0.42975213001190643, + "grad_norm": 0.09913921356201172, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 111170 + }, + { + "epoch": 0.4297907872152897, + "grad_norm": 0.11865279078483582, + "learning_rate": 0.002, + "loss": 2.35, + "step": 111180 + }, + { + "epoch": 0.429829444418673, + "grad_norm": 0.10810180008411407, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 111190 + }, + { + "epoch": 0.42986810162205624, + "grad_norm": 0.10936781018972397, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 111200 + }, + { + "epoch": 0.42990675882543955, + "grad_norm": 0.10779301077127457, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 111210 + }, + { + "epoch": 0.4299454160288228, + "grad_norm": 0.10778950154781342, + "learning_rate": 0.002, + "loss": 2.338, + "step": 111220 + }, + { + "epoch": 0.4299840732322061, + "grad_norm": 0.10507801920175552, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 111230 + }, + { + "epoch": 0.43002273043558936, + "grad_norm": 0.10925329476594925, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 111240 + }, + { + "epoch": 0.43006138763897267, + "grad_norm": 0.11180612444877625, + "learning_rate": 0.002, + "loss": 2.344, + "step": 111250 + }, + { + "epoch": 0.4301000448423559, + "grad_norm": 0.10876971483230591, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 111260 + }, + { + "epoch": 0.43013870204573923, + "grad_norm": 0.09879743307828903, + "learning_rate": 0.002, + "loss": 2.352, + "step": 111270 + }, + { + "epoch": 0.4301773592491225, + "grad_norm": 0.1002826914191246, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 111280 + }, + { + "epoch": 0.43021601645250573, + "grad_norm": 0.10194902122020721, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 111290 + }, + { + "epoch": 0.43025467365588904, + "grad_norm": 0.13493533432483673, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 111300 + }, + { + "epoch": 0.4302933308592723, + "grad_norm": 0.10006940364837646, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 111310 + }, + { + "epoch": 0.4303319880626556, + "grad_norm": 0.10241258889436722, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 111320 + }, + { + "epoch": 0.43037064526603885, + "grad_norm": 0.10722459852695465, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 111330 + }, + { + "epoch": 0.43040930246942216, + "grad_norm": 0.11641300469636917, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 111340 + }, + { + "epoch": 0.4304479596728054, + "grad_norm": 0.11539609730243683, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 111350 + }, + { + "epoch": 0.4304866168761887, + "grad_norm": 0.10943485796451569, + "learning_rate": 0.002, + "loss": 2.357, + "step": 111360 + }, + { + "epoch": 0.430525274079572, + "grad_norm": 0.09740972518920898, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 111370 + }, + { + "epoch": 0.4305639312829553, + "grad_norm": 0.11207325756549835, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 111380 + }, + { + "epoch": 0.43060258848633853, + "grad_norm": 0.11338721215724945, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 111390 + }, + { + "epoch": 0.43064124568972184, + "grad_norm": 0.09858185797929764, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 111400 + }, + { + "epoch": 0.4306799028931051, + "grad_norm": 0.10881024599075317, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 111410 + }, + { + "epoch": 0.4307185600964884, + "grad_norm": 0.10683208703994751, + "learning_rate": 0.002, + "loss": 2.346, + "step": 111420 + }, + { + "epoch": 0.43075721729987165, + "grad_norm": 0.09608019888401031, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 111430 + }, + { + "epoch": 0.43079587450325496, + "grad_norm": 0.08652333915233612, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 111440 + }, + { + "epoch": 0.4308345317066382, + "grad_norm": 0.11403698474168777, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 111450 + }, + { + "epoch": 0.4308731889100215, + "grad_norm": 0.10857101529836655, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 111460 + }, + { + "epoch": 0.4309118461134048, + "grad_norm": 0.10597855597734451, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 111470 + }, + { + "epoch": 0.430950503316788, + "grad_norm": 0.0985519140958786, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 111480 + }, + { + "epoch": 0.43098916052017133, + "grad_norm": 0.14048723876476288, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 111490 + }, + { + "epoch": 0.4310278177235546, + "grad_norm": 0.09988056868314743, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 111500 + }, + { + "epoch": 0.4310664749269379, + "grad_norm": 0.10084901750087738, + "learning_rate": 0.002, + "loss": 2.342, + "step": 111510 + }, + { + "epoch": 0.43110513213032114, + "grad_norm": 0.11319395154714584, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 111520 + }, + { + "epoch": 0.43114378933370445, + "grad_norm": 0.11077584326267242, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 111530 + }, + { + "epoch": 0.4311824465370877, + "grad_norm": 0.10431598871946335, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 111540 + }, + { + "epoch": 0.431221103740471, + "grad_norm": 0.1093798279762268, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 111550 + }, + { + "epoch": 0.43125976094385426, + "grad_norm": 0.11040189862251282, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 111560 + }, + { + "epoch": 0.43129841814723757, + "grad_norm": 0.09806570410728455, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 111570 + }, + { + "epoch": 0.4313370753506208, + "grad_norm": 0.11054784804582596, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 111580 + }, + { + "epoch": 0.43137573255400413, + "grad_norm": 0.11820884793996811, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 111590 + }, + { + "epoch": 0.4314143897573874, + "grad_norm": 0.10436604171991348, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 111600 + }, + { + "epoch": 0.4314530469607707, + "grad_norm": 0.12815545499324799, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 111610 + }, + { + "epoch": 0.43149170416415394, + "grad_norm": 0.090470090508461, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 111620 + }, + { + "epoch": 0.43153036136753725, + "grad_norm": 0.09031805396080017, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 111630 + }, + { + "epoch": 0.4315690185709205, + "grad_norm": 0.10491258651018143, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 111640 + }, + { + "epoch": 0.43160767577430376, + "grad_norm": 0.0920221358537674, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 111650 + }, + { + "epoch": 0.43164633297768706, + "grad_norm": 0.1239825040102005, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 111660 + }, + { + "epoch": 0.4316849901810703, + "grad_norm": 0.10015200078487396, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 111670 + }, + { + "epoch": 0.4317236473844536, + "grad_norm": 0.125696063041687, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 111680 + }, + { + "epoch": 0.4317623045878369, + "grad_norm": 0.11201971024274826, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 111690 + }, + { + "epoch": 0.4318009617912202, + "grad_norm": 0.10140449553728104, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 111700 + }, + { + "epoch": 0.43183961899460344, + "grad_norm": 0.10589606314897537, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 111710 + }, + { + "epoch": 0.43187827619798674, + "grad_norm": 0.10249914228916168, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 111720 + }, + { + "epoch": 0.43191693340137, + "grad_norm": 0.10037936270236969, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 111730 + }, + { + "epoch": 0.4319555906047533, + "grad_norm": 0.09914037585258484, + "learning_rate": 0.002, + "loss": 2.345, + "step": 111740 + }, + { + "epoch": 0.43199424780813656, + "grad_norm": 0.09331130981445312, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 111750 + }, + { + "epoch": 0.43203290501151986, + "grad_norm": 0.1144673079252243, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 111760 + }, + { + "epoch": 0.4320715622149031, + "grad_norm": 0.13211724162101746, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 111770 + }, + { + "epoch": 0.4321102194182864, + "grad_norm": 0.11853010207414627, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 111780 + }, + { + "epoch": 0.4321488766216697, + "grad_norm": 0.10304831713438034, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 111790 + }, + { + "epoch": 0.432187533825053, + "grad_norm": 0.102897509932518, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 111800 + }, + { + "epoch": 0.43222619102843624, + "grad_norm": 0.10276034474372864, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 111810 + }, + { + "epoch": 0.43226484823181954, + "grad_norm": 0.10357806831598282, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 111820 + }, + { + "epoch": 0.4323035054352028, + "grad_norm": 0.10969896614551544, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 111830 + }, + { + "epoch": 0.43234216263858605, + "grad_norm": 0.1067805290222168, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 111840 + }, + { + "epoch": 0.43238081984196935, + "grad_norm": 0.10147634893655777, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 111850 + }, + { + "epoch": 0.4324194770453526, + "grad_norm": 0.13259513676166534, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 111860 + }, + { + "epoch": 0.4324581342487359, + "grad_norm": 0.09387633949518204, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 111870 + }, + { + "epoch": 0.43249679145211917, + "grad_norm": 0.10201136022806168, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 111880 + }, + { + "epoch": 0.4325354486555025, + "grad_norm": 0.11776763200759888, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 111890 + }, + { + "epoch": 0.4325741058588857, + "grad_norm": 0.1148151308298111, + "learning_rate": 0.002, + "loss": 2.343, + "step": 111900 + }, + { + "epoch": 0.43261276306226903, + "grad_norm": 0.11505398154258728, + "learning_rate": 0.002, + "loss": 2.344, + "step": 111910 + }, + { + "epoch": 0.4326514202656523, + "grad_norm": 0.10341343283653259, + "learning_rate": 0.002, + "loss": 2.355, + "step": 111920 + }, + { + "epoch": 0.4326900774690356, + "grad_norm": 0.1147983968257904, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 111930 + }, + { + "epoch": 0.43272873467241885, + "grad_norm": 0.08892353624105453, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 111940 + }, + { + "epoch": 0.43276739187580215, + "grad_norm": 0.09897063672542572, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 111950 + }, + { + "epoch": 0.4328060490791854, + "grad_norm": 0.12240561097860336, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 111960 + }, + { + "epoch": 0.4328447062825687, + "grad_norm": 0.10131847858428955, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 111970 + }, + { + "epoch": 0.43288336348595197, + "grad_norm": 0.10665290057659149, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 111980 + }, + { + "epoch": 0.4329220206893353, + "grad_norm": 0.09720829874277115, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 111990 + }, + { + "epoch": 0.4329606778927185, + "grad_norm": 0.10075974464416504, + "learning_rate": 0.002, + "loss": 2.346, + "step": 112000 + }, + { + "epoch": 0.43299933509610183, + "grad_norm": 0.14467671513557434, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 112010 + }, + { + "epoch": 0.4330379922994851, + "grad_norm": 0.09993736445903778, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 112020 + }, + { + "epoch": 0.43307664950286834, + "grad_norm": 0.12287890911102295, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 112030 + }, + { + "epoch": 0.43311530670625165, + "grad_norm": 0.13064667582511902, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 112040 + }, + { + "epoch": 0.4331539639096349, + "grad_norm": 0.150161474943161, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 112050 + }, + { + "epoch": 0.4331926211130182, + "grad_norm": 0.24644172191619873, + "learning_rate": 0.002, + "loss": 2.3769, + "step": 112060 + }, + { + "epoch": 0.43323127831640146, + "grad_norm": 0.09621595591306686, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 112070 + }, + { + "epoch": 0.43326993551978477, + "grad_norm": 0.11061085760593414, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 112080 + }, + { + "epoch": 0.433308592723168, + "grad_norm": 0.11562249809503555, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 112090 + }, + { + "epoch": 0.4333472499265513, + "grad_norm": 0.10313281416893005, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 112100 + }, + { + "epoch": 0.4333859071299346, + "grad_norm": 0.09896907210350037, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 112110 + }, + { + "epoch": 0.4334245643333179, + "grad_norm": 0.09867633134126663, + "learning_rate": 0.002, + "loss": 2.341, + "step": 112120 + }, + { + "epoch": 0.43346322153670114, + "grad_norm": 0.38379985094070435, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 112130 + }, + { + "epoch": 0.43350187874008445, + "grad_norm": 0.14632292091846466, + "learning_rate": 0.002, + "loss": 2.345, + "step": 112140 + }, + { + "epoch": 0.4335405359434677, + "grad_norm": 0.12974530458450317, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 112150 + }, + { + "epoch": 0.433579193146851, + "grad_norm": 0.1288261115550995, + "learning_rate": 0.002, + "loss": 2.349, + "step": 112160 + }, + { + "epoch": 0.43361785035023426, + "grad_norm": 0.09830410033464432, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 112170 + }, + { + "epoch": 0.43365650755361757, + "grad_norm": 0.09210671484470367, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 112180 + }, + { + "epoch": 0.4336951647570008, + "grad_norm": 0.16515584290027618, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 112190 + }, + { + "epoch": 0.4337338219603841, + "grad_norm": 0.10097946971654892, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 112200 + }, + { + "epoch": 0.4337724791637674, + "grad_norm": 0.12167806178331375, + "learning_rate": 0.002, + "loss": 2.3735, + "step": 112210 + }, + { + "epoch": 0.43381113636715063, + "grad_norm": 0.11054915189743042, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 112220 + }, + { + "epoch": 0.43384979357053394, + "grad_norm": 0.10262811928987503, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 112230 + }, + { + "epoch": 0.4338884507739172, + "grad_norm": 0.11559311300516129, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 112240 + }, + { + "epoch": 0.4339271079773005, + "grad_norm": 0.09575275331735611, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 112250 + }, + { + "epoch": 0.43396576518068375, + "grad_norm": 0.10230062901973724, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 112260 + }, + { + "epoch": 0.43400442238406706, + "grad_norm": 0.10631681233644485, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 112270 + }, + { + "epoch": 0.4340430795874503, + "grad_norm": 0.14762724936008453, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 112280 + }, + { + "epoch": 0.4340817367908336, + "grad_norm": 0.1018281877040863, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 112290 + }, + { + "epoch": 0.43412039399421687, + "grad_norm": 0.10437997430562973, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 112300 + }, + { + "epoch": 0.4341590511976002, + "grad_norm": 0.10592590272426605, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 112310 + }, + { + "epoch": 0.43419770840098343, + "grad_norm": 0.09188766032457352, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 112320 + }, + { + "epoch": 0.43423636560436674, + "grad_norm": 0.11880189925432205, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 112330 + }, + { + "epoch": 0.43427502280775, + "grad_norm": 0.11617539077997208, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 112340 + }, + { + "epoch": 0.4343136800111333, + "grad_norm": 0.13290619850158691, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 112350 + }, + { + "epoch": 0.43435233721451655, + "grad_norm": 0.10754197090864182, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 112360 + }, + { + "epoch": 0.43439099441789986, + "grad_norm": 0.11557207256555557, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 112370 + }, + { + "epoch": 0.4344296516212831, + "grad_norm": 0.11429212987422943, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 112380 + }, + { + "epoch": 0.4344683088246664, + "grad_norm": 0.12526215612888336, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 112390 + }, + { + "epoch": 0.43450696602804967, + "grad_norm": 0.10692817717790604, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 112400 + }, + { + "epoch": 0.4345456232314329, + "grad_norm": 0.11485128104686737, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 112410 + }, + { + "epoch": 0.43458428043481623, + "grad_norm": 0.1053088828921318, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 112420 + }, + { + "epoch": 0.4346229376381995, + "grad_norm": 0.09685841202735901, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 112430 + }, + { + "epoch": 0.4346615948415828, + "grad_norm": 0.11361964792013168, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 112440 + }, + { + "epoch": 0.43470025204496604, + "grad_norm": 0.10083848237991333, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 112450 + }, + { + "epoch": 0.43473890924834935, + "grad_norm": 0.1087498739361763, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 112460 + }, + { + "epoch": 0.4347775664517326, + "grad_norm": 0.09891539812088013, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 112470 + }, + { + "epoch": 0.4348162236551159, + "grad_norm": 0.09671245515346527, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 112480 + }, + { + "epoch": 0.43485488085849916, + "grad_norm": 0.09899502247571945, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 112490 + }, + { + "epoch": 0.43489353806188247, + "grad_norm": 0.1066652461886406, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 112500 + }, + { + "epoch": 0.4349321952652657, + "grad_norm": 0.09613171964883804, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 112510 + }, + { + "epoch": 0.434970852468649, + "grad_norm": 0.10856905579566956, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 112520 + }, + { + "epoch": 0.4350095096720323, + "grad_norm": 0.09104996919631958, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 112530 + }, + { + "epoch": 0.4350481668754156, + "grad_norm": 0.12133617699146271, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 112540 + }, + { + "epoch": 0.43508682407879884, + "grad_norm": 0.10629113763570786, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 112550 + }, + { + "epoch": 0.43512548128218215, + "grad_norm": 0.10068577527999878, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 112560 + }, + { + "epoch": 0.4351641384855654, + "grad_norm": 0.11508452892303467, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 112570 + }, + { + "epoch": 0.43520279568894865, + "grad_norm": 0.09170261025428772, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 112580 + }, + { + "epoch": 0.43524145289233196, + "grad_norm": 0.1024169772863388, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 112590 + }, + { + "epoch": 0.4352801100957152, + "grad_norm": 0.11252007633447647, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 112600 + }, + { + "epoch": 0.4353187672990985, + "grad_norm": 0.10464402288198471, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 112610 + }, + { + "epoch": 0.43535742450248177, + "grad_norm": 0.10425686091184616, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 112620 + }, + { + "epoch": 0.4353960817058651, + "grad_norm": 0.12892098724842072, + "learning_rate": 0.002, + "loss": 2.3728, + "step": 112630 + }, + { + "epoch": 0.43543473890924833, + "grad_norm": 0.11767508834600449, + "learning_rate": 0.002, + "loss": 2.328, + "step": 112640 + }, + { + "epoch": 0.43547339611263164, + "grad_norm": 0.09892310202121735, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 112650 + }, + { + "epoch": 0.4355120533160149, + "grad_norm": 0.11061010509729385, + "learning_rate": 0.002, + "loss": 2.3697, + "step": 112660 + }, + { + "epoch": 0.4355507105193982, + "grad_norm": 0.10130377113819122, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 112670 + }, + { + "epoch": 0.43558936772278145, + "grad_norm": 0.12196584790945053, + "learning_rate": 0.002, + "loss": 2.352, + "step": 112680 + }, + { + "epoch": 0.43562802492616476, + "grad_norm": 0.10804404318332672, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 112690 + }, + { + "epoch": 0.435666682129548, + "grad_norm": 0.11497610062360764, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 112700 + }, + { + "epoch": 0.4357053393329313, + "grad_norm": 0.10728848725557327, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 112710 + }, + { + "epoch": 0.43574399653631457, + "grad_norm": 0.11768807470798492, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 112720 + }, + { + "epoch": 0.4357826537396979, + "grad_norm": 0.1070348247885704, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 112730 + }, + { + "epoch": 0.43582131094308113, + "grad_norm": 0.09731265902519226, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 112740 + }, + { + "epoch": 0.43585996814646444, + "grad_norm": 0.11236754804849625, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 112750 + }, + { + "epoch": 0.4358986253498477, + "grad_norm": 0.10371419787406921, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 112760 + }, + { + "epoch": 0.43593728255323094, + "grad_norm": 0.09995149075984955, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 112770 + }, + { + "epoch": 0.43597593975661425, + "grad_norm": 0.09958817064762115, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 112780 + }, + { + "epoch": 0.4360145969599975, + "grad_norm": 0.1253259927034378, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 112790 + }, + { + "epoch": 0.4360532541633808, + "grad_norm": 0.10836859792470932, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 112800 + }, + { + "epoch": 0.43609191136676406, + "grad_norm": 0.09147986769676208, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 112810 + }, + { + "epoch": 0.43613056857014737, + "grad_norm": 0.09477005153894424, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 112820 + }, + { + "epoch": 0.4361692257735306, + "grad_norm": 0.1311466097831726, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 112830 + }, + { + "epoch": 0.43620788297691393, + "grad_norm": 0.12641702592372894, + "learning_rate": 0.002, + "loss": 2.355, + "step": 112840 + }, + { + "epoch": 0.4362465401802972, + "grad_norm": 0.09795732796192169, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 112850 + }, + { + "epoch": 0.4362851973836805, + "grad_norm": 0.11147502809762955, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 112860 + }, + { + "epoch": 0.43632385458706374, + "grad_norm": 0.10798709094524384, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 112870 + }, + { + "epoch": 0.43636251179044705, + "grad_norm": 0.11606871336698532, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 112880 + }, + { + "epoch": 0.4364011689938303, + "grad_norm": 0.10911009460687637, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 112890 + }, + { + "epoch": 0.4364398261972136, + "grad_norm": 0.11125440150499344, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 112900 + }, + { + "epoch": 0.43647848340059686, + "grad_norm": 0.10836660861968994, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 112910 + }, + { + "epoch": 0.43651714060398017, + "grad_norm": 0.0975693017244339, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 112920 + }, + { + "epoch": 0.4365557978073634, + "grad_norm": 0.10515625029802322, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 112930 + }, + { + "epoch": 0.43659445501074673, + "grad_norm": 0.09607485681772232, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 112940 + }, + { + "epoch": 0.43663311221413, + "grad_norm": 0.08798225969076157, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 112950 + }, + { + "epoch": 0.43667176941751323, + "grad_norm": 0.11401963979005814, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 112960 + }, + { + "epoch": 0.43671042662089654, + "grad_norm": 0.09933006763458252, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 112970 + }, + { + "epoch": 0.4367490838242798, + "grad_norm": 0.10628395527601242, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 112980 + }, + { + "epoch": 0.4367877410276631, + "grad_norm": 0.0965963825583458, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 112990 + }, + { + "epoch": 0.43682639823104635, + "grad_norm": 0.10263420641422272, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 113000 + }, + { + "epoch": 0.43686505543442966, + "grad_norm": 0.10733772069215775, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 113010 + }, + { + "epoch": 0.4369037126378129, + "grad_norm": 0.09779093414545059, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 113020 + }, + { + "epoch": 0.4369423698411962, + "grad_norm": 0.10382230579853058, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 113030 + }, + { + "epoch": 0.4369810270445795, + "grad_norm": 0.2067124843597412, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 113040 + }, + { + "epoch": 0.4370196842479628, + "grad_norm": 0.11711885035037994, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 113050 + }, + { + "epoch": 0.43705834145134603, + "grad_norm": 0.10530546307563782, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 113060 + }, + { + "epoch": 0.43709699865472934, + "grad_norm": 0.10417833924293518, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 113070 + }, + { + "epoch": 0.4371356558581126, + "grad_norm": 0.09886866062879562, + "learning_rate": 0.002, + "loss": 2.352, + "step": 113080 + }, + { + "epoch": 0.4371743130614959, + "grad_norm": 0.12263850122690201, + "learning_rate": 0.002, + "loss": 2.35, + "step": 113090 + }, + { + "epoch": 0.43721297026487915, + "grad_norm": 0.09797932207584381, + "learning_rate": 0.002, + "loss": 2.344, + "step": 113100 + }, + { + "epoch": 0.43725162746826246, + "grad_norm": 0.12123297899961472, + "learning_rate": 0.002, + "loss": 2.3624, + "step": 113110 + }, + { + "epoch": 0.4372902846716457, + "grad_norm": 0.09888701885938644, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 113120 + }, + { + "epoch": 0.437328941875029, + "grad_norm": 0.10616670548915863, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 113130 + }, + { + "epoch": 0.4373675990784123, + "grad_norm": 0.10324602574110031, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 113140 + }, + { + "epoch": 0.4374062562817955, + "grad_norm": 0.13094359636306763, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 113150 + }, + { + "epoch": 0.43744491348517883, + "grad_norm": 0.12776924669742584, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 113160 + }, + { + "epoch": 0.4374835706885621, + "grad_norm": 0.11114688962697983, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 113170 + }, + { + "epoch": 0.4375222278919454, + "grad_norm": 0.10153741389513016, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 113180 + }, + { + "epoch": 0.43756088509532864, + "grad_norm": 0.0986599400639534, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 113190 + }, + { + "epoch": 0.43759954229871195, + "grad_norm": 0.10165860503911972, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 113200 + }, + { + "epoch": 0.4376381995020952, + "grad_norm": 0.11720269918441772, + "learning_rate": 0.002, + "loss": 2.344, + "step": 113210 + }, + { + "epoch": 0.4376768567054785, + "grad_norm": 0.11033625155687332, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 113220 + }, + { + "epoch": 0.43771551390886176, + "grad_norm": 0.5819602012634277, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 113230 + }, + { + "epoch": 0.43775417111224507, + "grad_norm": 0.11656757444143295, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 113240 + }, + { + "epoch": 0.4377928283156283, + "grad_norm": 0.10410122573375702, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 113250 + }, + { + "epoch": 0.43783148551901163, + "grad_norm": 0.0943315178155899, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 113260 + }, + { + "epoch": 0.4378701427223949, + "grad_norm": 0.18185904622077942, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 113270 + }, + { + "epoch": 0.4379087999257782, + "grad_norm": 0.10419418662786484, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 113280 + }, + { + "epoch": 0.43794745712916144, + "grad_norm": 0.11047537624835968, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 113290 + }, + { + "epoch": 0.43798611433254475, + "grad_norm": 0.12000361829996109, + "learning_rate": 0.002, + "loss": 2.345, + "step": 113300 + }, + { + "epoch": 0.438024771535928, + "grad_norm": 0.11603201925754547, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 113310 + }, + { + "epoch": 0.43806342873931126, + "grad_norm": 0.10754760354757309, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 113320 + }, + { + "epoch": 0.43810208594269456, + "grad_norm": 0.10942035913467407, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 113330 + }, + { + "epoch": 0.4381407431460778, + "grad_norm": 0.11428725719451904, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 113340 + }, + { + "epoch": 0.4381794003494611, + "grad_norm": 0.11185505986213684, + "learning_rate": 0.002, + "loss": 2.347, + "step": 113350 + }, + { + "epoch": 0.4382180575528444, + "grad_norm": 0.10271166265010834, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 113360 + }, + { + "epoch": 0.4382567147562277, + "grad_norm": 0.09774298220872879, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 113370 + }, + { + "epoch": 0.43829537195961094, + "grad_norm": 0.11461903899908066, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 113380 + }, + { + "epoch": 0.43833402916299424, + "grad_norm": 0.12484222650527954, + "learning_rate": 0.002, + "loss": 2.347, + "step": 113390 + }, + { + "epoch": 0.4383726863663775, + "grad_norm": 0.11602794378995895, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 113400 + }, + { + "epoch": 0.4384113435697608, + "grad_norm": 0.111027292907238, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 113410 + }, + { + "epoch": 0.43845000077314406, + "grad_norm": 0.08952493220567703, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 113420 + }, + { + "epoch": 0.43848865797652736, + "grad_norm": 0.10877718031406403, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 113430 + }, + { + "epoch": 0.4385273151799106, + "grad_norm": 0.11232464015483856, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 113440 + }, + { + "epoch": 0.4385659723832939, + "grad_norm": 0.10439947247505188, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 113450 + }, + { + "epoch": 0.4386046295866772, + "grad_norm": 0.10224592685699463, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 113460 + }, + { + "epoch": 0.4386432867900605, + "grad_norm": 0.10106103122234344, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 113470 + }, + { + "epoch": 0.43868194399344373, + "grad_norm": 0.1027471050620079, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 113480 + }, + { + "epoch": 0.43872060119682704, + "grad_norm": 0.10447635501623154, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 113490 + }, + { + "epoch": 0.4387592584002103, + "grad_norm": 0.10832199454307556, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 113500 + }, + { + "epoch": 0.43879791560359355, + "grad_norm": 0.375355064868927, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 113510 + }, + { + "epoch": 0.43883657280697685, + "grad_norm": 0.12057796865701675, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 113520 + }, + { + "epoch": 0.4388752300103601, + "grad_norm": 0.1038069799542427, + "learning_rate": 0.002, + "loss": 2.36, + "step": 113530 + }, + { + "epoch": 0.4389138872137434, + "grad_norm": 0.09568572789430618, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 113540 + }, + { + "epoch": 0.43895254441712667, + "grad_norm": 0.10505779832601547, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 113550 + }, + { + "epoch": 0.43899120162051, + "grad_norm": 0.12001272290945053, + "learning_rate": 0.002, + "loss": 2.338, + "step": 113560 + }, + { + "epoch": 0.4390298588238932, + "grad_norm": 0.1079864352941513, + "learning_rate": 0.002, + "loss": 2.335, + "step": 113570 + }, + { + "epoch": 0.43906851602727653, + "grad_norm": 0.09099382907152176, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 113580 + }, + { + "epoch": 0.4391071732306598, + "grad_norm": 0.09558238834142685, + "learning_rate": 0.002, + "loss": 2.347, + "step": 113590 + }, + { + "epoch": 0.4391458304340431, + "grad_norm": 0.10473648458719254, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 113600 + }, + { + "epoch": 0.43918448763742635, + "grad_norm": 0.0934872105717659, + "learning_rate": 0.002, + "loss": 2.346, + "step": 113610 + }, + { + "epoch": 0.43922314484080965, + "grad_norm": 0.1030026376247406, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 113620 + }, + { + "epoch": 0.4392618020441929, + "grad_norm": 0.11831945925951004, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 113630 + }, + { + "epoch": 0.4393004592475762, + "grad_norm": 0.10450316220521927, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 113640 + }, + { + "epoch": 0.43933911645095947, + "grad_norm": 0.11427527666091919, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 113650 + }, + { + "epoch": 0.4393777736543428, + "grad_norm": 0.10517244786024094, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 113660 + }, + { + "epoch": 0.439416430857726, + "grad_norm": 0.1093309074640274, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 113670 + }, + { + "epoch": 0.43945508806110933, + "grad_norm": 0.1326448619365692, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 113680 + }, + { + "epoch": 0.4394937452644926, + "grad_norm": 0.10119234025478363, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 113690 + }, + { + "epoch": 0.43953240246787584, + "grad_norm": 0.10754355788230896, + "learning_rate": 0.002, + "loss": 2.362, + "step": 113700 + }, + { + "epoch": 0.43957105967125915, + "grad_norm": 0.09021865576505661, + "learning_rate": 0.002, + "loss": 2.351, + "step": 113710 + }, + { + "epoch": 0.4396097168746424, + "grad_norm": 0.11766369640827179, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 113720 + }, + { + "epoch": 0.4396483740780257, + "grad_norm": 0.11109083145856857, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 113730 + }, + { + "epoch": 0.43968703128140896, + "grad_norm": 0.09245065599679947, + "learning_rate": 0.002, + "loss": 2.356, + "step": 113740 + }, + { + "epoch": 0.43972568848479227, + "grad_norm": 0.11703155189752579, + "learning_rate": 0.002, + "loss": 2.358, + "step": 113750 + }, + { + "epoch": 0.4397643456881755, + "grad_norm": 0.10193216800689697, + "learning_rate": 0.002, + "loss": 2.352, + "step": 113760 + }, + { + "epoch": 0.4398030028915588, + "grad_norm": 0.1027296856045723, + "learning_rate": 0.002, + "loss": 2.347, + "step": 113770 + }, + { + "epoch": 0.4398416600949421, + "grad_norm": 0.11095414310693741, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 113780 + }, + { + "epoch": 0.4398803172983254, + "grad_norm": 0.12157269567251205, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 113790 + }, + { + "epoch": 0.43991897450170864, + "grad_norm": 0.12535102665424347, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 113800 + }, + { + "epoch": 0.43995763170509194, + "grad_norm": 0.09027545899152756, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 113810 + }, + { + "epoch": 0.4399962889084752, + "grad_norm": 0.16895151138305664, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 113820 + }, + { + "epoch": 0.4400349461118585, + "grad_norm": 0.18759407103061676, + "learning_rate": 0.002, + "loss": 2.36, + "step": 113830 + }, + { + "epoch": 0.44007360331524176, + "grad_norm": 0.10348519682884216, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 113840 + }, + { + "epoch": 0.44011226051862506, + "grad_norm": 0.10666997730731964, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 113850 + }, + { + "epoch": 0.4401509177220083, + "grad_norm": 0.10717163980007172, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 113860 + }, + { + "epoch": 0.4401895749253916, + "grad_norm": 0.09396836906671524, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 113870 + }, + { + "epoch": 0.4402282321287749, + "grad_norm": 0.12772560119628906, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 113880 + }, + { + "epoch": 0.44026688933215813, + "grad_norm": 0.11134528368711472, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 113890 + }, + { + "epoch": 0.44030554653554144, + "grad_norm": 0.10744709521532059, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 113900 + }, + { + "epoch": 0.4403442037389247, + "grad_norm": 0.11233912408351898, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 113910 + }, + { + "epoch": 0.440382860942308, + "grad_norm": 0.10462363809347153, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 113920 + }, + { + "epoch": 0.44042151814569125, + "grad_norm": 0.1127980500459671, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 113930 + }, + { + "epoch": 0.44046017534907456, + "grad_norm": 0.09823184460401535, + "learning_rate": 0.002, + "loss": 2.334, + "step": 113940 + }, + { + "epoch": 0.4404988325524578, + "grad_norm": 0.08919413387775421, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 113950 + }, + { + "epoch": 0.4405374897558411, + "grad_norm": 0.10333508253097534, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 113960 + }, + { + "epoch": 0.44057614695922437, + "grad_norm": 0.11753777414560318, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 113970 + }, + { + "epoch": 0.4406148041626077, + "grad_norm": 0.11250483244657516, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 113980 + }, + { + "epoch": 0.44065346136599093, + "grad_norm": 0.10561315715312958, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 113990 + }, + { + "epoch": 0.44069211856937424, + "grad_norm": 0.09429409354925156, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 114000 + }, + { + "epoch": 0.4407307757727575, + "grad_norm": 0.09992121905088425, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 114010 + }, + { + "epoch": 0.4407694329761408, + "grad_norm": 0.10621625185012817, + "learning_rate": 0.002, + "loss": 2.363, + "step": 114020 + }, + { + "epoch": 0.44080809017952405, + "grad_norm": 0.11136475950479507, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 114030 + }, + { + "epoch": 0.44084674738290736, + "grad_norm": 0.10999602824449539, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 114040 + }, + { + "epoch": 0.4408854045862906, + "grad_norm": 0.12542486190795898, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 114050 + }, + { + "epoch": 0.44092406178967386, + "grad_norm": 0.10794009268283844, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 114060 + }, + { + "epoch": 0.44096271899305717, + "grad_norm": 0.11233028769493103, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 114070 + }, + { + "epoch": 0.4410013761964404, + "grad_norm": 0.10456335544586182, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 114080 + }, + { + "epoch": 0.4410400333998237, + "grad_norm": 0.10484910011291504, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 114090 + }, + { + "epoch": 0.441078690603207, + "grad_norm": 0.09089253842830658, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 114100 + }, + { + "epoch": 0.4411173478065903, + "grad_norm": 0.10500083863735199, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 114110 + }, + { + "epoch": 0.44115600500997354, + "grad_norm": 0.12656979262828827, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 114120 + }, + { + "epoch": 0.44119466221335685, + "grad_norm": 0.1000988706946373, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 114130 + }, + { + "epoch": 0.4412333194167401, + "grad_norm": 0.0932462066411972, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 114140 + }, + { + "epoch": 0.4412719766201234, + "grad_norm": 0.14996211230754852, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 114150 + }, + { + "epoch": 0.44131063382350666, + "grad_norm": 0.0966678187251091, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 114160 + }, + { + "epoch": 0.44134929102688997, + "grad_norm": 0.10398583114147186, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 114170 + }, + { + "epoch": 0.4413879482302732, + "grad_norm": 0.09673210978507996, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 114180 + }, + { + "epoch": 0.4414266054336565, + "grad_norm": 0.10650409013032913, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 114190 + }, + { + "epoch": 0.4414652626370398, + "grad_norm": 0.10166922956705093, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 114200 + }, + { + "epoch": 0.4415039198404231, + "grad_norm": 0.11047915369272232, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 114210 + }, + { + "epoch": 0.44154257704380634, + "grad_norm": 0.09343116730451584, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 114220 + }, + { + "epoch": 0.44158123424718965, + "grad_norm": 0.10396973788738251, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 114230 + }, + { + "epoch": 0.4416198914505729, + "grad_norm": 0.10365176200866699, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 114240 + }, + { + "epoch": 0.44165854865395615, + "grad_norm": 0.11292040348052979, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 114250 + }, + { + "epoch": 0.44169720585733946, + "grad_norm": 0.10333476215600967, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 114260 + }, + { + "epoch": 0.4417358630607227, + "grad_norm": 0.08777636289596558, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 114270 + }, + { + "epoch": 0.441774520264106, + "grad_norm": 0.12246387451887131, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 114280 + }, + { + "epoch": 0.44181317746748927, + "grad_norm": 0.10196879506111145, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 114290 + }, + { + "epoch": 0.4418518346708726, + "grad_norm": 0.1406039446592331, + "learning_rate": 0.002, + "loss": 2.348, + "step": 114300 + }, + { + "epoch": 0.44189049187425583, + "grad_norm": 0.09151335805654526, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 114310 + }, + { + "epoch": 0.44192914907763914, + "grad_norm": 0.12211549282073975, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 114320 + }, + { + "epoch": 0.4419678062810224, + "grad_norm": 0.103364959359169, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 114330 + }, + { + "epoch": 0.4420064634844057, + "grad_norm": 0.095526784658432, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 114340 + }, + { + "epoch": 0.44204512068778895, + "grad_norm": 0.10776218771934509, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 114350 + }, + { + "epoch": 0.44208377789117226, + "grad_norm": 0.12116540968418121, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 114360 + }, + { + "epoch": 0.4421224350945555, + "grad_norm": 0.10187121480703354, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 114370 + }, + { + "epoch": 0.4421610922979388, + "grad_norm": 0.12751945853233337, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 114380 + }, + { + "epoch": 0.44219974950132207, + "grad_norm": 0.12061352282762527, + "learning_rate": 0.002, + "loss": 2.357, + "step": 114390 + }, + { + "epoch": 0.4422384067047054, + "grad_norm": 0.09569384902715683, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 114400 + }, + { + "epoch": 0.44227706390808863, + "grad_norm": 0.10532968491315842, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 114410 + }, + { + "epoch": 0.44231572111147194, + "grad_norm": 0.1037789061665535, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 114420 + }, + { + "epoch": 0.4423543783148552, + "grad_norm": 0.09751158207654953, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 114430 + }, + { + "epoch": 0.44239303551823844, + "grad_norm": 0.11524856090545654, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 114440 + }, + { + "epoch": 0.44243169272162175, + "grad_norm": 0.10979925096035004, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 114450 + }, + { + "epoch": 0.442470349925005, + "grad_norm": 0.1250380575656891, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 114460 + }, + { + "epoch": 0.4425090071283883, + "grad_norm": 0.11559075117111206, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 114470 + }, + { + "epoch": 0.44254766433177156, + "grad_norm": 0.10545665770769119, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 114480 + }, + { + "epoch": 0.44258632153515487, + "grad_norm": 0.13213802874088287, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 114490 + }, + { + "epoch": 0.4426249787385381, + "grad_norm": 0.10205340385437012, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 114500 + }, + { + "epoch": 0.44266363594192143, + "grad_norm": 0.10072396695613861, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 114510 + }, + { + "epoch": 0.4427022931453047, + "grad_norm": 0.10048835724592209, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 114520 + }, + { + "epoch": 0.442740950348688, + "grad_norm": 0.10904062539339066, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 114530 + }, + { + "epoch": 0.44277960755207124, + "grad_norm": 0.09817338734865189, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 114540 + }, + { + "epoch": 0.44281826475545455, + "grad_norm": 0.12179743498563766, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 114550 + }, + { + "epoch": 0.4428569219588378, + "grad_norm": 0.11451654881238937, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 114560 + }, + { + "epoch": 0.4428955791622211, + "grad_norm": 0.11346857994794846, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 114570 + }, + { + "epoch": 0.44293423636560436, + "grad_norm": 0.13127179443836212, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 114580 + }, + { + "epoch": 0.44297289356898767, + "grad_norm": 0.10564865916967392, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 114590 + }, + { + "epoch": 0.4430115507723709, + "grad_norm": 0.09019293636083603, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 114600 + }, + { + "epoch": 0.44305020797575423, + "grad_norm": 0.12438137829303741, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 114610 + }, + { + "epoch": 0.4430888651791375, + "grad_norm": 0.12015697360038757, + "learning_rate": 0.002, + "loss": 2.357, + "step": 114620 + }, + { + "epoch": 0.44312752238252073, + "grad_norm": 0.10316593199968338, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 114630 + }, + { + "epoch": 0.44316617958590404, + "grad_norm": 0.0989539623260498, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 114640 + }, + { + "epoch": 0.4432048367892873, + "grad_norm": 0.10684391111135483, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 114650 + }, + { + "epoch": 0.4432434939926706, + "grad_norm": 0.09858309477567673, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 114660 + }, + { + "epoch": 0.44328215119605385, + "grad_norm": 0.12590645253658295, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 114670 + }, + { + "epoch": 0.44332080839943716, + "grad_norm": 0.1031150296330452, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 114680 + }, + { + "epoch": 0.4433594656028204, + "grad_norm": 0.09509457647800446, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 114690 + }, + { + "epoch": 0.4433981228062037, + "grad_norm": 0.11310373246669769, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 114700 + }, + { + "epoch": 0.443436780009587, + "grad_norm": 0.10985482484102249, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 114710 + }, + { + "epoch": 0.4434754372129703, + "grad_norm": 0.11310135573148727, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 114720 + }, + { + "epoch": 0.44351409441635353, + "grad_norm": 0.10724176466464996, + "learning_rate": 0.002, + "loss": 2.347, + "step": 114730 + }, + { + "epoch": 0.44355275161973684, + "grad_norm": 0.10607755929231644, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 114740 + }, + { + "epoch": 0.4435914088231201, + "grad_norm": 0.28465360403060913, + "learning_rate": 0.002, + "loss": 2.346, + "step": 114750 + }, + { + "epoch": 0.4436300660265034, + "grad_norm": 0.1293918639421463, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 114760 + }, + { + "epoch": 0.44366872322988665, + "grad_norm": 0.1216643676161766, + "learning_rate": 0.002, + "loss": 2.348, + "step": 114770 + }, + { + "epoch": 0.44370738043326996, + "grad_norm": 0.10900459438562393, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 114780 + }, + { + "epoch": 0.4437460376366532, + "grad_norm": 0.10473953932523727, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 114790 + }, + { + "epoch": 0.4437846948400365, + "grad_norm": 0.12707144021987915, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 114800 + }, + { + "epoch": 0.44382335204341977, + "grad_norm": 0.11088820546865463, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 114810 + }, + { + "epoch": 0.443862009246803, + "grad_norm": 0.10142777115106583, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 114820 + }, + { + "epoch": 0.44390066645018633, + "grad_norm": 0.10695470869541168, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 114830 + }, + { + "epoch": 0.4439393236535696, + "grad_norm": 0.09565724432468414, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 114840 + }, + { + "epoch": 0.4439779808569529, + "grad_norm": 0.10378613322973251, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 114850 + }, + { + "epoch": 0.44401663806033614, + "grad_norm": 0.1171041876077652, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 114860 + }, + { + "epoch": 0.44405529526371945, + "grad_norm": 0.11488772928714752, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 114870 + }, + { + "epoch": 0.4440939524671027, + "grad_norm": 0.11186092346906662, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 114880 + }, + { + "epoch": 0.444132609670486, + "grad_norm": 0.10943606495857239, + "learning_rate": 0.002, + "loss": 2.3709, + "step": 114890 + }, + { + "epoch": 0.44417126687386926, + "grad_norm": 0.11419028788805008, + "learning_rate": 0.002, + "loss": 2.339, + "step": 114900 + }, + { + "epoch": 0.44420992407725257, + "grad_norm": 0.09883607923984528, + "learning_rate": 0.002, + "loss": 2.35, + "step": 114910 + }, + { + "epoch": 0.4442485812806358, + "grad_norm": 0.10349875688552856, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 114920 + }, + { + "epoch": 0.44428723848401913, + "grad_norm": 0.09061745554208755, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 114930 + }, + { + "epoch": 0.4443258956874024, + "grad_norm": 0.1337675154209137, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 114940 + }, + { + "epoch": 0.4443645528907857, + "grad_norm": 0.10015002638101578, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 114950 + }, + { + "epoch": 0.44440321009416894, + "grad_norm": 0.12010683864355087, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 114960 + }, + { + "epoch": 0.44444186729755225, + "grad_norm": 0.11135932058095932, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 114970 + }, + { + "epoch": 0.4444805245009355, + "grad_norm": 0.09960712492465973, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 114980 + }, + { + "epoch": 0.44451918170431876, + "grad_norm": 0.10387620329856873, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 114990 + }, + { + "epoch": 0.44455783890770206, + "grad_norm": 0.10625986009836197, + "learning_rate": 0.002, + "loss": 2.355, + "step": 115000 + }, + { + "epoch": 0.4445964961110853, + "grad_norm": 0.11116498708724976, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 115010 + }, + { + "epoch": 0.4446351533144686, + "grad_norm": 0.12776482105255127, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 115020 + }, + { + "epoch": 0.4446738105178519, + "grad_norm": 0.10398684442043304, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 115030 + }, + { + "epoch": 0.4447124677212352, + "grad_norm": 0.1710490733385086, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 115040 + }, + { + "epoch": 0.44475112492461844, + "grad_norm": 0.10064823180437088, + "learning_rate": 0.002, + "loss": 2.364, + "step": 115050 + }, + { + "epoch": 0.44478978212800174, + "grad_norm": 0.11258933693170547, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 115060 + }, + { + "epoch": 0.444828439331385, + "grad_norm": 0.10680379718542099, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 115070 + }, + { + "epoch": 0.4448670965347683, + "grad_norm": 0.11084065586328506, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 115080 + }, + { + "epoch": 0.44490575373815155, + "grad_norm": 0.09208144247531891, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 115090 + }, + { + "epoch": 0.44494441094153486, + "grad_norm": 0.09825203567743301, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 115100 + }, + { + "epoch": 0.4449830681449181, + "grad_norm": 0.09931337088346481, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 115110 + }, + { + "epoch": 0.4450217253483014, + "grad_norm": 0.11392378062009811, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 115120 + }, + { + "epoch": 0.4450603825516847, + "grad_norm": 0.09332132339477539, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 115130 + }, + { + "epoch": 0.445099039755068, + "grad_norm": 0.10715038329362869, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 115140 + }, + { + "epoch": 0.44513769695845123, + "grad_norm": 0.11407089233398438, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 115150 + }, + { + "epoch": 0.44517635416183454, + "grad_norm": 0.1000797227025032, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 115160 + }, + { + "epoch": 0.4452150113652178, + "grad_norm": 0.10885439068078995, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 115170 + }, + { + "epoch": 0.44525366856860105, + "grad_norm": 0.09690678864717484, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 115180 + }, + { + "epoch": 0.44529232577198435, + "grad_norm": 0.18793104588985443, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 115190 + }, + { + "epoch": 0.4453309829753676, + "grad_norm": 0.1313229501247406, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 115200 + }, + { + "epoch": 0.4453696401787509, + "grad_norm": 0.10109826177358627, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 115210 + }, + { + "epoch": 0.44540829738213417, + "grad_norm": 0.11365307122468948, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 115220 + }, + { + "epoch": 0.4454469545855175, + "grad_norm": 0.10179514437913895, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 115230 + }, + { + "epoch": 0.4454856117889007, + "grad_norm": 0.10260636359453201, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 115240 + }, + { + "epoch": 0.44552426899228403, + "grad_norm": 0.10231651365756989, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 115250 + }, + { + "epoch": 0.4455629261956673, + "grad_norm": 0.09702098369598389, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 115260 + }, + { + "epoch": 0.4456015833990506, + "grad_norm": 0.09935378283262253, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 115270 + }, + { + "epoch": 0.44564024060243385, + "grad_norm": 0.10840914398431778, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 115280 + }, + { + "epoch": 0.44567889780581715, + "grad_norm": 0.10769101232290268, + "learning_rate": 0.002, + "loss": 2.35, + "step": 115290 + }, + { + "epoch": 0.4457175550092004, + "grad_norm": 0.10299813747406006, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 115300 + }, + { + "epoch": 0.4457562122125837, + "grad_norm": 0.10182159394025803, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 115310 + }, + { + "epoch": 0.44579486941596697, + "grad_norm": 0.10414126515388489, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 115320 + }, + { + "epoch": 0.4458335266193503, + "grad_norm": 0.10255824774503708, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 115330 + }, + { + "epoch": 0.4458721838227335, + "grad_norm": 0.09214485436677933, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 115340 + }, + { + "epoch": 0.44591084102611683, + "grad_norm": 0.11276800185441971, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 115350 + }, + { + "epoch": 0.4459494982295001, + "grad_norm": 0.10205808281898499, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 115360 + }, + { + "epoch": 0.44598815543288334, + "grad_norm": 0.15296092629432678, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 115370 + }, + { + "epoch": 0.44602681263626665, + "grad_norm": 0.10987062007188797, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 115380 + }, + { + "epoch": 0.4460654698396499, + "grad_norm": 0.11441478133201599, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 115390 + }, + { + "epoch": 0.4461041270430332, + "grad_norm": 0.107122503221035, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 115400 + }, + { + "epoch": 0.44614278424641646, + "grad_norm": 0.1339341700077057, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 115410 + }, + { + "epoch": 0.44618144144979976, + "grad_norm": 0.11719755083322525, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 115420 + }, + { + "epoch": 0.446220098653183, + "grad_norm": 0.1091657504439354, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 115430 + }, + { + "epoch": 0.4462587558565663, + "grad_norm": 0.10963524878025055, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 115440 + }, + { + "epoch": 0.4462974130599496, + "grad_norm": 0.11549586802721024, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 115450 + }, + { + "epoch": 0.4463360702633329, + "grad_norm": 0.12684787809848785, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 115460 + }, + { + "epoch": 0.44637472746671614, + "grad_norm": 0.11087344586849213, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 115470 + }, + { + "epoch": 0.44641338467009944, + "grad_norm": 0.10655102878808975, + "learning_rate": 0.002, + "loss": 2.346, + "step": 115480 + }, + { + "epoch": 0.4464520418734827, + "grad_norm": 0.14614103734493256, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 115490 + }, + { + "epoch": 0.446490699076866, + "grad_norm": 0.09883083403110504, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 115500 + }, + { + "epoch": 0.44652935628024926, + "grad_norm": 0.1201692447066307, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 115510 + }, + { + "epoch": 0.44656801348363256, + "grad_norm": 0.10232232511043549, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 115520 + }, + { + "epoch": 0.4466066706870158, + "grad_norm": 0.1238783597946167, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 115530 + }, + { + "epoch": 0.4466453278903991, + "grad_norm": 0.10835893452167511, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 115540 + }, + { + "epoch": 0.4466839850937824, + "grad_norm": 0.10148458927869797, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 115550 + }, + { + "epoch": 0.44672264229716563, + "grad_norm": 0.10020631551742554, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 115560 + }, + { + "epoch": 0.44676129950054894, + "grad_norm": 0.10603410005569458, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 115570 + }, + { + "epoch": 0.4467999567039322, + "grad_norm": 0.10621050745248795, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 115580 + }, + { + "epoch": 0.4468386139073155, + "grad_norm": 0.10222116112709045, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 115590 + }, + { + "epoch": 0.44687727111069875, + "grad_norm": 0.09559807181358337, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 115600 + }, + { + "epoch": 0.44691592831408206, + "grad_norm": 0.09763092547655106, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 115610 + }, + { + "epoch": 0.4469545855174653, + "grad_norm": 0.1289861798286438, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 115620 + }, + { + "epoch": 0.4469932427208486, + "grad_norm": 0.10587324947118759, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 115630 + }, + { + "epoch": 0.44703189992423187, + "grad_norm": 0.09899197518825531, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 115640 + }, + { + "epoch": 0.4470705571276152, + "grad_norm": 0.09850729256868362, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 115650 + }, + { + "epoch": 0.44710921433099843, + "grad_norm": 0.12134348601102829, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 115660 + }, + { + "epoch": 0.44714787153438174, + "grad_norm": 0.11509352177381516, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 115670 + }, + { + "epoch": 0.447186528737765, + "grad_norm": 0.08912909030914307, + "learning_rate": 0.002, + "loss": 2.352, + "step": 115680 + }, + { + "epoch": 0.4472251859411483, + "grad_norm": 0.11786410212516785, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 115690 + }, + { + "epoch": 0.44726384314453155, + "grad_norm": 0.09729806333780289, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 115700 + }, + { + "epoch": 0.44730250034791486, + "grad_norm": 0.09955301135778427, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 115710 + }, + { + "epoch": 0.4473411575512981, + "grad_norm": 0.1184239611029625, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 115720 + }, + { + "epoch": 0.44737981475468136, + "grad_norm": 0.1331377476453781, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 115730 + }, + { + "epoch": 0.44741847195806467, + "grad_norm": 0.11014413833618164, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 115740 + }, + { + "epoch": 0.4474571291614479, + "grad_norm": 0.10066406428813934, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 115750 + }, + { + "epoch": 0.4474957863648312, + "grad_norm": 0.09046106040477753, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 115760 + }, + { + "epoch": 0.4475344435682145, + "grad_norm": 0.12679710984230042, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 115770 + }, + { + "epoch": 0.4475731007715978, + "grad_norm": 0.10072343051433563, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 115780 + }, + { + "epoch": 0.44761175797498104, + "grad_norm": 0.10047661513090134, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 115790 + }, + { + "epoch": 0.44765041517836435, + "grad_norm": 0.09472613781690598, + "learning_rate": 0.002, + "loss": 2.349, + "step": 115800 + }, + { + "epoch": 0.4476890723817476, + "grad_norm": 0.10094655305147171, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 115810 + }, + { + "epoch": 0.4477277295851309, + "grad_norm": 0.1129259541630745, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 115820 + }, + { + "epoch": 0.44776638678851416, + "grad_norm": 0.2693578898906708, + "learning_rate": 0.002, + "loss": 2.357, + "step": 115830 + }, + { + "epoch": 0.44780504399189747, + "grad_norm": 0.11392874270677567, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 115840 + }, + { + "epoch": 0.4478437011952807, + "grad_norm": 0.13713346421718597, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 115850 + }, + { + "epoch": 0.447882358398664, + "grad_norm": 0.110591359436512, + "learning_rate": 0.002, + "loss": 2.356, + "step": 115860 + }, + { + "epoch": 0.4479210156020473, + "grad_norm": 0.10545430332422256, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 115870 + }, + { + "epoch": 0.4479596728054306, + "grad_norm": 0.1109808161854744, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 115880 + }, + { + "epoch": 0.44799833000881384, + "grad_norm": 0.10120480507612228, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 115890 + }, + { + "epoch": 0.44803698721219715, + "grad_norm": 0.12015772610902786, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 115900 + }, + { + "epoch": 0.4480756444155804, + "grad_norm": 0.1165841743350029, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 115910 + }, + { + "epoch": 0.44811430161896365, + "grad_norm": 0.11997988820075989, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 115920 + }, + { + "epoch": 0.44815295882234696, + "grad_norm": 0.09375101327896118, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 115930 + }, + { + "epoch": 0.4481916160257302, + "grad_norm": 0.10501191020011902, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 115940 + }, + { + "epoch": 0.4482302732291135, + "grad_norm": 0.11917419731616974, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 115950 + }, + { + "epoch": 0.44826893043249677, + "grad_norm": 0.10625232756137848, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 115960 + }, + { + "epoch": 0.4483075876358801, + "grad_norm": 0.1068049892783165, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 115970 + }, + { + "epoch": 0.44834624483926333, + "grad_norm": 0.11341347545385361, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 115980 + }, + { + "epoch": 0.44838490204264664, + "grad_norm": 0.10867293179035187, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 115990 + }, + { + "epoch": 0.4484235592460299, + "grad_norm": 0.110105499625206, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 116000 + }, + { + "epoch": 0.4484622164494132, + "grad_norm": 0.11558972299098969, + "learning_rate": 0.002, + "loss": 2.348, + "step": 116010 + }, + { + "epoch": 0.44850087365279645, + "grad_norm": 0.12024278193712234, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 116020 + }, + { + "epoch": 0.44853953085617976, + "grad_norm": 0.09318528324365616, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 116030 + }, + { + "epoch": 0.448578188059563, + "grad_norm": 0.11388210207223892, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 116040 + }, + { + "epoch": 0.4486168452629463, + "grad_norm": 0.10485441237688065, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 116050 + }, + { + "epoch": 0.44865550246632957, + "grad_norm": 0.10908669233322144, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 116060 + }, + { + "epoch": 0.4486941596697129, + "grad_norm": 0.11425649374723434, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 116070 + }, + { + "epoch": 0.44873281687309613, + "grad_norm": 0.09546269476413727, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 116080 + }, + { + "epoch": 0.44877147407647944, + "grad_norm": 0.10710975527763367, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 116090 + }, + { + "epoch": 0.4488101312798627, + "grad_norm": 0.12068114429712296, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 116100 + }, + { + "epoch": 0.44884878848324594, + "grad_norm": 0.10486262291669846, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 116110 + }, + { + "epoch": 0.44888744568662925, + "grad_norm": 0.10621446371078491, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 116120 + }, + { + "epoch": 0.4489261028900125, + "grad_norm": 0.09761782735586166, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 116130 + }, + { + "epoch": 0.4489647600933958, + "grad_norm": 0.10787412524223328, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 116140 + }, + { + "epoch": 0.44900341729677906, + "grad_norm": 0.11185034364461899, + "learning_rate": 0.002, + "loss": 2.359, + "step": 116150 + }, + { + "epoch": 0.44904207450016237, + "grad_norm": 0.10436546057462692, + "learning_rate": 0.002, + "loss": 2.356, + "step": 116160 + }, + { + "epoch": 0.4490807317035456, + "grad_norm": 0.1262926459312439, + "learning_rate": 0.002, + "loss": 2.3689, + "step": 116170 + }, + { + "epoch": 0.44911938890692893, + "grad_norm": 0.09157133102416992, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 116180 + }, + { + "epoch": 0.4491580461103122, + "grad_norm": 0.1170177310705185, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 116190 + }, + { + "epoch": 0.4491967033136955, + "grad_norm": 0.10202539712190628, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 116200 + }, + { + "epoch": 0.44923536051707874, + "grad_norm": 0.11776548624038696, + "learning_rate": 0.002, + "loss": 2.343, + "step": 116210 + }, + { + "epoch": 0.44927401772046205, + "grad_norm": 0.09582582116127014, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 116220 + }, + { + "epoch": 0.4493126749238453, + "grad_norm": 0.09644076973199844, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 116230 + }, + { + "epoch": 0.4493513321272286, + "grad_norm": 0.11195076256990433, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 116240 + }, + { + "epoch": 0.44938998933061186, + "grad_norm": 0.11746636778116226, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 116250 + }, + { + "epoch": 0.44942864653399517, + "grad_norm": 0.11495273560285568, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 116260 + }, + { + "epoch": 0.4494673037373784, + "grad_norm": 0.12250115722417831, + "learning_rate": 0.002, + "loss": 2.33, + "step": 116270 + }, + { + "epoch": 0.44950596094076173, + "grad_norm": 0.10969327390193939, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 116280 + }, + { + "epoch": 0.449544618144145, + "grad_norm": 0.10717156529426575, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 116290 + }, + { + "epoch": 0.44958327534752823, + "grad_norm": 0.12968280911445618, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 116300 + }, + { + "epoch": 0.44962193255091154, + "grad_norm": 0.10125463455915451, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 116310 + }, + { + "epoch": 0.4496605897542948, + "grad_norm": 0.10570531338453293, + "learning_rate": 0.002, + "loss": 2.349, + "step": 116320 + }, + { + "epoch": 0.4496992469576781, + "grad_norm": 0.10413254052400589, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 116330 + }, + { + "epoch": 0.44973790416106135, + "grad_norm": 0.11833761632442474, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 116340 + }, + { + "epoch": 0.44977656136444466, + "grad_norm": 0.09715671837329865, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 116350 + }, + { + "epoch": 0.4498152185678279, + "grad_norm": 0.09942399710416794, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 116360 + }, + { + "epoch": 0.4498538757712112, + "grad_norm": 0.1249130591750145, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 116370 + }, + { + "epoch": 0.4498925329745945, + "grad_norm": 0.11511149257421494, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 116380 + }, + { + "epoch": 0.4499311901779778, + "grad_norm": 0.117387555539608, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 116390 + }, + { + "epoch": 0.44996984738136103, + "grad_norm": 0.11409103870391846, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 116400 + }, + { + "epoch": 0.45000850458474434, + "grad_norm": 0.09024668484926224, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 116410 + }, + { + "epoch": 0.4500471617881276, + "grad_norm": 0.12117420881986618, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 116420 + }, + { + "epoch": 0.4500858189915109, + "grad_norm": 0.12811824679374695, + "learning_rate": 0.002, + "loss": 2.3953, + "step": 116430 + }, + { + "epoch": 0.45012447619489415, + "grad_norm": 0.09323813021183014, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 116440 + }, + { + "epoch": 0.45016313339827746, + "grad_norm": 0.10673702508211136, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 116450 + }, + { + "epoch": 0.4502017906016607, + "grad_norm": 0.1002158671617508, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 116460 + }, + { + "epoch": 0.450240447805044, + "grad_norm": 0.08856503665447235, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 116470 + }, + { + "epoch": 0.45027910500842727, + "grad_norm": 0.14465701580047607, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 116480 + }, + { + "epoch": 0.4503177622118105, + "grad_norm": 0.11650863289833069, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 116490 + }, + { + "epoch": 0.45035641941519383, + "grad_norm": 0.10695286840200424, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 116500 + }, + { + "epoch": 0.4503950766185771, + "grad_norm": 0.11719808727502823, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 116510 + }, + { + "epoch": 0.4504337338219604, + "grad_norm": 0.11228390038013458, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 116520 + }, + { + "epoch": 0.45047239102534364, + "grad_norm": 0.10490066558122635, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 116530 + }, + { + "epoch": 0.45051104822872695, + "grad_norm": 0.10596594959497452, + "learning_rate": 0.002, + "loss": 2.336, + "step": 116540 + }, + { + "epoch": 0.4505497054321102, + "grad_norm": 0.10827124118804932, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 116550 + }, + { + "epoch": 0.4505883626354935, + "grad_norm": 0.09110372513532639, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 116560 + }, + { + "epoch": 0.45062701983887676, + "grad_norm": 0.11297664791345596, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 116570 + }, + { + "epoch": 0.45066567704226007, + "grad_norm": 0.10109726339578629, + "learning_rate": 0.002, + "loss": 2.352, + "step": 116580 + }, + { + "epoch": 0.4507043342456433, + "grad_norm": 0.12487292289733887, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 116590 + }, + { + "epoch": 0.45074299144902663, + "grad_norm": 0.11954599618911743, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 116600 + }, + { + "epoch": 0.4507816486524099, + "grad_norm": 0.11460407823324203, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 116610 + }, + { + "epoch": 0.4508203058557932, + "grad_norm": 0.1472836583852768, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 116620 + }, + { + "epoch": 0.45085896305917644, + "grad_norm": 0.11723505705595016, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 116630 + }, + { + "epoch": 0.45089762026255975, + "grad_norm": 0.11846832185983658, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 116640 + }, + { + "epoch": 0.450936277465943, + "grad_norm": 0.10739443451166153, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 116650 + }, + { + "epoch": 0.45097493466932626, + "grad_norm": 0.0924198254942894, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 116660 + }, + { + "epoch": 0.45101359187270956, + "grad_norm": 0.09949216991662979, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 116670 + }, + { + "epoch": 0.4510522490760928, + "grad_norm": 0.10973398387432098, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 116680 + }, + { + "epoch": 0.4510909062794761, + "grad_norm": 0.10357681661844254, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 116690 + }, + { + "epoch": 0.4511295634828594, + "grad_norm": 0.11836923658847809, + "learning_rate": 0.002, + "loss": 2.3656, + "step": 116700 + }, + { + "epoch": 0.4511682206862427, + "grad_norm": 0.09963072091341019, + "learning_rate": 0.002, + "loss": 2.353, + "step": 116710 + }, + { + "epoch": 0.45120687788962593, + "grad_norm": 0.10210157185792923, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 116720 + }, + { + "epoch": 0.45124553509300924, + "grad_norm": 0.12839585542678833, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 116730 + }, + { + "epoch": 0.4512841922963925, + "grad_norm": 0.10952276736497879, + "learning_rate": 0.002, + "loss": 2.354, + "step": 116740 + }, + { + "epoch": 0.4513228494997758, + "grad_norm": 0.10620319843292236, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 116750 + }, + { + "epoch": 0.45136150670315905, + "grad_norm": 0.13847695291042328, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 116760 + }, + { + "epoch": 0.45140016390654236, + "grad_norm": 0.1073933020234108, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 116770 + }, + { + "epoch": 0.4514388211099256, + "grad_norm": 0.11338157206773758, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 116780 + }, + { + "epoch": 0.4514774783133089, + "grad_norm": 0.10555189102888107, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 116790 + }, + { + "epoch": 0.4515161355166922, + "grad_norm": 0.0956127718091011, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 116800 + }, + { + "epoch": 0.4515547927200755, + "grad_norm": 0.10321953892707825, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 116810 + }, + { + "epoch": 0.45159344992345873, + "grad_norm": 0.12122472375631332, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 116820 + }, + { + "epoch": 0.45163210712684204, + "grad_norm": 0.12436135113239288, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 116830 + }, + { + "epoch": 0.4516707643302253, + "grad_norm": 0.1039997860789299, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 116840 + }, + { + "epoch": 0.45170942153360855, + "grad_norm": 0.11182855814695358, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 116850 + }, + { + "epoch": 0.45174807873699185, + "grad_norm": 0.11596731841564178, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 116860 + }, + { + "epoch": 0.4517867359403751, + "grad_norm": 0.10430099815130234, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 116870 + }, + { + "epoch": 0.4518253931437584, + "grad_norm": 0.11345577985048294, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 116880 + }, + { + "epoch": 0.45186405034714167, + "grad_norm": 0.10795613378286362, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 116890 + }, + { + "epoch": 0.451902707550525, + "grad_norm": 0.13586848974227905, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 116900 + }, + { + "epoch": 0.4519413647539082, + "grad_norm": 0.1062573790550232, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 116910 + }, + { + "epoch": 0.45198002195729153, + "grad_norm": 0.11689643561840057, + "learning_rate": 0.002, + "loss": 2.336, + "step": 116920 + }, + { + "epoch": 0.4520186791606748, + "grad_norm": 0.11215382069349289, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 116930 + }, + { + "epoch": 0.4520573363640581, + "grad_norm": 0.10423940420150757, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 116940 + }, + { + "epoch": 0.45209599356744135, + "grad_norm": 0.11624042689800262, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 116950 + }, + { + "epoch": 0.45213465077082465, + "grad_norm": 0.10433658957481384, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 116960 + }, + { + "epoch": 0.4521733079742079, + "grad_norm": 0.10768541693687439, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 116970 + }, + { + "epoch": 0.4522119651775912, + "grad_norm": 0.10143450647592545, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 116980 + }, + { + "epoch": 0.45225062238097447, + "grad_norm": 0.1296466588973999, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 116990 + }, + { + "epoch": 0.4522892795843578, + "grad_norm": 0.13517975807189941, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 117000 + }, + { + "epoch": 0.452327936787741, + "grad_norm": 0.10195448994636536, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 117010 + }, + { + "epoch": 0.45236659399112433, + "grad_norm": 0.1134767085313797, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 117020 + }, + { + "epoch": 0.4524052511945076, + "grad_norm": 0.11139202117919922, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 117030 + }, + { + "epoch": 0.45244390839789084, + "grad_norm": 0.11013373732566833, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 117040 + }, + { + "epoch": 0.45248256560127414, + "grad_norm": 0.11789267510175705, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 117050 + }, + { + "epoch": 0.4525212228046574, + "grad_norm": 0.10006678104400635, + "learning_rate": 0.002, + "loss": 2.356, + "step": 117060 + }, + { + "epoch": 0.4525598800080407, + "grad_norm": 0.10965073108673096, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 117070 + }, + { + "epoch": 0.45259853721142396, + "grad_norm": 0.10759425163269043, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 117080 + }, + { + "epoch": 0.45263719441480726, + "grad_norm": 0.1084289476275444, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 117090 + }, + { + "epoch": 0.4526758516181905, + "grad_norm": 0.10556847602128983, + "learning_rate": 0.002, + "loss": 2.346, + "step": 117100 + }, + { + "epoch": 0.4527145088215738, + "grad_norm": 0.11164150387048721, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 117110 + }, + { + "epoch": 0.4527531660249571, + "grad_norm": 0.12561391294002533, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 117120 + }, + { + "epoch": 0.4527918232283404, + "grad_norm": 0.09579742699861526, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 117130 + }, + { + "epoch": 0.45283048043172364, + "grad_norm": 0.11333385109901428, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 117140 + }, + { + "epoch": 0.45286913763510694, + "grad_norm": 0.1069633960723877, + "learning_rate": 0.002, + "loss": 2.355, + "step": 117150 + }, + { + "epoch": 0.4529077948384902, + "grad_norm": 0.10790170729160309, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 117160 + }, + { + "epoch": 0.4529464520418735, + "grad_norm": 0.11147533357143402, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 117170 + }, + { + "epoch": 0.45298510924525676, + "grad_norm": 0.1039792075753212, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 117180 + }, + { + "epoch": 0.45302376644864006, + "grad_norm": 0.10051432996988297, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 117190 + }, + { + "epoch": 0.4530624236520233, + "grad_norm": 0.09797567129135132, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 117200 + }, + { + "epoch": 0.4531010808554066, + "grad_norm": 0.10028452426195145, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 117210 + }, + { + "epoch": 0.4531397380587899, + "grad_norm": 0.09154338389635086, + "learning_rate": 0.002, + "loss": 2.35, + "step": 117220 + }, + { + "epoch": 0.45317839526217313, + "grad_norm": 0.1039407029747963, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 117230 + }, + { + "epoch": 0.45321705246555644, + "grad_norm": 0.10166822373867035, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 117240 + }, + { + "epoch": 0.4532557096689397, + "grad_norm": 0.12528187036514282, + "learning_rate": 0.002, + "loss": 2.346, + "step": 117250 + }, + { + "epoch": 0.453294366872323, + "grad_norm": 0.10523614287376404, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 117260 + }, + { + "epoch": 0.45333302407570625, + "grad_norm": 0.09844973683357239, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 117270 + }, + { + "epoch": 0.45337168127908956, + "grad_norm": 0.11845612525939941, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 117280 + }, + { + "epoch": 0.4534103384824728, + "grad_norm": 0.11746399104595184, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 117290 + }, + { + "epoch": 0.4534489956858561, + "grad_norm": 0.11793000251054764, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 117300 + }, + { + "epoch": 0.45348765288923937, + "grad_norm": 0.11313361674547195, + "learning_rate": 0.002, + "loss": 2.3683, + "step": 117310 + }, + { + "epoch": 0.4535263100926227, + "grad_norm": 0.1121147871017456, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 117320 + }, + { + "epoch": 0.4535649672960059, + "grad_norm": 0.1194700226187706, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 117330 + }, + { + "epoch": 0.45360362449938924, + "grad_norm": 0.11465243995189667, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 117340 + }, + { + "epoch": 0.4536422817027725, + "grad_norm": 0.11006400734186172, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 117350 + }, + { + "epoch": 0.4536809389061558, + "grad_norm": 0.12547846138477325, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 117360 + }, + { + "epoch": 0.45371959610953905, + "grad_norm": 0.12957924604415894, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 117370 + }, + { + "epoch": 0.45375825331292235, + "grad_norm": 0.09238374978303909, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 117380 + }, + { + "epoch": 0.4537969105163056, + "grad_norm": 0.091926708817482, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 117390 + }, + { + "epoch": 0.45383556771968886, + "grad_norm": 0.10020801424980164, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 117400 + }, + { + "epoch": 0.45387422492307217, + "grad_norm": 0.10550692677497864, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 117410 + }, + { + "epoch": 0.4539128821264554, + "grad_norm": 0.10293002426624298, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 117420 + }, + { + "epoch": 0.4539515393298387, + "grad_norm": 0.11234964430332184, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 117430 + }, + { + "epoch": 0.453990196533222, + "grad_norm": 0.10801722854375839, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 117440 + }, + { + "epoch": 0.4540288537366053, + "grad_norm": 0.10281901806592941, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 117450 + }, + { + "epoch": 0.45406751093998854, + "grad_norm": 0.11679302155971527, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 117460 + }, + { + "epoch": 0.45410616814337185, + "grad_norm": 0.10115484893321991, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 117470 + }, + { + "epoch": 0.4541448253467551, + "grad_norm": 0.09839174151420593, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 117480 + }, + { + "epoch": 0.4541834825501384, + "grad_norm": 0.10131075978279114, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 117490 + }, + { + "epoch": 0.45422213975352166, + "grad_norm": 0.10719174891710281, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 117500 + }, + { + "epoch": 0.45426079695690497, + "grad_norm": 0.11066577583551407, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 117510 + }, + { + "epoch": 0.4542994541602882, + "grad_norm": 0.11224183440208435, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 117520 + }, + { + "epoch": 0.4543381113636715, + "grad_norm": 0.09913508594036102, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 117530 + }, + { + "epoch": 0.4543767685670548, + "grad_norm": 0.12280930578708649, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 117540 + }, + { + "epoch": 0.4544154257704381, + "grad_norm": 0.11988666653633118, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 117550 + }, + { + "epoch": 0.45445408297382134, + "grad_norm": 0.09960682690143585, + "learning_rate": 0.002, + "loss": 2.348, + "step": 117560 + }, + { + "epoch": 0.45449274017720465, + "grad_norm": 0.10411369055509567, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 117570 + }, + { + "epoch": 0.4545313973805879, + "grad_norm": 0.10298977047204971, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 117580 + }, + { + "epoch": 0.45457005458397115, + "grad_norm": 0.10172754526138306, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 117590 + }, + { + "epoch": 0.45460871178735446, + "grad_norm": 0.11252429336309433, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 117600 + }, + { + "epoch": 0.4546473689907377, + "grad_norm": 0.10582167655229568, + "learning_rate": 0.002, + "loss": 2.339, + "step": 117610 + }, + { + "epoch": 0.454686026194121, + "grad_norm": 0.11229343712329865, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 117620 + }, + { + "epoch": 0.45472468339750427, + "grad_norm": 0.10725530236959457, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 117630 + }, + { + "epoch": 0.4547633406008876, + "grad_norm": 0.11920682340860367, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 117640 + }, + { + "epoch": 0.45480199780427083, + "grad_norm": 0.11216481029987335, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 117650 + }, + { + "epoch": 0.45484065500765414, + "grad_norm": 0.10069788247346878, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 117660 + }, + { + "epoch": 0.4548793122110374, + "grad_norm": 0.1241251602768898, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 117670 + }, + { + "epoch": 0.4549179694144207, + "grad_norm": 0.11363086104393005, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 117680 + }, + { + "epoch": 0.45495662661780395, + "grad_norm": 0.1133902296423912, + "learning_rate": 0.002, + "loss": 2.336, + "step": 117690 + }, + { + "epoch": 0.45499528382118726, + "grad_norm": 0.10708627104759216, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 117700 + }, + { + "epoch": 0.4550339410245705, + "grad_norm": 0.10656888037919998, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 117710 + }, + { + "epoch": 0.4550725982279538, + "grad_norm": 0.09741010516881943, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 117720 + }, + { + "epoch": 0.45511125543133707, + "grad_norm": 0.10540743917226791, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 117730 + }, + { + "epoch": 0.4551499126347204, + "grad_norm": 0.10656697303056717, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 117740 + }, + { + "epoch": 0.45518856983810363, + "grad_norm": 0.10528460144996643, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 117750 + }, + { + "epoch": 0.45522722704148694, + "grad_norm": 0.1179598867893219, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 117760 + }, + { + "epoch": 0.4552658842448702, + "grad_norm": 0.12058539688587189, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 117770 + }, + { + "epoch": 0.45530454144825344, + "grad_norm": 0.12516923248767853, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 117780 + }, + { + "epoch": 0.45534319865163675, + "grad_norm": 0.1060568243265152, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 117790 + }, + { + "epoch": 0.45538185585502, + "grad_norm": 0.11407436430454254, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 117800 + }, + { + "epoch": 0.4554205130584033, + "grad_norm": 0.10866278409957886, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 117810 + }, + { + "epoch": 0.45545917026178656, + "grad_norm": 0.10732495784759521, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 117820 + }, + { + "epoch": 0.45549782746516987, + "grad_norm": 0.11802234500646591, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 117830 + }, + { + "epoch": 0.4555364846685531, + "grad_norm": 0.11777086555957794, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 117840 + }, + { + "epoch": 0.45557514187193643, + "grad_norm": 0.10621151328086853, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 117850 + }, + { + "epoch": 0.4556137990753197, + "grad_norm": 0.09682837873697281, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 117860 + }, + { + "epoch": 0.455652456278703, + "grad_norm": 0.11486103385686874, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 117870 + }, + { + "epoch": 0.45569111348208624, + "grad_norm": 0.10555505752563477, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 117880 + }, + { + "epoch": 0.45572977068546955, + "grad_norm": 0.12206471711397171, + "learning_rate": 0.002, + "loss": 2.347, + "step": 117890 + }, + { + "epoch": 0.4557684278888528, + "grad_norm": 0.10294688493013382, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 117900 + }, + { + "epoch": 0.4558070850922361, + "grad_norm": 0.09726481139659882, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 117910 + }, + { + "epoch": 0.45584574229561936, + "grad_norm": 0.11977904289960861, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 117920 + }, + { + "epoch": 0.45588439949900267, + "grad_norm": 0.10810381174087524, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 117930 + }, + { + "epoch": 0.4559230567023859, + "grad_norm": 0.12523780763149261, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 117940 + }, + { + "epoch": 0.45596171390576923, + "grad_norm": 0.10021597892045975, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 117950 + }, + { + "epoch": 0.4560003711091525, + "grad_norm": 0.11003972589969635, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 117960 + }, + { + "epoch": 0.45603902831253573, + "grad_norm": 0.10835086554288864, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 117970 + }, + { + "epoch": 0.45607768551591904, + "grad_norm": 0.11682787537574768, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 117980 + }, + { + "epoch": 0.4561163427193023, + "grad_norm": 0.10805117338895798, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 117990 + }, + { + "epoch": 0.4561549999226856, + "grad_norm": 0.11354291439056396, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 118000 + }, + { + "epoch": 0.45619365712606885, + "grad_norm": 0.09457213431596756, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 118010 + }, + { + "epoch": 0.45623231432945216, + "grad_norm": 0.11127649247646332, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 118020 + }, + { + "epoch": 0.4562709715328354, + "grad_norm": 0.1210864931344986, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 118030 + }, + { + "epoch": 0.4563096287362187, + "grad_norm": 0.10766540467739105, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 118040 + }, + { + "epoch": 0.45634828593960197, + "grad_norm": 0.09710508584976196, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 118050 + }, + { + "epoch": 0.4563869431429853, + "grad_norm": 0.10217183083295822, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 118060 + }, + { + "epoch": 0.45642560034636853, + "grad_norm": 0.11859377473592758, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 118070 + }, + { + "epoch": 0.45646425754975184, + "grad_norm": 0.11485655605792999, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 118080 + }, + { + "epoch": 0.4565029147531351, + "grad_norm": 0.10804521292448044, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 118090 + }, + { + "epoch": 0.4565415719565184, + "grad_norm": 0.11177793145179749, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 118100 + }, + { + "epoch": 0.45658022915990165, + "grad_norm": 0.13285163044929504, + "learning_rate": 0.002, + "loss": 2.341, + "step": 118110 + }, + { + "epoch": 0.45661888636328496, + "grad_norm": 0.101204052567482, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 118120 + }, + { + "epoch": 0.4566575435666682, + "grad_norm": 0.10703273862600327, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 118130 + }, + { + "epoch": 0.4566962007700515, + "grad_norm": 0.10772348195314407, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 118140 + }, + { + "epoch": 0.45673485797343477, + "grad_norm": 0.1055298000574112, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 118150 + }, + { + "epoch": 0.456773515176818, + "grad_norm": 0.10721943527460098, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 118160 + }, + { + "epoch": 0.45681217238020133, + "grad_norm": 0.12398935854434967, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 118170 + }, + { + "epoch": 0.4568508295835846, + "grad_norm": 0.10382281243801117, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 118180 + }, + { + "epoch": 0.4568894867869679, + "grad_norm": 0.09265174716711044, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 118190 + }, + { + "epoch": 0.45692814399035114, + "grad_norm": 0.09246476739645004, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 118200 + }, + { + "epoch": 0.45696680119373445, + "grad_norm": 0.10811164975166321, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 118210 + }, + { + "epoch": 0.4570054583971177, + "grad_norm": 0.11175240576267242, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 118220 + }, + { + "epoch": 0.457044115600501, + "grad_norm": 0.11679556965827942, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 118230 + }, + { + "epoch": 0.45708277280388426, + "grad_norm": 0.11951835453510284, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 118240 + }, + { + "epoch": 0.45712143000726757, + "grad_norm": 0.09550821036100388, + "learning_rate": 0.002, + "loss": 2.346, + "step": 118250 + }, + { + "epoch": 0.4571600872106508, + "grad_norm": 0.12797249853610992, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 118260 + }, + { + "epoch": 0.45719874441403413, + "grad_norm": 0.1069146916270256, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 118270 + }, + { + "epoch": 0.4572374016174174, + "grad_norm": 0.11669927090406418, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 118280 + }, + { + "epoch": 0.4572760588208007, + "grad_norm": 0.09638587385416031, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 118290 + }, + { + "epoch": 0.45731471602418394, + "grad_norm": 0.10993216931819916, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 118300 + }, + { + "epoch": 0.45735337322756725, + "grad_norm": 0.10969716310501099, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 118310 + }, + { + "epoch": 0.4573920304309505, + "grad_norm": 0.10108502954244614, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 118320 + }, + { + "epoch": 0.45743068763433375, + "grad_norm": 0.11068734526634216, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 118330 + }, + { + "epoch": 0.45746934483771706, + "grad_norm": 0.09397805482149124, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 118340 + }, + { + "epoch": 0.4575080020411003, + "grad_norm": 0.10946174710988998, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 118350 + }, + { + "epoch": 0.4575466592444836, + "grad_norm": 0.11118168383836746, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 118360 + }, + { + "epoch": 0.4575853164478669, + "grad_norm": 0.11338678002357483, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 118370 + }, + { + "epoch": 0.4576239736512502, + "grad_norm": 0.10616643726825714, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 118380 + }, + { + "epoch": 0.45766263085463343, + "grad_norm": 0.0960552766919136, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 118390 + }, + { + "epoch": 0.45770128805801674, + "grad_norm": 0.10193514823913574, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 118400 + }, + { + "epoch": 0.4577399452614, + "grad_norm": 0.10242512077093124, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 118410 + }, + { + "epoch": 0.4577786024647833, + "grad_norm": 0.09158801287412643, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 118420 + }, + { + "epoch": 0.45781725966816655, + "grad_norm": 0.11092250794172287, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 118430 + }, + { + "epoch": 0.45785591687154986, + "grad_norm": 0.11364707350730896, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 118440 + }, + { + "epoch": 0.4578945740749331, + "grad_norm": 0.10631339997053146, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 118450 + }, + { + "epoch": 0.4579332312783164, + "grad_norm": 0.11272207647562027, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 118460 + }, + { + "epoch": 0.4579718884816997, + "grad_norm": 0.109578438103199, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 118470 + }, + { + "epoch": 0.458010545685083, + "grad_norm": 0.09565993398427963, + "learning_rate": 0.002, + "loss": 2.361, + "step": 118480 + }, + { + "epoch": 0.45804920288846623, + "grad_norm": 0.1042008176445961, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 118490 + }, + { + "epoch": 0.45808786009184954, + "grad_norm": 0.09246491640806198, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 118500 + }, + { + "epoch": 0.4581265172952328, + "grad_norm": 0.0987660214304924, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 118510 + }, + { + "epoch": 0.45816517449861605, + "grad_norm": 0.1291125863790512, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 118520 + }, + { + "epoch": 0.45820383170199935, + "grad_norm": 0.11253218352794647, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 118530 + }, + { + "epoch": 0.4582424889053826, + "grad_norm": 0.10364680737257004, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 118540 + }, + { + "epoch": 0.4582811461087659, + "grad_norm": 0.09472087025642395, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 118550 + }, + { + "epoch": 0.45831980331214917, + "grad_norm": 0.11546590924263, + "learning_rate": 0.002, + "loss": 2.35, + "step": 118560 + }, + { + "epoch": 0.4583584605155325, + "grad_norm": 0.11254972964525223, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 118570 + }, + { + "epoch": 0.4583971177189157, + "grad_norm": 0.11288855969905853, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 118580 + }, + { + "epoch": 0.45843577492229903, + "grad_norm": 0.10879432410001755, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 118590 + }, + { + "epoch": 0.4584744321256823, + "grad_norm": 0.10375107824802399, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 118600 + }, + { + "epoch": 0.4585130893290656, + "grad_norm": 0.10116180032491684, + "learning_rate": 0.002, + "loss": 2.342, + "step": 118610 + }, + { + "epoch": 0.45855174653244885, + "grad_norm": 0.13571816682815552, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 118620 + }, + { + "epoch": 0.45859040373583215, + "grad_norm": 0.10424613207578659, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 118630 + }, + { + "epoch": 0.4586290609392154, + "grad_norm": 0.10607179999351501, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 118640 + }, + { + "epoch": 0.4586677181425987, + "grad_norm": 0.10820239782333374, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 118650 + }, + { + "epoch": 0.45870637534598196, + "grad_norm": 0.11633388698101044, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 118660 + }, + { + "epoch": 0.4587450325493653, + "grad_norm": 0.10916215181350708, + "learning_rate": 0.002, + "loss": 2.351, + "step": 118670 + }, + { + "epoch": 0.4587836897527485, + "grad_norm": 0.11252785474061966, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 118680 + }, + { + "epoch": 0.45882234695613183, + "grad_norm": 0.11671953648328781, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 118690 + }, + { + "epoch": 0.4588610041595151, + "grad_norm": 0.09520015120506287, + "learning_rate": 0.002, + "loss": 2.352, + "step": 118700 + }, + { + "epoch": 0.45889966136289834, + "grad_norm": 0.09781093150377274, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 118710 + }, + { + "epoch": 0.45893831856628164, + "grad_norm": 0.10954856872558594, + "learning_rate": 0.002, + "loss": 2.351, + "step": 118720 + }, + { + "epoch": 0.4589769757696649, + "grad_norm": 0.11329742521047592, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 118730 + }, + { + "epoch": 0.4590156329730482, + "grad_norm": 0.13266512751579285, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 118740 + }, + { + "epoch": 0.45905429017643146, + "grad_norm": 0.10615793615579605, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 118750 + }, + { + "epoch": 0.45909294737981476, + "grad_norm": 0.09207666665315628, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 118760 + }, + { + "epoch": 0.459131604583198, + "grad_norm": 0.1197124645113945, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 118770 + }, + { + "epoch": 0.4591702617865813, + "grad_norm": 0.10759207606315613, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 118780 + }, + { + "epoch": 0.4592089189899646, + "grad_norm": 0.09776882082223892, + "learning_rate": 0.002, + "loss": 2.352, + "step": 118790 + }, + { + "epoch": 0.4592475761933479, + "grad_norm": 0.10756875574588776, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 118800 + }, + { + "epoch": 0.45928623339673114, + "grad_norm": 0.11720873415470123, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 118810 + }, + { + "epoch": 0.45932489060011444, + "grad_norm": 0.23412977159023285, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 118820 + }, + { + "epoch": 0.4593635478034977, + "grad_norm": 0.11052930355072021, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 118830 + }, + { + "epoch": 0.459402205006881, + "grad_norm": 0.09955132752656937, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 118840 + }, + { + "epoch": 0.45944086221026426, + "grad_norm": 0.10409852862358093, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 118850 + }, + { + "epoch": 0.45947951941364756, + "grad_norm": 0.1087784692645073, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 118860 + }, + { + "epoch": 0.4595181766170308, + "grad_norm": 0.1173655092716217, + "learning_rate": 0.002, + "loss": 2.341, + "step": 118870 + }, + { + "epoch": 0.4595568338204141, + "grad_norm": 0.09329547733068466, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 118880 + }, + { + "epoch": 0.4595954910237974, + "grad_norm": 0.12537939846515656, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 118890 + }, + { + "epoch": 0.45963414822718063, + "grad_norm": 0.09709435701370239, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 118900 + }, + { + "epoch": 0.45967280543056394, + "grad_norm": 0.10174285620450974, + "learning_rate": 0.002, + "loss": 2.349, + "step": 118910 + }, + { + "epoch": 0.4597114626339472, + "grad_norm": 0.11365412175655365, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 118920 + }, + { + "epoch": 0.4597501198373305, + "grad_norm": 0.10826610773801804, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 118930 + }, + { + "epoch": 0.45978877704071375, + "grad_norm": 0.11835940182209015, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 118940 + }, + { + "epoch": 0.45982743424409706, + "grad_norm": 0.11163625121116638, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 118950 + }, + { + "epoch": 0.4598660914474803, + "grad_norm": 0.09987737238407135, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 118960 + }, + { + "epoch": 0.4599047486508636, + "grad_norm": 0.10360037535429001, + "learning_rate": 0.002, + "loss": 2.358, + "step": 118970 + }, + { + "epoch": 0.45994340585424687, + "grad_norm": 0.10967511683702469, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 118980 + }, + { + "epoch": 0.4599820630576302, + "grad_norm": 0.11492753028869629, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 118990 + }, + { + "epoch": 0.4600207202610134, + "grad_norm": 0.10047702491283417, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 119000 + }, + { + "epoch": 0.46005937746439673, + "grad_norm": 0.10438437014818192, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 119010 + }, + { + "epoch": 0.46009803466778, + "grad_norm": 0.11283348500728607, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 119020 + }, + { + "epoch": 0.4601366918711633, + "grad_norm": 0.11161735653877258, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 119030 + }, + { + "epoch": 0.46017534907454655, + "grad_norm": 0.11847127974033356, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 119040 + }, + { + "epoch": 0.46021400627792985, + "grad_norm": 0.11654791980981827, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 119050 + }, + { + "epoch": 0.4602526634813131, + "grad_norm": 0.1058049201965332, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 119060 + }, + { + "epoch": 0.46029132068469636, + "grad_norm": 0.12000872939825058, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 119070 + }, + { + "epoch": 0.46032997788807967, + "grad_norm": 0.11992703378200531, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 119080 + }, + { + "epoch": 0.4603686350914629, + "grad_norm": 0.12298092991113663, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 119090 + }, + { + "epoch": 0.4604072922948462, + "grad_norm": 0.0997709259390831, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 119100 + }, + { + "epoch": 0.4604459494982295, + "grad_norm": 0.11276139318943024, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 119110 + }, + { + "epoch": 0.4604846067016128, + "grad_norm": 0.09762197732925415, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 119120 + }, + { + "epoch": 0.46052326390499604, + "grad_norm": 0.1194218322634697, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 119130 + }, + { + "epoch": 0.46056192110837935, + "grad_norm": 0.10560303181409836, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 119140 + }, + { + "epoch": 0.4606005783117626, + "grad_norm": 0.10876715183258057, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 119150 + }, + { + "epoch": 0.4606392355151459, + "grad_norm": 0.11960139125585556, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 119160 + }, + { + "epoch": 0.46067789271852916, + "grad_norm": 0.11625376343727112, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 119170 + }, + { + "epoch": 0.46071654992191247, + "grad_norm": 0.11519750207662582, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 119180 + }, + { + "epoch": 0.4607552071252957, + "grad_norm": 0.12271353602409363, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 119190 + }, + { + "epoch": 0.460793864328679, + "grad_norm": 0.08917110413312912, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 119200 + }, + { + "epoch": 0.4608325215320623, + "grad_norm": 0.09352830797433853, + "learning_rate": 0.002, + "loss": 2.347, + "step": 119210 + }, + { + "epoch": 0.4608711787354456, + "grad_norm": 0.10510279983282089, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 119220 + }, + { + "epoch": 0.46090983593882884, + "grad_norm": 0.10788548737764359, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 119230 + }, + { + "epoch": 0.46094849314221215, + "grad_norm": 0.10123184323310852, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 119240 + }, + { + "epoch": 0.4609871503455954, + "grad_norm": 0.1042218804359436, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 119250 + }, + { + "epoch": 0.46102580754897865, + "grad_norm": 0.09787441790103912, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 119260 + }, + { + "epoch": 0.46106446475236196, + "grad_norm": 0.0965803861618042, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 119270 + }, + { + "epoch": 0.4611031219557452, + "grad_norm": 0.11218937486410141, + "learning_rate": 0.002, + "loss": 2.354, + "step": 119280 + }, + { + "epoch": 0.4611417791591285, + "grad_norm": 0.0979524776339531, + "learning_rate": 0.002, + "loss": 2.352, + "step": 119290 + }, + { + "epoch": 0.46118043636251177, + "grad_norm": 0.1069769412279129, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 119300 + }, + { + "epoch": 0.4612190935658951, + "grad_norm": 0.09180501848459244, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 119310 + }, + { + "epoch": 0.46125775076927833, + "grad_norm": 0.12309186905622482, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 119320 + }, + { + "epoch": 0.46129640797266164, + "grad_norm": 0.10278813540935516, + "learning_rate": 0.002, + "loss": 2.352, + "step": 119330 + }, + { + "epoch": 0.4613350651760449, + "grad_norm": 0.16567133367061615, + "learning_rate": 0.002, + "loss": 2.352, + "step": 119340 + }, + { + "epoch": 0.4613737223794282, + "grad_norm": 0.1133866161108017, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 119350 + }, + { + "epoch": 0.46141237958281145, + "grad_norm": 0.10620232671499252, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 119360 + }, + { + "epoch": 0.46145103678619476, + "grad_norm": 0.1399594396352768, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 119370 + }, + { + "epoch": 0.461489693989578, + "grad_norm": 0.1203150525689125, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 119380 + }, + { + "epoch": 0.4615283511929613, + "grad_norm": 0.10792354494333267, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 119390 + }, + { + "epoch": 0.46156700839634457, + "grad_norm": 0.09377129375934601, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 119400 + }, + { + "epoch": 0.4616056655997279, + "grad_norm": 0.1000465452671051, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 119410 + }, + { + "epoch": 0.46164432280311113, + "grad_norm": 0.11150357872247696, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 119420 + }, + { + "epoch": 0.46168298000649444, + "grad_norm": 0.10797961056232452, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 119430 + }, + { + "epoch": 0.4617216372098777, + "grad_norm": 0.11019590497016907, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 119440 + }, + { + "epoch": 0.46176029441326094, + "grad_norm": 0.11343441158533096, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 119450 + }, + { + "epoch": 0.46179895161664425, + "grad_norm": 0.09743047505617142, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 119460 + }, + { + "epoch": 0.4618376088200275, + "grad_norm": 0.13833467662334442, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 119470 + }, + { + "epoch": 0.4618762660234108, + "grad_norm": 0.09775927662849426, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 119480 + }, + { + "epoch": 0.46191492322679406, + "grad_norm": 0.10678848624229431, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 119490 + }, + { + "epoch": 0.46195358043017737, + "grad_norm": 0.10635264962911606, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 119500 + }, + { + "epoch": 0.4619922376335606, + "grad_norm": 0.11372309923171997, + "learning_rate": 0.002, + "loss": 2.359, + "step": 119510 + }, + { + "epoch": 0.46203089483694393, + "grad_norm": 0.10508405417203903, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 119520 + }, + { + "epoch": 0.4620695520403272, + "grad_norm": 0.1135542243719101, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 119530 + }, + { + "epoch": 0.4621082092437105, + "grad_norm": 0.09791223704814911, + "learning_rate": 0.002, + "loss": 2.362, + "step": 119540 + }, + { + "epoch": 0.46214686644709374, + "grad_norm": 0.10771148651838303, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 119550 + }, + { + "epoch": 0.46218552365047705, + "grad_norm": 0.10225772112607956, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 119560 + }, + { + "epoch": 0.4622241808538603, + "grad_norm": 0.1033085361123085, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 119570 + }, + { + "epoch": 0.4622628380572436, + "grad_norm": 0.10448309034109116, + "learning_rate": 0.002, + "loss": 2.346, + "step": 119580 + }, + { + "epoch": 0.46230149526062686, + "grad_norm": 0.1192668080329895, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 119590 + }, + { + "epoch": 0.46234015246401017, + "grad_norm": 0.1274099349975586, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 119600 + }, + { + "epoch": 0.4623788096673934, + "grad_norm": 0.11079815030097961, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 119610 + }, + { + "epoch": 0.4624174668707767, + "grad_norm": 0.11107289791107178, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 119620 + }, + { + "epoch": 0.46245612407416, + "grad_norm": 0.11228213459253311, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 119630 + }, + { + "epoch": 0.46249478127754323, + "grad_norm": 0.09557343274354935, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 119640 + }, + { + "epoch": 0.46253343848092654, + "grad_norm": 0.11540845036506653, + "learning_rate": 0.002, + "loss": 2.3707, + "step": 119650 + }, + { + "epoch": 0.4625720956843098, + "grad_norm": 0.11660248786211014, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 119660 + }, + { + "epoch": 0.4626107528876931, + "grad_norm": 0.12330517917871475, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 119670 + }, + { + "epoch": 0.46264941009107635, + "grad_norm": 0.09484302997589111, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 119680 + }, + { + "epoch": 0.46268806729445966, + "grad_norm": 0.11608867347240448, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 119690 + }, + { + "epoch": 0.4627267244978429, + "grad_norm": 0.09752960503101349, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 119700 + }, + { + "epoch": 0.4627653817012262, + "grad_norm": 0.1270848512649536, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 119710 + }, + { + "epoch": 0.46280403890460947, + "grad_norm": 0.0989656075835228, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 119720 + }, + { + "epoch": 0.4628426961079928, + "grad_norm": 0.1194680854678154, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 119730 + }, + { + "epoch": 0.46288135331137603, + "grad_norm": 0.10372116416692734, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 119740 + }, + { + "epoch": 0.46292001051475934, + "grad_norm": 0.10059253126382828, + "learning_rate": 0.002, + "loss": 2.343, + "step": 119750 + }, + { + "epoch": 0.4629586677181426, + "grad_norm": 0.11401177197694778, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 119760 + }, + { + "epoch": 0.4629973249215259, + "grad_norm": 0.17351770401000977, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 119770 + }, + { + "epoch": 0.46303598212490915, + "grad_norm": 0.11136732250452042, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 119780 + }, + { + "epoch": 0.46307463932829246, + "grad_norm": 0.10837043076753616, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 119790 + }, + { + "epoch": 0.4631132965316757, + "grad_norm": 0.10472016781568527, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 119800 + }, + { + "epoch": 0.46315195373505896, + "grad_norm": 0.09861727058887482, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 119810 + }, + { + "epoch": 0.46319061093844227, + "grad_norm": 0.10650566965341568, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 119820 + }, + { + "epoch": 0.4632292681418255, + "grad_norm": 0.12352261692285538, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 119830 + }, + { + "epoch": 0.46326792534520883, + "grad_norm": 0.10251409560441971, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 119840 + }, + { + "epoch": 0.4633065825485921, + "grad_norm": 0.10705935209989548, + "learning_rate": 0.002, + "loss": 2.352, + "step": 119850 + }, + { + "epoch": 0.4633452397519754, + "grad_norm": 0.1259768307209015, + "learning_rate": 0.002, + "loss": 2.338, + "step": 119860 + }, + { + "epoch": 0.46338389695535864, + "grad_norm": 0.11686693131923676, + "learning_rate": 0.002, + "loss": 2.351, + "step": 119870 + }, + { + "epoch": 0.46342255415874195, + "grad_norm": 0.11357530206441879, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 119880 + }, + { + "epoch": 0.4634612113621252, + "grad_norm": 0.10879925638437271, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 119890 + }, + { + "epoch": 0.4634998685655085, + "grad_norm": 0.09799963235855103, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 119900 + }, + { + "epoch": 0.46353852576889176, + "grad_norm": 0.11603069305419922, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 119910 + }, + { + "epoch": 0.46357718297227507, + "grad_norm": 0.11923540383577347, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 119920 + }, + { + "epoch": 0.4636158401756583, + "grad_norm": 0.10628203302621841, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 119930 + }, + { + "epoch": 0.46365449737904163, + "grad_norm": 0.10463366657495499, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 119940 + }, + { + "epoch": 0.4636931545824249, + "grad_norm": 0.11544150114059448, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 119950 + }, + { + "epoch": 0.4637318117858082, + "grad_norm": 0.10797338932752609, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 119960 + }, + { + "epoch": 0.46377046898919144, + "grad_norm": 0.09882956743240356, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 119970 + }, + { + "epoch": 0.46380912619257475, + "grad_norm": 0.10176735371351242, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 119980 + }, + { + "epoch": 0.463847783395958, + "grad_norm": 0.12177812308073044, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 119990 + }, + { + "epoch": 0.46388644059934125, + "grad_norm": 0.10070200264453888, + "learning_rate": 0.002, + "loss": 2.371, + "step": 120000 + }, + { + "epoch": 0.46392509780272456, + "grad_norm": 0.08883144706487656, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 120010 + }, + { + "epoch": 0.4639637550061078, + "grad_norm": 0.11577492207288742, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 120020 + }, + { + "epoch": 0.4640024122094911, + "grad_norm": 0.09652300924062729, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 120030 + }, + { + "epoch": 0.4640410694128744, + "grad_norm": 0.10715745389461517, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 120040 + }, + { + "epoch": 0.4640797266162577, + "grad_norm": 0.1004326194524765, + "learning_rate": 0.002, + "loss": 2.332, + "step": 120050 + }, + { + "epoch": 0.46411838381964093, + "grad_norm": 0.09592131525278091, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 120060 + }, + { + "epoch": 0.46415704102302424, + "grad_norm": 0.13525542616844177, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 120070 + }, + { + "epoch": 0.4641956982264075, + "grad_norm": 0.11197216808795929, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 120080 + }, + { + "epoch": 0.4642343554297908, + "grad_norm": 0.11991088837385178, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 120090 + }, + { + "epoch": 0.46427301263317405, + "grad_norm": 0.12654219567775726, + "learning_rate": 0.002, + "loss": 2.349, + "step": 120100 + }, + { + "epoch": 0.46431166983655736, + "grad_norm": 0.10239842534065247, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 120110 + }, + { + "epoch": 0.4643503270399406, + "grad_norm": 0.12372080236673355, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 120120 + }, + { + "epoch": 0.4643889842433239, + "grad_norm": 0.10336051136255264, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 120130 + }, + { + "epoch": 0.4644276414467072, + "grad_norm": 0.11218868941068649, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 120140 + }, + { + "epoch": 0.4644662986500905, + "grad_norm": 0.10370685160160065, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 120150 + }, + { + "epoch": 0.46450495585347373, + "grad_norm": 0.09634726494550705, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 120160 + }, + { + "epoch": 0.46454361305685704, + "grad_norm": 0.132036954164505, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 120170 + }, + { + "epoch": 0.4645822702602403, + "grad_norm": 0.14281754195690155, + "learning_rate": 0.002, + "loss": 2.357, + "step": 120180 + }, + { + "epoch": 0.46462092746362355, + "grad_norm": 0.11279735714197159, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 120190 + }, + { + "epoch": 0.46465958466700685, + "grad_norm": 0.09632159769535065, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 120200 + }, + { + "epoch": 0.4646982418703901, + "grad_norm": 0.09436524659395218, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 120210 + }, + { + "epoch": 0.4647368990737734, + "grad_norm": 0.12093179672956467, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 120220 + }, + { + "epoch": 0.46477555627715667, + "grad_norm": 0.1084708645939827, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 120230 + }, + { + "epoch": 0.46481421348054, + "grad_norm": 0.11820647865533829, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 120240 + }, + { + "epoch": 0.4648528706839232, + "grad_norm": 0.12325529754161835, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 120250 + }, + { + "epoch": 0.46489152788730653, + "grad_norm": 0.10659166425466537, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 120260 + }, + { + "epoch": 0.4649301850906898, + "grad_norm": 0.11430692672729492, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 120270 + }, + { + "epoch": 0.4649688422940731, + "grad_norm": 0.12177309393882751, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 120280 + }, + { + "epoch": 0.46500749949745634, + "grad_norm": 0.12660956382751465, + "learning_rate": 0.002, + "loss": 2.349, + "step": 120290 + }, + { + "epoch": 0.46504615670083965, + "grad_norm": 0.08679116517305374, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 120300 + }, + { + "epoch": 0.4650848139042229, + "grad_norm": 0.09967411309480667, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 120310 + }, + { + "epoch": 0.4651234711076062, + "grad_norm": 0.11712680757045746, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 120320 + }, + { + "epoch": 0.46516212831098946, + "grad_norm": 0.11553264409303665, + "learning_rate": 0.002, + "loss": 2.339, + "step": 120330 + }, + { + "epoch": 0.46520078551437277, + "grad_norm": 0.11434249579906464, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 120340 + }, + { + "epoch": 0.465239442717756, + "grad_norm": 0.1029350757598877, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 120350 + }, + { + "epoch": 0.46527809992113933, + "grad_norm": 0.11101929098367691, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 120360 + }, + { + "epoch": 0.4653167571245226, + "grad_norm": 0.10793200880289078, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 120370 + }, + { + "epoch": 0.46535541432790584, + "grad_norm": 0.1273803412914276, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 120380 + }, + { + "epoch": 0.46539407153128914, + "grad_norm": 0.14132443070411682, + "learning_rate": 0.002, + "loss": 2.356, + "step": 120390 + }, + { + "epoch": 0.4654327287346724, + "grad_norm": 0.10636676847934723, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 120400 + }, + { + "epoch": 0.4654713859380557, + "grad_norm": 0.0932055413722992, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 120410 + }, + { + "epoch": 0.46551004314143896, + "grad_norm": 0.1264282763004303, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 120420 + }, + { + "epoch": 0.46554870034482226, + "grad_norm": 0.10335405170917511, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 120430 + }, + { + "epoch": 0.4655873575482055, + "grad_norm": 0.10554596036672592, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 120440 + }, + { + "epoch": 0.4656260147515888, + "grad_norm": 0.12399185448884964, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 120450 + }, + { + "epoch": 0.4656646719549721, + "grad_norm": 0.0971696674823761, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 120460 + }, + { + "epoch": 0.4657033291583554, + "grad_norm": 0.11128117889165878, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 120470 + }, + { + "epoch": 0.46574198636173864, + "grad_norm": 0.09656219929456711, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 120480 + }, + { + "epoch": 0.46578064356512194, + "grad_norm": 0.09573480486869812, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 120490 + }, + { + "epoch": 0.4658193007685052, + "grad_norm": 0.11154934018850327, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 120500 + }, + { + "epoch": 0.4658579579718885, + "grad_norm": 0.11035740375518799, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 120510 + }, + { + "epoch": 0.46589661517527176, + "grad_norm": 0.09658867120742798, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 120520 + }, + { + "epoch": 0.46593527237865506, + "grad_norm": 0.12649919092655182, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 120530 + }, + { + "epoch": 0.4659739295820383, + "grad_norm": 0.11713672429323196, + "learning_rate": 0.002, + "loss": 2.365, + "step": 120540 + }, + { + "epoch": 0.4660125867854216, + "grad_norm": 0.11696921288967133, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 120550 + }, + { + "epoch": 0.4660512439888049, + "grad_norm": 0.10550013184547424, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 120560 + }, + { + "epoch": 0.4660899011921881, + "grad_norm": 0.12237689644098282, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 120570 + }, + { + "epoch": 0.46612855839557144, + "grad_norm": 0.09278661757707596, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 120580 + }, + { + "epoch": 0.4661672155989547, + "grad_norm": 0.11413309723138809, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 120590 + }, + { + "epoch": 0.466205872802338, + "grad_norm": 0.13561297953128815, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 120600 + }, + { + "epoch": 0.46624453000572125, + "grad_norm": 0.11021458357572556, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 120610 + }, + { + "epoch": 0.46628318720910455, + "grad_norm": 0.11534292995929718, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 120620 + }, + { + "epoch": 0.4663218444124878, + "grad_norm": 0.10146640241146088, + "learning_rate": 0.002, + "loss": 2.338, + "step": 120630 + }, + { + "epoch": 0.4663605016158711, + "grad_norm": 0.11225304752588272, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 120640 + }, + { + "epoch": 0.46639915881925437, + "grad_norm": 0.10609450936317444, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 120650 + }, + { + "epoch": 0.4664378160226377, + "grad_norm": 0.10264197736978531, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 120660 + }, + { + "epoch": 0.4664764732260209, + "grad_norm": 0.09689023345708847, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 120670 + }, + { + "epoch": 0.46651513042940423, + "grad_norm": 0.108772873878479, + "learning_rate": 0.002, + "loss": 2.346, + "step": 120680 + }, + { + "epoch": 0.4665537876327875, + "grad_norm": 0.1061214953660965, + "learning_rate": 0.002, + "loss": 2.35, + "step": 120690 + }, + { + "epoch": 0.4665924448361708, + "grad_norm": 0.10163940489292145, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 120700 + }, + { + "epoch": 0.46663110203955405, + "grad_norm": 0.11704286187887192, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 120710 + }, + { + "epoch": 0.46666975924293735, + "grad_norm": 0.11735977232456207, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 120720 + }, + { + "epoch": 0.4667084164463206, + "grad_norm": 0.1283426731824875, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 120730 + }, + { + "epoch": 0.46674707364970386, + "grad_norm": 0.10897085815668106, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 120740 + }, + { + "epoch": 0.46678573085308717, + "grad_norm": 0.10965883731842041, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 120750 + }, + { + "epoch": 0.4668243880564704, + "grad_norm": 0.10167060047388077, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 120760 + }, + { + "epoch": 0.4668630452598537, + "grad_norm": 0.10578198730945587, + "learning_rate": 0.002, + "loss": 2.348, + "step": 120770 + }, + { + "epoch": 0.466901702463237, + "grad_norm": 0.10897000133991241, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 120780 + }, + { + "epoch": 0.4669403596666203, + "grad_norm": 0.0896591916680336, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 120790 + }, + { + "epoch": 0.46697901687000354, + "grad_norm": 0.11927013844251633, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 120800 + }, + { + "epoch": 0.46701767407338685, + "grad_norm": 0.10862559825181961, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 120810 + }, + { + "epoch": 0.4670563312767701, + "grad_norm": 0.13308750092983246, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 120820 + }, + { + "epoch": 0.4670949884801534, + "grad_norm": 0.09521917998790741, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 120830 + }, + { + "epoch": 0.46713364568353666, + "grad_norm": 0.10568059235811234, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 120840 + }, + { + "epoch": 0.46717230288691997, + "grad_norm": 0.1008000299334526, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 120850 + }, + { + "epoch": 0.4672109600903032, + "grad_norm": 0.33562374114990234, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 120860 + }, + { + "epoch": 0.4672496172936865, + "grad_norm": 0.12237101048231125, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 120870 + }, + { + "epoch": 0.4672882744970698, + "grad_norm": 0.10964284092187881, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 120880 + }, + { + "epoch": 0.4673269317004531, + "grad_norm": 0.09678712487220764, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 120890 + }, + { + "epoch": 0.46736558890383634, + "grad_norm": 0.11241725087165833, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 120900 + }, + { + "epoch": 0.46740424610721965, + "grad_norm": 0.10393916815519333, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 120910 + }, + { + "epoch": 0.4674429033106029, + "grad_norm": 0.11094099283218384, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 120920 + }, + { + "epoch": 0.46748156051398615, + "grad_norm": 0.1192275732755661, + "learning_rate": 0.002, + "loss": 2.349, + "step": 120930 + }, + { + "epoch": 0.46752021771736946, + "grad_norm": 0.13173754513263702, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 120940 + }, + { + "epoch": 0.4675588749207527, + "grad_norm": 0.1284404844045639, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 120950 + }, + { + "epoch": 0.467597532124136, + "grad_norm": 0.1140296459197998, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 120960 + }, + { + "epoch": 0.46763618932751927, + "grad_norm": 0.10858230292797089, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 120970 + }, + { + "epoch": 0.4676748465309026, + "grad_norm": 0.14189264178276062, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 120980 + }, + { + "epoch": 0.46771350373428583, + "grad_norm": 0.10394598543643951, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 120990 + }, + { + "epoch": 0.46775216093766914, + "grad_norm": 0.10648351162672043, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 121000 + }, + { + "epoch": 0.4677908181410524, + "grad_norm": 0.09850535541772842, + "learning_rate": 0.002, + "loss": 2.338, + "step": 121010 + }, + { + "epoch": 0.4678294753444357, + "grad_norm": 0.09851415455341339, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 121020 + }, + { + "epoch": 0.46786813254781895, + "grad_norm": 0.1181519404053688, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 121030 + }, + { + "epoch": 0.46790678975120226, + "grad_norm": 0.10987094044685364, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 121040 + }, + { + "epoch": 0.4679454469545855, + "grad_norm": 0.10694500803947449, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 121050 + }, + { + "epoch": 0.4679841041579688, + "grad_norm": 0.1143125668168068, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 121060 + }, + { + "epoch": 0.46802276136135207, + "grad_norm": 0.12903359532356262, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 121070 + }, + { + "epoch": 0.4680614185647354, + "grad_norm": 0.09866703301668167, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 121080 + }, + { + "epoch": 0.46810007576811863, + "grad_norm": 0.11391236633062363, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 121090 + }, + { + "epoch": 0.46813873297150194, + "grad_norm": 0.0955762043595314, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 121100 + }, + { + "epoch": 0.4681773901748852, + "grad_norm": 0.11622832715511322, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 121110 + }, + { + "epoch": 0.46821604737826844, + "grad_norm": 0.11479691416025162, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 121120 + }, + { + "epoch": 0.46825470458165175, + "grad_norm": 0.09819727391004562, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 121130 + }, + { + "epoch": 0.468293361785035, + "grad_norm": 0.11308932304382324, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 121140 + }, + { + "epoch": 0.4683320189884183, + "grad_norm": 0.10155956447124481, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 121150 + }, + { + "epoch": 0.46837067619180156, + "grad_norm": 0.11019018292427063, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 121160 + }, + { + "epoch": 0.46840933339518487, + "grad_norm": 0.10457126796245575, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 121170 + }, + { + "epoch": 0.4684479905985681, + "grad_norm": 0.11174652725458145, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 121180 + }, + { + "epoch": 0.46848664780195143, + "grad_norm": 0.10429858416318893, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 121190 + }, + { + "epoch": 0.4685253050053347, + "grad_norm": 0.09792893379926682, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 121200 + }, + { + "epoch": 0.468563962208718, + "grad_norm": 0.10697484761476517, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 121210 + }, + { + "epoch": 0.46860261941210124, + "grad_norm": 0.10429663211107254, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 121220 + }, + { + "epoch": 0.46864127661548455, + "grad_norm": 0.09487517178058624, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 121230 + }, + { + "epoch": 0.4686799338188678, + "grad_norm": 0.10496007651090622, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 121240 + }, + { + "epoch": 0.4687185910222511, + "grad_norm": 0.11141261458396912, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 121250 + }, + { + "epoch": 0.46875724822563436, + "grad_norm": 0.10244353115558624, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 121260 + }, + { + "epoch": 0.46879590542901767, + "grad_norm": 0.09761404246091843, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 121270 + }, + { + "epoch": 0.4688345626324009, + "grad_norm": 0.10012496262788773, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 121280 + }, + { + "epoch": 0.4688732198357842, + "grad_norm": 0.10322960466146469, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 121290 + }, + { + "epoch": 0.4689118770391675, + "grad_norm": 0.11318720132112503, + "learning_rate": 0.002, + "loss": 2.341, + "step": 121300 + }, + { + "epoch": 0.46895053424255073, + "grad_norm": 0.10540834069252014, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 121310 + }, + { + "epoch": 0.46898919144593404, + "grad_norm": 0.10919986665248871, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 121320 + }, + { + "epoch": 0.4690278486493173, + "grad_norm": 0.11813310533761978, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 121330 + }, + { + "epoch": 0.4690665058527006, + "grad_norm": 0.10450516641139984, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 121340 + }, + { + "epoch": 0.46910516305608385, + "grad_norm": 0.10707175731658936, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 121350 + }, + { + "epoch": 0.46914382025946716, + "grad_norm": 0.1057899221777916, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 121360 + }, + { + "epoch": 0.4691824774628504, + "grad_norm": 0.10432370007038116, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 121370 + }, + { + "epoch": 0.4692211346662337, + "grad_norm": 0.10156414657831192, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 121380 + }, + { + "epoch": 0.46925979186961697, + "grad_norm": 0.11768826842308044, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 121390 + }, + { + "epoch": 0.4692984490730003, + "grad_norm": 0.10005386918783188, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 121400 + }, + { + "epoch": 0.46933710627638353, + "grad_norm": 0.11784195154905319, + "learning_rate": 0.002, + "loss": 2.3712, + "step": 121410 + }, + { + "epoch": 0.46937576347976684, + "grad_norm": 0.10894471406936646, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 121420 + }, + { + "epoch": 0.4694144206831501, + "grad_norm": 0.11698637902736664, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 121430 + }, + { + "epoch": 0.4694530778865334, + "grad_norm": 0.11290238797664642, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 121440 + }, + { + "epoch": 0.46949173508991665, + "grad_norm": 0.11281818896532059, + "learning_rate": 0.002, + "loss": 2.345, + "step": 121450 + }, + { + "epoch": 0.46953039229329996, + "grad_norm": 0.10703438520431519, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 121460 + }, + { + "epoch": 0.4695690494966832, + "grad_norm": 0.11932453513145447, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 121470 + }, + { + "epoch": 0.46960770670006646, + "grad_norm": 0.1095815896987915, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 121480 + }, + { + "epoch": 0.46964636390344977, + "grad_norm": 0.13058489561080933, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 121490 + }, + { + "epoch": 0.469685021106833, + "grad_norm": 0.09637417644262314, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 121500 + }, + { + "epoch": 0.46972367831021633, + "grad_norm": 0.09962920099496841, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 121510 + }, + { + "epoch": 0.4697623355135996, + "grad_norm": 0.09866471588611603, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 121520 + }, + { + "epoch": 0.4698009927169829, + "grad_norm": 0.1037471741437912, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 121530 + }, + { + "epoch": 0.46983964992036614, + "grad_norm": 0.09907791763544083, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 121540 + }, + { + "epoch": 0.46987830712374945, + "grad_norm": 0.11282161623239517, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 121550 + }, + { + "epoch": 0.4699169643271327, + "grad_norm": 0.11053518950939178, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 121560 + }, + { + "epoch": 0.469955621530516, + "grad_norm": 0.10161669552326202, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 121570 + }, + { + "epoch": 0.46999427873389926, + "grad_norm": 0.09895357489585876, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 121580 + }, + { + "epoch": 0.47003293593728257, + "grad_norm": 0.1091216579079628, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 121590 + }, + { + "epoch": 0.4700715931406658, + "grad_norm": 0.10275205224752426, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 121600 + }, + { + "epoch": 0.47011025034404913, + "grad_norm": 0.10430356115102768, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 121610 + }, + { + "epoch": 0.4701489075474324, + "grad_norm": 0.11154796928167343, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 121620 + }, + { + "epoch": 0.4701875647508157, + "grad_norm": 0.11724287271499634, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 121630 + }, + { + "epoch": 0.47022622195419894, + "grad_norm": 0.11500924080610275, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 121640 + }, + { + "epoch": 0.47026487915758225, + "grad_norm": 0.10347431898117065, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 121650 + }, + { + "epoch": 0.4703035363609655, + "grad_norm": 0.11929845064878464, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 121660 + }, + { + "epoch": 0.47034219356434875, + "grad_norm": 0.12049948424100876, + "learning_rate": 0.002, + "loss": 2.33, + "step": 121670 + }, + { + "epoch": 0.47038085076773206, + "grad_norm": 0.1123681291937828, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 121680 + }, + { + "epoch": 0.4704195079711153, + "grad_norm": 0.1003938540816307, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 121690 + }, + { + "epoch": 0.4704581651744986, + "grad_norm": 0.10262428224086761, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 121700 + }, + { + "epoch": 0.4704968223778819, + "grad_norm": 0.10206400603055954, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 121710 + }, + { + "epoch": 0.4705354795812652, + "grad_norm": 0.10609682649374008, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 121720 + }, + { + "epoch": 0.47057413678464843, + "grad_norm": 0.10343533009290695, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 121730 + }, + { + "epoch": 0.47061279398803174, + "grad_norm": 0.10270936787128448, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 121740 + }, + { + "epoch": 0.470651451191415, + "grad_norm": 0.12486699223518372, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 121750 + }, + { + "epoch": 0.4706901083947983, + "grad_norm": 0.10693617910146713, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 121760 + }, + { + "epoch": 0.47072876559818155, + "grad_norm": 0.1021292507648468, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 121770 + }, + { + "epoch": 0.47076742280156486, + "grad_norm": 0.12084102630615234, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 121780 + }, + { + "epoch": 0.4708060800049481, + "grad_norm": 0.1022479236125946, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 121790 + }, + { + "epoch": 0.4708447372083314, + "grad_norm": 0.09856412559747696, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 121800 + }, + { + "epoch": 0.4708833944117147, + "grad_norm": 0.10755357146263123, + "learning_rate": 0.002, + "loss": 2.351, + "step": 121810 + }, + { + "epoch": 0.470922051615098, + "grad_norm": 0.1091640517115593, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 121820 + }, + { + "epoch": 0.47096070881848123, + "grad_norm": 0.11964549869298935, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 121830 + }, + { + "epoch": 0.47099936602186454, + "grad_norm": 0.10800044983625412, + "learning_rate": 0.002, + "loss": 2.356, + "step": 121840 + }, + { + "epoch": 0.4710380232252478, + "grad_norm": 0.10545913875102997, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 121850 + }, + { + "epoch": 0.47107668042863104, + "grad_norm": 0.10835971683263779, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 121860 + }, + { + "epoch": 0.47111533763201435, + "grad_norm": 0.09890252351760864, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 121870 + }, + { + "epoch": 0.4711539948353976, + "grad_norm": 0.10678128153085709, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 121880 + }, + { + "epoch": 0.4711926520387809, + "grad_norm": 0.10318324714899063, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 121890 + }, + { + "epoch": 0.47123130924216416, + "grad_norm": 0.10819438844919205, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 121900 + }, + { + "epoch": 0.4712699664455475, + "grad_norm": 0.11838934570550919, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 121910 + }, + { + "epoch": 0.4713086236489307, + "grad_norm": 0.10129489004611969, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 121920 + }, + { + "epoch": 0.47134728085231403, + "grad_norm": 0.11001982539892197, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 121930 + }, + { + "epoch": 0.4713859380556973, + "grad_norm": 0.10711158812046051, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 121940 + }, + { + "epoch": 0.4714245952590806, + "grad_norm": 0.10674590617418289, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 121950 + }, + { + "epoch": 0.47146325246246384, + "grad_norm": 0.12260083109140396, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 121960 + }, + { + "epoch": 0.47150190966584715, + "grad_norm": 0.13318702578544617, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 121970 + }, + { + "epoch": 0.4715405668692304, + "grad_norm": 0.10054940730333328, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 121980 + }, + { + "epoch": 0.4715792240726137, + "grad_norm": 0.1058686152100563, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 121990 + }, + { + "epoch": 0.47161788127599696, + "grad_norm": 0.12064962089061737, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 122000 + }, + { + "epoch": 0.47165653847938027, + "grad_norm": 0.10323496162891388, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 122010 + }, + { + "epoch": 0.4716951956827635, + "grad_norm": 0.10121781378984451, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 122020 + }, + { + "epoch": 0.47173385288614683, + "grad_norm": 0.11574602872133255, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 122030 + }, + { + "epoch": 0.4717725100895301, + "grad_norm": 0.11002667993307114, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 122040 + }, + { + "epoch": 0.47181116729291334, + "grad_norm": 0.10185150802135468, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 122050 + }, + { + "epoch": 0.47184982449629664, + "grad_norm": 0.11052855104207993, + "learning_rate": 0.002, + "loss": 2.345, + "step": 122060 + }, + { + "epoch": 0.4718884816996799, + "grad_norm": 0.13415193557739258, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 122070 + }, + { + "epoch": 0.4719271389030632, + "grad_norm": 0.1063295528292656, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 122080 + }, + { + "epoch": 0.47196579610644646, + "grad_norm": 0.08993332087993622, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 122090 + }, + { + "epoch": 0.47200445330982976, + "grad_norm": 0.10279487818479538, + "learning_rate": 0.002, + "loss": 2.337, + "step": 122100 + }, + { + "epoch": 0.472043110513213, + "grad_norm": 0.11254022270441055, + "learning_rate": 0.002, + "loss": 2.354, + "step": 122110 + }, + { + "epoch": 0.4720817677165963, + "grad_norm": 0.10892201215028763, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 122120 + }, + { + "epoch": 0.4721204249199796, + "grad_norm": 0.11184471845626831, + "learning_rate": 0.002, + "loss": 2.343, + "step": 122130 + }, + { + "epoch": 0.4721590821233629, + "grad_norm": 0.11071911454200745, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 122140 + }, + { + "epoch": 0.47219773932674614, + "grad_norm": 0.120620958507061, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 122150 + }, + { + "epoch": 0.47223639653012944, + "grad_norm": 0.10598298162221909, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 122160 + }, + { + "epoch": 0.4722750537335127, + "grad_norm": 0.09908302128314972, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 122170 + }, + { + "epoch": 0.472313710936896, + "grad_norm": 0.11272618919610977, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 122180 + }, + { + "epoch": 0.47235236814027926, + "grad_norm": 0.10746448487043381, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 122190 + }, + { + "epoch": 0.47239102534366256, + "grad_norm": 0.11876311153173447, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 122200 + }, + { + "epoch": 0.4724296825470458, + "grad_norm": 0.09779126197099686, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 122210 + }, + { + "epoch": 0.4724683397504291, + "grad_norm": 0.11250808835029602, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 122220 + }, + { + "epoch": 0.4725069969538124, + "grad_norm": 0.09808599948883057, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 122230 + }, + { + "epoch": 0.4725456541571956, + "grad_norm": 0.09768354147672653, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 122240 + }, + { + "epoch": 0.47258431136057893, + "grad_norm": 0.10772417485713959, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 122250 + }, + { + "epoch": 0.4726229685639622, + "grad_norm": 0.11477760970592499, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 122260 + }, + { + "epoch": 0.4726616257673455, + "grad_norm": 0.11122085899114609, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 122270 + }, + { + "epoch": 0.47270028297072875, + "grad_norm": 0.12852375209331512, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 122280 + }, + { + "epoch": 0.47273894017411205, + "grad_norm": 0.16506214439868927, + "learning_rate": 0.002, + "loss": 2.3941, + "step": 122290 + }, + { + "epoch": 0.4727775973774953, + "grad_norm": 0.11552702635526657, + "learning_rate": 0.002, + "loss": 2.3702, + "step": 122300 + }, + { + "epoch": 0.4728162545808786, + "grad_norm": 0.10295978933572769, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 122310 + }, + { + "epoch": 0.47285491178426187, + "grad_norm": 0.33917784690856934, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 122320 + }, + { + "epoch": 0.4728935689876452, + "grad_norm": 0.09117474406957626, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 122330 + }, + { + "epoch": 0.4729322261910284, + "grad_norm": 0.11995255202054977, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 122340 + }, + { + "epoch": 0.47297088339441173, + "grad_norm": 0.10412541031837463, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 122350 + }, + { + "epoch": 0.473009540597795, + "grad_norm": 0.10575006902217865, + "learning_rate": 0.002, + "loss": 2.365, + "step": 122360 + }, + { + "epoch": 0.4730481978011783, + "grad_norm": 0.09834925830364227, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 122370 + }, + { + "epoch": 0.47308685500456155, + "grad_norm": 0.10345424711704254, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 122380 + }, + { + "epoch": 0.47312551220794485, + "grad_norm": 0.10761108249425888, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 122390 + }, + { + "epoch": 0.4731641694113281, + "grad_norm": 0.13001149892807007, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 122400 + }, + { + "epoch": 0.47320282661471136, + "grad_norm": 0.11039768904447556, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 122410 + }, + { + "epoch": 0.47324148381809467, + "grad_norm": 0.10438776016235352, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 122420 + }, + { + "epoch": 0.4732801410214779, + "grad_norm": 0.11237910389900208, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 122430 + }, + { + "epoch": 0.4733187982248612, + "grad_norm": 0.10197488218545914, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 122440 + }, + { + "epoch": 0.4733574554282445, + "grad_norm": 0.11044498533010483, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 122450 + }, + { + "epoch": 0.4733961126316278, + "grad_norm": 0.12326077371835709, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 122460 + }, + { + "epoch": 0.47343476983501104, + "grad_norm": 0.09754885733127594, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 122470 + }, + { + "epoch": 0.47347342703839435, + "grad_norm": 0.10445263981819153, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 122480 + }, + { + "epoch": 0.4735120842417776, + "grad_norm": 0.09975433349609375, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 122490 + }, + { + "epoch": 0.4735507414451609, + "grad_norm": 0.10851839184761047, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 122500 + }, + { + "epoch": 0.47358939864854416, + "grad_norm": 0.10483758896589279, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 122510 + }, + { + "epoch": 0.47362805585192747, + "grad_norm": 0.1174597442150116, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 122520 + }, + { + "epoch": 0.4736667130553107, + "grad_norm": 0.10874351114034653, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 122530 + }, + { + "epoch": 0.473705370258694, + "grad_norm": 0.11186617612838745, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 122540 + }, + { + "epoch": 0.4737440274620773, + "grad_norm": 0.09507044404745102, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 122550 + }, + { + "epoch": 0.4737826846654606, + "grad_norm": 0.09617611765861511, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 122560 + }, + { + "epoch": 0.47382134186884384, + "grad_norm": 0.11511634290218353, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 122570 + }, + { + "epoch": 0.47385999907222714, + "grad_norm": 0.12867169082164764, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 122580 + }, + { + "epoch": 0.4738986562756104, + "grad_norm": 0.11168276518583298, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 122590 + }, + { + "epoch": 0.47393731347899365, + "grad_norm": 0.10149930417537689, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 122600 + }, + { + "epoch": 0.47397597068237696, + "grad_norm": 0.11204592883586884, + "learning_rate": 0.002, + "loss": 2.3731, + "step": 122610 + }, + { + "epoch": 0.4740146278857602, + "grad_norm": 0.11496981233358383, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 122620 + }, + { + "epoch": 0.4740532850891435, + "grad_norm": 0.09762315452098846, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 122630 + }, + { + "epoch": 0.47409194229252677, + "grad_norm": 0.0918693020939827, + "learning_rate": 0.002, + "loss": 2.353, + "step": 122640 + }, + { + "epoch": 0.4741305994959101, + "grad_norm": 0.12558548152446747, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 122650 + }, + { + "epoch": 0.47416925669929333, + "grad_norm": 0.10448439419269562, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 122660 + }, + { + "epoch": 0.47420791390267664, + "grad_norm": 0.08786539733409882, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 122670 + }, + { + "epoch": 0.4742465711060599, + "grad_norm": 0.12373369932174683, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 122680 + }, + { + "epoch": 0.4742852283094432, + "grad_norm": 0.10301606357097626, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 122690 + }, + { + "epoch": 0.47432388551282645, + "grad_norm": 0.1180417463183403, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 122700 + }, + { + "epoch": 0.47436254271620976, + "grad_norm": 0.0997176319360733, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 122710 + }, + { + "epoch": 0.474401199919593, + "grad_norm": 0.11349517852067947, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 122720 + }, + { + "epoch": 0.4744398571229763, + "grad_norm": 0.11657247692346573, + "learning_rate": 0.002, + "loss": 2.3765, + "step": 122730 + }, + { + "epoch": 0.47447851432635957, + "grad_norm": 0.09847909212112427, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 122740 + }, + { + "epoch": 0.4745171715297429, + "grad_norm": 0.11855094879865646, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 122750 + }, + { + "epoch": 0.47455582873312613, + "grad_norm": 0.10409026592969894, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 122760 + }, + { + "epoch": 0.47459448593650944, + "grad_norm": 0.10284674912691116, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 122770 + }, + { + "epoch": 0.4746331431398927, + "grad_norm": 0.11371070891618729, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 122780 + }, + { + "epoch": 0.47467180034327594, + "grad_norm": 0.10059019923210144, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 122790 + }, + { + "epoch": 0.47471045754665925, + "grad_norm": 0.11485622823238373, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 122800 + }, + { + "epoch": 0.4747491147500425, + "grad_norm": 0.09271082282066345, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 122810 + }, + { + "epoch": 0.4747877719534258, + "grad_norm": 0.10824739187955856, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 122820 + }, + { + "epoch": 0.47482642915680906, + "grad_norm": 0.11803317815065384, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 122830 + }, + { + "epoch": 0.47486508636019237, + "grad_norm": 0.09994529187679291, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 122840 + }, + { + "epoch": 0.4749037435635756, + "grad_norm": 0.11159958690404892, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 122850 + }, + { + "epoch": 0.4749424007669589, + "grad_norm": 0.10315810143947601, + "learning_rate": 0.002, + "loss": 2.333, + "step": 122860 + }, + { + "epoch": 0.4749810579703422, + "grad_norm": 0.11093452572822571, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 122870 + }, + { + "epoch": 0.4750197151737255, + "grad_norm": 0.14293049275875092, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 122880 + }, + { + "epoch": 0.47505837237710874, + "grad_norm": 0.10940203070640564, + "learning_rate": 0.002, + "loss": 2.348, + "step": 122890 + }, + { + "epoch": 0.47509702958049205, + "grad_norm": 0.11061827093362808, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 122900 + }, + { + "epoch": 0.4751356867838753, + "grad_norm": 0.11959835141897202, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 122910 + }, + { + "epoch": 0.4751743439872586, + "grad_norm": 0.1079825758934021, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 122920 + }, + { + "epoch": 0.47521300119064186, + "grad_norm": 0.11268473416566849, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 122930 + }, + { + "epoch": 0.47525165839402517, + "grad_norm": 0.10862148553133011, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 122940 + }, + { + "epoch": 0.4752903155974084, + "grad_norm": 0.09216571599245071, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 122950 + }, + { + "epoch": 0.4753289728007917, + "grad_norm": 0.11650997400283813, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 122960 + }, + { + "epoch": 0.475367630004175, + "grad_norm": 0.09918678551912308, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 122970 + }, + { + "epoch": 0.47540628720755823, + "grad_norm": 0.11022733896970749, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 122980 + }, + { + "epoch": 0.47544494441094154, + "grad_norm": 0.10531317442655563, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 122990 + }, + { + "epoch": 0.4754836016143248, + "grad_norm": 0.11401054263114929, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 123000 + }, + { + "epoch": 0.4755222588177081, + "grad_norm": 0.10972213000059128, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 123010 + }, + { + "epoch": 0.47556091602109135, + "grad_norm": 0.10765939205884933, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 123020 + }, + { + "epoch": 0.47559957322447466, + "grad_norm": 0.09859401732683182, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 123030 + }, + { + "epoch": 0.4756382304278579, + "grad_norm": 0.11380176246166229, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 123040 + }, + { + "epoch": 0.4756768876312412, + "grad_norm": 0.09912136942148209, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 123050 + }, + { + "epoch": 0.47571554483462447, + "grad_norm": 0.1289130300283432, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 123060 + }, + { + "epoch": 0.4757542020380078, + "grad_norm": 0.0963532030582428, + "learning_rate": 0.002, + "loss": 2.333, + "step": 123070 + }, + { + "epoch": 0.47579285924139103, + "grad_norm": 0.11527842283248901, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 123080 + }, + { + "epoch": 0.47583151644477434, + "grad_norm": 0.13533703982830048, + "learning_rate": 0.002, + "loss": 2.363, + "step": 123090 + }, + { + "epoch": 0.4758701736481576, + "grad_norm": 0.10471322387456894, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 123100 + }, + { + "epoch": 0.4759088308515409, + "grad_norm": 0.10058660060167313, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 123110 + }, + { + "epoch": 0.47594748805492415, + "grad_norm": 0.09999233484268188, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 123120 + }, + { + "epoch": 0.47598614525830746, + "grad_norm": 0.10986481606960297, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 123130 + }, + { + "epoch": 0.4760248024616907, + "grad_norm": 0.10944868624210358, + "learning_rate": 0.002, + "loss": 2.352, + "step": 123140 + }, + { + "epoch": 0.47606345966507396, + "grad_norm": 0.09159363061189651, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 123150 + }, + { + "epoch": 0.47610211686845727, + "grad_norm": 0.13750354945659637, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 123160 + }, + { + "epoch": 0.4761407740718405, + "grad_norm": 0.11780702322721481, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 123170 + }, + { + "epoch": 0.47617943127522383, + "grad_norm": 0.12204831838607788, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 123180 + }, + { + "epoch": 0.4762180884786071, + "grad_norm": 0.10399141907691956, + "learning_rate": 0.002, + "loss": 2.339, + "step": 123190 + }, + { + "epoch": 0.4762567456819904, + "grad_norm": 0.11878559738397598, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 123200 + }, + { + "epoch": 0.47629540288537364, + "grad_norm": 0.11291263997554779, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 123210 + }, + { + "epoch": 0.47633406008875695, + "grad_norm": 0.12872259318828583, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 123220 + }, + { + "epoch": 0.4763727172921402, + "grad_norm": 0.12508279085159302, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 123230 + }, + { + "epoch": 0.4764113744955235, + "grad_norm": 0.09549172967672348, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 123240 + }, + { + "epoch": 0.47645003169890676, + "grad_norm": 0.10898482799530029, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 123250 + }, + { + "epoch": 0.47648868890229007, + "grad_norm": 0.1020859032869339, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 123260 + }, + { + "epoch": 0.4765273461056733, + "grad_norm": 0.10750532895326614, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 123270 + }, + { + "epoch": 0.47656600330905663, + "grad_norm": 0.10456563532352448, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 123280 + }, + { + "epoch": 0.4766046605124399, + "grad_norm": 0.13879376649856567, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 123290 + }, + { + "epoch": 0.4766433177158232, + "grad_norm": 0.10747821629047394, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 123300 + }, + { + "epoch": 0.47668197491920644, + "grad_norm": 0.09391331672668457, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 123310 + }, + { + "epoch": 0.47672063212258975, + "grad_norm": 0.11444292217493057, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 123320 + }, + { + "epoch": 0.476759289325973, + "grad_norm": 0.10892808437347412, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 123330 + }, + { + "epoch": 0.47679794652935625, + "grad_norm": 0.10487792640924454, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 123340 + }, + { + "epoch": 0.47683660373273956, + "grad_norm": 0.10460419952869415, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 123350 + }, + { + "epoch": 0.4768752609361228, + "grad_norm": 0.11522373557090759, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 123360 + }, + { + "epoch": 0.4769139181395061, + "grad_norm": 0.09701727330684662, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 123370 + }, + { + "epoch": 0.4769525753428894, + "grad_norm": 0.10497508198022842, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 123380 + }, + { + "epoch": 0.4769912325462727, + "grad_norm": 0.10454043000936508, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 123390 + }, + { + "epoch": 0.47702988974965593, + "grad_norm": 0.10469161719083786, + "learning_rate": 0.002, + "loss": 2.343, + "step": 123400 + }, + { + "epoch": 0.47706854695303924, + "grad_norm": 0.11177811026573181, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 123410 + }, + { + "epoch": 0.4771072041564225, + "grad_norm": 0.13299429416656494, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 123420 + }, + { + "epoch": 0.4771458613598058, + "grad_norm": 0.127616286277771, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 123430 + }, + { + "epoch": 0.47718451856318905, + "grad_norm": 0.09650331735610962, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 123440 + }, + { + "epoch": 0.47722317576657236, + "grad_norm": 0.09756970405578613, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 123450 + }, + { + "epoch": 0.4772618329699556, + "grad_norm": 0.12383294105529785, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 123460 + }, + { + "epoch": 0.4773004901733389, + "grad_norm": 0.09876053035259247, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 123470 + }, + { + "epoch": 0.4773391473767222, + "grad_norm": 0.09043329954147339, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 123480 + }, + { + "epoch": 0.4773778045801055, + "grad_norm": 0.11116903275251389, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 123490 + }, + { + "epoch": 0.47741646178348873, + "grad_norm": 0.08807190507650375, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 123500 + }, + { + "epoch": 0.47745511898687204, + "grad_norm": 0.12583951652050018, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 123510 + }, + { + "epoch": 0.4774937761902553, + "grad_norm": 0.10756245255470276, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 123520 + }, + { + "epoch": 0.47753243339363854, + "grad_norm": 0.1139204278588295, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 123530 + }, + { + "epoch": 0.47757109059702185, + "grad_norm": 0.10103306919336319, + "learning_rate": 0.002, + "loss": 2.355, + "step": 123540 + }, + { + "epoch": 0.4776097478004051, + "grad_norm": 0.11110308021306992, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 123550 + }, + { + "epoch": 0.4776484050037884, + "grad_norm": 0.11258696764707565, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 123560 + }, + { + "epoch": 0.47768706220717166, + "grad_norm": 0.10868509113788605, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 123570 + }, + { + "epoch": 0.47772571941055497, + "grad_norm": 0.10797320306301117, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 123580 + }, + { + "epoch": 0.4777643766139382, + "grad_norm": 0.1106133833527565, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 123590 + }, + { + "epoch": 0.47780303381732153, + "grad_norm": 0.09830787777900696, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 123600 + }, + { + "epoch": 0.4778416910207048, + "grad_norm": 0.09236044436693192, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 123610 + }, + { + "epoch": 0.4778803482240881, + "grad_norm": 0.0985744521021843, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 123620 + }, + { + "epoch": 0.47791900542747134, + "grad_norm": 0.12322218716144562, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 123630 + }, + { + "epoch": 0.47795766263085465, + "grad_norm": 0.11628399044275284, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 123640 + }, + { + "epoch": 0.4779963198342379, + "grad_norm": 0.09713403135538101, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 123650 + }, + { + "epoch": 0.4780349770376212, + "grad_norm": 0.10573708266019821, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 123660 + }, + { + "epoch": 0.47807363424100446, + "grad_norm": 0.16359750926494598, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 123670 + }, + { + "epoch": 0.47811229144438777, + "grad_norm": 0.10506831854581833, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 123680 + }, + { + "epoch": 0.478150948647771, + "grad_norm": 0.10553120076656342, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 123690 + }, + { + "epoch": 0.47818960585115433, + "grad_norm": 0.11690998077392578, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 123700 + }, + { + "epoch": 0.4782282630545376, + "grad_norm": 0.2327994406223297, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 123710 + }, + { + "epoch": 0.47826692025792084, + "grad_norm": 0.11086133122444153, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 123720 + }, + { + "epoch": 0.47830557746130414, + "grad_norm": 0.10380079597234726, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 123730 + }, + { + "epoch": 0.4783442346646874, + "grad_norm": 0.10399412363767624, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 123740 + }, + { + "epoch": 0.4783828918680707, + "grad_norm": 0.11090266704559326, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 123750 + }, + { + "epoch": 0.47842154907145396, + "grad_norm": 0.10452635586261749, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 123760 + }, + { + "epoch": 0.47846020627483726, + "grad_norm": 0.0872868224978447, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 123770 + }, + { + "epoch": 0.4784988634782205, + "grad_norm": 0.09890952706336975, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 123780 + }, + { + "epoch": 0.4785375206816038, + "grad_norm": 0.10068909078836441, + "learning_rate": 0.002, + "loss": 2.347, + "step": 123790 + }, + { + "epoch": 0.4785761778849871, + "grad_norm": 0.10055209696292877, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 123800 + }, + { + "epoch": 0.4786148350883704, + "grad_norm": 0.1076897531747818, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 123810 + }, + { + "epoch": 0.47865349229175363, + "grad_norm": 0.11537522822618484, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 123820 + }, + { + "epoch": 0.47869214949513694, + "grad_norm": 0.11433115601539612, + "learning_rate": 0.002, + "loss": 2.34, + "step": 123830 + }, + { + "epoch": 0.4787308066985202, + "grad_norm": 0.1042742133140564, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 123840 + }, + { + "epoch": 0.4787694639019035, + "grad_norm": 0.1332925260066986, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 123850 + }, + { + "epoch": 0.47880812110528675, + "grad_norm": 0.0977521538734436, + "learning_rate": 0.002, + "loss": 2.338, + "step": 123860 + }, + { + "epoch": 0.47884677830867006, + "grad_norm": 0.09739074110984802, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 123870 + }, + { + "epoch": 0.4788854355120533, + "grad_norm": 0.10039547830820084, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 123880 + }, + { + "epoch": 0.47892409271543657, + "grad_norm": 0.10721756517887115, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 123890 + }, + { + "epoch": 0.4789627499188199, + "grad_norm": 0.09866327792406082, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 123900 + }, + { + "epoch": 0.4790014071222031, + "grad_norm": 0.10998773574829102, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 123910 + }, + { + "epoch": 0.47904006432558643, + "grad_norm": 0.10089613497257233, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 123920 + }, + { + "epoch": 0.4790787215289697, + "grad_norm": 0.09514022618532181, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 123930 + }, + { + "epoch": 0.479117378732353, + "grad_norm": 0.11414384096860886, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 123940 + }, + { + "epoch": 0.47915603593573625, + "grad_norm": 0.09830380976200104, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 123950 + }, + { + "epoch": 0.47919469313911955, + "grad_norm": 0.10151248425245285, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 123960 + }, + { + "epoch": 0.4792333503425028, + "grad_norm": 0.09892397373914719, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 123970 + }, + { + "epoch": 0.4792720075458861, + "grad_norm": 0.11178483814001083, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 123980 + }, + { + "epoch": 0.47931066474926937, + "grad_norm": 0.106821209192276, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 123990 + }, + { + "epoch": 0.4793493219526527, + "grad_norm": 0.09730260819196701, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 124000 + }, + { + "epoch": 0.4793879791560359, + "grad_norm": 0.10343889147043228, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 124010 + }, + { + "epoch": 0.47942663635941923, + "grad_norm": 0.09810855239629745, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 124020 + }, + { + "epoch": 0.4794652935628025, + "grad_norm": 0.1482008844614029, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 124030 + }, + { + "epoch": 0.4795039507661858, + "grad_norm": 0.11742330342531204, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 124040 + }, + { + "epoch": 0.47954260796956905, + "grad_norm": 0.09411367028951645, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 124050 + }, + { + "epoch": 0.47958126517295235, + "grad_norm": 0.11513230204582214, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 124060 + }, + { + "epoch": 0.4796199223763356, + "grad_norm": 0.11209332942962646, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 124070 + }, + { + "epoch": 0.47965857957971886, + "grad_norm": 0.10911991447210312, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 124080 + }, + { + "epoch": 0.47969723678310217, + "grad_norm": 0.12374506890773773, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 124090 + }, + { + "epoch": 0.4797358939864854, + "grad_norm": 0.1096247136592865, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 124100 + }, + { + "epoch": 0.4797745511898687, + "grad_norm": 0.12387961149215698, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 124110 + }, + { + "epoch": 0.479813208393252, + "grad_norm": 0.10381369292736053, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 124120 + }, + { + "epoch": 0.4798518655966353, + "grad_norm": 0.11828626692295074, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 124130 + }, + { + "epoch": 0.47989052280001854, + "grad_norm": 0.1190611943602562, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 124140 + }, + { + "epoch": 0.47992918000340185, + "grad_norm": 0.10298268496990204, + "learning_rate": 0.002, + "loss": 2.3688, + "step": 124150 + }, + { + "epoch": 0.4799678372067851, + "grad_norm": 0.1297299563884735, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 124160 + }, + { + "epoch": 0.4800064944101684, + "grad_norm": 0.11486640572547913, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 124170 + }, + { + "epoch": 0.48004515161355166, + "grad_norm": 0.10039033740758896, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 124180 + }, + { + "epoch": 0.48008380881693496, + "grad_norm": 0.09772678464651108, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 124190 + }, + { + "epoch": 0.4801224660203182, + "grad_norm": 0.16087579727172852, + "learning_rate": 0.002, + "loss": 2.343, + "step": 124200 + }, + { + "epoch": 0.4801611232237015, + "grad_norm": 0.11107642948627472, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 124210 + }, + { + "epoch": 0.4801997804270848, + "grad_norm": 0.10909386724233627, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 124220 + }, + { + "epoch": 0.4802384376304681, + "grad_norm": 0.11320231109857559, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 124230 + }, + { + "epoch": 0.48027709483385134, + "grad_norm": 0.10628072172403336, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 124240 + }, + { + "epoch": 0.48031575203723464, + "grad_norm": 0.11253766715526581, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 124250 + }, + { + "epoch": 0.4803544092406179, + "grad_norm": 0.10708357393741608, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 124260 + }, + { + "epoch": 0.48039306644400115, + "grad_norm": 0.09570637345314026, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 124270 + }, + { + "epoch": 0.48043172364738446, + "grad_norm": 0.11563023924827576, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 124280 + }, + { + "epoch": 0.4804703808507677, + "grad_norm": 0.1064223125576973, + "learning_rate": 0.002, + "loss": 2.346, + "step": 124290 + }, + { + "epoch": 0.480509038054151, + "grad_norm": 0.09856715053319931, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 124300 + }, + { + "epoch": 0.48054769525753427, + "grad_norm": 0.11088183522224426, + "learning_rate": 0.002, + "loss": 2.346, + "step": 124310 + }, + { + "epoch": 0.4805863524609176, + "grad_norm": 0.13400298357009888, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 124320 + }, + { + "epoch": 0.48062500966430083, + "grad_norm": 0.1328463852405548, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 124330 + }, + { + "epoch": 0.48066366686768414, + "grad_norm": 0.11681412905454636, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 124340 + }, + { + "epoch": 0.4807023240710674, + "grad_norm": 0.09900808334350586, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 124350 + }, + { + "epoch": 0.4807409812744507, + "grad_norm": 0.12010405212640762, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 124360 + }, + { + "epoch": 0.48077963847783395, + "grad_norm": 0.10821161419153214, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 124370 + }, + { + "epoch": 0.48081829568121726, + "grad_norm": 0.10221979022026062, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 124380 + }, + { + "epoch": 0.4808569528846005, + "grad_norm": 0.119261234998703, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 124390 + }, + { + "epoch": 0.4808956100879838, + "grad_norm": 0.10663552582263947, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 124400 + }, + { + "epoch": 0.48093426729136707, + "grad_norm": 0.12499512732028961, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 124410 + }, + { + "epoch": 0.4809729244947504, + "grad_norm": 0.1076861023902893, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 124420 + }, + { + "epoch": 0.48101158169813363, + "grad_norm": 0.11870467662811279, + "learning_rate": 0.002, + "loss": 2.358, + "step": 124430 + }, + { + "epoch": 0.48105023890151694, + "grad_norm": 0.1098322793841362, + "learning_rate": 0.002, + "loss": 2.335, + "step": 124440 + }, + { + "epoch": 0.4810888961049002, + "grad_norm": 0.10048480331897736, + "learning_rate": 0.002, + "loss": 2.35, + "step": 124450 + }, + { + "epoch": 0.48112755330828344, + "grad_norm": 0.10514318197965622, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 124460 + }, + { + "epoch": 0.48116621051166675, + "grad_norm": 0.10487706959247589, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 124470 + }, + { + "epoch": 0.48120486771505, + "grad_norm": 0.10981044918298721, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 124480 + }, + { + "epoch": 0.4812435249184333, + "grad_norm": 0.10225850343704224, + "learning_rate": 0.002, + "loss": 2.36, + "step": 124490 + }, + { + "epoch": 0.48128218212181656, + "grad_norm": 0.09725990891456604, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 124500 + }, + { + "epoch": 0.48132083932519987, + "grad_norm": 0.10735096782445908, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 124510 + }, + { + "epoch": 0.4813594965285831, + "grad_norm": 0.09924648702144623, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 124520 + }, + { + "epoch": 0.4813981537319664, + "grad_norm": 0.11738327890634537, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 124530 + }, + { + "epoch": 0.4814368109353497, + "grad_norm": 0.0980398878455162, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 124540 + }, + { + "epoch": 0.481475468138733, + "grad_norm": 0.12604741752147675, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 124550 + }, + { + "epoch": 0.48151412534211624, + "grad_norm": 0.12514206767082214, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 124560 + }, + { + "epoch": 0.48155278254549955, + "grad_norm": 0.09885002672672272, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 124570 + }, + { + "epoch": 0.4815914397488828, + "grad_norm": 0.10877029597759247, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 124580 + }, + { + "epoch": 0.4816300969522661, + "grad_norm": 0.11007339507341385, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 124590 + }, + { + "epoch": 0.48166875415564936, + "grad_norm": 0.09676380455493927, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 124600 + }, + { + "epoch": 0.48170741135903267, + "grad_norm": 0.09930353611707687, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 124610 + }, + { + "epoch": 0.4817460685624159, + "grad_norm": 0.09808854013681412, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 124620 + }, + { + "epoch": 0.4817847257657992, + "grad_norm": 0.1069636344909668, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 124630 + }, + { + "epoch": 0.4818233829691825, + "grad_norm": 0.10339025408029556, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 124640 + }, + { + "epoch": 0.48186204017256573, + "grad_norm": 0.1034790500998497, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 124650 + }, + { + "epoch": 0.48190069737594904, + "grad_norm": 0.10819303244352341, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 124660 + }, + { + "epoch": 0.4819393545793323, + "grad_norm": 0.10512588918209076, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 124670 + }, + { + "epoch": 0.4819780117827156, + "grad_norm": 0.16597643494606018, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 124680 + }, + { + "epoch": 0.48201666898609885, + "grad_norm": 0.11072135716676712, + "learning_rate": 0.002, + "loss": 2.339, + "step": 124690 + }, + { + "epoch": 0.48205532618948216, + "grad_norm": 0.09571421891450882, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 124700 + }, + { + "epoch": 0.4820939833928654, + "grad_norm": 0.10639669746160507, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 124710 + }, + { + "epoch": 0.4821326405962487, + "grad_norm": 0.09967085719108582, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 124720 + }, + { + "epoch": 0.48217129779963197, + "grad_norm": 0.10771028697490692, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 124730 + }, + { + "epoch": 0.4822099550030153, + "grad_norm": 0.09660494327545166, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 124740 + }, + { + "epoch": 0.48224861220639853, + "grad_norm": 0.10943452268838882, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 124750 + }, + { + "epoch": 0.48228726940978184, + "grad_norm": 0.10899591445922852, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 124760 + }, + { + "epoch": 0.4823259266131651, + "grad_norm": 0.1283608227968216, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 124770 + }, + { + "epoch": 0.4823645838165484, + "grad_norm": 0.11032495647668839, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 124780 + }, + { + "epoch": 0.48240324101993165, + "grad_norm": 0.10227882117033005, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 124790 + }, + { + "epoch": 0.48244189822331496, + "grad_norm": 0.10366970300674438, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 124800 + }, + { + "epoch": 0.4824805554266982, + "grad_norm": 0.11385279148817062, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 124810 + }, + { + "epoch": 0.48251921263008146, + "grad_norm": 0.08986172080039978, + "learning_rate": 0.002, + "loss": 2.341, + "step": 124820 + }, + { + "epoch": 0.48255786983346477, + "grad_norm": 0.10763401538133621, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 124830 + }, + { + "epoch": 0.482596527036848, + "grad_norm": 0.10179536789655685, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 124840 + }, + { + "epoch": 0.48263518424023133, + "grad_norm": 0.1055702343583107, + "learning_rate": 0.002, + "loss": 2.335, + "step": 124850 + }, + { + "epoch": 0.4826738414436146, + "grad_norm": 0.140970841050148, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 124860 + }, + { + "epoch": 0.4827124986469979, + "grad_norm": 0.10648707300424576, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 124870 + }, + { + "epoch": 0.48275115585038114, + "grad_norm": 0.105586476624012, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 124880 + }, + { + "epoch": 0.48278981305376445, + "grad_norm": 0.10715337097644806, + "learning_rate": 0.002, + "loss": 2.346, + "step": 124890 + }, + { + "epoch": 0.4828284702571477, + "grad_norm": 0.1060248464345932, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 124900 + }, + { + "epoch": 0.482867127460531, + "grad_norm": 0.09489964693784714, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 124910 + }, + { + "epoch": 0.48290578466391426, + "grad_norm": 0.10424128919839859, + "learning_rate": 0.002, + "loss": 2.3745, + "step": 124920 + }, + { + "epoch": 0.48294444186729757, + "grad_norm": 0.10901130735874176, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 124930 + }, + { + "epoch": 0.4829830990706808, + "grad_norm": 0.11290793865919113, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 124940 + }, + { + "epoch": 0.48302175627406413, + "grad_norm": 0.10265372693538666, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 124950 + }, + { + "epoch": 0.4830604134774474, + "grad_norm": 0.10744495689868927, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 124960 + }, + { + "epoch": 0.4830990706808307, + "grad_norm": 0.0985356792807579, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 124970 + }, + { + "epoch": 0.48313772788421394, + "grad_norm": 0.10968119651079178, + "learning_rate": 0.002, + "loss": 2.354, + "step": 124980 + }, + { + "epoch": 0.48317638508759725, + "grad_norm": 0.09952443093061447, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 124990 + }, + { + "epoch": 0.4832150422909805, + "grad_norm": 0.11583472788333893, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 125000 + }, + { + "epoch": 0.48325369949436375, + "grad_norm": 0.12367252260446548, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 125010 + }, + { + "epoch": 0.48329235669774706, + "grad_norm": 0.11487246304750443, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 125020 + }, + { + "epoch": 0.4833310139011303, + "grad_norm": 0.10422641038894653, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 125030 + }, + { + "epoch": 0.4833696711045136, + "grad_norm": 0.09209615737199783, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 125040 + }, + { + "epoch": 0.4834083283078969, + "grad_norm": 0.1276664286851883, + "learning_rate": 0.002, + "loss": 2.338, + "step": 125050 + }, + { + "epoch": 0.4834469855112802, + "grad_norm": 0.10056187212467194, + "learning_rate": 0.002, + "loss": 2.348, + "step": 125060 + }, + { + "epoch": 0.48348564271466343, + "grad_norm": 0.1188889592885971, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 125070 + }, + { + "epoch": 0.48352429991804674, + "grad_norm": 0.11096154153347015, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 125080 + }, + { + "epoch": 0.48356295712143, + "grad_norm": 0.119388647377491, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 125090 + }, + { + "epoch": 0.4836016143248133, + "grad_norm": 0.09515579789876938, + "learning_rate": 0.002, + "loss": 2.333, + "step": 125100 + }, + { + "epoch": 0.48364027152819655, + "grad_norm": 0.09334330260753632, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 125110 + }, + { + "epoch": 0.48367892873157986, + "grad_norm": 0.0963970422744751, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 125120 + }, + { + "epoch": 0.4837175859349631, + "grad_norm": 0.10869818180799484, + "learning_rate": 0.002, + "loss": 2.345, + "step": 125130 + }, + { + "epoch": 0.4837562431383464, + "grad_norm": 0.11080613732337952, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 125140 + }, + { + "epoch": 0.4837949003417297, + "grad_norm": 0.11019641160964966, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 125150 + }, + { + "epoch": 0.483833557545113, + "grad_norm": 0.11051522940397263, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 125160 + }, + { + "epoch": 0.48387221474849623, + "grad_norm": 0.12491607666015625, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 125170 + }, + { + "epoch": 0.48391087195187954, + "grad_norm": 0.11003629863262177, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 125180 + }, + { + "epoch": 0.4839495291552628, + "grad_norm": 0.10570420324802399, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 125190 + }, + { + "epoch": 0.48398818635864604, + "grad_norm": 0.10359431803226471, + "learning_rate": 0.002, + "loss": 2.357, + "step": 125200 + }, + { + "epoch": 0.48402684356202935, + "grad_norm": 0.10841794312000275, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 125210 + }, + { + "epoch": 0.4840655007654126, + "grad_norm": 0.1006321832537651, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 125220 + }, + { + "epoch": 0.4841041579687959, + "grad_norm": 0.1357637345790863, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 125230 + }, + { + "epoch": 0.48414281517217916, + "grad_norm": 0.10982130467891693, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 125240 + }, + { + "epoch": 0.48418147237556247, + "grad_norm": 0.09725379198789597, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 125250 + }, + { + "epoch": 0.4842201295789457, + "grad_norm": 0.11814479529857635, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 125260 + }, + { + "epoch": 0.48425878678232903, + "grad_norm": 0.10812816023826599, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 125270 + }, + { + "epoch": 0.4842974439857123, + "grad_norm": 0.10966924577951431, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 125280 + }, + { + "epoch": 0.4843361011890956, + "grad_norm": 0.1525818407535553, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 125290 + }, + { + "epoch": 0.48437475839247884, + "grad_norm": 0.10309594124555588, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 125300 + }, + { + "epoch": 0.48441341559586215, + "grad_norm": 0.12443433701992035, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 125310 + }, + { + "epoch": 0.4844520727992454, + "grad_norm": 0.0991070494055748, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 125320 + }, + { + "epoch": 0.4844907300026287, + "grad_norm": 0.10828681290149689, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 125330 + }, + { + "epoch": 0.48452938720601196, + "grad_norm": 0.10650653392076492, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 125340 + }, + { + "epoch": 0.48456804440939527, + "grad_norm": 0.10827747732400894, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 125350 + }, + { + "epoch": 0.4846067016127785, + "grad_norm": 0.12869510054588318, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 125360 + }, + { + "epoch": 0.48464535881616183, + "grad_norm": 0.11589004844427109, + "learning_rate": 0.002, + "loss": 2.346, + "step": 125370 + }, + { + "epoch": 0.4846840160195451, + "grad_norm": 0.29300642013549805, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 125380 + }, + { + "epoch": 0.48472267322292834, + "grad_norm": 0.0952758640050888, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 125390 + }, + { + "epoch": 0.48476133042631164, + "grad_norm": 0.10119640827178955, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 125400 + }, + { + "epoch": 0.4847999876296949, + "grad_norm": 0.10181231796741486, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 125410 + }, + { + "epoch": 0.4848386448330782, + "grad_norm": 0.10348128527402878, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 125420 + }, + { + "epoch": 0.48487730203646145, + "grad_norm": 0.09146574139595032, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 125430 + }, + { + "epoch": 0.48491595923984476, + "grad_norm": 0.10100752860307693, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 125440 + }, + { + "epoch": 0.484954616443228, + "grad_norm": 0.108201764523983, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 125450 + }, + { + "epoch": 0.4849932736466113, + "grad_norm": 0.10299346596002579, + "learning_rate": 0.002, + "loss": 2.356, + "step": 125460 + }, + { + "epoch": 0.4850319308499946, + "grad_norm": 0.12397720664739609, + "learning_rate": 0.002, + "loss": 2.335, + "step": 125470 + }, + { + "epoch": 0.4850705880533779, + "grad_norm": 0.1161360964179039, + "learning_rate": 0.002, + "loss": 2.334, + "step": 125480 + }, + { + "epoch": 0.48510924525676113, + "grad_norm": 0.11088062077760696, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 125490 + }, + { + "epoch": 0.48514790246014444, + "grad_norm": 0.12293235212564468, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 125500 + }, + { + "epoch": 0.4851865596635277, + "grad_norm": 0.09698466211557388, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 125510 + }, + { + "epoch": 0.485225216866911, + "grad_norm": 0.10574761778116226, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 125520 + }, + { + "epoch": 0.48526387407029425, + "grad_norm": 0.10661862790584564, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 125530 + }, + { + "epoch": 0.48530253127367756, + "grad_norm": 0.1068946123123169, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 125540 + }, + { + "epoch": 0.4853411884770608, + "grad_norm": 0.10147390514612198, + "learning_rate": 0.002, + "loss": 2.338, + "step": 125550 + }, + { + "epoch": 0.48537984568044407, + "grad_norm": 0.10097593814134598, + "learning_rate": 0.002, + "loss": 2.336, + "step": 125560 + }, + { + "epoch": 0.4854185028838274, + "grad_norm": 0.11848179996013641, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 125570 + }, + { + "epoch": 0.4854571600872106, + "grad_norm": 0.10089132189750671, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 125580 + }, + { + "epoch": 0.48549581729059393, + "grad_norm": 0.12234753370285034, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 125590 + }, + { + "epoch": 0.4855344744939772, + "grad_norm": 0.10000978410243988, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 125600 + }, + { + "epoch": 0.4855731316973605, + "grad_norm": 0.11176779121160507, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 125610 + }, + { + "epoch": 0.48561178890074375, + "grad_norm": 0.11679171770811081, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 125620 + }, + { + "epoch": 0.48565044610412705, + "grad_norm": 0.1252773553133011, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 125630 + }, + { + "epoch": 0.4856891033075103, + "grad_norm": 0.11438603699207306, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 125640 + }, + { + "epoch": 0.4857277605108936, + "grad_norm": 0.11496452242136002, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 125650 + }, + { + "epoch": 0.48576641771427687, + "grad_norm": 0.11636804789304733, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 125660 + }, + { + "epoch": 0.4858050749176602, + "grad_norm": 0.11302290111780167, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 125670 + }, + { + "epoch": 0.4858437321210434, + "grad_norm": 0.1039479523897171, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 125680 + }, + { + "epoch": 0.48588238932442673, + "grad_norm": 0.09467640519142151, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 125690 + }, + { + "epoch": 0.48592104652781, + "grad_norm": 0.12681657075881958, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 125700 + }, + { + "epoch": 0.4859597037311933, + "grad_norm": 0.09812672436237335, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 125710 + }, + { + "epoch": 0.48599836093457655, + "grad_norm": 0.1105395182967186, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 125720 + }, + { + "epoch": 0.48603701813795985, + "grad_norm": 0.09311152249574661, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 125730 + }, + { + "epoch": 0.4860756753413431, + "grad_norm": 0.1042751669883728, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 125740 + }, + { + "epoch": 0.48611433254472636, + "grad_norm": 0.11312663555145264, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 125750 + }, + { + "epoch": 0.48615298974810967, + "grad_norm": 0.11612541973590851, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 125760 + }, + { + "epoch": 0.4861916469514929, + "grad_norm": 0.09691762179136276, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 125770 + }, + { + "epoch": 0.4862303041548762, + "grad_norm": 0.11213060468435287, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 125780 + }, + { + "epoch": 0.4862689613582595, + "grad_norm": 0.10857041925191879, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 125790 + }, + { + "epoch": 0.4863076185616428, + "grad_norm": 0.15382245182991028, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 125800 + }, + { + "epoch": 0.48634627576502604, + "grad_norm": 0.10181780904531479, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 125810 + }, + { + "epoch": 0.48638493296840934, + "grad_norm": 0.09610498696565628, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 125820 + }, + { + "epoch": 0.4864235901717926, + "grad_norm": 0.11725539714097977, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 125830 + }, + { + "epoch": 0.4864622473751759, + "grad_norm": 0.09810120612382889, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 125840 + }, + { + "epoch": 0.48650090457855916, + "grad_norm": 0.11101103574037552, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 125850 + }, + { + "epoch": 0.48653956178194246, + "grad_norm": 0.10683691501617432, + "learning_rate": 0.002, + "loss": 2.339, + "step": 125860 + }, + { + "epoch": 0.4865782189853257, + "grad_norm": 0.10464399307966232, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 125870 + }, + { + "epoch": 0.486616876188709, + "grad_norm": 0.11023180186748505, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 125880 + }, + { + "epoch": 0.4866555333920923, + "grad_norm": 0.11341548711061478, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 125890 + }, + { + "epoch": 0.4866941905954756, + "grad_norm": 0.10827740281820297, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 125900 + }, + { + "epoch": 0.48673284779885884, + "grad_norm": 0.10786271095275879, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 125910 + }, + { + "epoch": 0.48677150500224214, + "grad_norm": 0.0976567268371582, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 125920 + }, + { + "epoch": 0.4868101622056254, + "grad_norm": 0.11942003667354584, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 125930 + }, + { + "epoch": 0.48684881940900865, + "grad_norm": 0.10531553626060486, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 125940 + }, + { + "epoch": 0.48688747661239196, + "grad_norm": 0.1033153161406517, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 125950 + }, + { + "epoch": 0.4869261338157752, + "grad_norm": 0.0929771140217781, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 125960 + }, + { + "epoch": 0.4869647910191585, + "grad_norm": 0.13447654247283936, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 125970 + }, + { + "epoch": 0.48700344822254177, + "grad_norm": 0.11186330020427704, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 125980 + }, + { + "epoch": 0.4870421054259251, + "grad_norm": 0.11655236035585403, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 125990 + }, + { + "epoch": 0.48708076262930833, + "grad_norm": 0.11176680773496628, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 126000 + }, + { + "epoch": 0.48711941983269164, + "grad_norm": 0.11391759663820267, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 126010 + }, + { + "epoch": 0.4871580770360749, + "grad_norm": 0.10402825474739075, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 126020 + }, + { + "epoch": 0.4871967342394582, + "grad_norm": 0.11938010901212692, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 126030 + }, + { + "epoch": 0.48723539144284145, + "grad_norm": 0.10057584196329117, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 126040 + }, + { + "epoch": 0.48727404864622476, + "grad_norm": 0.10689040273427963, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 126050 + }, + { + "epoch": 0.487312705849608, + "grad_norm": 0.09272339940071106, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 126060 + }, + { + "epoch": 0.4873513630529913, + "grad_norm": 0.11007925868034363, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 126070 + }, + { + "epoch": 0.48739002025637457, + "grad_norm": 0.09506697207689285, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 126080 + }, + { + "epoch": 0.4874286774597579, + "grad_norm": 0.12454500794410706, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 126090 + }, + { + "epoch": 0.4874673346631411, + "grad_norm": 0.10810870677232742, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 126100 + }, + { + "epoch": 0.48750599186652444, + "grad_norm": 0.10550742596387863, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 126110 + }, + { + "epoch": 0.4875446490699077, + "grad_norm": 0.10646315664052963, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 126120 + }, + { + "epoch": 0.48758330627329094, + "grad_norm": 0.10827473551034927, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 126130 + }, + { + "epoch": 0.48762196347667425, + "grad_norm": 0.11294981837272644, + "learning_rate": 0.002, + "loss": 2.357, + "step": 126140 + }, + { + "epoch": 0.4876606206800575, + "grad_norm": 0.10049781948328018, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 126150 + }, + { + "epoch": 0.4876992778834408, + "grad_norm": 0.12226846069097519, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 126160 + }, + { + "epoch": 0.48773793508682406, + "grad_norm": 0.12626990675926208, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 126170 + }, + { + "epoch": 0.48777659229020737, + "grad_norm": 0.1084417849779129, + "learning_rate": 0.002, + "loss": 2.334, + "step": 126180 + }, + { + "epoch": 0.4878152494935906, + "grad_norm": 0.11008724570274353, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 126190 + }, + { + "epoch": 0.4878539066969739, + "grad_norm": 0.10245554149150848, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 126200 + }, + { + "epoch": 0.4878925639003572, + "grad_norm": 0.11041338741779327, + "learning_rate": 0.002, + "loss": 2.338, + "step": 126210 + }, + { + "epoch": 0.4879312211037405, + "grad_norm": 0.1254657804965973, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 126220 + }, + { + "epoch": 0.48796987830712374, + "grad_norm": 0.10623673349618912, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 126230 + }, + { + "epoch": 0.48800853551050705, + "grad_norm": 0.12370909750461578, + "learning_rate": 0.002, + "loss": 2.348, + "step": 126240 + }, + { + "epoch": 0.4880471927138903, + "grad_norm": 0.11274704337120056, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 126250 + }, + { + "epoch": 0.4880858499172736, + "grad_norm": 0.11869961023330688, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 126260 + }, + { + "epoch": 0.48812450712065686, + "grad_norm": 0.1175747886300087, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 126270 + }, + { + "epoch": 0.48816316432404017, + "grad_norm": 0.27420172095298767, + "learning_rate": 0.002, + "loss": 2.3672, + "step": 126280 + }, + { + "epoch": 0.4882018215274234, + "grad_norm": 0.11592134833335876, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 126290 + }, + { + "epoch": 0.4882404787308067, + "grad_norm": 0.1103459820151329, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 126300 + }, + { + "epoch": 0.48827913593419, + "grad_norm": 0.09347846359014511, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 126310 + }, + { + "epoch": 0.48831779313757323, + "grad_norm": 0.1197313666343689, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 126320 + }, + { + "epoch": 0.48835645034095654, + "grad_norm": 0.11836214363574982, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 126330 + }, + { + "epoch": 0.4883951075443398, + "grad_norm": 0.10949929803609848, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 126340 + }, + { + "epoch": 0.4884337647477231, + "grad_norm": 0.10934491455554962, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 126350 + }, + { + "epoch": 0.48847242195110635, + "grad_norm": 0.09713473916053772, + "learning_rate": 0.002, + "loss": 2.359, + "step": 126360 + }, + { + "epoch": 0.48851107915448966, + "grad_norm": 0.1307293176651001, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 126370 + }, + { + "epoch": 0.4885497363578729, + "grad_norm": 0.10963564366102219, + "learning_rate": 0.002, + "loss": 2.346, + "step": 126380 + }, + { + "epoch": 0.4885883935612562, + "grad_norm": 0.10998211055994034, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 126390 + }, + { + "epoch": 0.48862705076463947, + "grad_norm": 0.10524381697177887, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 126400 + }, + { + "epoch": 0.4886657079680228, + "grad_norm": 0.11870896816253662, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 126410 + }, + { + "epoch": 0.48870436517140603, + "grad_norm": 0.11174721270799637, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 126420 + }, + { + "epoch": 0.48874302237478934, + "grad_norm": 0.14074304699897766, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 126430 + }, + { + "epoch": 0.4887816795781726, + "grad_norm": 0.10730758309364319, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 126440 + }, + { + "epoch": 0.4888203367815559, + "grad_norm": 0.10556095838546753, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 126450 + }, + { + "epoch": 0.48885899398493915, + "grad_norm": 0.10455413907766342, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 126460 + }, + { + "epoch": 0.48889765118832246, + "grad_norm": 0.2152157425880432, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 126470 + }, + { + "epoch": 0.4889363083917057, + "grad_norm": 0.10853318870067596, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 126480 + }, + { + "epoch": 0.48897496559508896, + "grad_norm": 0.11397414654493332, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 126490 + }, + { + "epoch": 0.48901362279847227, + "grad_norm": 0.09437750279903412, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 126500 + }, + { + "epoch": 0.4890522800018555, + "grad_norm": 0.10448405146598816, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 126510 + }, + { + "epoch": 0.48909093720523883, + "grad_norm": 0.1160251796245575, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 126520 + }, + { + "epoch": 0.4891295944086221, + "grad_norm": 0.09574171900749207, + "learning_rate": 0.002, + "loss": 2.338, + "step": 126530 + }, + { + "epoch": 0.4891682516120054, + "grad_norm": 0.12774688005447388, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 126540 + }, + { + "epoch": 0.48920690881538864, + "grad_norm": 0.106713205575943, + "learning_rate": 0.002, + "loss": 2.34, + "step": 126550 + }, + { + "epoch": 0.48924556601877195, + "grad_norm": 0.11310985684394836, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 126560 + }, + { + "epoch": 0.4892842232221552, + "grad_norm": 0.10112718492746353, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 126570 + }, + { + "epoch": 0.4893228804255385, + "grad_norm": 0.11339341849088669, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 126580 + }, + { + "epoch": 0.48936153762892176, + "grad_norm": 0.10261591523885727, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 126590 + }, + { + "epoch": 0.48940019483230507, + "grad_norm": 0.10041617602109909, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 126600 + }, + { + "epoch": 0.4894388520356883, + "grad_norm": 0.10402018576860428, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 126610 + }, + { + "epoch": 0.48947750923907163, + "grad_norm": 0.09408276528120041, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 126620 + }, + { + "epoch": 0.4895161664424549, + "grad_norm": 0.10646459460258484, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 126630 + }, + { + "epoch": 0.4895548236458382, + "grad_norm": 0.12016189843416214, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 126640 + }, + { + "epoch": 0.48959348084922144, + "grad_norm": 0.0968141183257103, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 126650 + }, + { + "epoch": 0.48963213805260475, + "grad_norm": 0.10623986274003983, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 126660 + }, + { + "epoch": 0.489670795255988, + "grad_norm": 0.10878663510084152, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 126670 + }, + { + "epoch": 0.48970945245937125, + "grad_norm": 0.10133571922779083, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 126680 + }, + { + "epoch": 0.48974810966275456, + "grad_norm": 0.1187170147895813, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 126690 + }, + { + "epoch": 0.4897867668661378, + "grad_norm": 0.09845730662345886, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 126700 + }, + { + "epoch": 0.4898254240695211, + "grad_norm": 0.12468785792589188, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 126710 + }, + { + "epoch": 0.4898640812729044, + "grad_norm": 0.09631633758544922, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 126720 + }, + { + "epoch": 0.4899027384762877, + "grad_norm": 0.10853462666273117, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 126730 + }, + { + "epoch": 0.48994139567967093, + "grad_norm": 0.10383742302656174, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 126740 + }, + { + "epoch": 0.48998005288305424, + "grad_norm": 0.1175794005393982, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 126750 + }, + { + "epoch": 0.4900187100864375, + "grad_norm": 0.10090035945177078, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 126760 + }, + { + "epoch": 0.4900573672898208, + "grad_norm": 0.09507987648248672, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 126770 + }, + { + "epoch": 0.49009602449320405, + "grad_norm": 0.12469930946826935, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 126780 + }, + { + "epoch": 0.49013468169658736, + "grad_norm": 0.10618849843740463, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 126790 + }, + { + "epoch": 0.4901733388999706, + "grad_norm": 0.11613799631595612, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 126800 + }, + { + "epoch": 0.4902119961033539, + "grad_norm": 0.11558814346790314, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 126810 + }, + { + "epoch": 0.49025065330673717, + "grad_norm": 0.11019840091466904, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 126820 + }, + { + "epoch": 0.4902893105101205, + "grad_norm": 0.11108105629682541, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 126830 + }, + { + "epoch": 0.49032796771350373, + "grad_norm": 0.11231342703104019, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 126840 + }, + { + "epoch": 0.49036662491688704, + "grad_norm": 0.10218144208192825, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 126850 + }, + { + "epoch": 0.4904052821202703, + "grad_norm": 0.11010507494211197, + "learning_rate": 0.002, + "loss": 2.3713, + "step": 126860 + }, + { + "epoch": 0.49044393932365354, + "grad_norm": 0.11340287327766418, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 126870 + }, + { + "epoch": 0.49048259652703685, + "grad_norm": 0.09621971100568771, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 126880 + }, + { + "epoch": 0.4905212537304201, + "grad_norm": 0.12170908600091934, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 126890 + }, + { + "epoch": 0.4905599109338034, + "grad_norm": 0.10535162687301636, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 126900 + }, + { + "epoch": 0.49059856813718666, + "grad_norm": 0.1528225988149643, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 126910 + }, + { + "epoch": 0.49063722534056997, + "grad_norm": 0.12900884449481964, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 126920 + }, + { + "epoch": 0.4906758825439532, + "grad_norm": 0.1097668707370758, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 126930 + }, + { + "epoch": 0.49071453974733653, + "grad_norm": 0.10988660156726837, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 126940 + }, + { + "epoch": 0.4907531969507198, + "grad_norm": 0.09913916140794754, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 126950 + }, + { + "epoch": 0.4907918541541031, + "grad_norm": 0.09544407576322556, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 126960 + }, + { + "epoch": 0.49083051135748634, + "grad_norm": 0.11369612067937851, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 126970 + }, + { + "epoch": 0.49086916856086965, + "grad_norm": 0.10173063725233078, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 126980 + }, + { + "epoch": 0.4909078257642529, + "grad_norm": 0.10611432045698166, + "learning_rate": 0.002, + "loss": 2.34, + "step": 126990 + }, + { + "epoch": 0.4909464829676362, + "grad_norm": 0.10398635268211365, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 127000 + }, + { + "epoch": 0.49098514017101946, + "grad_norm": 0.11209771037101746, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 127010 + }, + { + "epoch": 0.49102379737440277, + "grad_norm": 0.10647541284561157, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 127020 + }, + { + "epoch": 0.491062454577786, + "grad_norm": 0.11205994337797165, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 127030 + }, + { + "epoch": 0.49110111178116933, + "grad_norm": 0.10731545090675354, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 127040 + }, + { + "epoch": 0.4911397689845526, + "grad_norm": 0.13301315903663635, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 127050 + }, + { + "epoch": 0.49117842618793583, + "grad_norm": 0.1114644855260849, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 127060 + }, + { + "epoch": 0.49121708339131914, + "grad_norm": 0.09928295761346817, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 127070 + }, + { + "epoch": 0.4912557405947024, + "grad_norm": 0.14364327490329742, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 127080 + }, + { + "epoch": 0.4912943977980857, + "grad_norm": 0.11864378303289413, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 127090 + }, + { + "epoch": 0.49133305500146895, + "grad_norm": 0.09710802137851715, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 127100 + }, + { + "epoch": 0.49137171220485226, + "grad_norm": 0.10308346152305603, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 127110 + }, + { + "epoch": 0.4914103694082355, + "grad_norm": 0.10085774958133698, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 127120 + }, + { + "epoch": 0.4914490266116188, + "grad_norm": 0.11910171061754227, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 127130 + }, + { + "epoch": 0.4914876838150021, + "grad_norm": 0.11054100096225739, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 127140 + }, + { + "epoch": 0.4915263410183854, + "grad_norm": 0.1082221195101738, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 127150 + }, + { + "epoch": 0.49156499822176863, + "grad_norm": 0.11659803986549377, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 127160 + }, + { + "epoch": 0.49160365542515194, + "grad_norm": 0.11233268678188324, + "learning_rate": 0.002, + "loss": 2.351, + "step": 127170 + }, + { + "epoch": 0.4916423126285352, + "grad_norm": 0.10978901386260986, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 127180 + }, + { + "epoch": 0.4916809698319185, + "grad_norm": 0.1231280267238617, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 127190 + }, + { + "epoch": 0.49171962703530175, + "grad_norm": 0.11095661669969559, + "learning_rate": 0.002, + "loss": 2.344, + "step": 127200 + }, + { + "epoch": 0.49175828423868506, + "grad_norm": 0.11212368309497833, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 127210 + }, + { + "epoch": 0.4917969414420683, + "grad_norm": 0.12635929882526398, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 127220 + }, + { + "epoch": 0.49183559864545157, + "grad_norm": 0.11959027498960495, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 127230 + }, + { + "epoch": 0.4918742558488349, + "grad_norm": 0.11061688512563705, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 127240 + }, + { + "epoch": 0.4919129130522181, + "grad_norm": 0.09959302842617035, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 127250 + }, + { + "epoch": 0.49195157025560143, + "grad_norm": 0.10022852569818497, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 127260 + }, + { + "epoch": 0.4919902274589847, + "grad_norm": 0.13130536675453186, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 127270 + }, + { + "epoch": 0.492028884662368, + "grad_norm": 0.31289398670196533, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 127280 + }, + { + "epoch": 0.49206754186575125, + "grad_norm": 0.09189872443675995, + "learning_rate": 0.002, + "loss": 2.354, + "step": 127290 + }, + { + "epoch": 0.49210619906913455, + "grad_norm": 0.11592382937669754, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 127300 + }, + { + "epoch": 0.4921448562725178, + "grad_norm": 0.10866158455610275, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 127310 + }, + { + "epoch": 0.4921835134759011, + "grad_norm": 0.11046077311038971, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 127320 + }, + { + "epoch": 0.49222217067928437, + "grad_norm": 0.11631123721599579, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 127330 + }, + { + "epoch": 0.4922608278826677, + "grad_norm": 0.1088087409734726, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 127340 + }, + { + "epoch": 0.4922994850860509, + "grad_norm": 0.11409708112478256, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 127350 + }, + { + "epoch": 0.49233814228943423, + "grad_norm": 0.10936682671308517, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 127360 + }, + { + "epoch": 0.4923767994928175, + "grad_norm": 0.13295626640319824, + "learning_rate": 0.002, + "loss": 2.347, + "step": 127370 + }, + { + "epoch": 0.4924154566962008, + "grad_norm": 0.1150369718670845, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 127380 + }, + { + "epoch": 0.49245411389958405, + "grad_norm": 0.11830049008131027, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 127390 + }, + { + "epoch": 0.49249277110296735, + "grad_norm": 0.10365139693021774, + "learning_rate": 0.002, + "loss": 2.331, + "step": 127400 + }, + { + "epoch": 0.4925314283063506, + "grad_norm": 0.10849923640489578, + "learning_rate": 0.002, + "loss": 2.342, + "step": 127410 + }, + { + "epoch": 0.49257008550973386, + "grad_norm": 0.11630803346633911, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 127420 + }, + { + "epoch": 0.49260874271311716, + "grad_norm": 0.11408057808876038, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 127430 + }, + { + "epoch": 0.4926473999165004, + "grad_norm": 0.10046610981225967, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 127440 + }, + { + "epoch": 0.4926860571198837, + "grad_norm": 0.1012267991900444, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 127450 + }, + { + "epoch": 0.492724714323267, + "grad_norm": 0.11138573288917542, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 127460 + }, + { + "epoch": 0.4927633715266503, + "grad_norm": 0.10981214046478271, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 127470 + }, + { + "epoch": 0.49280202873003354, + "grad_norm": 0.11571098864078522, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 127480 + }, + { + "epoch": 0.49284068593341684, + "grad_norm": 0.11805303394794464, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 127490 + }, + { + "epoch": 0.4928793431368001, + "grad_norm": 0.09402794390916824, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 127500 + }, + { + "epoch": 0.4929180003401834, + "grad_norm": 0.1224559023976326, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 127510 + }, + { + "epoch": 0.49295665754356666, + "grad_norm": 0.09864436089992523, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 127520 + }, + { + "epoch": 0.49299531474694996, + "grad_norm": 0.10893671214580536, + "learning_rate": 0.002, + "loss": 2.346, + "step": 127530 + }, + { + "epoch": 0.4930339719503332, + "grad_norm": 0.11042977124452591, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 127540 + }, + { + "epoch": 0.4930726291537165, + "grad_norm": 0.1300388127565384, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 127550 + }, + { + "epoch": 0.4931112863570998, + "grad_norm": 0.10813228785991669, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 127560 + }, + { + "epoch": 0.4931499435604831, + "grad_norm": 0.10544244199991226, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 127570 + }, + { + "epoch": 0.49318860076386634, + "grad_norm": 0.09903642535209656, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 127580 + }, + { + "epoch": 0.49322725796724964, + "grad_norm": 0.11719724535942078, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 127590 + }, + { + "epoch": 0.4932659151706329, + "grad_norm": 0.11094460636377335, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 127600 + }, + { + "epoch": 0.49330457237401615, + "grad_norm": 0.12684324383735657, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 127610 + }, + { + "epoch": 0.49334322957739946, + "grad_norm": 0.11697521805763245, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 127620 + }, + { + "epoch": 0.4933818867807827, + "grad_norm": 0.11327816545963287, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 127630 + }, + { + "epoch": 0.493420543984166, + "grad_norm": 0.09532005339860916, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 127640 + }, + { + "epoch": 0.49345920118754927, + "grad_norm": 0.1013709083199501, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 127650 + }, + { + "epoch": 0.4934978583909326, + "grad_norm": 0.09491346776485443, + "learning_rate": 0.002, + "loss": 2.344, + "step": 127660 + }, + { + "epoch": 0.4935365155943158, + "grad_norm": 0.11858315020799637, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 127670 + }, + { + "epoch": 0.49357517279769914, + "grad_norm": 0.10509752482175827, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 127680 + }, + { + "epoch": 0.4936138300010824, + "grad_norm": 0.11521925032138824, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 127690 + }, + { + "epoch": 0.4936524872044657, + "grad_norm": 0.11256805062294006, + "learning_rate": 0.002, + "loss": 2.351, + "step": 127700 + }, + { + "epoch": 0.49369114440784895, + "grad_norm": 0.11606287211179733, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 127710 + }, + { + "epoch": 0.49372980161123226, + "grad_norm": 0.10571981966495514, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 127720 + }, + { + "epoch": 0.4937684588146155, + "grad_norm": 0.12093351036310196, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 127730 + }, + { + "epoch": 0.4938071160179988, + "grad_norm": 0.1062152162194252, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 127740 + }, + { + "epoch": 0.49384577322138207, + "grad_norm": 0.10367966443300247, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 127750 + }, + { + "epoch": 0.4938844304247654, + "grad_norm": 0.127140611410141, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 127760 + }, + { + "epoch": 0.4939230876281486, + "grad_norm": 0.10023011267185211, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 127770 + }, + { + "epoch": 0.49396174483153193, + "grad_norm": 0.10536835342645645, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 127780 + }, + { + "epoch": 0.4940004020349152, + "grad_norm": 0.10575197637081146, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 127790 + }, + { + "epoch": 0.49403905923829844, + "grad_norm": 0.10220493376255035, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 127800 + }, + { + "epoch": 0.49407771644168175, + "grad_norm": 0.10080619156360626, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 127810 + }, + { + "epoch": 0.494116373645065, + "grad_norm": 0.10619170218706131, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 127820 + }, + { + "epoch": 0.4941550308484483, + "grad_norm": 0.13515208661556244, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 127830 + }, + { + "epoch": 0.49419368805183156, + "grad_norm": 0.11757437884807587, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 127840 + }, + { + "epoch": 0.49423234525521487, + "grad_norm": 0.10760632157325745, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 127850 + }, + { + "epoch": 0.4942710024585981, + "grad_norm": 0.10090624541044235, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 127860 + }, + { + "epoch": 0.4943096596619814, + "grad_norm": 0.11018454283475876, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 127870 + }, + { + "epoch": 0.4943483168653647, + "grad_norm": 0.1165466159582138, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 127880 + }, + { + "epoch": 0.494386974068748, + "grad_norm": 0.1118311733007431, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 127890 + }, + { + "epoch": 0.49442563127213124, + "grad_norm": 0.10799223929643631, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 127900 + }, + { + "epoch": 0.49446428847551455, + "grad_norm": 0.10268426686525345, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 127910 + }, + { + "epoch": 0.4945029456788978, + "grad_norm": 0.11427894234657288, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 127920 + }, + { + "epoch": 0.4945416028822811, + "grad_norm": 0.10355595499277115, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 127930 + }, + { + "epoch": 0.49458026008566436, + "grad_norm": 0.11134828627109528, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 127940 + }, + { + "epoch": 0.49461891728904767, + "grad_norm": 0.10238852351903915, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 127950 + }, + { + "epoch": 0.4946575744924309, + "grad_norm": 0.11048177629709244, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 127960 + }, + { + "epoch": 0.49469623169581417, + "grad_norm": 0.11856822669506073, + "learning_rate": 0.002, + "loss": 2.369, + "step": 127970 + }, + { + "epoch": 0.4947348888991975, + "grad_norm": 0.10213296115398407, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 127980 + }, + { + "epoch": 0.49477354610258073, + "grad_norm": 0.11366453021764755, + "learning_rate": 0.002, + "loss": 2.348, + "step": 127990 + }, + { + "epoch": 0.49481220330596404, + "grad_norm": 0.10160555690526962, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 128000 + }, + { + "epoch": 0.4948508605093473, + "grad_norm": 0.10848793387413025, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 128010 + }, + { + "epoch": 0.4948895177127306, + "grad_norm": 0.09237051010131836, + "learning_rate": 0.002, + "loss": 2.363, + "step": 128020 + }, + { + "epoch": 0.49492817491611385, + "grad_norm": 0.11140402406454086, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 128030 + }, + { + "epoch": 0.49496683211949716, + "grad_norm": 0.109286829829216, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 128040 + }, + { + "epoch": 0.4950054893228804, + "grad_norm": 0.1083264946937561, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 128050 + }, + { + "epoch": 0.4950441465262637, + "grad_norm": 0.1095544695854187, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 128060 + }, + { + "epoch": 0.49508280372964697, + "grad_norm": 0.12022789567708969, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 128070 + }, + { + "epoch": 0.4951214609330303, + "grad_norm": 0.10353036969900131, + "learning_rate": 0.002, + "loss": 2.34, + "step": 128080 + }, + { + "epoch": 0.49516011813641353, + "grad_norm": 0.1073271781206131, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 128090 + }, + { + "epoch": 0.49519877533979684, + "grad_norm": 0.09842212498188019, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 128100 + }, + { + "epoch": 0.4952374325431801, + "grad_norm": 0.09421800076961517, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 128110 + }, + { + "epoch": 0.4952760897465634, + "grad_norm": 0.10729176551103592, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 128120 + }, + { + "epoch": 0.49531474694994665, + "grad_norm": 0.09626664221286774, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 128130 + }, + { + "epoch": 0.49535340415332996, + "grad_norm": 0.09824251383543015, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 128140 + }, + { + "epoch": 0.4953920613567132, + "grad_norm": 0.1067798063158989, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 128150 + }, + { + "epoch": 0.49543071856009646, + "grad_norm": 0.09855811297893524, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 128160 + }, + { + "epoch": 0.49546937576347977, + "grad_norm": 0.11580448597669601, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 128170 + }, + { + "epoch": 0.495508032966863, + "grad_norm": 0.10759995877742767, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 128180 + }, + { + "epoch": 0.49554669017024633, + "grad_norm": 0.09637139737606049, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 128190 + }, + { + "epoch": 0.4955853473736296, + "grad_norm": 0.0985492467880249, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 128200 + }, + { + "epoch": 0.4956240045770129, + "grad_norm": 0.10081609338521957, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 128210 + }, + { + "epoch": 0.49566266178039614, + "grad_norm": 0.10274836421012878, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 128220 + }, + { + "epoch": 0.49570131898377945, + "grad_norm": 0.10830628126859665, + "learning_rate": 0.002, + "loss": 2.347, + "step": 128230 + }, + { + "epoch": 0.4957399761871627, + "grad_norm": 0.11750640720129013, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 128240 + }, + { + "epoch": 0.495778633390546, + "grad_norm": 0.10376337170600891, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 128250 + }, + { + "epoch": 0.49581729059392926, + "grad_norm": 0.13361400365829468, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 128260 + }, + { + "epoch": 0.49585594779731257, + "grad_norm": 0.11120904237031937, + "learning_rate": 0.002, + "loss": 2.336, + "step": 128270 + }, + { + "epoch": 0.4958946050006958, + "grad_norm": 0.10228724032640457, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 128280 + }, + { + "epoch": 0.49593326220407913, + "grad_norm": 0.11770356446504593, + "learning_rate": 0.002, + "loss": 2.332, + "step": 128290 + }, + { + "epoch": 0.4959719194074624, + "grad_norm": 0.10043749958276749, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 128300 + }, + { + "epoch": 0.4960105766108457, + "grad_norm": 0.10765615850687027, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 128310 + }, + { + "epoch": 0.49604923381422894, + "grad_norm": 0.11104804277420044, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 128320 + }, + { + "epoch": 0.49608789101761225, + "grad_norm": 0.11131534725427628, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 128330 + }, + { + "epoch": 0.4961265482209955, + "grad_norm": 0.10066875070333481, + "learning_rate": 0.002, + "loss": 2.333, + "step": 128340 + }, + { + "epoch": 0.49616520542437875, + "grad_norm": 0.11946538835763931, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 128350 + }, + { + "epoch": 0.49620386262776206, + "grad_norm": 0.10993815213441849, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 128360 + }, + { + "epoch": 0.4962425198311453, + "grad_norm": 0.11155513674020767, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 128370 + }, + { + "epoch": 0.4962811770345286, + "grad_norm": 0.12873508036136627, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 128380 + }, + { + "epoch": 0.49631983423791187, + "grad_norm": 0.1165364608168602, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 128390 + }, + { + "epoch": 0.4963584914412952, + "grad_norm": 0.11247461289167404, + "learning_rate": 0.002, + "loss": 2.343, + "step": 128400 + }, + { + "epoch": 0.49639714864467843, + "grad_norm": 0.11606108397245407, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 128410 + }, + { + "epoch": 0.49643580584806174, + "grad_norm": 0.09354253858327866, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 128420 + }, + { + "epoch": 0.496474463051445, + "grad_norm": 0.18941552937030792, + "learning_rate": 0.002, + "loss": 2.363, + "step": 128430 + }, + { + "epoch": 0.4965131202548283, + "grad_norm": 0.09419083595275879, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 128440 + }, + { + "epoch": 0.49655177745821155, + "grad_norm": 0.11070279777050018, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 128450 + }, + { + "epoch": 0.49659043466159486, + "grad_norm": 0.09962885826826096, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 128460 + }, + { + "epoch": 0.4966290918649781, + "grad_norm": 0.11432572454214096, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 128470 + }, + { + "epoch": 0.4966677490683614, + "grad_norm": 0.11048974096775055, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 128480 + }, + { + "epoch": 0.49670640627174467, + "grad_norm": 0.14594443142414093, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 128490 + }, + { + "epoch": 0.496745063475128, + "grad_norm": 0.1116267740726471, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 128500 + }, + { + "epoch": 0.49678372067851123, + "grad_norm": 0.10765746980905533, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 128510 + }, + { + "epoch": 0.49682237788189454, + "grad_norm": 0.10805536806583405, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 128520 + }, + { + "epoch": 0.4968610350852778, + "grad_norm": 0.11227666586637497, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 128530 + }, + { + "epoch": 0.49689969228866104, + "grad_norm": 0.12149425595998764, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 128540 + }, + { + "epoch": 0.49693834949204435, + "grad_norm": 0.10537626594305038, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 128550 + }, + { + "epoch": 0.4969770066954276, + "grad_norm": 0.12122012674808502, + "learning_rate": 0.002, + "loss": 2.345, + "step": 128560 + }, + { + "epoch": 0.4970156638988109, + "grad_norm": 0.11633715033531189, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 128570 + }, + { + "epoch": 0.49705432110219416, + "grad_norm": 0.10596884787082672, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 128580 + }, + { + "epoch": 0.49709297830557747, + "grad_norm": 0.12135255336761475, + "learning_rate": 0.002, + "loss": 2.337, + "step": 128590 + }, + { + "epoch": 0.4971316355089607, + "grad_norm": 0.11075810343027115, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 128600 + }, + { + "epoch": 0.49717029271234403, + "grad_norm": 0.10570067167282104, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 128610 + }, + { + "epoch": 0.4972089499157273, + "grad_norm": 0.10998471081256866, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 128620 + }, + { + "epoch": 0.4972476071191106, + "grad_norm": 0.12023153156042099, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 128630 + }, + { + "epoch": 0.49728626432249384, + "grad_norm": 0.0980997234582901, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 128640 + }, + { + "epoch": 0.49732492152587715, + "grad_norm": 0.1252102553844452, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 128650 + }, + { + "epoch": 0.4973635787292604, + "grad_norm": 0.10539890080690384, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 128660 + }, + { + "epoch": 0.4974022359326437, + "grad_norm": 0.10025840252637863, + "learning_rate": 0.002, + "loss": 2.348, + "step": 128670 + }, + { + "epoch": 0.49744089313602696, + "grad_norm": 0.10762711614370346, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 128680 + }, + { + "epoch": 0.49747955033941027, + "grad_norm": 0.11508306860923767, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 128690 + }, + { + "epoch": 0.4975182075427935, + "grad_norm": 0.10356348007917404, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 128700 + }, + { + "epoch": 0.49755686474617683, + "grad_norm": 0.10061529278755188, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 128710 + }, + { + "epoch": 0.4975955219495601, + "grad_norm": 0.10842433571815491, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 128720 + }, + { + "epoch": 0.49763417915294333, + "grad_norm": 0.12198733538389206, + "learning_rate": 0.002, + "loss": 2.335, + "step": 128730 + }, + { + "epoch": 0.49767283635632664, + "grad_norm": 0.10084047168493271, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 128740 + }, + { + "epoch": 0.4977114935597099, + "grad_norm": 0.104703888297081, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 128750 + }, + { + "epoch": 0.4977501507630932, + "grad_norm": 0.10577589273452759, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 128760 + }, + { + "epoch": 0.49778880796647645, + "grad_norm": 0.10533466190099716, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 128770 + }, + { + "epoch": 0.49782746516985976, + "grad_norm": 0.10518418252468109, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 128780 + }, + { + "epoch": 0.497866122373243, + "grad_norm": 0.10616995394229889, + "learning_rate": 0.002, + "loss": 2.362, + "step": 128790 + }, + { + "epoch": 0.4979047795766263, + "grad_norm": 0.10706468671560287, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 128800 + }, + { + "epoch": 0.4979434367800096, + "grad_norm": 0.1249483972787857, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 128810 + }, + { + "epoch": 0.4979820939833929, + "grad_norm": 0.09538888186216354, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 128820 + }, + { + "epoch": 0.49802075118677613, + "grad_norm": 0.10221359133720398, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 128830 + }, + { + "epoch": 0.49805940839015944, + "grad_norm": 0.11646133661270142, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 128840 + }, + { + "epoch": 0.4980980655935427, + "grad_norm": 0.1112755611538887, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 128850 + }, + { + "epoch": 0.498136722796926, + "grad_norm": 0.09283029288053513, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 128860 + }, + { + "epoch": 0.49817538000030925, + "grad_norm": 0.13229452073574066, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 128870 + }, + { + "epoch": 0.49821403720369256, + "grad_norm": 0.1138056218624115, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 128880 + }, + { + "epoch": 0.4982526944070758, + "grad_norm": 0.10786010324954987, + "learning_rate": 0.002, + "loss": 2.346, + "step": 128890 + }, + { + "epoch": 0.49829135161045907, + "grad_norm": 0.10349131375551224, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 128900 + }, + { + "epoch": 0.4983300088138424, + "grad_norm": 0.1148381382226944, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 128910 + }, + { + "epoch": 0.4983686660172256, + "grad_norm": 0.11891456693410873, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 128920 + }, + { + "epoch": 0.49840732322060893, + "grad_norm": 0.10900149494409561, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 128930 + }, + { + "epoch": 0.4984459804239922, + "grad_norm": 0.10199940949678421, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 128940 + }, + { + "epoch": 0.4984846376273755, + "grad_norm": 0.1393444687128067, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 128950 + }, + { + "epoch": 0.49852329483075875, + "grad_norm": 0.2699110507965088, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 128960 + }, + { + "epoch": 0.49856195203414205, + "grad_norm": 0.0982867032289505, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 128970 + }, + { + "epoch": 0.4986006092375253, + "grad_norm": 0.10448093712329865, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 128980 + }, + { + "epoch": 0.4986392664409086, + "grad_norm": 0.11845577508211136, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 128990 + }, + { + "epoch": 0.49867792364429187, + "grad_norm": 0.0945153683423996, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 129000 + }, + { + "epoch": 0.4987165808476752, + "grad_norm": 0.10529087483882904, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 129010 + }, + { + "epoch": 0.4987552380510584, + "grad_norm": 0.10217724740505219, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 129020 + }, + { + "epoch": 0.49879389525444173, + "grad_norm": 0.10975181311368942, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 129030 + }, + { + "epoch": 0.498832552457825, + "grad_norm": 0.11213059723377228, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 129040 + }, + { + "epoch": 0.4988712096612083, + "grad_norm": 0.11686363816261292, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 129050 + }, + { + "epoch": 0.49890986686459154, + "grad_norm": 0.10080690681934357, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 129060 + }, + { + "epoch": 0.49894852406797485, + "grad_norm": 0.12356174737215042, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 129070 + }, + { + "epoch": 0.4989871812713581, + "grad_norm": 0.1226542666554451, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 129080 + }, + { + "epoch": 0.49902583847474136, + "grad_norm": 0.10394148528575897, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 129090 + }, + { + "epoch": 0.49906449567812466, + "grad_norm": 0.1452575922012329, + "learning_rate": 0.002, + "loss": 2.349, + "step": 129100 + }, + { + "epoch": 0.4991031528815079, + "grad_norm": 0.11420813202857971, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 129110 + }, + { + "epoch": 0.4991418100848912, + "grad_norm": 0.0996331125497818, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 129120 + }, + { + "epoch": 0.4991804672882745, + "grad_norm": 0.10831142961978912, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 129130 + }, + { + "epoch": 0.4992191244916578, + "grad_norm": 0.121832475066185, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 129140 + }, + { + "epoch": 0.49925778169504104, + "grad_norm": 0.10833895206451416, + "learning_rate": 0.002, + "loss": 2.346, + "step": 129150 + }, + { + "epoch": 0.49929643889842434, + "grad_norm": 0.10350599884986877, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 129160 + }, + { + "epoch": 0.4993350961018076, + "grad_norm": 0.10486671328544617, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 129170 + }, + { + "epoch": 0.4993737533051909, + "grad_norm": 0.10589516907930374, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 129180 + }, + { + "epoch": 0.49941241050857416, + "grad_norm": 0.10069865733385086, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 129190 + }, + { + "epoch": 0.49945106771195746, + "grad_norm": 0.11168158054351807, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 129200 + }, + { + "epoch": 0.4994897249153407, + "grad_norm": 0.1095777153968811, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 129210 + }, + { + "epoch": 0.499528382118724, + "grad_norm": 0.10152468830347061, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 129220 + }, + { + "epoch": 0.4995670393221073, + "grad_norm": 0.11011513322591782, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 129230 + }, + { + "epoch": 0.4996056965254906, + "grad_norm": 0.10783960670232773, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 129240 + }, + { + "epoch": 0.49964435372887384, + "grad_norm": 0.09583844989538193, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 129250 + }, + { + "epoch": 0.49968301093225714, + "grad_norm": 0.10869450867176056, + "learning_rate": 0.002, + "loss": 2.35, + "step": 129260 + }, + { + "epoch": 0.4997216681356404, + "grad_norm": 0.10898219048976898, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 129270 + }, + { + "epoch": 0.49976032533902365, + "grad_norm": 0.1137361004948616, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 129280 + }, + { + "epoch": 0.49979898254240696, + "grad_norm": 0.09693686664104462, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 129290 + }, + { + "epoch": 0.4998376397457902, + "grad_norm": 0.10234058648347855, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 129300 + }, + { + "epoch": 0.4998762969491735, + "grad_norm": 0.11275883764028549, + "learning_rate": 0.002, + "loss": 2.342, + "step": 129310 + }, + { + "epoch": 0.49991495415255677, + "grad_norm": 0.1097671240568161, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 129320 + }, + { + "epoch": 0.4999536113559401, + "grad_norm": 0.09512518346309662, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 129330 + }, + { + "epoch": 0.4999922685593233, + "grad_norm": 0.10958248376846313, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 129340 + }, + { + "epoch": 0.5000309257627066, + "grad_norm": 0.10563676059246063, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 129350 + }, + { + "epoch": 0.5000695829660899, + "grad_norm": 0.09640498459339142, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 129360 + }, + { + "epoch": 0.5001082401694732, + "grad_norm": 0.11537022888660431, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 129370 + }, + { + "epoch": 0.5001468973728564, + "grad_norm": 0.0939837396144867, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 129380 + }, + { + "epoch": 0.5001855545762397, + "grad_norm": 0.1068049892783165, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 129390 + }, + { + "epoch": 0.5002242117796231, + "grad_norm": 0.12465416640043259, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 129400 + }, + { + "epoch": 0.5002628689830063, + "grad_norm": 0.0991302952170372, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 129410 + }, + { + "epoch": 0.5003015261863896, + "grad_norm": 0.10132446140050888, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 129420 + }, + { + "epoch": 0.5003401833897728, + "grad_norm": 0.11166176944971085, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 129430 + }, + { + "epoch": 0.5003788405931562, + "grad_norm": 0.11701422929763794, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 129440 + }, + { + "epoch": 0.5004174977965394, + "grad_norm": 0.1262463927268982, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 129450 + }, + { + "epoch": 0.5004561549999227, + "grad_norm": 0.10851752758026123, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 129460 + }, + { + "epoch": 0.5004948122033059, + "grad_norm": 0.1244642361998558, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 129470 + }, + { + "epoch": 0.5005334694066892, + "grad_norm": 0.10973615199327469, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 129480 + }, + { + "epoch": 0.5005721266100726, + "grad_norm": 0.11159950494766235, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 129490 + }, + { + "epoch": 0.5006107838134558, + "grad_norm": 0.09034644067287445, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 129500 + }, + { + "epoch": 0.5006494410168391, + "grad_norm": 0.10297048836946487, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 129510 + }, + { + "epoch": 0.5006880982202223, + "grad_norm": 0.10359738022089005, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 129520 + }, + { + "epoch": 0.5007267554236057, + "grad_norm": 0.10247504711151123, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 129530 + }, + { + "epoch": 0.5007654126269889, + "grad_norm": 0.10605670511722565, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 129540 + }, + { + "epoch": 0.5008040698303722, + "grad_norm": 0.10165400058031082, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 129550 + }, + { + "epoch": 0.5008427270337554, + "grad_norm": 0.1123841404914856, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 129560 + }, + { + "epoch": 0.5008813842371388, + "grad_norm": 0.10684897750616074, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 129570 + }, + { + "epoch": 0.500920041440522, + "grad_norm": 0.11832645535469055, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 129580 + }, + { + "epoch": 0.5009586986439053, + "grad_norm": 0.10235374420881271, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 129590 + }, + { + "epoch": 0.5009973558472886, + "grad_norm": 0.10725278407335281, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 129600 + }, + { + "epoch": 0.5010360130506719, + "grad_norm": 0.11649779975414276, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 129610 + }, + { + "epoch": 0.5010746702540552, + "grad_norm": 0.10545599460601807, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 129620 + }, + { + "epoch": 0.5011133274574384, + "grad_norm": 0.10189778357744217, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 129630 + }, + { + "epoch": 0.5011519846608217, + "grad_norm": 0.11193004250526428, + "learning_rate": 0.002, + "loss": 2.354, + "step": 129640 + }, + { + "epoch": 0.5011906418642049, + "grad_norm": 0.11135837435722351, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 129650 + }, + { + "epoch": 0.5012292990675883, + "grad_norm": 0.10167776793241501, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 129660 + }, + { + "epoch": 0.5012679562709715, + "grad_norm": 0.10497588664293289, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 129670 + }, + { + "epoch": 0.5013066134743548, + "grad_norm": 0.13115350902080536, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 129680 + }, + { + "epoch": 0.501345270677738, + "grad_norm": 0.09938187897205353, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 129690 + }, + { + "epoch": 0.5013839278811214, + "grad_norm": 0.09681079536676407, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 129700 + }, + { + "epoch": 0.5014225850845047, + "grad_norm": 0.0996105894446373, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 129710 + }, + { + "epoch": 0.5014612422878879, + "grad_norm": 0.12206865847110748, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 129720 + }, + { + "epoch": 0.5014998994912712, + "grad_norm": 0.10603801906108856, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 129730 + }, + { + "epoch": 0.5015385566946545, + "grad_norm": 0.12295734882354736, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 129740 + }, + { + "epoch": 0.5015772138980378, + "grad_norm": 0.1133185625076294, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 129750 + }, + { + "epoch": 0.501615871101421, + "grad_norm": 0.10275447368621826, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 129760 + }, + { + "epoch": 0.5016545283048043, + "grad_norm": 0.10109806805849075, + "learning_rate": 0.002, + "loss": 2.356, + "step": 129770 + }, + { + "epoch": 0.5016931855081876, + "grad_norm": 0.09771363437175751, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 129780 + }, + { + "epoch": 0.5017318427115709, + "grad_norm": 0.14310520887374878, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 129790 + }, + { + "epoch": 0.5017704999149541, + "grad_norm": 0.11893128603696823, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 129800 + }, + { + "epoch": 0.5018091571183374, + "grad_norm": 0.09260301291942596, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 129810 + }, + { + "epoch": 0.5018478143217208, + "grad_norm": 0.11436281353235245, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 129820 + }, + { + "epoch": 0.501886471525104, + "grad_norm": 0.09662805497646332, + "learning_rate": 0.002, + "loss": 2.345, + "step": 129830 + }, + { + "epoch": 0.5019251287284873, + "grad_norm": 0.1126139834523201, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 129840 + }, + { + "epoch": 0.5019637859318705, + "grad_norm": 0.11318148672580719, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 129850 + }, + { + "epoch": 0.5020024431352538, + "grad_norm": 0.11841975152492523, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 129860 + }, + { + "epoch": 0.5020411003386371, + "grad_norm": 0.08732382208108902, + "learning_rate": 0.002, + "loss": 2.34, + "step": 129870 + }, + { + "epoch": 0.5020797575420204, + "grad_norm": 0.11381547152996063, + "learning_rate": 0.002, + "loss": 2.336, + "step": 129880 + }, + { + "epoch": 0.5021184147454036, + "grad_norm": 0.10493790358304977, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 129890 + }, + { + "epoch": 0.5021570719487869, + "grad_norm": 0.10133479535579681, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 129900 + }, + { + "epoch": 0.5021957291521703, + "grad_norm": 0.10650533437728882, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 129910 + }, + { + "epoch": 0.5022343863555535, + "grad_norm": 0.09722936898469925, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 129920 + }, + { + "epoch": 0.5022730435589368, + "grad_norm": 0.09154581278562546, + "learning_rate": 0.002, + "loss": 2.343, + "step": 129930 + }, + { + "epoch": 0.50231170076232, + "grad_norm": 0.12030097842216492, + "learning_rate": 0.002, + "loss": 2.336, + "step": 129940 + }, + { + "epoch": 0.5023503579657034, + "grad_norm": 0.11775567382574081, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 129950 + }, + { + "epoch": 0.5023890151690866, + "grad_norm": 0.10541986674070358, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 129960 + }, + { + "epoch": 0.5024276723724699, + "grad_norm": 0.12167876213788986, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 129970 + }, + { + "epoch": 0.5024663295758531, + "grad_norm": 0.10480458289384842, + "learning_rate": 0.002, + "loss": 2.347, + "step": 129980 + }, + { + "epoch": 0.5025049867792365, + "grad_norm": 0.10177701711654663, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 129990 + }, + { + "epoch": 0.5025436439826197, + "grad_norm": 0.11270109564065933, + "learning_rate": 0.002, + "loss": 2.34, + "step": 130000 + }, + { + "epoch": 0.502582301186003, + "grad_norm": 0.104780413210392, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 130010 + }, + { + "epoch": 0.5026209583893863, + "grad_norm": 0.09986669570207596, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 130020 + }, + { + "epoch": 0.5026596155927695, + "grad_norm": 0.13555224239826202, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 130030 + }, + { + "epoch": 0.5026982727961529, + "grad_norm": 0.10494866967201233, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 130040 + }, + { + "epoch": 0.5027369299995361, + "grad_norm": 0.11657863110303879, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 130050 + }, + { + "epoch": 0.5027755872029194, + "grad_norm": 0.11362355947494507, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 130060 + }, + { + "epoch": 0.5028142444063026, + "grad_norm": 0.09364651143550873, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 130070 + }, + { + "epoch": 0.502852901609686, + "grad_norm": 0.11096946895122528, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 130080 + }, + { + "epoch": 0.5028915588130692, + "grad_norm": 0.10207531601190567, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 130090 + }, + { + "epoch": 0.5029302160164525, + "grad_norm": 0.10843487828969955, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 130100 + }, + { + "epoch": 0.5029688732198357, + "grad_norm": 0.11173061281442642, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 130110 + }, + { + "epoch": 0.5030075304232191, + "grad_norm": 0.10927116125822067, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 130120 + }, + { + "epoch": 0.5030461876266024, + "grad_norm": 0.10148067772388458, + "learning_rate": 0.002, + "loss": 2.335, + "step": 130130 + }, + { + "epoch": 0.5030848448299856, + "grad_norm": 0.09976466745138168, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 130140 + }, + { + "epoch": 0.5031235020333689, + "grad_norm": 0.10571061074733734, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 130150 + }, + { + "epoch": 0.5031621592367522, + "grad_norm": 0.10799040645360947, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 130160 + }, + { + "epoch": 0.5032008164401355, + "grad_norm": 0.0908653512597084, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 130170 + }, + { + "epoch": 0.5032394736435187, + "grad_norm": 0.10622545331716537, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 130180 + }, + { + "epoch": 0.503278130846902, + "grad_norm": 0.11057796329259872, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 130190 + }, + { + "epoch": 0.5033167880502852, + "grad_norm": 0.10583652555942535, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 130200 + }, + { + "epoch": 0.5033554452536686, + "grad_norm": 0.10508795082569122, + "learning_rate": 0.002, + "loss": 2.345, + "step": 130210 + }, + { + "epoch": 0.5033941024570519, + "grad_norm": 0.09605761617422104, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 130220 + }, + { + "epoch": 0.5034327596604351, + "grad_norm": 0.11529167741537094, + "learning_rate": 0.002, + "loss": 2.363, + "step": 130230 + }, + { + "epoch": 0.5034714168638184, + "grad_norm": 0.09331204742193222, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 130240 + }, + { + "epoch": 0.5035100740672017, + "grad_norm": 0.11913208663463593, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 130250 + }, + { + "epoch": 0.503548731270585, + "grad_norm": 0.10425525903701782, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 130260 + }, + { + "epoch": 0.5035873884739682, + "grad_norm": 0.10939491540193558, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 130270 + }, + { + "epoch": 0.5036260456773515, + "grad_norm": 0.09465670585632324, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 130280 + }, + { + "epoch": 0.5036647028807348, + "grad_norm": 0.1188560351729393, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 130290 + }, + { + "epoch": 0.5037033600841181, + "grad_norm": 0.11721429228782654, + "learning_rate": 0.002, + "loss": 2.344, + "step": 130300 + }, + { + "epoch": 0.5037420172875013, + "grad_norm": 0.097111776471138, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 130310 + }, + { + "epoch": 0.5037806744908846, + "grad_norm": 0.11451169103384018, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 130320 + }, + { + "epoch": 0.503819331694268, + "grad_norm": 0.14097385108470917, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 130330 + }, + { + "epoch": 0.5038579888976512, + "grad_norm": 0.10153955221176147, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 130340 + }, + { + "epoch": 0.5038966461010345, + "grad_norm": 0.1101381704211235, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 130350 + }, + { + "epoch": 0.5039353033044177, + "grad_norm": 0.10483207553625107, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 130360 + }, + { + "epoch": 0.5039739605078011, + "grad_norm": 0.10726115107536316, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 130370 + }, + { + "epoch": 0.5040126177111843, + "grad_norm": 0.11522553116083145, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 130380 + }, + { + "epoch": 0.5040512749145676, + "grad_norm": 0.1336357742547989, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 130390 + }, + { + "epoch": 0.5040899321179508, + "grad_norm": 0.10702812671661377, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 130400 + }, + { + "epoch": 0.5041285893213341, + "grad_norm": 0.10702671110630035, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 130410 + }, + { + "epoch": 0.5041672465247174, + "grad_norm": 0.09648557752370834, + "learning_rate": 0.002, + "loss": 2.346, + "step": 130420 + }, + { + "epoch": 0.5042059037281007, + "grad_norm": 0.10503020137548447, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 130430 + }, + { + "epoch": 0.504244560931484, + "grad_norm": 0.10085074603557587, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 130440 + }, + { + "epoch": 0.5042832181348672, + "grad_norm": 0.10082501918077469, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 130450 + }, + { + "epoch": 0.5043218753382506, + "grad_norm": 0.10579691082239151, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 130460 + }, + { + "epoch": 0.5043605325416338, + "grad_norm": 0.12046706676483154, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 130470 + }, + { + "epoch": 0.5043991897450171, + "grad_norm": 0.10785841941833496, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 130480 + }, + { + "epoch": 0.5044378469484003, + "grad_norm": 0.12022893875837326, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 130490 + }, + { + "epoch": 0.5044765041517837, + "grad_norm": 0.10601069033145905, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 130500 + }, + { + "epoch": 0.5045151613551669, + "grad_norm": 0.10324139893054962, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 130510 + }, + { + "epoch": 0.5045538185585502, + "grad_norm": 0.0889090746641159, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 130520 + }, + { + "epoch": 0.5045924757619334, + "grad_norm": 0.10269230604171753, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 130530 + }, + { + "epoch": 0.5046311329653168, + "grad_norm": 0.10001306235790253, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 130540 + }, + { + "epoch": 0.5046697901687001, + "grad_norm": 0.11116419732570648, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 130550 + }, + { + "epoch": 0.5047084473720833, + "grad_norm": 0.1122574731707573, + "learning_rate": 0.002, + "loss": 2.339, + "step": 130560 + }, + { + "epoch": 0.5047471045754666, + "grad_norm": 0.106594018638134, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 130570 + }, + { + "epoch": 0.5047857617788498, + "grad_norm": 0.11066550761461258, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 130580 + }, + { + "epoch": 0.5048244189822332, + "grad_norm": 0.09080637246370316, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 130590 + }, + { + "epoch": 0.5048630761856164, + "grad_norm": 0.1192973256111145, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 130600 + }, + { + "epoch": 0.5049017333889997, + "grad_norm": 0.1105991080403328, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 130610 + }, + { + "epoch": 0.5049403905923829, + "grad_norm": 0.20441003143787384, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 130620 + }, + { + "epoch": 0.5049790477957663, + "grad_norm": 0.10766471177339554, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 130630 + }, + { + "epoch": 0.5050177049991496, + "grad_norm": 0.09288868308067322, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 130640 + }, + { + "epoch": 0.5050563622025328, + "grad_norm": 0.10671941936016083, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 130650 + }, + { + "epoch": 0.5050950194059161, + "grad_norm": 0.10453400015830994, + "learning_rate": 0.002, + "loss": 2.35, + "step": 130660 + }, + { + "epoch": 0.5051336766092994, + "grad_norm": 0.10739947855472565, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 130670 + }, + { + "epoch": 0.5051723338126827, + "grad_norm": 0.12816528975963593, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 130680 + }, + { + "epoch": 0.5052109910160659, + "grad_norm": 0.09332219511270523, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 130690 + }, + { + "epoch": 0.5052496482194492, + "grad_norm": 0.12734173238277435, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 130700 + }, + { + "epoch": 0.5052883054228325, + "grad_norm": 0.0906364843249321, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 130710 + }, + { + "epoch": 0.5053269626262158, + "grad_norm": 0.10444950312376022, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 130720 + }, + { + "epoch": 0.505365619829599, + "grad_norm": 0.10656848549842834, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 130730 + }, + { + "epoch": 0.5054042770329823, + "grad_norm": 0.09719150513410568, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 130740 + }, + { + "epoch": 0.5054429342363657, + "grad_norm": 0.10844217985868454, + "learning_rate": 0.002, + "loss": 2.356, + "step": 130750 + }, + { + "epoch": 0.5054815914397489, + "grad_norm": 0.12327215820550919, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 130760 + }, + { + "epoch": 0.5055202486431322, + "grad_norm": 0.10725454241037369, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 130770 + }, + { + "epoch": 0.5055589058465154, + "grad_norm": 0.10240764170885086, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 130780 + }, + { + "epoch": 0.5055975630498987, + "grad_norm": 0.09825988858938217, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 130790 + }, + { + "epoch": 0.505636220253282, + "grad_norm": 0.1060064509510994, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 130800 + }, + { + "epoch": 0.5056748774566653, + "grad_norm": 0.10984325408935547, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 130810 + }, + { + "epoch": 0.5057135346600485, + "grad_norm": 0.10869551450014114, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 130820 + }, + { + "epoch": 0.5057521918634318, + "grad_norm": 0.1109204962849617, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 130830 + }, + { + "epoch": 0.5057908490668152, + "grad_norm": 0.1029016375541687, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 130840 + }, + { + "epoch": 0.5058295062701984, + "grad_norm": 0.12351314723491669, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 130850 + }, + { + "epoch": 0.5058681634735817, + "grad_norm": 0.1079796850681305, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 130860 + }, + { + "epoch": 0.5059068206769649, + "grad_norm": 0.10102064162492752, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 130870 + }, + { + "epoch": 0.5059454778803483, + "grad_norm": 0.11627139151096344, + "learning_rate": 0.002, + "loss": 2.35, + "step": 130880 + }, + { + "epoch": 0.5059841350837315, + "grad_norm": 0.11690767109394073, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 130890 + }, + { + "epoch": 0.5060227922871148, + "grad_norm": 0.09732042998075485, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 130900 + }, + { + "epoch": 0.506061449490498, + "grad_norm": 0.12113215774297714, + "learning_rate": 0.002, + "loss": 2.343, + "step": 130910 + }, + { + "epoch": 0.5061001066938814, + "grad_norm": 0.09814827144145966, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 130920 + }, + { + "epoch": 0.5061387638972646, + "grad_norm": 0.10524283349514008, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 130930 + }, + { + "epoch": 0.5061774211006479, + "grad_norm": 0.13482894003391266, + "learning_rate": 0.002, + "loss": 2.356, + "step": 130940 + }, + { + "epoch": 0.5062160783040311, + "grad_norm": 0.08584418147802353, + "learning_rate": 0.002, + "loss": 2.357, + "step": 130950 + }, + { + "epoch": 0.5062547355074144, + "grad_norm": 0.1154203787446022, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 130960 + }, + { + "epoch": 0.5062933927107978, + "grad_norm": 0.09992490708827972, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 130970 + }, + { + "epoch": 0.506332049914181, + "grad_norm": 0.1054072305560112, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 130980 + }, + { + "epoch": 0.5063707071175643, + "grad_norm": 0.11291274428367615, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 130990 + }, + { + "epoch": 0.5064093643209475, + "grad_norm": 0.11428273469209671, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 131000 + }, + { + "epoch": 0.5064480215243309, + "grad_norm": 0.1516738086938858, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 131010 + }, + { + "epoch": 0.5064866787277141, + "grad_norm": 0.10782832652330399, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 131020 + }, + { + "epoch": 0.5065253359310974, + "grad_norm": 0.10205426067113876, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 131030 + }, + { + "epoch": 0.5065639931344806, + "grad_norm": 0.1182415708899498, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 131040 + }, + { + "epoch": 0.506602650337864, + "grad_norm": 0.11133012175559998, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 131050 + }, + { + "epoch": 0.5066413075412473, + "grad_norm": 0.09714315831661224, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 131060 + }, + { + "epoch": 0.5066799647446305, + "grad_norm": 0.11032336205244064, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 131070 + }, + { + "epoch": 0.5067186219480138, + "grad_norm": 0.09999191761016846, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 131080 + }, + { + "epoch": 0.5067572791513971, + "grad_norm": 0.09123971313238144, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 131090 + }, + { + "epoch": 0.5067959363547804, + "grad_norm": 0.11429370939731598, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 131100 + }, + { + "epoch": 0.5068345935581636, + "grad_norm": 0.10124003887176514, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 131110 + }, + { + "epoch": 0.5068732507615469, + "grad_norm": 0.09939180314540863, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 131120 + }, + { + "epoch": 0.5069119079649301, + "grad_norm": 0.10667199641466141, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 131130 + }, + { + "epoch": 0.5069505651683135, + "grad_norm": 0.10647431761026382, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 131140 + }, + { + "epoch": 0.5069892223716967, + "grad_norm": 0.10949549823999405, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 131150 + }, + { + "epoch": 0.50702787957508, + "grad_norm": 0.11368773132562637, + "learning_rate": 0.002, + "loss": 2.356, + "step": 131160 + }, + { + "epoch": 0.5070665367784633, + "grad_norm": 0.10527981072664261, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 131170 + }, + { + "epoch": 0.5071051939818466, + "grad_norm": 0.09757716208696365, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 131180 + }, + { + "epoch": 0.5071438511852299, + "grad_norm": 0.11202974617481232, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 131190 + }, + { + "epoch": 0.5071825083886131, + "grad_norm": 0.10303257405757904, + "learning_rate": 0.002, + "loss": 2.353, + "step": 131200 + }, + { + "epoch": 0.5072211655919964, + "grad_norm": 0.11027651280164719, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 131210 + }, + { + "epoch": 0.5072598227953797, + "grad_norm": 0.10823901742696762, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 131220 + }, + { + "epoch": 0.507298479998763, + "grad_norm": 0.1229693591594696, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 131230 + }, + { + "epoch": 0.5073371372021462, + "grad_norm": 0.1097574308514595, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 131240 + }, + { + "epoch": 0.5073757944055295, + "grad_norm": 0.11523716151714325, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 131250 + }, + { + "epoch": 0.5074144516089129, + "grad_norm": 0.10482656955718994, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 131260 + }, + { + "epoch": 0.5074531088122961, + "grad_norm": 0.10720503330230713, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 131270 + }, + { + "epoch": 0.5074917660156794, + "grad_norm": 0.10162218660116196, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 131280 + }, + { + "epoch": 0.5075304232190626, + "grad_norm": 0.1127084493637085, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 131290 + }, + { + "epoch": 0.507569080422446, + "grad_norm": 0.10835334658622742, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 131300 + }, + { + "epoch": 0.5076077376258292, + "grad_norm": 0.10543724149465561, + "learning_rate": 0.002, + "loss": 2.342, + "step": 131310 + }, + { + "epoch": 0.5076463948292125, + "grad_norm": 0.10903951525688171, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 131320 + }, + { + "epoch": 0.5076850520325957, + "grad_norm": 0.11438053846359253, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 131330 + }, + { + "epoch": 0.507723709235979, + "grad_norm": 0.11422231048345566, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 131340 + }, + { + "epoch": 0.5077623664393623, + "grad_norm": 0.11027810722589493, + "learning_rate": 0.002, + "loss": 2.349, + "step": 131350 + }, + { + "epoch": 0.5078010236427456, + "grad_norm": 0.104742631316185, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 131360 + }, + { + "epoch": 0.5078396808461288, + "grad_norm": 0.11140044778585434, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 131370 + }, + { + "epoch": 0.5078783380495121, + "grad_norm": 0.10416561365127563, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 131380 + }, + { + "epoch": 0.5079169952528955, + "grad_norm": 0.11030837148427963, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 131390 + }, + { + "epoch": 0.5079556524562787, + "grad_norm": 0.10846863687038422, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 131400 + }, + { + "epoch": 0.507994309659662, + "grad_norm": 0.11026975512504578, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 131410 + }, + { + "epoch": 0.5080329668630452, + "grad_norm": 0.09743861854076385, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 131420 + }, + { + "epoch": 0.5080716240664286, + "grad_norm": 0.13267552852630615, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 131430 + }, + { + "epoch": 0.5081102812698118, + "grad_norm": 0.7629449367523193, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 131440 + }, + { + "epoch": 0.5081489384731951, + "grad_norm": 0.10051924735307693, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 131450 + }, + { + "epoch": 0.5081875956765783, + "grad_norm": 0.11776295304298401, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 131460 + }, + { + "epoch": 0.5082262528799617, + "grad_norm": 0.11177230626344681, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 131470 + }, + { + "epoch": 0.508264910083345, + "grad_norm": 0.09945464134216309, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 131480 + }, + { + "epoch": 0.5083035672867282, + "grad_norm": 0.11460543423891068, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 131490 + }, + { + "epoch": 0.5083422244901115, + "grad_norm": 0.12200960516929626, + "learning_rate": 0.002, + "loss": 2.332, + "step": 131500 + }, + { + "epoch": 0.5083808816934947, + "grad_norm": 0.10961554944515228, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 131510 + }, + { + "epoch": 0.5084195388968781, + "grad_norm": 0.10165112465620041, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 131520 + }, + { + "epoch": 0.5084581961002613, + "grad_norm": 0.11187805235385895, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 131530 + }, + { + "epoch": 0.5084968533036446, + "grad_norm": 0.10341209173202515, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 131540 + }, + { + "epoch": 0.5085355105070278, + "grad_norm": 0.1048155277967453, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 131550 + }, + { + "epoch": 0.5085741677104112, + "grad_norm": 0.11941980570554733, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 131560 + }, + { + "epoch": 0.5086128249137944, + "grad_norm": 0.10180438309907913, + "learning_rate": 0.002, + "loss": 2.341, + "step": 131570 + }, + { + "epoch": 0.5086514821171777, + "grad_norm": 0.1211216077208519, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 131580 + }, + { + "epoch": 0.508690139320561, + "grad_norm": 0.10446929931640625, + "learning_rate": 0.002, + "loss": 2.357, + "step": 131590 + }, + { + "epoch": 0.5087287965239443, + "grad_norm": 0.11889198422431946, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 131600 + }, + { + "epoch": 0.5087674537273276, + "grad_norm": 0.1014224961400032, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 131610 + }, + { + "epoch": 0.5088061109307108, + "grad_norm": 0.10770139843225479, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 131620 + }, + { + "epoch": 0.5088447681340941, + "grad_norm": 0.11339019238948822, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 131630 + }, + { + "epoch": 0.5088834253374774, + "grad_norm": 0.10878494381904602, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 131640 + }, + { + "epoch": 0.5089220825408607, + "grad_norm": 0.10915534198284149, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 131650 + }, + { + "epoch": 0.5089607397442439, + "grad_norm": 0.1017685979604721, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 131660 + }, + { + "epoch": 0.5089993969476272, + "grad_norm": 0.11992502212524414, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 131670 + }, + { + "epoch": 0.5090380541510104, + "grad_norm": 0.1117842048406601, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 131680 + }, + { + "epoch": 0.5090767113543938, + "grad_norm": 0.10868057608604431, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 131690 + }, + { + "epoch": 0.5091153685577771, + "grad_norm": 0.10284089297056198, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 131700 + }, + { + "epoch": 0.5091540257611603, + "grad_norm": 0.11913391947746277, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 131710 + }, + { + "epoch": 0.5091926829645436, + "grad_norm": 0.10378926992416382, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 131720 + }, + { + "epoch": 0.5092313401679269, + "grad_norm": 0.10352809727191925, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 131730 + }, + { + "epoch": 0.5092699973713102, + "grad_norm": 0.10070081800222397, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 131740 + }, + { + "epoch": 0.5093086545746934, + "grad_norm": 0.10485085844993591, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 131750 + }, + { + "epoch": 0.5093473117780767, + "grad_norm": 0.10146090388298035, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 131760 + }, + { + "epoch": 0.50938596898146, + "grad_norm": 0.12098173797130585, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 131770 + }, + { + "epoch": 0.5094246261848433, + "grad_norm": 0.11403773725032806, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 131780 + }, + { + "epoch": 0.5094632833882266, + "grad_norm": 0.09431008994579315, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 131790 + }, + { + "epoch": 0.5095019405916098, + "grad_norm": 0.11636164784431458, + "learning_rate": 0.002, + "loss": 2.3759, + "step": 131800 + }, + { + "epoch": 0.5095405977949932, + "grad_norm": 0.09959069639444351, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 131810 + }, + { + "epoch": 0.5095792549983764, + "grad_norm": 0.11051137000322342, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 131820 + }, + { + "epoch": 0.5096179122017597, + "grad_norm": 0.10396791249513626, + "learning_rate": 0.002, + "loss": 2.349, + "step": 131830 + }, + { + "epoch": 0.5096565694051429, + "grad_norm": 0.10696316510438919, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 131840 + }, + { + "epoch": 0.5096952266085263, + "grad_norm": 0.09929095953702927, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 131850 + }, + { + "epoch": 0.5097338838119095, + "grad_norm": 0.11605452746152878, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 131860 + }, + { + "epoch": 0.5097725410152928, + "grad_norm": 0.11871679127216339, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 131870 + }, + { + "epoch": 0.509811198218676, + "grad_norm": 0.10420700907707214, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 131880 + }, + { + "epoch": 0.5098498554220593, + "grad_norm": 0.11169622838497162, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 131890 + }, + { + "epoch": 0.5098885126254427, + "grad_norm": 0.09876314550638199, + "learning_rate": 0.002, + "loss": 2.354, + "step": 131900 + }, + { + "epoch": 0.5099271698288259, + "grad_norm": 0.10073533654212952, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 131910 + }, + { + "epoch": 0.5099658270322092, + "grad_norm": 0.10538259148597717, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 131920 + }, + { + "epoch": 0.5100044842355924, + "grad_norm": 0.09020911902189255, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 131930 + }, + { + "epoch": 0.5100431414389758, + "grad_norm": 0.11380067467689514, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 131940 + }, + { + "epoch": 0.510081798642359, + "grad_norm": 0.1035328358411789, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 131950 + }, + { + "epoch": 0.5101204558457423, + "grad_norm": 0.1049763560295105, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 131960 + }, + { + "epoch": 0.5101591130491255, + "grad_norm": 0.11328870058059692, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 131970 + }, + { + "epoch": 0.5101977702525089, + "grad_norm": 0.11048241704702377, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 131980 + }, + { + "epoch": 0.5102364274558921, + "grad_norm": 0.10634758323431015, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 131990 + }, + { + "epoch": 0.5102750846592754, + "grad_norm": 0.11657426506280899, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 132000 + }, + { + "epoch": 0.5103137418626587, + "grad_norm": 0.1074022501707077, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 132010 + }, + { + "epoch": 0.510352399066042, + "grad_norm": 0.0970696434378624, + "learning_rate": 0.002, + "loss": 2.339, + "step": 132020 + }, + { + "epoch": 0.5103910562694253, + "grad_norm": 0.10929609090089798, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 132030 + }, + { + "epoch": 0.5104297134728085, + "grad_norm": 0.10729232430458069, + "learning_rate": 0.002, + "loss": 2.339, + "step": 132040 + }, + { + "epoch": 0.5104683706761918, + "grad_norm": 0.09950549900531769, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 132050 + }, + { + "epoch": 0.510507027879575, + "grad_norm": 0.10508369654417038, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 132060 + }, + { + "epoch": 0.5105456850829584, + "grad_norm": 0.12266229093074799, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 132070 + }, + { + "epoch": 0.5105843422863416, + "grad_norm": 0.10278153419494629, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 132080 + }, + { + "epoch": 0.5106229994897249, + "grad_norm": 0.10363687574863434, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 132090 + }, + { + "epoch": 0.5106616566931081, + "grad_norm": 0.11429458856582642, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 132100 + }, + { + "epoch": 0.5107003138964915, + "grad_norm": 0.0995219498872757, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 132110 + }, + { + "epoch": 0.5107389710998748, + "grad_norm": 0.11245020478963852, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 132120 + }, + { + "epoch": 0.510777628303258, + "grad_norm": 0.133084237575531, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 132130 + }, + { + "epoch": 0.5108162855066413, + "grad_norm": 0.11617525666952133, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 132140 + }, + { + "epoch": 0.5108549427100246, + "grad_norm": 0.09348950535058975, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 132150 + }, + { + "epoch": 0.5108935999134079, + "grad_norm": 0.1113290935754776, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 132160 + }, + { + "epoch": 0.5109322571167911, + "grad_norm": 0.10906348377466202, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 132170 + }, + { + "epoch": 0.5109709143201744, + "grad_norm": 0.12325325608253479, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 132180 + }, + { + "epoch": 0.5110095715235577, + "grad_norm": 0.1047777459025383, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 132190 + }, + { + "epoch": 0.511048228726941, + "grad_norm": 0.10495191812515259, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 132200 + }, + { + "epoch": 0.5110868859303243, + "grad_norm": 0.10061822086572647, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 132210 + }, + { + "epoch": 0.5111255431337075, + "grad_norm": 0.10761447250843048, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 132220 + }, + { + "epoch": 0.5111642003370909, + "grad_norm": 0.10504017025232315, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 132230 + }, + { + "epoch": 0.5112028575404741, + "grad_norm": 0.11160441488027573, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 132240 + }, + { + "epoch": 0.5112415147438574, + "grad_norm": 0.110514797270298, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 132250 + }, + { + "epoch": 0.5112801719472406, + "grad_norm": 0.11630698293447495, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 132260 + }, + { + "epoch": 0.5113188291506239, + "grad_norm": 0.11179690808057785, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 132270 + }, + { + "epoch": 0.5113574863540072, + "grad_norm": 0.10343910753726959, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 132280 + }, + { + "epoch": 0.5113961435573905, + "grad_norm": 0.10784193873405457, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 132290 + }, + { + "epoch": 0.5114348007607737, + "grad_norm": 0.10568580776453018, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 132300 + }, + { + "epoch": 0.511473457964157, + "grad_norm": 0.11240072548389435, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 132310 + }, + { + "epoch": 0.5115121151675404, + "grad_norm": 0.10685410350561142, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 132320 + }, + { + "epoch": 0.5115507723709236, + "grad_norm": 0.1029103696346283, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 132330 + }, + { + "epoch": 0.5115894295743069, + "grad_norm": 0.09647617489099503, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 132340 + }, + { + "epoch": 0.5116280867776901, + "grad_norm": 0.10524403303861618, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 132350 + }, + { + "epoch": 0.5116667439810735, + "grad_norm": 0.11678078025579453, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 132360 + }, + { + "epoch": 0.5117054011844567, + "grad_norm": 0.11135760694742203, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 132370 + }, + { + "epoch": 0.51174405838784, + "grad_norm": 0.10289976000785828, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 132380 + }, + { + "epoch": 0.5117827155912232, + "grad_norm": 0.0918273851275444, + "learning_rate": 0.002, + "loss": 2.341, + "step": 132390 + }, + { + "epoch": 0.5118213727946066, + "grad_norm": 0.11923960596323013, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 132400 + }, + { + "epoch": 0.5118600299979899, + "grad_norm": 0.1056162640452385, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 132410 + }, + { + "epoch": 0.5118986872013731, + "grad_norm": 0.10509263724088669, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 132420 + }, + { + "epoch": 0.5119373444047564, + "grad_norm": 0.09654008597135544, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 132430 + }, + { + "epoch": 0.5119760016081396, + "grad_norm": 0.09543860703706741, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 132440 + }, + { + "epoch": 0.512014658811523, + "grad_norm": 0.11143346130847931, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 132450 + }, + { + "epoch": 0.5120533160149062, + "grad_norm": 0.1058533564209938, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 132460 + }, + { + "epoch": 0.5120919732182895, + "grad_norm": 0.10159727931022644, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 132470 + }, + { + "epoch": 0.5121306304216727, + "grad_norm": 0.10234866291284561, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 132480 + }, + { + "epoch": 0.5121692876250561, + "grad_norm": 0.1149788424372673, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 132490 + }, + { + "epoch": 0.5122079448284393, + "grad_norm": 0.11671125143766403, + "learning_rate": 0.002, + "loss": 2.3703, + "step": 132500 + }, + { + "epoch": 0.5122466020318226, + "grad_norm": 0.10762688517570496, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 132510 + }, + { + "epoch": 0.5122852592352058, + "grad_norm": 0.10247467458248138, + "learning_rate": 0.002, + "loss": 2.342, + "step": 132520 + }, + { + "epoch": 0.5123239164385892, + "grad_norm": 0.12695589661598206, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 132530 + }, + { + "epoch": 0.5123625736419725, + "grad_norm": 0.10451260209083557, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 132540 + }, + { + "epoch": 0.5124012308453557, + "grad_norm": 0.10632877051830292, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 132550 + }, + { + "epoch": 0.512439888048739, + "grad_norm": 0.1000732034444809, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 132560 + }, + { + "epoch": 0.5124785452521223, + "grad_norm": 0.12949375808238983, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 132570 + }, + { + "epoch": 0.5125172024555056, + "grad_norm": 0.10160006582736969, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 132580 + }, + { + "epoch": 0.5125558596588888, + "grad_norm": 0.13972491025924683, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 132590 + }, + { + "epoch": 0.5125945168622721, + "grad_norm": 0.09255162626504898, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 132600 + }, + { + "epoch": 0.5126331740656553, + "grad_norm": 0.09891640394926071, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 132610 + }, + { + "epoch": 0.5126718312690387, + "grad_norm": 0.09517667442560196, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 132620 + }, + { + "epoch": 0.512710488472422, + "grad_norm": 0.10801737755537033, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 132630 + }, + { + "epoch": 0.5127491456758052, + "grad_norm": 0.10930386185646057, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 132640 + }, + { + "epoch": 0.5127878028791885, + "grad_norm": 0.11170261353254318, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 132650 + }, + { + "epoch": 0.5128264600825718, + "grad_norm": 0.09209370613098145, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 132660 + }, + { + "epoch": 0.5128651172859551, + "grad_norm": 0.10556315630674362, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 132670 + }, + { + "epoch": 0.5129037744893383, + "grad_norm": 0.11175452172756195, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 132680 + }, + { + "epoch": 0.5129424316927216, + "grad_norm": 0.11552779376506805, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 132690 + }, + { + "epoch": 0.5129810888961049, + "grad_norm": 0.10478057712316513, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 132700 + }, + { + "epoch": 0.5130197460994882, + "grad_norm": 0.09821119159460068, + "learning_rate": 0.002, + "loss": 2.335, + "step": 132710 + }, + { + "epoch": 0.5130584033028714, + "grad_norm": 0.12864336371421814, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 132720 + }, + { + "epoch": 0.5130970605062547, + "grad_norm": 0.12022227793931961, + "learning_rate": 0.002, + "loss": 2.348, + "step": 132730 + }, + { + "epoch": 0.5131357177096381, + "grad_norm": 0.09698469191789627, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 132740 + }, + { + "epoch": 0.5131743749130213, + "grad_norm": 0.10591412335634232, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 132750 + }, + { + "epoch": 0.5132130321164046, + "grad_norm": 0.10472962260246277, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 132760 + }, + { + "epoch": 0.5132516893197878, + "grad_norm": 0.1055840253829956, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 132770 + }, + { + "epoch": 0.5132903465231712, + "grad_norm": 0.10502316057682037, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 132780 + }, + { + "epoch": 0.5133290037265544, + "grad_norm": 0.10915154218673706, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 132790 + }, + { + "epoch": 0.5133676609299377, + "grad_norm": 0.09325850754976273, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 132800 + }, + { + "epoch": 0.5134063181333209, + "grad_norm": 0.11692541092634201, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 132810 + }, + { + "epoch": 0.5134449753367042, + "grad_norm": 0.11866552382707596, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 132820 + }, + { + "epoch": 0.5134836325400876, + "grad_norm": 0.11210706830024719, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 132830 + }, + { + "epoch": 0.5135222897434708, + "grad_norm": 0.09434731304645538, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 132840 + }, + { + "epoch": 0.5135609469468541, + "grad_norm": 0.1452234834432602, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 132850 + }, + { + "epoch": 0.5135996041502373, + "grad_norm": 0.11293261498212814, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 132860 + }, + { + "epoch": 0.5136382613536207, + "grad_norm": 0.10713877528905869, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 132870 + }, + { + "epoch": 0.5136769185570039, + "grad_norm": 0.14340457320213318, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 132880 + }, + { + "epoch": 0.5137155757603872, + "grad_norm": 0.12019462138414383, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 132890 + }, + { + "epoch": 0.5137542329637704, + "grad_norm": 0.10262832045555115, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 132900 + }, + { + "epoch": 0.5137928901671538, + "grad_norm": 0.10447874665260315, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 132910 + }, + { + "epoch": 0.513831547370537, + "grad_norm": 0.09929387271404266, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 132920 + }, + { + "epoch": 0.5138702045739203, + "grad_norm": 0.09785928577184677, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 132930 + }, + { + "epoch": 0.5139088617773035, + "grad_norm": 0.11398641020059586, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 132940 + }, + { + "epoch": 0.5139475189806869, + "grad_norm": 0.13142992556095123, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 132950 + }, + { + "epoch": 0.5139861761840702, + "grad_norm": 0.10426430404186249, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 132960 + }, + { + "epoch": 0.5140248333874534, + "grad_norm": 0.0965687558054924, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 132970 + }, + { + "epoch": 0.5140634905908367, + "grad_norm": 0.10306452959775925, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 132980 + }, + { + "epoch": 0.5141021477942199, + "grad_norm": 0.10615862160921097, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 132990 + }, + { + "epoch": 0.5141408049976033, + "grad_norm": 0.10264278203248978, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 133000 + }, + { + "epoch": 0.5141794622009865, + "grad_norm": 0.09326222538948059, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 133010 + }, + { + "epoch": 0.5142181194043698, + "grad_norm": 0.09489531815052032, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 133020 + }, + { + "epoch": 0.514256776607753, + "grad_norm": 0.12088096141815186, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 133030 + }, + { + "epoch": 0.5142954338111364, + "grad_norm": 0.10753527283668518, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 133040 + }, + { + "epoch": 0.5143340910145197, + "grad_norm": 0.10076984018087387, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 133050 + }, + { + "epoch": 0.5143727482179029, + "grad_norm": 0.12687014043331146, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 133060 + }, + { + "epoch": 0.5144114054212862, + "grad_norm": 0.1204477995634079, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 133070 + }, + { + "epoch": 0.5144500626246695, + "grad_norm": 0.12178194522857666, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 133080 + }, + { + "epoch": 0.5144887198280528, + "grad_norm": 0.10629022866487503, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 133090 + }, + { + "epoch": 0.514527377031436, + "grad_norm": 0.11349408328533173, + "learning_rate": 0.002, + "loss": 2.339, + "step": 133100 + }, + { + "epoch": 0.5145660342348193, + "grad_norm": 0.12075722217559814, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 133110 + }, + { + "epoch": 0.5146046914382026, + "grad_norm": 0.10138367116451263, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 133120 + }, + { + "epoch": 0.5146433486415859, + "grad_norm": 0.10058567672967911, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 133130 + }, + { + "epoch": 0.5146820058449691, + "grad_norm": 0.10954679548740387, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 133140 + }, + { + "epoch": 0.5147206630483524, + "grad_norm": 0.10420696437358856, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 133150 + }, + { + "epoch": 0.5147593202517358, + "grad_norm": 0.10656526684761047, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 133160 + }, + { + "epoch": 0.514797977455119, + "grad_norm": 0.10731308162212372, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 133170 + }, + { + "epoch": 0.5148366346585023, + "grad_norm": 0.1107126921415329, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 133180 + }, + { + "epoch": 0.5148752918618855, + "grad_norm": 0.10922086238861084, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 133190 + }, + { + "epoch": 0.5149139490652688, + "grad_norm": 0.10685280710458755, + "learning_rate": 0.002, + "loss": 2.344, + "step": 133200 + }, + { + "epoch": 0.5149526062686521, + "grad_norm": 0.10599326342344284, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 133210 + }, + { + "epoch": 0.5149912634720354, + "grad_norm": 0.09880273789167404, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 133220 + }, + { + "epoch": 0.5150299206754186, + "grad_norm": 0.11078929156064987, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 133230 + }, + { + "epoch": 0.5150685778788019, + "grad_norm": 0.11526217311620712, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 133240 + }, + { + "epoch": 0.5151072350821853, + "grad_norm": 0.11422278732061386, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 133250 + }, + { + "epoch": 0.5151458922855685, + "grad_norm": 0.10045097023248672, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 133260 + }, + { + "epoch": 0.5151845494889518, + "grad_norm": 0.10135480016469955, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 133270 + }, + { + "epoch": 0.515223206692335, + "grad_norm": 0.09497664123773575, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 133280 + }, + { + "epoch": 0.5152618638957184, + "grad_norm": 0.10189198702573776, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 133290 + }, + { + "epoch": 0.5153005210991016, + "grad_norm": 0.112625353038311, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 133300 + }, + { + "epoch": 0.5153391783024849, + "grad_norm": 0.12322766333818436, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 133310 + }, + { + "epoch": 0.5153778355058681, + "grad_norm": 0.10485317558050156, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 133320 + }, + { + "epoch": 0.5154164927092515, + "grad_norm": 0.11122911423444748, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 133330 + }, + { + "epoch": 0.5154551499126347, + "grad_norm": 0.1240696832537651, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 133340 + }, + { + "epoch": 0.515493807116018, + "grad_norm": 0.09978976100683212, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 133350 + }, + { + "epoch": 0.5155324643194013, + "grad_norm": 0.11819953471422195, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 133360 + }, + { + "epoch": 0.5155711215227845, + "grad_norm": 0.10220865160226822, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 133370 + }, + { + "epoch": 0.5156097787261679, + "grad_norm": 0.11670244485139847, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 133380 + }, + { + "epoch": 0.5156484359295511, + "grad_norm": 0.12394854426383972, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 133390 + }, + { + "epoch": 0.5156870931329344, + "grad_norm": 0.12229776382446289, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 133400 + }, + { + "epoch": 0.5157257503363176, + "grad_norm": 0.1157093346118927, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 133410 + }, + { + "epoch": 0.515764407539701, + "grad_norm": 0.11021224409341812, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 133420 + }, + { + "epoch": 0.5158030647430842, + "grad_norm": 0.11094634234905243, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 133430 + }, + { + "epoch": 0.5158417219464675, + "grad_norm": 0.10788990557193756, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 133440 + }, + { + "epoch": 0.5158803791498507, + "grad_norm": 0.10132350027561188, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 133450 + }, + { + "epoch": 0.5159190363532341, + "grad_norm": 0.11580069363117218, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 133460 + }, + { + "epoch": 0.5159576935566174, + "grad_norm": 0.12661530077457428, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 133470 + }, + { + "epoch": 0.5159963507600006, + "grad_norm": 0.1014246866106987, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 133480 + }, + { + "epoch": 0.5160350079633839, + "grad_norm": 0.10210530459880829, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 133490 + }, + { + "epoch": 0.5160736651667672, + "grad_norm": 0.11067615449428558, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 133500 + }, + { + "epoch": 0.5161123223701505, + "grad_norm": 0.17499856650829315, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 133510 + }, + { + "epoch": 0.5161509795735337, + "grad_norm": 0.14115779101848602, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 133520 + }, + { + "epoch": 0.516189636776917, + "grad_norm": 0.11126365512609482, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 133530 + }, + { + "epoch": 0.5162282939803002, + "grad_norm": 0.09723678976297379, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 133540 + }, + { + "epoch": 0.5162669511836836, + "grad_norm": 0.09144878387451172, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 133550 + }, + { + "epoch": 0.5163056083870669, + "grad_norm": 0.1076786145567894, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 133560 + }, + { + "epoch": 0.5163442655904501, + "grad_norm": 0.110333651304245, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 133570 + }, + { + "epoch": 0.5163829227938334, + "grad_norm": 0.10729396343231201, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 133580 + }, + { + "epoch": 0.5164215799972167, + "grad_norm": 0.1079677864909172, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 133590 + }, + { + "epoch": 0.5164602372006, + "grad_norm": 0.11192754656076431, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 133600 + }, + { + "epoch": 0.5164988944039832, + "grad_norm": 0.11061155796051025, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 133610 + }, + { + "epoch": 0.5165375516073665, + "grad_norm": 0.37179362773895264, + "learning_rate": 0.002, + "loss": 2.339, + "step": 133620 + }, + { + "epoch": 0.5165762088107498, + "grad_norm": 0.11642017960548401, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 133630 + }, + { + "epoch": 0.5166148660141331, + "grad_norm": 0.10036401450634003, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 133640 + }, + { + "epoch": 0.5166535232175163, + "grad_norm": 0.10902401804924011, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 133650 + }, + { + "epoch": 0.5166921804208996, + "grad_norm": 0.09906166046857834, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 133660 + }, + { + "epoch": 0.516730837624283, + "grad_norm": 0.0989346131682396, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 133670 + }, + { + "epoch": 0.5167694948276662, + "grad_norm": 0.1261736899614334, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 133680 + }, + { + "epoch": 0.5168081520310495, + "grad_norm": 0.10672534257173538, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 133690 + }, + { + "epoch": 0.5168468092344327, + "grad_norm": 0.10923762619495392, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 133700 + }, + { + "epoch": 0.5168854664378161, + "grad_norm": 0.10447768867015839, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 133710 + }, + { + "epoch": 0.5169241236411993, + "grad_norm": 0.13156598806381226, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 133720 + }, + { + "epoch": 0.5169627808445826, + "grad_norm": 0.11636074632406235, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 133730 + }, + { + "epoch": 0.5170014380479658, + "grad_norm": 0.093384750187397, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 133740 + }, + { + "epoch": 0.5170400952513491, + "grad_norm": 0.10545427352190018, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 133750 + }, + { + "epoch": 0.5170787524547324, + "grad_norm": 0.1253446340560913, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 133760 + }, + { + "epoch": 0.5171174096581157, + "grad_norm": 0.1060023084282875, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 133770 + }, + { + "epoch": 0.517156066861499, + "grad_norm": 0.11571837961673737, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 133780 + }, + { + "epoch": 0.5171947240648822, + "grad_norm": 0.13900338113307953, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 133790 + }, + { + "epoch": 0.5172333812682656, + "grad_norm": 0.10236191004514694, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 133800 + }, + { + "epoch": 0.5172720384716488, + "grad_norm": 0.10067922621965408, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 133810 + }, + { + "epoch": 0.5173106956750321, + "grad_norm": 0.10600189864635468, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 133820 + }, + { + "epoch": 0.5173493528784153, + "grad_norm": 0.12919996678829193, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 133830 + }, + { + "epoch": 0.5173880100817987, + "grad_norm": 0.1171584352850914, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 133840 + }, + { + "epoch": 0.5174266672851819, + "grad_norm": 0.12879115343093872, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 133850 + }, + { + "epoch": 0.5174653244885652, + "grad_norm": 0.09737744927406311, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 133860 + }, + { + "epoch": 0.5175039816919484, + "grad_norm": 0.1005203127861023, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 133870 + }, + { + "epoch": 0.5175426388953318, + "grad_norm": 0.10229729115962982, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 133880 + }, + { + "epoch": 0.5175812960987151, + "grad_norm": 0.09854832291603088, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 133890 + }, + { + "epoch": 0.5176199533020983, + "grad_norm": 0.11200766265392303, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 133900 + }, + { + "epoch": 0.5176586105054816, + "grad_norm": 0.10389938205480576, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 133910 + }, + { + "epoch": 0.5176972677088648, + "grad_norm": 0.12249651551246643, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 133920 + }, + { + "epoch": 0.5177359249122482, + "grad_norm": 0.09284602850675583, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 133930 + }, + { + "epoch": 0.5177745821156314, + "grad_norm": 0.12067190557718277, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 133940 + }, + { + "epoch": 0.5178132393190147, + "grad_norm": 0.09926807135343552, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 133950 + }, + { + "epoch": 0.5178518965223979, + "grad_norm": 0.13991133868694305, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 133960 + }, + { + "epoch": 0.5178905537257813, + "grad_norm": 0.11230497807264328, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 133970 + }, + { + "epoch": 0.5179292109291646, + "grad_norm": 0.10470463335514069, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 133980 + }, + { + "epoch": 0.5179678681325478, + "grad_norm": 0.116326704621315, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 133990 + }, + { + "epoch": 0.5180065253359311, + "grad_norm": 0.10669012367725372, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 134000 + }, + { + "epoch": 0.5180451825393144, + "grad_norm": 0.10244178026914597, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 134010 + }, + { + "epoch": 0.5180838397426977, + "grad_norm": 0.1011168584227562, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 134020 + }, + { + "epoch": 0.5181224969460809, + "grad_norm": 0.10668822377920151, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 134030 + }, + { + "epoch": 0.5181611541494642, + "grad_norm": 0.108344167470932, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 134040 + }, + { + "epoch": 0.5181998113528475, + "grad_norm": 0.1032029315829277, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 134050 + }, + { + "epoch": 0.5182384685562308, + "grad_norm": 0.10704102367162704, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 134060 + }, + { + "epoch": 0.518277125759614, + "grad_norm": 0.12949079275131226, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 134070 + }, + { + "epoch": 0.5183157829629973, + "grad_norm": 0.10617806762456894, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 134080 + }, + { + "epoch": 0.5183544401663805, + "grad_norm": 0.09986576437950134, + "learning_rate": 0.002, + "loss": 2.335, + "step": 134090 + }, + { + "epoch": 0.5183930973697639, + "grad_norm": 0.15230610966682434, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 134100 + }, + { + "epoch": 0.5184317545731472, + "grad_norm": 0.10359156876802444, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 134110 + }, + { + "epoch": 0.5184704117765304, + "grad_norm": 0.1316675990819931, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 134120 + }, + { + "epoch": 0.5185090689799137, + "grad_norm": 0.10203003883361816, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 134130 + }, + { + "epoch": 0.518547726183297, + "grad_norm": 0.11022720485925674, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 134140 + }, + { + "epoch": 0.5185863833866803, + "grad_norm": 0.1270035207271576, + "learning_rate": 0.002, + "loss": 2.359, + "step": 134150 + }, + { + "epoch": 0.5186250405900635, + "grad_norm": 0.10473570972681046, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 134160 + }, + { + "epoch": 0.5186636977934468, + "grad_norm": 0.0970320850610733, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 134170 + }, + { + "epoch": 0.5187023549968302, + "grad_norm": 0.09618403017520905, + "learning_rate": 0.002, + "loss": 2.3655, + "step": 134180 + }, + { + "epoch": 0.5187410122002134, + "grad_norm": 0.11617468297481537, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 134190 + }, + { + "epoch": 0.5187796694035967, + "grad_norm": 0.1210232824087143, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 134200 + }, + { + "epoch": 0.5188183266069799, + "grad_norm": 0.12471597641706467, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 134210 + }, + { + "epoch": 0.5188569838103633, + "grad_norm": 0.09887123852968216, + "learning_rate": 0.002, + "loss": 2.354, + "step": 134220 + }, + { + "epoch": 0.5188956410137465, + "grad_norm": 0.11552924662828445, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 134230 + }, + { + "epoch": 0.5189342982171298, + "grad_norm": 0.11775676161050797, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 134240 + }, + { + "epoch": 0.518972955420513, + "grad_norm": 0.1122833788394928, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 134250 + }, + { + "epoch": 0.5190116126238964, + "grad_norm": 0.11856210976839066, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 134260 + }, + { + "epoch": 0.5190502698272796, + "grad_norm": 0.09300824254751205, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 134270 + }, + { + "epoch": 0.5190889270306629, + "grad_norm": 0.11704012751579285, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 134280 + }, + { + "epoch": 0.5191275842340461, + "grad_norm": 0.10149552673101425, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 134290 + }, + { + "epoch": 0.5191662414374294, + "grad_norm": 0.11498774588108063, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 134300 + }, + { + "epoch": 0.5192048986408128, + "grad_norm": 0.08841398358345032, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 134310 + }, + { + "epoch": 0.519243555844196, + "grad_norm": 0.11097606271505356, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 134320 + }, + { + "epoch": 0.5192822130475793, + "grad_norm": 0.11205650866031647, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 134330 + }, + { + "epoch": 0.5193208702509625, + "grad_norm": 0.10886071622371674, + "learning_rate": 0.002, + "loss": 2.329, + "step": 134340 + }, + { + "epoch": 0.5193595274543459, + "grad_norm": 0.09254167973995209, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 134350 + }, + { + "epoch": 0.5193981846577291, + "grad_norm": 0.10301493108272552, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 134360 + }, + { + "epoch": 0.5194368418611124, + "grad_norm": 0.11026881635189056, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 134370 + }, + { + "epoch": 0.5194754990644956, + "grad_norm": 0.11774619668722153, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 134380 + }, + { + "epoch": 0.519514156267879, + "grad_norm": 0.10522367805242538, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 134390 + }, + { + "epoch": 0.5195528134712623, + "grad_norm": 0.10401474684476852, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 134400 + }, + { + "epoch": 0.5195914706746455, + "grad_norm": 0.1115586906671524, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 134410 + }, + { + "epoch": 0.5196301278780288, + "grad_norm": 0.10123543441295624, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 134420 + }, + { + "epoch": 0.5196687850814121, + "grad_norm": 0.10290665924549103, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 134430 + }, + { + "epoch": 0.5197074422847954, + "grad_norm": 0.11987538635730743, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 134440 + }, + { + "epoch": 0.5197460994881786, + "grad_norm": 0.10227857530117035, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 134450 + }, + { + "epoch": 0.5197847566915619, + "grad_norm": 0.11408598721027374, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 134460 + }, + { + "epoch": 0.5198234138949451, + "grad_norm": 0.12978363037109375, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 134470 + }, + { + "epoch": 0.5198620710983285, + "grad_norm": 0.09933390468358994, + "learning_rate": 0.002, + "loss": 2.336, + "step": 134480 + }, + { + "epoch": 0.5199007283017117, + "grad_norm": 0.11041887104511261, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 134490 + }, + { + "epoch": 0.519939385505095, + "grad_norm": 0.10262308269739151, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 134500 + }, + { + "epoch": 0.5199780427084782, + "grad_norm": 0.10440009087324142, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 134510 + }, + { + "epoch": 0.5200166999118616, + "grad_norm": 0.104102224111557, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 134520 + }, + { + "epoch": 0.5200553571152449, + "grad_norm": 0.10482411086559296, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 134530 + }, + { + "epoch": 0.5200940143186281, + "grad_norm": 0.11475711315870285, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 134540 + }, + { + "epoch": 0.5201326715220114, + "grad_norm": 0.1048799678683281, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 134550 + }, + { + "epoch": 0.5201713287253947, + "grad_norm": 0.1025136187672615, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 134560 + }, + { + "epoch": 0.520209985928778, + "grad_norm": 0.09366423636674881, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 134570 + }, + { + "epoch": 0.5202486431321612, + "grad_norm": 0.13303522765636444, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 134580 + }, + { + "epoch": 0.5202873003355445, + "grad_norm": 0.45014289021492004, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 134590 + }, + { + "epoch": 0.5203259575389279, + "grad_norm": 0.11356982588768005, + "learning_rate": 0.002, + "loss": 2.331, + "step": 134600 + }, + { + "epoch": 0.5203646147423111, + "grad_norm": 0.10263145714998245, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 134610 + }, + { + "epoch": 0.5204032719456944, + "grad_norm": 0.1106531098484993, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 134620 + }, + { + "epoch": 0.5204419291490776, + "grad_norm": 0.12091255933046341, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 134630 + }, + { + "epoch": 0.520480586352461, + "grad_norm": 0.11186621338129044, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 134640 + }, + { + "epoch": 0.5205192435558442, + "grad_norm": 0.09535179287195206, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 134650 + }, + { + "epoch": 0.5205579007592275, + "grad_norm": 0.10127545893192291, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 134660 + }, + { + "epoch": 0.5205965579626107, + "grad_norm": 0.14372475445270538, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 134670 + }, + { + "epoch": 0.520635215165994, + "grad_norm": 0.10775496810674667, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 134680 + }, + { + "epoch": 0.5206738723693773, + "grad_norm": 0.10723396390676498, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 134690 + }, + { + "epoch": 0.5207125295727606, + "grad_norm": 0.14177311956882477, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 134700 + }, + { + "epoch": 0.5207511867761438, + "grad_norm": 0.10543669015169144, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 134710 + }, + { + "epoch": 0.5207898439795271, + "grad_norm": 0.12044474482536316, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 134720 + }, + { + "epoch": 0.5208285011829105, + "grad_norm": 0.10286377370357513, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 134730 + }, + { + "epoch": 0.5208671583862937, + "grad_norm": 0.10837053507566452, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 134740 + }, + { + "epoch": 0.520905815589677, + "grad_norm": 0.10814402252435684, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 134750 + }, + { + "epoch": 0.5209444727930602, + "grad_norm": 0.15393859148025513, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 134760 + }, + { + "epoch": 0.5209831299964436, + "grad_norm": 0.11631506681442261, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 134770 + }, + { + "epoch": 0.5210217871998268, + "grad_norm": 0.10414082556962967, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 134780 + }, + { + "epoch": 0.5210604444032101, + "grad_norm": 0.11460426449775696, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 134790 + }, + { + "epoch": 0.5210991016065933, + "grad_norm": 0.11126938462257385, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 134800 + }, + { + "epoch": 0.5211377588099767, + "grad_norm": 0.10752900689840317, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 134810 + }, + { + "epoch": 0.52117641601336, + "grad_norm": 0.10768181830644608, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 134820 + }, + { + "epoch": 0.5212150732167432, + "grad_norm": 0.12294381111860275, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 134830 + }, + { + "epoch": 0.5212537304201265, + "grad_norm": 0.10825446993112564, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 134840 + }, + { + "epoch": 0.5212923876235097, + "grad_norm": 0.11277685314416885, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 134850 + }, + { + "epoch": 0.5213310448268931, + "grad_norm": 0.10696902871131897, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 134860 + }, + { + "epoch": 0.5213697020302763, + "grad_norm": 0.11747557669878006, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 134870 + }, + { + "epoch": 0.5214083592336596, + "grad_norm": 0.10795747488737106, + "learning_rate": 0.002, + "loss": 2.354, + "step": 134880 + }, + { + "epoch": 0.5214470164370428, + "grad_norm": 0.09496504813432693, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 134890 + }, + { + "epoch": 0.5214856736404262, + "grad_norm": 0.09993629902601242, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 134900 + }, + { + "epoch": 0.5215243308438094, + "grad_norm": 0.10213123261928558, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 134910 + }, + { + "epoch": 0.5215629880471927, + "grad_norm": 0.08959214389324188, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 134920 + }, + { + "epoch": 0.521601645250576, + "grad_norm": 0.09359659999608994, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 134930 + }, + { + "epoch": 0.5216403024539593, + "grad_norm": 0.10688213258981705, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 134940 + }, + { + "epoch": 0.5216789596573426, + "grad_norm": 0.11640635877847672, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 134950 + }, + { + "epoch": 0.5217176168607258, + "grad_norm": 0.09887672960758209, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 134960 + }, + { + "epoch": 0.5217562740641091, + "grad_norm": 0.10856257379055023, + "learning_rate": 0.002, + "loss": 2.328, + "step": 134970 + }, + { + "epoch": 0.5217949312674924, + "grad_norm": 0.11670080572366714, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 134980 + }, + { + "epoch": 0.5218335884708757, + "grad_norm": 0.10856100916862488, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 134990 + }, + { + "epoch": 0.5218722456742589, + "grad_norm": 0.09527178853750229, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 135000 + }, + { + "epoch": 0.5219109028776422, + "grad_norm": 0.11452498286962509, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 135010 + }, + { + "epoch": 0.5219495600810254, + "grad_norm": 0.1107560321688652, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 135020 + }, + { + "epoch": 0.5219882172844088, + "grad_norm": 0.11319515854120255, + "learning_rate": 0.002, + "loss": 2.342, + "step": 135030 + }, + { + "epoch": 0.5220268744877921, + "grad_norm": 0.09755564481019974, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 135040 + }, + { + "epoch": 0.5220655316911753, + "grad_norm": 0.11580151319503784, + "learning_rate": 0.002, + "loss": 2.3785, + "step": 135050 + }, + { + "epoch": 0.5221041888945586, + "grad_norm": 0.10912448912858963, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 135060 + }, + { + "epoch": 0.5221428460979419, + "grad_norm": 0.10042090713977814, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 135070 + }, + { + "epoch": 0.5221815033013252, + "grad_norm": 0.09757044166326523, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 135080 + }, + { + "epoch": 0.5222201605047084, + "grad_norm": 0.09776173532009125, + "learning_rate": 0.002, + "loss": 2.367, + "step": 135090 + }, + { + "epoch": 0.5222588177080917, + "grad_norm": 0.11984714865684509, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 135100 + }, + { + "epoch": 0.522297474911475, + "grad_norm": 0.10493864119052887, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 135110 + }, + { + "epoch": 0.5223361321148583, + "grad_norm": 0.1006794199347496, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 135120 + }, + { + "epoch": 0.5223747893182416, + "grad_norm": 0.12291432172060013, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 135130 + }, + { + "epoch": 0.5224134465216248, + "grad_norm": 0.11559300869703293, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 135140 + }, + { + "epoch": 0.5224521037250082, + "grad_norm": 0.10269319266080856, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 135150 + }, + { + "epoch": 0.5224907609283914, + "grad_norm": 0.11249280720949173, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 135160 + }, + { + "epoch": 0.5225294181317747, + "grad_norm": 0.10752243548631668, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 135170 + }, + { + "epoch": 0.5225680753351579, + "grad_norm": 0.10211756080389023, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 135180 + }, + { + "epoch": 0.5226067325385413, + "grad_norm": 0.12007226049900055, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 135190 + }, + { + "epoch": 0.5226453897419245, + "grad_norm": 0.10831714421510696, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 135200 + }, + { + "epoch": 0.5226840469453078, + "grad_norm": 0.09663467854261398, + "learning_rate": 0.002, + "loss": 2.338, + "step": 135210 + }, + { + "epoch": 0.522722704148691, + "grad_norm": 0.11312247067689896, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 135220 + }, + { + "epoch": 0.5227613613520743, + "grad_norm": 0.10021994262933731, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 135230 + }, + { + "epoch": 0.5228000185554577, + "grad_norm": 0.12776386737823486, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 135240 + }, + { + "epoch": 0.5228386757588409, + "grad_norm": 0.11067812889814377, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 135250 + }, + { + "epoch": 0.5228773329622242, + "grad_norm": 0.1040908619761467, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 135260 + }, + { + "epoch": 0.5229159901656074, + "grad_norm": 0.11826976388692856, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 135270 + }, + { + "epoch": 0.5229546473689908, + "grad_norm": 0.10310089588165283, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 135280 + }, + { + "epoch": 0.522993304572374, + "grad_norm": 0.13205021619796753, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 135290 + }, + { + "epoch": 0.5230319617757573, + "grad_norm": 0.0964132621884346, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 135300 + }, + { + "epoch": 0.5230706189791405, + "grad_norm": 0.10004474967718124, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 135310 + }, + { + "epoch": 0.5231092761825239, + "grad_norm": 0.1004033163189888, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 135320 + }, + { + "epoch": 0.5231479333859071, + "grad_norm": 0.09789041429758072, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 135330 + }, + { + "epoch": 0.5231865905892904, + "grad_norm": 0.13248370587825775, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 135340 + }, + { + "epoch": 0.5232252477926737, + "grad_norm": 0.10540876537561417, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 135350 + }, + { + "epoch": 0.523263904996057, + "grad_norm": 0.09602527320384979, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 135360 + }, + { + "epoch": 0.5233025621994403, + "grad_norm": 0.10474893450737, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 135370 + }, + { + "epoch": 0.5233412194028235, + "grad_norm": 0.08683910965919495, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 135380 + }, + { + "epoch": 0.5233798766062068, + "grad_norm": 0.14607366919517517, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 135390 + }, + { + "epoch": 0.52341853380959, + "grad_norm": 0.10965852439403534, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 135400 + }, + { + "epoch": 0.5234571910129734, + "grad_norm": 0.15924514830112457, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 135410 + }, + { + "epoch": 0.5234958482163566, + "grad_norm": 0.11714666336774826, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 135420 + }, + { + "epoch": 0.5235345054197399, + "grad_norm": 0.09994805604219437, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 135430 + }, + { + "epoch": 0.5235731626231231, + "grad_norm": 0.09854995459318161, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 135440 + }, + { + "epoch": 0.5236118198265065, + "grad_norm": 0.11116635799407959, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 135450 + }, + { + "epoch": 0.5236504770298898, + "grad_norm": 0.10123293846845627, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 135460 + }, + { + "epoch": 0.523689134233273, + "grad_norm": 0.10389833152294159, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 135470 + }, + { + "epoch": 0.5237277914366563, + "grad_norm": 0.11927933990955353, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 135480 + }, + { + "epoch": 0.5237664486400396, + "grad_norm": 0.11789486557245255, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 135490 + }, + { + "epoch": 0.5238051058434229, + "grad_norm": 0.08775586634874344, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 135500 + }, + { + "epoch": 0.5238437630468061, + "grad_norm": 0.10228339582681656, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 135510 + }, + { + "epoch": 0.5238824202501894, + "grad_norm": 0.10572341084480286, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 135520 + }, + { + "epoch": 0.5239210774535727, + "grad_norm": 0.10267415642738342, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 135530 + }, + { + "epoch": 0.523959734656956, + "grad_norm": 0.10416428744792938, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 135540 + }, + { + "epoch": 0.5239983918603393, + "grad_norm": 0.1263497918844223, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 135550 + }, + { + "epoch": 0.5240370490637225, + "grad_norm": 0.1601221263408661, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 135560 + }, + { + "epoch": 0.5240757062671059, + "grad_norm": 0.11268121749162674, + "learning_rate": 0.002, + "loss": 2.345, + "step": 135570 + }, + { + "epoch": 0.5241143634704891, + "grad_norm": 0.1266414374113083, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 135580 + }, + { + "epoch": 0.5241530206738724, + "grad_norm": 0.119015634059906, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 135590 + }, + { + "epoch": 0.5241916778772556, + "grad_norm": 0.10737226158380508, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 135600 + }, + { + "epoch": 0.5242303350806389, + "grad_norm": 0.09539292752742767, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 135610 + }, + { + "epoch": 0.5242689922840222, + "grad_norm": 0.09416766464710236, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 135620 + }, + { + "epoch": 0.5243076494874055, + "grad_norm": 0.0924883484840393, + "learning_rate": 0.002, + "loss": 2.346, + "step": 135630 + }, + { + "epoch": 0.5243463066907887, + "grad_norm": 0.11176314949989319, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 135640 + }, + { + "epoch": 0.524384963894172, + "grad_norm": 0.13459579646587372, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 135650 + }, + { + "epoch": 0.5244236210975554, + "grad_norm": 0.11950015276670456, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 135660 + }, + { + "epoch": 0.5244622783009386, + "grad_norm": 0.12850578129291534, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 135670 + }, + { + "epoch": 0.5245009355043219, + "grad_norm": 0.10969886183738708, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 135680 + }, + { + "epoch": 0.5245395927077051, + "grad_norm": 0.12006625533103943, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 135690 + }, + { + "epoch": 0.5245782499110885, + "grad_norm": 0.11588145047426224, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 135700 + }, + { + "epoch": 0.5246169071144717, + "grad_norm": 0.12310832738876343, + "learning_rate": 0.002, + "loss": 2.347, + "step": 135710 + }, + { + "epoch": 0.524655564317855, + "grad_norm": 0.09691675752401352, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 135720 + }, + { + "epoch": 0.5246942215212382, + "grad_norm": 0.12335009127855301, + "learning_rate": 0.002, + "loss": 2.3663, + "step": 135730 + }, + { + "epoch": 0.5247328787246216, + "grad_norm": 0.11544227600097656, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 135740 + }, + { + "epoch": 0.5247715359280049, + "grad_norm": 0.12592443823814392, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 135750 + }, + { + "epoch": 0.5248101931313881, + "grad_norm": 0.1194482147693634, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 135760 + }, + { + "epoch": 0.5248488503347714, + "grad_norm": 0.10435574501752853, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 135770 + }, + { + "epoch": 0.5248875075381546, + "grad_norm": 0.10019376128911972, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 135780 + }, + { + "epoch": 0.524926164741538, + "grad_norm": 0.10837697237730026, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 135790 + }, + { + "epoch": 0.5249648219449212, + "grad_norm": 0.12371774017810822, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 135800 + }, + { + "epoch": 0.5250034791483045, + "grad_norm": 0.09747034311294556, + "learning_rate": 0.002, + "loss": 2.344, + "step": 135810 + }, + { + "epoch": 0.5250421363516877, + "grad_norm": 0.10710786283016205, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 135820 + }, + { + "epoch": 0.5250807935550711, + "grad_norm": 0.11960620433092117, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 135830 + }, + { + "epoch": 0.5251194507584543, + "grad_norm": 0.11846979707479477, + "learning_rate": 0.002, + "loss": 2.348, + "step": 135840 + }, + { + "epoch": 0.5251581079618376, + "grad_norm": 0.12157097458839417, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 135850 + }, + { + "epoch": 0.5251967651652208, + "grad_norm": 0.1204945296049118, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 135860 + }, + { + "epoch": 0.5252354223686042, + "grad_norm": 0.10225175321102142, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 135870 + }, + { + "epoch": 0.5252740795719875, + "grad_norm": 0.11851023882627487, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 135880 + }, + { + "epoch": 0.5253127367753707, + "grad_norm": 0.09924530982971191, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 135890 + }, + { + "epoch": 0.525351393978754, + "grad_norm": 0.10862696170806885, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 135900 + }, + { + "epoch": 0.5253900511821373, + "grad_norm": 0.09692274034023285, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 135910 + }, + { + "epoch": 0.5254287083855206, + "grad_norm": 0.10528989136219025, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 135920 + }, + { + "epoch": 0.5254673655889038, + "grad_norm": 0.10078863799571991, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 135930 + }, + { + "epoch": 0.5255060227922871, + "grad_norm": 0.1006772443652153, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 135940 + }, + { + "epoch": 0.5255446799956703, + "grad_norm": 0.10570927709341049, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 135950 + }, + { + "epoch": 0.5255833371990537, + "grad_norm": 0.10867559164762497, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 135960 + }, + { + "epoch": 0.525621994402437, + "grad_norm": 0.10470971465110779, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 135970 + }, + { + "epoch": 0.5256606516058202, + "grad_norm": 0.10036683082580566, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 135980 + }, + { + "epoch": 0.5256993088092035, + "grad_norm": 0.11269212514162064, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 135990 + }, + { + "epoch": 0.5257379660125868, + "grad_norm": 0.10223264992237091, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 136000 + }, + { + "epoch": 0.5257766232159701, + "grad_norm": 0.12405657023191452, + "learning_rate": 0.002, + "loss": 2.361, + "step": 136010 + }, + { + "epoch": 0.5258152804193533, + "grad_norm": 0.11630851775407791, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 136020 + }, + { + "epoch": 0.5258539376227366, + "grad_norm": 0.11915447562932968, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 136030 + }, + { + "epoch": 0.5258925948261199, + "grad_norm": 0.13056451082229614, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 136040 + }, + { + "epoch": 0.5259312520295032, + "grad_norm": 0.12401597201824188, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 136050 + }, + { + "epoch": 0.5259699092328864, + "grad_norm": 0.1053740456700325, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 136060 + }, + { + "epoch": 0.5260085664362697, + "grad_norm": 0.12475787848234177, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 136070 + }, + { + "epoch": 0.5260472236396531, + "grad_norm": 0.10645383596420288, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 136080 + }, + { + "epoch": 0.5260858808430363, + "grad_norm": 0.10044417530298233, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 136090 + }, + { + "epoch": 0.5261245380464196, + "grad_norm": 0.11066542565822601, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 136100 + }, + { + "epoch": 0.5261631952498028, + "grad_norm": 0.1240231841802597, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 136110 + }, + { + "epoch": 0.5262018524531862, + "grad_norm": 0.10536878556013107, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 136120 + }, + { + "epoch": 0.5262405096565694, + "grad_norm": 0.11693378537893295, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 136130 + }, + { + "epoch": 0.5262791668599527, + "grad_norm": 0.11828534305095673, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 136140 + }, + { + "epoch": 0.5263178240633359, + "grad_norm": 0.12139234691858292, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 136150 + }, + { + "epoch": 0.5263564812667192, + "grad_norm": 0.11577997356653214, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 136160 + }, + { + "epoch": 0.5263951384701026, + "grad_norm": 0.09906233102083206, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 136170 + }, + { + "epoch": 0.5264337956734858, + "grad_norm": 0.11612803488969803, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 136180 + }, + { + "epoch": 0.5264724528768691, + "grad_norm": 0.10978134721517563, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 136190 + }, + { + "epoch": 0.5265111100802523, + "grad_norm": 0.12343063950538635, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 136200 + }, + { + "epoch": 0.5265497672836357, + "grad_norm": 0.1064305379986763, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 136210 + }, + { + "epoch": 0.5265884244870189, + "grad_norm": 0.09350063651800156, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 136220 + }, + { + "epoch": 0.5266270816904022, + "grad_norm": 0.10289590060710907, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 136230 + }, + { + "epoch": 0.5266657388937854, + "grad_norm": 0.12419820576906204, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 136240 + }, + { + "epoch": 0.5267043960971688, + "grad_norm": 0.09969479590654373, + "learning_rate": 0.002, + "loss": 2.343, + "step": 136250 + }, + { + "epoch": 0.526743053300552, + "grad_norm": 0.13299071788787842, + "learning_rate": 0.002, + "loss": 2.348, + "step": 136260 + }, + { + "epoch": 0.5267817105039353, + "grad_norm": 0.11333530396223068, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 136270 + }, + { + "epoch": 0.5268203677073185, + "grad_norm": 0.11156991869211197, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 136280 + }, + { + "epoch": 0.5268590249107019, + "grad_norm": 0.10726924240589142, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 136290 + }, + { + "epoch": 0.5268976821140852, + "grad_norm": 0.0964454635977745, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 136300 + }, + { + "epoch": 0.5269363393174684, + "grad_norm": 0.10612018406391144, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 136310 + }, + { + "epoch": 0.5269749965208517, + "grad_norm": 0.10159798711538315, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 136320 + }, + { + "epoch": 0.5270136537242349, + "grad_norm": 0.10592789202928543, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 136330 + }, + { + "epoch": 0.5270523109276183, + "grad_norm": 0.12359337508678436, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 136340 + }, + { + "epoch": 0.5270909681310015, + "grad_norm": 0.11304433643817902, + "learning_rate": 0.002, + "loss": 2.3726, + "step": 136350 + }, + { + "epoch": 0.5271296253343848, + "grad_norm": 0.10624481737613678, + "learning_rate": 0.002, + "loss": 2.327, + "step": 136360 + }, + { + "epoch": 0.527168282537768, + "grad_norm": 0.10739625990390778, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 136370 + }, + { + "epoch": 0.5272069397411514, + "grad_norm": 0.11046303808689117, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 136380 + }, + { + "epoch": 0.5272455969445347, + "grad_norm": 0.12088941782712936, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 136390 + }, + { + "epoch": 0.5272842541479179, + "grad_norm": 0.10500706732273102, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 136400 + }, + { + "epoch": 0.5273229113513012, + "grad_norm": 0.10844823718070984, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 136410 + }, + { + "epoch": 0.5273615685546845, + "grad_norm": 0.10434307903051376, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 136420 + }, + { + "epoch": 0.5274002257580678, + "grad_norm": 0.09942899644374847, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 136430 + }, + { + "epoch": 0.527438882961451, + "grad_norm": 0.09675901383161545, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 136440 + }, + { + "epoch": 0.5274775401648343, + "grad_norm": 0.10810984671115875, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 136450 + }, + { + "epoch": 0.5275161973682176, + "grad_norm": 0.10786008089780807, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 136460 + }, + { + "epoch": 0.5275548545716009, + "grad_norm": 0.11267884075641632, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 136470 + }, + { + "epoch": 0.5275935117749841, + "grad_norm": 0.11689767241477966, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 136480 + }, + { + "epoch": 0.5276321689783674, + "grad_norm": 0.11251311749219894, + "learning_rate": 0.002, + "loss": 2.337, + "step": 136490 + }, + { + "epoch": 0.5276708261817508, + "grad_norm": 0.1003497913479805, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 136500 + }, + { + "epoch": 0.527709483385134, + "grad_norm": 0.11317698657512665, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 136510 + }, + { + "epoch": 0.5277481405885173, + "grad_norm": 0.10377456247806549, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 136520 + }, + { + "epoch": 0.5277867977919005, + "grad_norm": 0.11414996534585953, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 136530 + }, + { + "epoch": 0.5278254549952838, + "grad_norm": 0.10959716886281967, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 136540 + }, + { + "epoch": 0.5278641121986671, + "grad_norm": 0.11749317497015, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 136550 + }, + { + "epoch": 0.5279027694020504, + "grad_norm": 0.10722503066062927, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 136560 + }, + { + "epoch": 0.5279414266054336, + "grad_norm": 0.09356562793254852, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 136570 + }, + { + "epoch": 0.5279800838088169, + "grad_norm": 0.1076977327466011, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 136580 + }, + { + "epoch": 0.5280187410122003, + "grad_norm": 0.10418250411748886, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 136590 + }, + { + "epoch": 0.5280573982155835, + "grad_norm": 0.10491588711738586, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 136600 + }, + { + "epoch": 0.5280960554189668, + "grad_norm": 0.10587549209594727, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 136610 + }, + { + "epoch": 0.52813471262235, + "grad_norm": 0.10736706107854843, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 136620 + }, + { + "epoch": 0.5281733698257334, + "grad_norm": 0.09436357021331787, + "learning_rate": 0.002, + "loss": 2.349, + "step": 136630 + }, + { + "epoch": 0.5282120270291166, + "grad_norm": 0.10800494253635406, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 136640 + }, + { + "epoch": 0.5282506842324999, + "grad_norm": 0.10391030460596085, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 136650 + }, + { + "epoch": 0.5282893414358831, + "grad_norm": 0.1147724837064743, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 136660 + }, + { + "epoch": 0.5283279986392665, + "grad_norm": 0.10503552854061127, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 136670 + }, + { + "epoch": 0.5283666558426497, + "grad_norm": 0.11024164408445358, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 136680 + }, + { + "epoch": 0.528405313046033, + "grad_norm": 0.10197118669748306, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 136690 + }, + { + "epoch": 0.5284439702494163, + "grad_norm": 0.10183871537446976, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 136700 + }, + { + "epoch": 0.5284826274527995, + "grad_norm": 0.10741639137268066, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 136710 + }, + { + "epoch": 0.5285212846561829, + "grad_norm": 0.1323143094778061, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 136720 + }, + { + "epoch": 0.5285599418595661, + "grad_norm": 0.1085241287946701, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 136730 + }, + { + "epoch": 0.5285985990629494, + "grad_norm": 0.10665200650691986, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 136740 + }, + { + "epoch": 0.5286372562663326, + "grad_norm": 0.10080068558454514, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 136750 + }, + { + "epoch": 0.528675913469716, + "grad_norm": 0.11925096064805984, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 136760 + }, + { + "epoch": 0.5287145706730992, + "grad_norm": 0.11695779860019684, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 136770 + }, + { + "epoch": 0.5287532278764825, + "grad_norm": 0.12991879880428314, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 136780 + }, + { + "epoch": 0.5287918850798657, + "grad_norm": 0.13205960392951965, + "learning_rate": 0.002, + "loss": 2.3682, + "step": 136790 + }, + { + "epoch": 0.5288305422832491, + "grad_norm": 0.0976249948143959, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 136800 + }, + { + "epoch": 0.5288691994866324, + "grad_norm": 0.11842067539691925, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 136810 + }, + { + "epoch": 0.5289078566900156, + "grad_norm": 0.13334208726882935, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 136820 + }, + { + "epoch": 0.5289465138933989, + "grad_norm": 0.11997167021036148, + "learning_rate": 0.002, + "loss": 2.3671, + "step": 136830 + }, + { + "epoch": 0.5289851710967822, + "grad_norm": 0.09686478972434998, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 136840 + }, + { + "epoch": 0.5290238283001655, + "grad_norm": 0.09126406162977219, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 136850 + }, + { + "epoch": 0.5290624855035487, + "grad_norm": 0.13515914976596832, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 136860 + }, + { + "epoch": 0.529101142706932, + "grad_norm": 0.11011700332164764, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 136870 + }, + { + "epoch": 0.5291397999103152, + "grad_norm": 0.09951812773942947, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 136880 + }, + { + "epoch": 0.5291784571136986, + "grad_norm": 0.12080974876880646, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 136890 + }, + { + "epoch": 0.5292171143170818, + "grad_norm": 0.11308450251817703, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 136900 + }, + { + "epoch": 0.5292557715204651, + "grad_norm": 0.1100267842411995, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 136910 + }, + { + "epoch": 0.5292944287238484, + "grad_norm": 0.10424294322729111, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 136920 + }, + { + "epoch": 0.5293330859272317, + "grad_norm": 0.1197114810347557, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 136930 + }, + { + "epoch": 0.529371743130615, + "grad_norm": 0.1036001443862915, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 136940 + }, + { + "epoch": 0.5294104003339982, + "grad_norm": 0.10613606870174408, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 136950 + }, + { + "epoch": 0.5294490575373815, + "grad_norm": 0.10405551642179489, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 136960 + }, + { + "epoch": 0.5294877147407648, + "grad_norm": 0.11526428908109665, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 136970 + }, + { + "epoch": 0.5295263719441481, + "grad_norm": 0.10523267835378647, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 136980 + }, + { + "epoch": 0.5295650291475313, + "grad_norm": 0.09959837049245834, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 136990 + }, + { + "epoch": 0.5296036863509146, + "grad_norm": 0.12185262143611908, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 137000 + }, + { + "epoch": 0.529642343554298, + "grad_norm": 0.09950944036245346, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 137010 + }, + { + "epoch": 0.5296810007576812, + "grad_norm": 0.1028703823685646, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 137020 + }, + { + "epoch": 0.5297196579610645, + "grad_norm": 0.11582040041685104, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 137030 + }, + { + "epoch": 0.5297583151644477, + "grad_norm": 0.11042831838130951, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 137040 + }, + { + "epoch": 0.5297969723678311, + "grad_norm": 0.108167365193367, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 137050 + }, + { + "epoch": 0.5298356295712143, + "grad_norm": 0.12642677128314972, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 137060 + }, + { + "epoch": 0.5298742867745976, + "grad_norm": 0.11467591673135757, + "learning_rate": 0.002, + "loss": 2.345, + "step": 137070 + }, + { + "epoch": 0.5299129439779808, + "grad_norm": 0.10052400082349777, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 137080 + }, + { + "epoch": 0.5299516011813641, + "grad_norm": 0.09801182150840759, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 137090 + }, + { + "epoch": 0.5299902583847474, + "grad_norm": 0.11449935287237167, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 137100 + }, + { + "epoch": 0.5300289155881307, + "grad_norm": 0.102525994181633, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 137110 + }, + { + "epoch": 0.530067572791514, + "grad_norm": 0.09240062534809113, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 137120 + }, + { + "epoch": 0.5301062299948972, + "grad_norm": 0.10398975014686584, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 137130 + }, + { + "epoch": 0.5301448871982806, + "grad_norm": 0.10602229088544846, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 137140 + }, + { + "epoch": 0.5301835444016638, + "grad_norm": 0.10612321645021439, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 137150 + }, + { + "epoch": 0.5302222016050471, + "grad_norm": 0.10876964777708054, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 137160 + }, + { + "epoch": 0.5302608588084303, + "grad_norm": 0.10908559709787369, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 137170 + }, + { + "epoch": 0.5302995160118137, + "grad_norm": 0.10353915393352509, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 137180 + }, + { + "epoch": 0.5303381732151969, + "grad_norm": 0.10041562467813492, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 137190 + }, + { + "epoch": 0.5303768304185802, + "grad_norm": 0.09648370742797852, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 137200 + }, + { + "epoch": 0.5304154876219634, + "grad_norm": 0.11177092790603638, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 137210 + }, + { + "epoch": 0.5304541448253468, + "grad_norm": 0.10650645941495895, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 137220 + }, + { + "epoch": 0.5304928020287301, + "grad_norm": 0.10645987838506699, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 137230 + }, + { + "epoch": 0.5305314592321133, + "grad_norm": 0.10007698833942413, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 137240 + }, + { + "epoch": 0.5305701164354966, + "grad_norm": 0.12854336202144623, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 137250 + }, + { + "epoch": 0.5306087736388798, + "grad_norm": 0.13256269693374634, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 137260 + }, + { + "epoch": 0.5306474308422632, + "grad_norm": 0.12798702716827393, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 137270 + }, + { + "epoch": 0.5306860880456464, + "grad_norm": 0.10192005336284637, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 137280 + }, + { + "epoch": 0.5307247452490297, + "grad_norm": 0.12237120419740677, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 137290 + }, + { + "epoch": 0.5307634024524129, + "grad_norm": 0.09982860833406448, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 137300 + }, + { + "epoch": 0.5308020596557963, + "grad_norm": 0.10656183958053589, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 137310 + }, + { + "epoch": 0.5308407168591796, + "grad_norm": 0.10240715742111206, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 137320 + }, + { + "epoch": 0.5308793740625628, + "grad_norm": 0.12004515528678894, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 137330 + }, + { + "epoch": 0.530918031265946, + "grad_norm": 0.11195909231901169, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 137340 + }, + { + "epoch": 0.5309566884693294, + "grad_norm": 0.12709975242614746, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 137350 + }, + { + "epoch": 0.5309953456727127, + "grad_norm": 0.09677889198064804, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 137360 + }, + { + "epoch": 0.5310340028760959, + "grad_norm": 0.10898634046316147, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 137370 + }, + { + "epoch": 0.5310726600794792, + "grad_norm": 0.11007361859083176, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 137380 + }, + { + "epoch": 0.5311113172828625, + "grad_norm": 0.12255366891622543, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 137390 + }, + { + "epoch": 0.5311499744862458, + "grad_norm": 0.1071476936340332, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 137400 + }, + { + "epoch": 0.531188631689629, + "grad_norm": 0.12634190917015076, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 137410 + }, + { + "epoch": 0.5312272888930123, + "grad_norm": 0.10547083616256714, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 137420 + }, + { + "epoch": 0.5312659460963955, + "grad_norm": 0.10647553950548172, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 137430 + }, + { + "epoch": 0.5313046032997789, + "grad_norm": 0.09845500439405441, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 137440 + }, + { + "epoch": 0.5313432605031622, + "grad_norm": 0.10793498903512955, + "learning_rate": 0.002, + "loss": 2.363, + "step": 137450 + }, + { + "epoch": 0.5313819177065454, + "grad_norm": 0.11223926395177841, + "learning_rate": 0.002, + "loss": 2.349, + "step": 137460 + }, + { + "epoch": 0.5314205749099287, + "grad_norm": 0.11091437190771103, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 137470 + }, + { + "epoch": 0.531459232113312, + "grad_norm": 0.11193855106830597, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 137480 + }, + { + "epoch": 0.5314978893166953, + "grad_norm": 0.11525751650333405, + "learning_rate": 0.002, + "loss": 2.341, + "step": 137490 + }, + { + "epoch": 0.5315365465200785, + "grad_norm": 0.11154419928789139, + "learning_rate": 0.002, + "loss": 2.346, + "step": 137500 + }, + { + "epoch": 0.5315752037234618, + "grad_norm": 0.10130494832992554, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 137510 + }, + { + "epoch": 0.5316138609268451, + "grad_norm": 0.1344832181930542, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 137520 + }, + { + "epoch": 0.5316525181302284, + "grad_norm": 0.11252513527870178, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 137530 + }, + { + "epoch": 0.5316911753336117, + "grad_norm": 0.11794115602970123, + "learning_rate": 0.002, + "loss": 2.344, + "step": 137540 + }, + { + "epoch": 0.5317298325369949, + "grad_norm": 0.1194150522351265, + "learning_rate": 0.002, + "loss": 2.34, + "step": 137550 + }, + { + "epoch": 0.5317684897403783, + "grad_norm": 0.10883738100528717, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 137560 + }, + { + "epoch": 0.5318071469437615, + "grad_norm": 0.11823202669620514, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 137570 + }, + { + "epoch": 0.5318458041471448, + "grad_norm": 0.10311403125524521, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 137580 + }, + { + "epoch": 0.531884461350528, + "grad_norm": 0.11634775996208191, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 137590 + }, + { + "epoch": 0.5319231185539114, + "grad_norm": 0.10529926419258118, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 137600 + }, + { + "epoch": 0.5319617757572946, + "grad_norm": 0.10747803002595901, + "learning_rate": 0.002, + "loss": 2.346, + "step": 137610 + }, + { + "epoch": 0.5320004329606779, + "grad_norm": 0.12081359326839447, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 137620 + }, + { + "epoch": 0.5320390901640611, + "grad_norm": 0.1166115254163742, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 137630 + }, + { + "epoch": 0.5320777473674444, + "grad_norm": 0.11433306336402893, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 137640 + }, + { + "epoch": 0.5321164045708278, + "grad_norm": 0.11932501196861267, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 137650 + }, + { + "epoch": 0.532155061774211, + "grad_norm": 0.12655851244926453, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 137660 + }, + { + "epoch": 0.5321937189775943, + "grad_norm": 0.09576230496168137, + "learning_rate": 0.002, + "loss": 2.342, + "step": 137670 + }, + { + "epoch": 0.5322323761809775, + "grad_norm": 0.11238545924425125, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 137680 + }, + { + "epoch": 0.5322710333843609, + "grad_norm": 0.10102082043886185, + "learning_rate": 0.002, + "loss": 2.348, + "step": 137690 + }, + { + "epoch": 0.5323096905877441, + "grad_norm": 0.10141270607709885, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 137700 + }, + { + "epoch": 0.5323483477911274, + "grad_norm": 0.09829019010066986, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 137710 + }, + { + "epoch": 0.5323870049945106, + "grad_norm": 0.09422247856855392, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 137720 + }, + { + "epoch": 0.532425662197894, + "grad_norm": 0.11723770946264267, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 137730 + }, + { + "epoch": 0.5324643194012773, + "grad_norm": 0.10489365458488464, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 137740 + }, + { + "epoch": 0.5325029766046605, + "grad_norm": 0.12095458060503006, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 137750 + }, + { + "epoch": 0.5325416338080438, + "grad_norm": 0.10421379655599594, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 137760 + }, + { + "epoch": 0.5325802910114271, + "grad_norm": 0.10324057936668396, + "learning_rate": 0.002, + "loss": 2.356, + "step": 137770 + }, + { + "epoch": 0.5326189482148104, + "grad_norm": 0.1105327233672142, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 137780 + }, + { + "epoch": 0.5326576054181936, + "grad_norm": 0.11294253170490265, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 137790 + }, + { + "epoch": 0.5326962626215769, + "grad_norm": 0.10702069848775864, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 137800 + }, + { + "epoch": 0.5327349198249601, + "grad_norm": 0.1080106571316719, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 137810 + }, + { + "epoch": 0.5327735770283435, + "grad_norm": 0.0997968465089798, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 137820 + }, + { + "epoch": 0.5328122342317267, + "grad_norm": 0.09706050157546997, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 137830 + }, + { + "epoch": 0.53285089143511, + "grad_norm": 0.17344297468662262, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 137840 + }, + { + "epoch": 0.5328895486384932, + "grad_norm": 0.11641829460859299, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 137850 + }, + { + "epoch": 0.5329282058418766, + "grad_norm": 0.12219628691673279, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 137860 + }, + { + "epoch": 0.5329668630452599, + "grad_norm": 0.09959860146045685, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 137870 + }, + { + "epoch": 0.5330055202486431, + "grad_norm": 0.10274484008550644, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 137880 + }, + { + "epoch": 0.5330441774520264, + "grad_norm": 0.11681586503982544, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 137890 + }, + { + "epoch": 0.5330828346554097, + "grad_norm": 0.11741876602172852, + "learning_rate": 0.002, + "loss": 2.337, + "step": 137900 + }, + { + "epoch": 0.533121491858793, + "grad_norm": 0.10877589136362076, + "learning_rate": 0.002, + "loss": 2.338, + "step": 137910 + }, + { + "epoch": 0.5331601490621762, + "grad_norm": 0.12460336089134216, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 137920 + }, + { + "epoch": 0.5331988062655595, + "grad_norm": 0.11148268729448318, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 137930 + }, + { + "epoch": 0.5332374634689429, + "grad_norm": 0.10965389758348465, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 137940 + }, + { + "epoch": 0.5332761206723261, + "grad_norm": 0.10817113518714905, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 137950 + }, + { + "epoch": 0.5333147778757094, + "grad_norm": 0.10625895112752914, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 137960 + }, + { + "epoch": 0.5333534350790926, + "grad_norm": 0.09788678586483002, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 137970 + }, + { + "epoch": 0.533392092282476, + "grad_norm": 0.10641621798276901, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 137980 + }, + { + "epoch": 0.5334307494858592, + "grad_norm": 0.11436722427606583, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 137990 + }, + { + "epoch": 0.5334694066892425, + "grad_norm": 0.10436925292015076, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 138000 + }, + { + "epoch": 0.5335080638926257, + "grad_norm": 0.11858268827199936, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 138010 + }, + { + "epoch": 0.533546721096009, + "grad_norm": 0.1955452710390091, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 138020 + }, + { + "epoch": 0.5335853782993923, + "grad_norm": 0.11019251495599747, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 138030 + }, + { + "epoch": 0.5336240355027756, + "grad_norm": 0.11960510909557343, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 138040 + }, + { + "epoch": 0.5336626927061588, + "grad_norm": 0.10244852304458618, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 138050 + }, + { + "epoch": 0.5337013499095421, + "grad_norm": 0.0959695354104042, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 138060 + }, + { + "epoch": 0.5337400071129255, + "grad_norm": 0.1148761510848999, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 138070 + }, + { + "epoch": 0.5337786643163087, + "grad_norm": 0.09662537276744843, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 138080 + }, + { + "epoch": 0.533817321519692, + "grad_norm": 0.1158476397395134, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 138090 + }, + { + "epoch": 0.5338559787230752, + "grad_norm": 0.11077646166086197, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 138100 + }, + { + "epoch": 0.5338946359264586, + "grad_norm": 0.1256052851676941, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 138110 + }, + { + "epoch": 0.5339332931298418, + "grad_norm": 0.11398004740476608, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 138120 + }, + { + "epoch": 0.5339719503332251, + "grad_norm": 0.10633135586977005, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 138130 + }, + { + "epoch": 0.5340106075366083, + "grad_norm": 0.10169097781181335, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 138140 + }, + { + "epoch": 0.5340492647399917, + "grad_norm": 0.10960586369037628, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 138150 + }, + { + "epoch": 0.534087921943375, + "grad_norm": 0.10723719000816345, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 138160 + }, + { + "epoch": 0.5341265791467582, + "grad_norm": 0.10737229138612747, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 138170 + }, + { + "epoch": 0.5341652363501415, + "grad_norm": 0.10901413857936859, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 138180 + }, + { + "epoch": 0.5342038935535247, + "grad_norm": 0.11041481047868729, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 138190 + }, + { + "epoch": 0.5342425507569081, + "grad_norm": 0.10067623108625412, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 138200 + }, + { + "epoch": 0.5342812079602913, + "grad_norm": 0.10747084021568298, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 138210 + }, + { + "epoch": 0.5343198651636746, + "grad_norm": 0.12419717758893967, + "learning_rate": 0.002, + "loss": 2.354, + "step": 138220 + }, + { + "epoch": 0.5343585223670578, + "grad_norm": 0.10230522602796555, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 138230 + }, + { + "epoch": 0.5343971795704412, + "grad_norm": 0.0917351171374321, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 138240 + }, + { + "epoch": 0.5344358367738244, + "grad_norm": 0.11229658871889114, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 138250 + }, + { + "epoch": 0.5344744939772077, + "grad_norm": 0.11508216708898544, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 138260 + }, + { + "epoch": 0.534513151180591, + "grad_norm": 0.10804786533117294, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 138270 + }, + { + "epoch": 0.5345518083839743, + "grad_norm": 0.10519649088382721, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 138280 + }, + { + "epoch": 0.5345904655873576, + "grad_norm": 0.1274058073759079, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 138290 + }, + { + "epoch": 0.5346291227907408, + "grad_norm": 0.09675882756710052, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 138300 + }, + { + "epoch": 0.5346677799941241, + "grad_norm": 0.09814517945051193, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 138310 + }, + { + "epoch": 0.5347064371975074, + "grad_norm": 0.11503762751817703, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 138320 + }, + { + "epoch": 0.5347450944008907, + "grad_norm": 0.110122449696064, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 138330 + }, + { + "epoch": 0.5347837516042739, + "grad_norm": 0.1314275562763214, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 138340 + }, + { + "epoch": 0.5348224088076572, + "grad_norm": 0.09663635492324829, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 138350 + }, + { + "epoch": 0.5348610660110404, + "grad_norm": 0.11120118945837021, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 138360 + }, + { + "epoch": 0.5348997232144238, + "grad_norm": 0.10792695730924606, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 138370 + }, + { + "epoch": 0.5349383804178071, + "grad_norm": 0.1011994257569313, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 138380 + }, + { + "epoch": 0.5349770376211903, + "grad_norm": 0.11302940547466278, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 138390 + }, + { + "epoch": 0.5350156948245736, + "grad_norm": 0.10505368560552597, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 138400 + }, + { + "epoch": 0.5350543520279569, + "grad_norm": 0.11198221147060394, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 138410 + }, + { + "epoch": 0.5350930092313402, + "grad_norm": 0.1090167760848999, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 138420 + }, + { + "epoch": 0.5351316664347234, + "grad_norm": 0.10570481419563293, + "learning_rate": 0.002, + "loss": 2.329, + "step": 138430 + }, + { + "epoch": 0.5351703236381067, + "grad_norm": 0.10702286660671234, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 138440 + }, + { + "epoch": 0.53520898084149, + "grad_norm": 0.12096297740936279, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 138450 + }, + { + "epoch": 0.5352476380448733, + "grad_norm": 0.12605620920658112, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 138460 + }, + { + "epoch": 0.5352862952482565, + "grad_norm": 0.10755420476198196, + "learning_rate": 0.002, + "loss": 2.33, + "step": 138470 + }, + { + "epoch": 0.5353249524516398, + "grad_norm": 0.10866609960794449, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 138480 + }, + { + "epoch": 0.5353636096550232, + "grad_norm": 0.11253587156534195, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 138490 + }, + { + "epoch": 0.5354022668584064, + "grad_norm": 0.12749694287776947, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 138500 + }, + { + "epoch": 0.5354409240617897, + "grad_norm": 0.10958480089902878, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 138510 + }, + { + "epoch": 0.5354795812651729, + "grad_norm": 0.10303416103124619, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 138520 + }, + { + "epoch": 0.5355182384685563, + "grad_norm": 0.11819668114185333, + "learning_rate": 0.002, + "loss": 2.355, + "step": 138530 + }, + { + "epoch": 0.5355568956719395, + "grad_norm": 0.2007521092891693, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 138540 + }, + { + "epoch": 0.5355955528753228, + "grad_norm": 0.09628603607416153, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 138550 + }, + { + "epoch": 0.535634210078706, + "grad_norm": 0.1116054356098175, + "learning_rate": 0.002, + "loss": 2.337, + "step": 138560 + }, + { + "epoch": 0.5356728672820893, + "grad_norm": 0.10477752238512039, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 138570 + }, + { + "epoch": 0.5357115244854727, + "grad_norm": 0.10349495708942413, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 138580 + }, + { + "epoch": 0.5357501816888559, + "grad_norm": 0.10677219182252884, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 138590 + }, + { + "epoch": 0.5357888388922392, + "grad_norm": 0.10819147527217865, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 138600 + }, + { + "epoch": 0.5358274960956224, + "grad_norm": 0.12032123655080795, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 138610 + }, + { + "epoch": 0.5358661532990058, + "grad_norm": 0.10413940250873566, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 138620 + }, + { + "epoch": 0.535904810502389, + "grad_norm": 0.10026529431343079, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 138630 + }, + { + "epoch": 0.5359434677057723, + "grad_norm": 0.10903637111186981, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 138640 + }, + { + "epoch": 0.5359821249091555, + "grad_norm": 0.11362525075674057, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 138650 + }, + { + "epoch": 0.5360207821125389, + "grad_norm": 0.10190536081790924, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 138660 + }, + { + "epoch": 0.5360594393159221, + "grad_norm": 0.09856315702199936, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 138670 + }, + { + "epoch": 0.5360980965193054, + "grad_norm": 0.10943195968866348, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 138680 + }, + { + "epoch": 0.5361367537226887, + "grad_norm": 0.11535698920488358, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 138690 + }, + { + "epoch": 0.536175410926072, + "grad_norm": 0.10688416659832001, + "learning_rate": 0.002, + "loss": 2.3621, + "step": 138700 + }, + { + "epoch": 0.5362140681294553, + "grad_norm": 0.10885433107614517, + "learning_rate": 0.002, + "loss": 2.348, + "step": 138710 + }, + { + "epoch": 0.5362527253328385, + "grad_norm": 0.09661147743463516, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 138720 + }, + { + "epoch": 0.5362913825362218, + "grad_norm": 0.11588813364505768, + "learning_rate": 0.002, + "loss": 2.338, + "step": 138730 + }, + { + "epoch": 0.536330039739605, + "grad_norm": 0.10474442690610886, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 138740 + }, + { + "epoch": 0.5363686969429884, + "grad_norm": 0.11134396493434906, + "learning_rate": 0.002, + "loss": 2.343, + "step": 138750 + }, + { + "epoch": 0.5364073541463716, + "grad_norm": 0.12790916860103607, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 138760 + }, + { + "epoch": 0.5364460113497549, + "grad_norm": 0.11726795136928558, + "learning_rate": 0.002, + "loss": 2.354, + "step": 138770 + }, + { + "epoch": 0.5364846685531381, + "grad_norm": 0.11688651889562607, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 138780 + }, + { + "epoch": 0.5365233257565215, + "grad_norm": 0.09294089674949646, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 138790 + }, + { + "epoch": 0.5365619829599048, + "grad_norm": 0.13146226108074188, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 138800 + }, + { + "epoch": 0.536600640163288, + "grad_norm": 0.11804410070180893, + "learning_rate": 0.002, + "loss": 2.359, + "step": 138810 + }, + { + "epoch": 0.5366392973666713, + "grad_norm": 0.10523460805416107, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 138820 + }, + { + "epoch": 0.5366779545700546, + "grad_norm": 0.1207517683506012, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 138830 + }, + { + "epoch": 0.5367166117734379, + "grad_norm": 0.12145643681287766, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 138840 + }, + { + "epoch": 0.5367552689768211, + "grad_norm": 0.1467278003692627, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 138850 + }, + { + "epoch": 0.5367939261802044, + "grad_norm": 0.10867208987474442, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 138860 + }, + { + "epoch": 0.5368325833835877, + "grad_norm": 0.11317738145589828, + "learning_rate": 0.002, + "loss": 2.342, + "step": 138870 + }, + { + "epoch": 0.536871240586971, + "grad_norm": 0.10063724964857101, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 138880 + }, + { + "epoch": 0.5369098977903543, + "grad_norm": 0.12380929291248322, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 138890 + }, + { + "epoch": 0.5369485549937375, + "grad_norm": 0.11145635694265366, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 138900 + }, + { + "epoch": 0.5369872121971209, + "grad_norm": 0.1016305610537529, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 138910 + }, + { + "epoch": 0.5370258694005041, + "grad_norm": 0.11410214006900787, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 138920 + }, + { + "epoch": 0.5370645266038874, + "grad_norm": 0.10275132209062576, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 138930 + }, + { + "epoch": 0.5371031838072706, + "grad_norm": 0.11877349019050598, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 138940 + }, + { + "epoch": 0.5371418410106539, + "grad_norm": 0.11219332367181778, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 138950 + }, + { + "epoch": 0.5371804982140372, + "grad_norm": 0.10791706293821335, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 138960 + }, + { + "epoch": 0.5372191554174205, + "grad_norm": 0.11491399258375168, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 138970 + }, + { + "epoch": 0.5372578126208037, + "grad_norm": 0.11004562675952911, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 138980 + }, + { + "epoch": 0.537296469824187, + "grad_norm": 0.11763054132461548, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 138990 + }, + { + "epoch": 0.5373351270275704, + "grad_norm": 0.10310228914022446, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 139000 + }, + { + "epoch": 0.5373737842309536, + "grad_norm": 0.10830710083246231, + "learning_rate": 0.002, + "loss": 2.353, + "step": 139010 + }, + { + "epoch": 0.5374124414343369, + "grad_norm": 0.0979403480887413, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 139020 + }, + { + "epoch": 0.5374510986377201, + "grad_norm": 0.1297398954629898, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 139030 + }, + { + "epoch": 0.5374897558411035, + "grad_norm": 0.14436522126197815, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 139040 + }, + { + "epoch": 0.5375284130444867, + "grad_norm": 0.09765699505805969, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 139050 + }, + { + "epoch": 0.53756707024787, + "grad_norm": 0.10815630108118057, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 139060 + }, + { + "epoch": 0.5376057274512532, + "grad_norm": 0.09410147368907928, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 139070 + }, + { + "epoch": 0.5376443846546366, + "grad_norm": 0.11704320460557938, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 139080 + }, + { + "epoch": 0.5376830418580199, + "grad_norm": 0.10289481282234192, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 139090 + }, + { + "epoch": 0.5377216990614031, + "grad_norm": 0.13339246809482574, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 139100 + }, + { + "epoch": 0.5377603562647864, + "grad_norm": 0.10395749658346176, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 139110 + }, + { + "epoch": 0.5377990134681696, + "grad_norm": 0.10145257413387299, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 139120 + }, + { + "epoch": 0.537837670671553, + "grad_norm": 0.1058330237865448, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 139130 + }, + { + "epoch": 0.5378763278749362, + "grad_norm": 0.09136255830526352, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 139140 + }, + { + "epoch": 0.5379149850783195, + "grad_norm": 0.09894274920225143, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 139150 + }, + { + "epoch": 0.5379536422817027, + "grad_norm": 0.11287065595388412, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 139160 + }, + { + "epoch": 0.5379922994850861, + "grad_norm": 0.11594919115304947, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 139170 + }, + { + "epoch": 0.5380309566884693, + "grad_norm": 0.13460049033164978, + "learning_rate": 0.002, + "loss": 2.352, + "step": 139180 + }, + { + "epoch": 0.5380696138918526, + "grad_norm": 0.11856977641582489, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 139190 + }, + { + "epoch": 0.5381082710952358, + "grad_norm": 0.10561086982488632, + "learning_rate": 0.002, + "loss": 2.345, + "step": 139200 + }, + { + "epoch": 0.5381469282986192, + "grad_norm": 0.1271483302116394, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 139210 + }, + { + "epoch": 0.5381855855020025, + "grad_norm": 0.0922880545258522, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 139220 + }, + { + "epoch": 0.5382242427053857, + "grad_norm": 0.09948939085006714, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 139230 + }, + { + "epoch": 0.538262899908769, + "grad_norm": 0.10929225385189056, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 139240 + }, + { + "epoch": 0.5383015571121523, + "grad_norm": 0.11775950342416763, + "learning_rate": 0.002, + "loss": 2.349, + "step": 139250 + }, + { + "epoch": 0.5383402143155356, + "grad_norm": 0.1184498593211174, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 139260 + }, + { + "epoch": 0.5383788715189188, + "grad_norm": 0.08900095522403717, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 139270 + }, + { + "epoch": 0.5384175287223021, + "grad_norm": 0.09807005524635315, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 139280 + }, + { + "epoch": 0.5384561859256853, + "grad_norm": 0.11427653580904007, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 139290 + }, + { + "epoch": 0.5384948431290687, + "grad_norm": 0.10617062449455261, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 139300 + }, + { + "epoch": 0.538533500332452, + "grad_norm": 0.11520641297101974, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 139310 + }, + { + "epoch": 0.5385721575358352, + "grad_norm": 0.09916675835847855, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 139320 + }, + { + "epoch": 0.5386108147392185, + "grad_norm": 0.09200314432382584, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 139330 + }, + { + "epoch": 0.5386494719426018, + "grad_norm": 0.12772326171398163, + "learning_rate": 0.002, + "loss": 2.342, + "step": 139340 + }, + { + "epoch": 0.5386881291459851, + "grad_norm": 0.09921281039714813, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 139350 + }, + { + "epoch": 0.5387267863493683, + "grad_norm": 0.11318966746330261, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 139360 + }, + { + "epoch": 0.5387654435527516, + "grad_norm": 0.10829410701990128, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 139370 + }, + { + "epoch": 0.5388041007561349, + "grad_norm": 0.10719124227762222, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 139380 + }, + { + "epoch": 0.5388427579595182, + "grad_norm": 0.11400733888149261, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 139390 + }, + { + "epoch": 0.5388814151629014, + "grad_norm": 0.10717476904392242, + "learning_rate": 0.002, + "loss": 2.34, + "step": 139400 + }, + { + "epoch": 0.5389200723662847, + "grad_norm": 0.11081114411354065, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 139410 + }, + { + "epoch": 0.5389587295696681, + "grad_norm": 0.10991322249174118, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 139420 + }, + { + "epoch": 0.5389973867730513, + "grad_norm": 0.09669278562068939, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 139430 + }, + { + "epoch": 0.5390360439764346, + "grad_norm": 0.11478671431541443, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 139440 + }, + { + "epoch": 0.5390747011798178, + "grad_norm": 0.1203511580824852, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 139450 + }, + { + "epoch": 0.5391133583832012, + "grad_norm": 0.10512226819992065, + "learning_rate": 0.002, + "loss": 2.335, + "step": 139460 + }, + { + "epoch": 0.5391520155865844, + "grad_norm": 0.09710384905338287, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 139470 + }, + { + "epoch": 0.5391906727899677, + "grad_norm": 0.29483917355537415, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 139480 + }, + { + "epoch": 0.5392293299933509, + "grad_norm": 0.12612727284431458, + "learning_rate": 0.002, + "loss": 2.352, + "step": 139490 + }, + { + "epoch": 0.5392679871967342, + "grad_norm": 0.10861579328775406, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 139500 + }, + { + "epoch": 0.5393066444001176, + "grad_norm": 0.10819701850414276, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 139510 + }, + { + "epoch": 0.5393453016035008, + "grad_norm": 0.10991454869508743, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 139520 + }, + { + "epoch": 0.5393839588068841, + "grad_norm": 0.09839512407779694, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 139530 + }, + { + "epoch": 0.5394226160102673, + "grad_norm": 0.11368145793676376, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 139540 + }, + { + "epoch": 0.5394612732136507, + "grad_norm": 0.10433456301689148, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 139550 + }, + { + "epoch": 0.5394999304170339, + "grad_norm": 0.1109948456287384, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 139560 + }, + { + "epoch": 0.5395385876204172, + "grad_norm": 0.1037093847990036, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 139570 + }, + { + "epoch": 0.5395772448238004, + "grad_norm": 0.10175781697034836, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 139580 + }, + { + "epoch": 0.5396159020271838, + "grad_norm": 0.09333990514278412, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 139590 + }, + { + "epoch": 0.539654559230567, + "grad_norm": 0.11181159317493439, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 139600 + }, + { + "epoch": 0.5396932164339503, + "grad_norm": 0.11220884323120117, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 139610 + }, + { + "epoch": 0.5397318736373335, + "grad_norm": 0.09398156404495239, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 139620 + }, + { + "epoch": 0.5397705308407169, + "grad_norm": 0.11417990177869797, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 139630 + }, + { + "epoch": 0.5398091880441002, + "grad_norm": 0.14919176697731018, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 139640 + }, + { + "epoch": 0.5398478452474834, + "grad_norm": 0.11777222901582718, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 139650 + }, + { + "epoch": 0.5398865024508667, + "grad_norm": 0.10088865458965302, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 139660 + }, + { + "epoch": 0.5399251596542499, + "grad_norm": 0.09098080545663834, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 139670 + }, + { + "epoch": 0.5399638168576333, + "grad_norm": 0.11097294092178345, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 139680 + }, + { + "epoch": 0.5400024740610165, + "grad_norm": 0.11297319829463959, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 139690 + }, + { + "epoch": 0.5400411312643998, + "grad_norm": 0.0978778824210167, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 139700 + }, + { + "epoch": 0.540079788467783, + "grad_norm": 0.12271102517843246, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 139710 + }, + { + "epoch": 0.5401184456711664, + "grad_norm": 0.10544592142105103, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 139720 + }, + { + "epoch": 0.5401571028745497, + "grad_norm": 0.10868457704782486, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 139730 + }, + { + "epoch": 0.5401957600779329, + "grad_norm": 0.13596363365650177, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 139740 + }, + { + "epoch": 0.5402344172813162, + "grad_norm": 0.10462050884962082, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 139750 + }, + { + "epoch": 0.5402730744846995, + "grad_norm": 0.12288188934326172, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 139760 + }, + { + "epoch": 0.5403117316880828, + "grad_norm": 0.11072691529989243, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 139770 + }, + { + "epoch": 0.540350388891466, + "grad_norm": 0.1335332691669464, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 139780 + }, + { + "epoch": 0.5403890460948493, + "grad_norm": 0.10119934380054474, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 139790 + }, + { + "epoch": 0.5404277032982326, + "grad_norm": 0.09935948997735977, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 139800 + }, + { + "epoch": 0.5404663605016159, + "grad_norm": 0.11677015572786331, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 139810 + }, + { + "epoch": 0.5405050177049991, + "grad_norm": 0.10641399770975113, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 139820 + }, + { + "epoch": 0.5405436749083824, + "grad_norm": 0.10183258354663849, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 139830 + }, + { + "epoch": 0.5405823321117657, + "grad_norm": 0.09802069514989853, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 139840 + }, + { + "epoch": 0.540620989315149, + "grad_norm": 0.11229890584945679, + "learning_rate": 0.002, + "loss": 2.356, + "step": 139850 + }, + { + "epoch": 0.5406596465185323, + "grad_norm": 0.10374360531568527, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 139860 + }, + { + "epoch": 0.5406983037219155, + "grad_norm": 0.10174267739057541, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 139870 + }, + { + "epoch": 0.5407369609252988, + "grad_norm": 0.11391658335924149, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 139880 + }, + { + "epoch": 0.5407756181286821, + "grad_norm": 0.11875861883163452, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 139890 + }, + { + "epoch": 0.5408142753320654, + "grad_norm": 0.10455701500177383, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 139900 + }, + { + "epoch": 0.5408529325354486, + "grad_norm": 0.11358092725276947, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 139910 + }, + { + "epoch": 0.5408915897388319, + "grad_norm": 0.11409742385149002, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 139920 + }, + { + "epoch": 0.5409302469422153, + "grad_norm": 0.14725357294082642, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 139930 + }, + { + "epoch": 0.5409689041455985, + "grad_norm": 0.10605788230895996, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 139940 + }, + { + "epoch": 0.5410075613489818, + "grad_norm": 0.107200987637043, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 139950 + }, + { + "epoch": 0.541046218552365, + "grad_norm": 0.09364209324121475, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 139960 + }, + { + "epoch": 0.5410848757557484, + "grad_norm": 0.1474541425704956, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 139970 + }, + { + "epoch": 0.5411235329591316, + "grad_norm": 0.10103508085012436, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 139980 + }, + { + "epoch": 0.5411621901625149, + "grad_norm": 0.10418781638145447, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 139990 + }, + { + "epoch": 0.5412008473658981, + "grad_norm": 0.1143510490655899, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 140000 + }, + { + "epoch": 0.5412395045692815, + "grad_norm": 0.10674919933080673, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 140010 + }, + { + "epoch": 0.5412781617726647, + "grad_norm": 0.09429977834224701, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 140020 + }, + { + "epoch": 0.541316818976048, + "grad_norm": 0.11200029402971268, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 140030 + }, + { + "epoch": 0.5413554761794312, + "grad_norm": 0.10863328725099564, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 140040 + }, + { + "epoch": 0.5413941333828145, + "grad_norm": 0.09245651960372925, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 140050 + }, + { + "epoch": 0.5414327905861979, + "grad_norm": 0.10689567774534225, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 140060 + }, + { + "epoch": 0.5414714477895811, + "grad_norm": 0.10819069296121597, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 140070 + }, + { + "epoch": 0.5415101049929644, + "grad_norm": 0.10671747475862503, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 140080 + }, + { + "epoch": 0.5415487621963476, + "grad_norm": 0.10832956433296204, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 140090 + }, + { + "epoch": 0.541587419399731, + "grad_norm": 0.11262453347444534, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 140100 + }, + { + "epoch": 0.5416260766031142, + "grad_norm": 0.10590975731611252, + "learning_rate": 0.002, + "loss": 2.342, + "step": 140110 + }, + { + "epoch": 0.5416647338064975, + "grad_norm": 0.11854864656925201, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 140120 + }, + { + "epoch": 0.5417033910098807, + "grad_norm": 0.10754364728927612, + "learning_rate": 0.002, + "loss": 2.34, + "step": 140130 + }, + { + "epoch": 0.5417420482132641, + "grad_norm": 0.10380040109157562, + "learning_rate": 0.002, + "loss": 2.36, + "step": 140140 + }, + { + "epoch": 0.5417807054166474, + "grad_norm": 0.11440842598676682, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 140150 + }, + { + "epoch": 0.5418193626200306, + "grad_norm": 0.11621855944395065, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 140160 + }, + { + "epoch": 0.5418580198234139, + "grad_norm": 0.10840783268213272, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 140170 + }, + { + "epoch": 0.5418966770267972, + "grad_norm": 0.11345444619655609, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 140180 + }, + { + "epoch": 0.5419353342301805, + "grad_norm": 0.10408294945955276, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 140190 + }, + { + "epoch": 0.5419739914335637, + "grad_norm": 0.10988804697990417, + "learning_rate": 0.002, + "loss": 2.347, + "step": 140200 + }, + { + "epoch": 0.542012648636947, + "grad_norm": 0.08823379874229431, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 140210 + }, + { + "epoch": 0.5420513058403302, + "grad_norm": 0.10055267810821533, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 140220 + }, + { + "epoch": 0.5420899630437136, + "grad_norm": 0.10901609808206558, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 140230 + }, + { + "epoch": 0.5421286202470968, + "grad_norm": 0.09420198202133179, + "learning_rate": 0.002, + "loss": 2.343, + "step": 140240 + }, + { + "epoch": 0.5421672774504801, + "grad_norm": 0.17304375767707825, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 140250 + }, + { + "epoch": 0.5422059346538634, + "grad_norm": 0.11952243000268936, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 140260 + }, + { + "epoch": 0.5422445918572467, + "grad_norm": 0.09226176142692566, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 140270 + }, + { + "epoch": 0.54228324906063, + "grad_norm": 0.10702744871377945, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 140280 + }, + { + "epoch": 0.5423219062640132, + "grad_norm": 0.10657176375389099, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 140290 + }, + { + "epoch": 0.5423605634673965, + "grad_norm": 0.10938893258571625, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 140300 + }, + { + "epoch": 0.5423992206707798, + "grad_norm": 0.10880811512470245, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 140310 + }, + { + "epoch": 0.5424378778741631, + "grad_norm": 0.11123374104499817, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 140320 + }, + { + "epoch": 0.5424765350775463, + "grad_norm": 0.09306691586971283, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 140330 + }, + { + "epoch": 0.5425151922809296, + "grad_norm": 0.11641582101583481, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 140340 + }, + { + "epoch": 0.542553849484313, + "grad_norm": 0.11220816522836685, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 140350 + }, + { + "epoch": 0.5425925066876962, + "grad_norm": 0.0909360870718956, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 140360 + }, + { + "epoch": 0.5426311638910795, + "grad_norm": 0.12731623649597168, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 140370 + }, + { + "epoch": 0.5426698210944627, + "grad_norm": 0.12812413275241852, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 140380 + }, + { + "epoch": 0.5427084782978461, + "grad_norm": 0.13419029116630554, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 140390 + }, + { + "epoch": 0.5427471355012293, + "grad_norm": 0.09206200391054153, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 140400 + }, + { + "epoch": 0.5427857927046126, + "grad_norm": 0.09752889722585678, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 140410 + }, + { + "epoch": 0.5428244499079958, + "grad_norm": 0.1198781207203865, + "learning_rate": 0.002, + "loss": 2.349, + "step": 140420 + }, + { + "epoch": 0.5428631071113791, + "grad_norm": 0.13163580000400543, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 140430 + }, + { + "epoch": 0.5429017643147624, + "grad_norm": 0.10607501119375229, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 140440 + }, + { + "epoch": 0.5429404215181457, + "grad_norm": 0.10543625801801682, + "learning_rate": 0.002, + "loss": 2.332, + "step": 140450 + }, + { + "epoch": 0.542979078721529, + "grad_norm": 0.11482521891593933, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 140460 + }, + { + "epoch": 0.5430177359249122, + "grad_norm": 0.09837967902421951, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 140470 + }, + { + "epoch": 0.5430563931282956, + "grad_norm": 0.13049250841140747, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 140480 + }, + { + "epoch": 0.5430950503316788, + "grad_norm": 0.12102434039115906, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 140490 + }, + { + "epoch": 0.5431337075350621, + "grad_norm": 0.10373283177614212, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 140500 + }, + { + "epoch": 0.5431723647384453, + "grad_norm": 0.1218980923295021, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 140510 + }, + { + "epoch": 0.5432110219418287, + "grad_norm": 0.11484548449516296, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 140520 + }, + { + "epoch": 0.5432496791452119, + "grad_norm": 0.09388495236635208, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 140530 + }, + { + "epoch": 0.5432883363485952, + "grad_norm": 0.10815958678722382, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 140540 + }, + { + "epoch": 0.5433269935519784, + "grad_norm": 0.10376013070344925, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 140550 + }, + { + "epoch": 0.5433656507553618, + "grad_norm": 0.1519942432641983, + "learning_rate": 0.002, + "loss": 2.353, + "step": 140560 + }, + { + "epoch": 0.5434043079587451, + "grad_norm": 0.09740184247493744, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 140570 + }, + { + "epoch": 0.5434429651621283, + "grad_norm": 0.11878722161054611, + "learning_rate": 0.002, + "loss": 2.337, + "step": 140580 + }, + { + "epoch": 0.5434816223655116, + "grad_norm": 0.11175355315208435, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 140590 + }, + { + "epoch": 0.5435202795688948, + "grad_norm": 0.11709204316139221, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 140600 + }, + { + "epoch": 0.5435589367722782, + "grad_norm": 0.10323283821344376, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 140610 + }, + { + "epoch": 0.5435975939756614, + "grad_norm": 0.1406097263097763, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 140620 + }, + { + "epoch": 0.5436362511790447, + "grad_norm": 0.10124669224023819, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 140630 + }, + { + "epoch": 0.5436749083824279, + "grad_norm": 0.10182762145996094, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 140640 + }, + { + "epoch": 0.5437135655858113, + "grad_norm": 0.11305060237646103, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 140650 + }, + { + "epoch": 0.5437522227891946, + "grad_norm": 0.10381458699703217, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 140660 + }, + { + "epoch": 0.5437908799925778, + "grad_norm": 0.10050679743289948, + "learning_rate": 0.002, + "loss": 2.341, + "step": 140670 + }, + { + "epoch": 0.543829537195961, + "grad_norm": 0.11455484479665756, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 140680 + }, + { + "epoch": 0.5438681943993444, + "grad_norm": 0.12211675196886063, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 140690 + }, + { + "epoch": 0.5439068516027277, + "grad_norm": 0.14766447246074677, + "learning_rate": 0.002, + "loss": 2.34, + "step": 140700 + }, + { + "epoch": 0.5439455088061109, + "grad_norm": 0.09843506664037704, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 140710 + }, + { + "epoch": 0.5439841660094942, + "grad_norm": 0.10439879447221756, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 140720 + }, + { + "epoch": 0.5440228232128775, + "grad_norm": 0.09761619567871094, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 140730 + }, + { + "epoch": 0.5440614804162608, + "grad_norm": 0.110262431204319, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 140740 + }, + { + "epoch": 0.544100137619644, + "grad_norm": 0.11131390184164047, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 140750 + }, + { + "epoch": 0.5441387948230273, + "grad_norm": 0.12816110253334045, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 140760 + }, + { + "epoch": 0.5441774520264105, + "grad_norm": 0.10264119505882263, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 140770 + }, + { + "epoch": 0.5442161092297939, + "grad_norm": 0.12289933860301971, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 140780 + }, + { + "epoch": 0.5442547664331772, + "grad_norm": 0.10959749668836594, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 140790 + }, + { + "epoch": 0.5442934236365604, + "grad_norm": 0.1002492755651474, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 140800 + }, + { + "epoch": 0.5443320808399437, + "grad_norm": 0.11740302294492722, + "learning_rate": 0.002, + "loss": 2.341, + "step": 140810 + }, + { + "epoch": 0.544370738043327, + "grad_norm": 0.09524788707494736, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 140820 + }, + { + "epoch": 0.5444093952467103, + "grad_norm": 0.10740210115909576, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 140830 + }, + { + "epoch": 0.5444480524500935, + "grad_norm": 0.10724620521068573, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 140840 + }, + { + "epoch": 0.5444867096534768, + "grad_norm": 0.09376657009124756, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 140850 + }, + { + "epoch": 0.5445253668568601, + "grad_norm": 0.12937888503074646, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 140860 + }, + { + "epoch": 0.5445640240602434, + "grad_norm": 0.11522895097732544, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 140870 + }, + { + "epoch": 0.5446026812636267, + "grad_norm": 0.10400960594415665, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 140880 + }, + { + "epoch": 0.5446413384670099, + "grad_norm": 0.126004159450531, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 140890 + }, + { + "epoch": 0.5446799956703933, + "grad_norm": 0.11208324134349823, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 140900 + }, + { + "epoch": 0.5447186528737765, + "grad_norm": 0.10716670751571655, + "learning_rate": 0.002, + "loss": 2.362, + "step": 140910 + }, + { + "epoch": 0.5447573100771598, + "grad_norm": 0.11034978926181793, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 140920 + }, + { + "epoch": 0.544795967280543, + "grad_norm": 0.11277556419372559, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 140930 + }, + { + "epoch": 0.5448346244839264, + "grad_norm": 0.1075998991727829, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 140940 + }, + { + "epoch": 0.5448732816873096, + "grad_norm": 0.12325393408536911, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 140950 + }, + { + "epoch": 0.5449119388906929, + "grad_norm": 0.11496102809906006, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 140960 + }, + { + "epoch": 0.5449505960940761, + "grad_norm": 0.09571215510368347, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 140970 + }, + { + "epoch": 0.5449892532974594, + "grad_norm": 0.10081490874290466, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 140980 + }, + { + "epoch": 0.5450279105008428, + "grad_norm": 0.11344999819993973, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 140990 + }, + { + "epoch": 0.545066567704226, + "grad_norm": 0.10012887418270111, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 141000 + }, + { + "epoch": 0.5451052249076093, + "grad_norm": 0.1071837916970253, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 141010 + }, + { + "epoch": 0.5451438821109925, + "grad_norm": 0.11736311763525009, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 141020 + }, + { + "epoch": 0.5451825393143759, + "grad_norm": 0.12453530728816986, + "learning_rate": 0.002, + "loss": 2.347, + "step": 141030 + }, + { + "epoch": 0.5452211965177591, + "grad_norm": 0.10917795449495316, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 141040 + }, + { + "epoch": 0.5452598537211424, + "grad_norm": 0.11465924978256226, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 141050 + }, + { + "epoch": 0.5452985109245256, + "grad_norm": 0.11755634099245071, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 141060 + }, + { + "epoch": 0.545337168127909, + "grad_norm": 0.13867446780204773, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 141070 + }, + { + "epoch": 0.5453758253312923, + "grad_norm": 0.10823734104633331, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 141080 + }, + { + "epoch": 0.5454144825346755, + "grad_norm": 0.1211395263671875, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 141090 + }, + { + "epoch": 0.5454531397380588, + "grad_norm": 0.10349445044994354, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 141100 + }, + { + "epoch": 0.5454917969414421, + "grad_norm": 0.11433000862598419, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 141110 + }, + { + "epoch": 0.5455304541448254, + "grad_norm": 0.12612050771713257, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 141120 + }, + { + "epoch": 0.5455691113482086, + "grad_norm": 0.09824709594249725, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 141130 + }, + { + "epoch": 0.5456077685515919, + "grad_norm": 0.1088489443063736, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 141140 + }, + { + "epoch": 0.5456464257549751, + "grad_norm": 0.1025751382112503, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 141150 + }, + { + "epoch": 0.5456850829583585, + "grad_norm": 0.10643947124481201, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 141160 + }, + { + "epoch": 0.5457237401617417, + "grad_norm": 0.13787996768951416, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 141170 + }, + { + "epoch": 0.545762397365125, + "grad_norm": 0.18211092054843903, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 141180 + }, + { + "epoch": 0.5458010545685082, + "grad_norm": 0.10266145318746567, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 141190 + }, + { + "epoch": 0.5458397117718916, + "grad_norm": 0.1027229055762291, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 141200 + }, + { + "epoch": 0.5458783689752749, + "grad_norm": 0.11025464534759521, + "learning_rate": 0.002, + "loss": 2.341, + "step": 141210 + }, + { + "epoch": 0.5459170261786581, + "grad_norm": 0.10315735638141632, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 141220 + }, + { + "epoch": 0.5459556833820414, + "grad_norm": 0.11743441969156265, + "learning_rate": 0.002, + "loss": 2.346, + "step": 141230 + }, + { + "epoch": 0.5459943405854247, + "grad_norm": 0.09454575181007385, + "learning_rate": 0.002, + "loss": 2.334, + "step": 141240 + }, + { + "epoch": 0.546032997788808, + "grad_norm": 0.10413283854722977, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 141250 + }, + { + "epoch": 0.5460716549921912, + "grad_norm": 0.10902142524719238, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 141260 + }, + { + "epoch": 0.5461103121955745, + "grad_norm": 0.10488925874233246, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 141270 + }, + { + "epoch": 0.5461489693989579, + "grad_norm": 0.12448426336050034, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 141280 + }, + { + "epoch": 0.5461876266023411, + "grad_norm": 0.14548496901988983, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 141290 + }, + { + "epoch": 0.5462262838057244, + "grad_norm": 0.09693209081888199, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 141300 + }, + { + "epoch": 0.5462649410091076, + "grad_norm": 0.09567166119813919, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 141310 + }, + { + "epoch": 0.546303598212491, + "grad_norm": 0.11992914974689484, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 141320 + }, + { + "epoch": 0.5463422554158742, + "grad_norm": 0.11967799067497253, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 141330 + }, + { + "epoch": 0.5463809126192575, + "grad_norm": 0.11840023845434189, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 141340 + }, + { + "epoch": 0.5464195698226407, + "grad_norm": 0.1122565045952797, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 141350 + }, + { + "epoch": 0.546458227026024, + "grad_norm": 0.10423989593982697, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 141360 + }, + { + "epoch": 0.5464968842294073, + "grad_norm": 0.11209941655397415, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 141370 + }, + { + "epoch": 0.5465355414327906, + "grad_norm": 0.1401730328798294, + "learning_rate": 0.002, + "loss": 2.35, + "step": 141380 + }, + { + "epoch": 0.5465741986361738, + "grad_norm": 0.10664816200733185, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 141390 + }, + { + "epoch": 0.5466128558395571, + "grad_norm": 0.09247737377882004, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 141400 + }, + { + "epoch": 0.5466515130429405, + "grad_norm": 0.10387951880693436, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 141410 + }, + { + "epoch": 0.5466901702463237, + "grad_norm": 0.10998312383890152, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 141420 + }, + { + "epoch": 0.546728827449707, + "grad_norm": 0.10908520966768265, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 141430 + }, + { + "epoch": 0.5467674846530902, + "grad_norm": 0.1527799367904663, + "learning_rate": 0.002, + "loss": 2.358, + "step": 141440 + }, + { + "epoch": 0.5468061418564736, + "grad_norm": 0.11492700129747391, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 141450 + }, + { + "epoch": 0.5468447990598568, + "grad_norm": 0.11756177246570587, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 141460 + }, + { + "epoch": 0.5468834562632401, + "grad_norm": 0.10394591093063354, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 141470 + }, + { + "epoch": 0.5469221134666233, + "grad_norm": 0.1501491218805313, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 141480 + }, + { + "epoch": 0.5469607706700067, + "grad_norm": 0.11308126896619797, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 141490 + }, + { + "epoch": 0.54699942787339, + "grad_norm": 0.10572928190231323, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 141500 + }, + { + "epoch": 0.5470380850767732, + "grad_norm": 0.11026640981435776, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 141510 + }, + { + "epoch": 0.5470767422801565, + "grad_norm": 0.0973174050450325, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 141520 + }, + { + "epoch": 0.5471153994835397, + "grad_norm": 0.11233708262443542, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 141530 + }, + { + "epoch": 0.5471540566869231, + "grad_norm": 0.10455524176359177, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 141540 + }, + { + "epoch": 0.5471927138903063, + "grad_norm": 0.11175559461116791, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 141550 + }, + { + "epoch": 0.5472313710936896, + "grad_norm": 0.10089890658855438, + "learning_rate": 0.002, + "loss": 2.367, + "step": 141560 + }, + { + "epoch": 0.5472700282970728, + "grad_norm": 0.09284067898988724, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 141570 + }, + { + "epoch": 0.5473086855004562, + "grad_norm": 0.11399402469396591, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 141580 + }, + { + "epoch": 0.5473473427038394, + "grad_norm": 0.10248827189207077, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 141590 + }, + { + "epoch": 0.5473859999072227, + "grad_norm": 0.1427232325077057, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 141600 + }, + { + "epoch": 0.547424657110606, + "grad_norm": 0.10681942850351334, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 141610 + }, + { + "epoch": 0.5474633143139893, + "grad_norm": 0.11645887047052383, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 141620 + }, + { + "epoch": 0.5475019715173726, + "grad_norm": 0.10732848942279816, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 141630 + }, + { + "epoch": 0.5475406287207558, + "grad_norm": 0.10308927297592163, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 141640 + }, + { + "epoch": 0.5475792859241391, + "grad_norm": 0.11306145042181015, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 141650 + }, + { + "epoch": 0.5476179431275224, + "grad_norm": 0.11283697187900543, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 141660 + }, + { + "epoch": 0.5476566003309057, + "grad_norm": 0.11596529185771942, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 141670 + }, + { + "epoch": 0.5476952575342889, + "grad_norm": 0.10113532841205597, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 141680 + }, + { + "epoch": 0.5477339147376722, + "grad_norm": 0.10591016709804535, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 141690 + }, + { + "epoch": 0.5477725719410554, + "grad_norm": 0.0996353030204773, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 141700 + }, + { + "epoch": 0.5478112291444388, + "grad_norm": 0.11873535811901093, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 141710 + }, + { + "epoch": 0.5478498863478221, + "grad_norm": 0.09684333205223083, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 141720 + }, + { + "epoch": 0.5478885435512053, + "grad_norm": 0.10586248338222504, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 141730 + }, + { + "epoch": 0.5479272007545886, + "grad_norm": 0.10639870911836624, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 141740 + }, + { + "epoch": 0.5479658579579719, + "grad_norm": 0.1137106642127037, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 141750 + }, + { + "epoch": 0.5480045151613552, + "grad_norm": 0.11402547359466553, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 141760 + }, + { + "epoch": 0.5480431723647384, + "grad_norm": 0.11360086500644684, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 141770 + }, + { + "epoch": 0.5480818295681217, + "grad_norm": 0.11750371754169464, + "learning_rate": 0.002, + "loss": 2.353, + "step": 141780 + }, + { + "epoch": 0.548120486771505, + "grad_norm": 0.1186307743191719, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 141790 + }, + { + "epoch": 0.5481591439748883, + "grad_norm": 0.11170139163732529, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 141800 + }, + { + "epoch": 0.5481978011782715, + "grad_norm": 0.11253993213176727, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 141810 + }, + { + "epoch": 0.5482364583816548, + "grad_norm": 0.129967600107193, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 141820 + }, + { + "epoch": 0.5482751155850382, + "grad_norm": 0.10900263488292694, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 141830 + }, + { + "epoch": 0.5483137727884214, + "grad_norm": 0.10477101802825928, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 141840 + }, + { + "epoch": 0.5483524299918047, + "grad_norm": 0.10287479311227798, + "learning_rate": 0.002, + "loss": 2.3637, + "step": 141850 + }, + { + "epoch": 0.5483910871951879, + "grad_norm": 0.10571011900901794, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 141860 + }, + { + "epoch": 0.5484297443985713, + "grad_norm": 0.10842036455869675, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 141870 + }, + { + "epoch": 0.5484684016019545, + "grad_norm": 0.09585347026586533, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 141880 + }, + { + "epoch": 0.5485070588053378, + "grad_norm": 0.10993171483278275, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 141890 + }, + { + "epoch": 0.548545716008721, + "grad_norm": 0.10170605778694153, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 141900 + }, + { + "epoch": 0.5485843732121043, + "grad_norm": 0.10986624658107758, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 141910 + }, + { + "epoch": 0.5486230304154877, + "grad_norm": 0.11059927195310593, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 141920 + }, + { + "epoch": 0.5486616876188709, + "grad_norm": 0.11290241032838821, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 141930 + }, + { + "epoch": 0.5487003448222542, + "grad_norm": 0.11301394551992416, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 141940 + }, + { + "epoch": 0.5487390020256374, + "grad_norm": 0.1104665994644165, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 141950 + }, + { + "epoch": 0.5487776592290208, + "grad_norm": 0.10794278979301453, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 141960 + }, + { + "epoch": 0.548816316432404, + "grad_norm": 0.10633352398872375, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 141970 + }, + { + "epoch": 0.5488549736357873, + "grad_norm": 0.1199800968170166, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 141980 + }, + { + "epoch": 0.5488936308391705, + "grad_norm": 0.11475013196468353, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 141990 + }, + { + "epoch": 0.5489322880425539, + "grad_norm": 0.10983271151781082, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 142000 + }, + { + "epoch": 0.5489709452459371, + "grad_norm": 0.12063111364841461, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 142010 + }, + { + "epoch": 0.5490096024493204, + "grad_norm": 0.10083327442407608, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 142020 + }, + { + "epoch": 0.5490482596527037, + "grad_norm": 0.12083666026592255, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 142030 + }, + { + "epoch": 0.549086916856087, + "grad_norm": 0.09449543058872223, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 142040 + }, + { + "epoch": 0.5491255740594703, + "grad_norm": 0.10262036323547363, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 142050 + }, + { + "epoch": 0.5491642312628535, + "grad_norm": 0.10588623583316803, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 142060 + }, + { + "epoch": 0.5492028884662368, + "grad_norm": 0.10415609180927277, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 142070 + }, + { + "epoch": 0.54924154566962, + "grad_norm": 0.12541480362415314, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 142080 + }, + { + "epoch": 0.5492802028730034, + "grad_norm": 0.11196579784154892, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 142090 + }, + { + "epoch": 0.5493188600763866, + "grad_norm": 0.11203989386558533, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 142100 + }, + { + "epoch": 0.5493575172797699, + "grad_norm": 0.09834367036819458, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 142110 + }, + { + "epoch": 0.5493961744831531, + "grad_norm": 0.12963052093982697, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 142120 + }, + { + "epoch": 0.5494348316865365, + "grad_norm": 0.10904177278280258, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 142130 + }, + { + "epoch": 0.5494734888899198, + "grad_norm": 0.13051638007164001, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 142140 + }, + { + "epoch": 0.549512146093303, + "grad_norm": 0.11035215109586716, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 142150 + }, + { + "epoch": 0.5495508032966863, + "grad_norm": 0.10016104578971863, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 142160 + }, + { + "epoch": 0.5495894605000696, + "grad_norm": 0.10304881632328033, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 142170 + }, + { + "epoch": 0.5496281177034529, + "grad_norm": 0.09799494594335556, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 142180 + }, + { + "epoch": 0.5496667749068361, + "grad_norm": 0.10154926776885986, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 142190 + }, + { + "epoch": 0.5497054321102194, + "grad_norm": 0.1152535229921341, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 142200 + }, + { + "epoch": 0.5497440893136027, + "grad_norm": 0.100237637758255, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 142210 + }, + { + "epoch": 0.549782746516986, + "grad_norm": 0.11598395556211472, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 142220 + }, + { + "epoch": 0.5498214037203693, + "grad_norm": 0.10567312687635422, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 142230 + }, + { + "epoch": 0.5498600609237525, + "grad_norm": 0.08874915540218353, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 142240 + }, + { + "epoch": 0.5498987181271359, + "grad_norm": 0.13708887994289398, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 142250 + }, + { + "epoch": 0.5499373753305191, + "grad_norm": 0.12051773816347122, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 142260 + }, + { + "epoch": 0.5499760325339024, + "grad_norm": 0.10234910249710083, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 142270 + }, + { + "epoch": 0.5500146897372856, + "grad_norm": 0.1016688346862793, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 142280 + }, + { + "epoch": 0.5500533469406689, + "grad_norm": 0.11642882972955704, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 142290 + }, + { + "epoch": 0.5500920041440522, + "grad_norm": 0.10169527679681778, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 142300 + }, + { + "epoch": 0.5501306613474355, + "grad_norm": 0.102230966091156, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 142310 + }, + { + "epoch": 0.5501693185508187, + "grad_norm": 0.12380985170602798, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 142320 + }, + { + "epoch": 0.550207975754202, + "grad_norm": 0.11084472388029099, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 142330 + }, + { + "epoch": 0.5502466329575854, + "grad_norm": 0.14835940301418304, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 142340 + }, + { + "epoch": 0.5502852901609686, + "grad_norm": 0.11222419887781143, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 142350 + }, + { + "epoch": 0.5503239473643519, + "grad_norm": 0.09371999651193619, + "learning_rate": 0.002, + "loss": 2.367, + "step": 142360 + }, + { + "epoch": 0.5503626045677351, + "grad_norm": 0.10161064565181732, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 142370 + }, + { + "epoch": 0.5504012617711185, + "grad_norm": 0.10748559236526489, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 142380 + }, + { + "epoch": 0.5504399189745017, + "grad_norm": 0.10964936017990112, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 142390 + }, + { + "epoch": 0.550478576177885, + "grad_norm": 0.10478832572698593, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 142400 + }, + { + "epoch": 0.5505172333812682, + "grad_norm": 0.11970923840999603, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 142410 + }, + { + "epoch": 0.5505558905846516, + "grad_norm": 0.1185605600476265, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 142420 + }, + { + "epoch": 0.5505945477880348, + "grad_norm": 0.11552104353904724, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 142430 + }, + { + "epoch": 0.5506332049914181, + "grad_norm": 0.11067025363445282, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 142440 + }, + { + "epoch": 0.5506718621948014, + "grad_norm": 0.11349763721227646, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 142450 + }, + { + "epoch": 0.5507105193981846, + "grad_norm": 0.12076409161090851, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 142460 + }, + { + "epoch": 0.550749176601568, + "grad_norm": 0.10295160114765167, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 142470 + }, + { + "epoch": 0.5507878338049512, + "grad_norm": 0.09861474484205246, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 142480 + }, + { + "epoch": 0.5508264910083345, + "grad_norm": 0.09792405366897583, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 142490 + }, + { + "epoch": 0.5508651482117177, + "grad_norm": 0.09810902178287506, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 142500 + }, + { + "epoch": 0.5509038054151011, + "grad_norm": 0.10388771444559097, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 142510 + }, + { + "epoch": 0.5509424626184843, + "grad_norm": 0.11705761402845383, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 142520 + }, + { + "epoch": 0.5509811198218676, + "grad_norm": 0.1030333936214447, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 142530 + }, + { + "epoch": 0.5510197770252508, + "grad_norm": 0.1025761142373085, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 142540 + }, + { + "epoch": 0.5510584342286342, + "grad_norm": 0.09871582686901093, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 142550 + }, + { + "epoch": 0.5510970914320175, + "grad_norm": 0.11431854218244553, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 142560 + }, + { + "epoch": 0.5511357486354007, + "grad_norm": 0.11743414402008057, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 142570 + }, + { + "epoch": 0.551174405838784, + "grad_norm": 0.11136042326688766, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 142580 + }, + { + "epoch": 0.5512130630421673, + "grad_norm": 0.11726684123277664, + "learning_rate": 0.002, + "loss": 2.35, + "step": 142590 + }, + { + "epoch": 0.5512517202455506, + "grad_norm": 0.11254546046257019, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 142600 + }, + { + "epoch": 0.5512903774489338, + "grad_norm": 0.11354956775903702, + "learning_rate": 0.002, + "loss": 2.358, + "step": 142610 + }, + { + "epoch": 0.5513290346523171, + "grad_norm": 0.1168670505285263, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 142620 + }, + { + "epoch": 0.5513676918557003, + "grad_norm": 0.12499973922967911, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 142630 + }, + { + "epoch": 0.5514063490590837, + "grad_norm": 0.0976676344871521, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 142640 + }, + { + "epoch": 0.551445006262467, + "grad_norm": 0.11980077624320984, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 142650 + }, + { + "epoch": 0.5514836634658502, + "grad_norm": 0.1028711274266243, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 142660 + }, + { + "epoch": 0.5515223206692335, + "grad_norm": 0.1479312628507614, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 142670 + }, + { + "epoch": 0.5515609778726168, + "grad_norm": 0.11372735351324081, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 142680 + }, + { + "epoch": 0.5515996350760001, + "grad_norm": 0.11581075191497803, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 142690 + }, + { + "epoch": 0.5516382922793833, + "grad_norm": 0.09935362637042999, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 142700 + }, + { + "epoch": 0.5516769494827666, + "grad_norm": 0.10516053438186646, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 142710 + }, + { + "epoch": 0.5517156066861499, + "grad_norm": 0.10696021467447281, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 142720 + }, + { + "epoch": 0.5517542638895332, + "grad_norm": 0.09497939795255661, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 142730 + }, + { + "epoch": 0.5517929210929164, + "grad_norm": 0.10460008680820465, + "learning_rate": 0.002, + "loss": 2.348, + "step": 142740 + }, + { + "epoch": 0.5518315782962997, + "grad_norm": 0.134171724319458, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 142750 + }, + { + "epoch": 0.5518702354996831, + "grad_norm": 0.10251723229885101, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 142760 + }, + { + "epoch": 0.5519088927030663, + "grad_norm": 0.10286859422922134, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 142770 + }, + { + "epoch": 0.5519475499064496, + "grad_norm": 0.10067896544933319, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 142780 + }, + { + "epoch": 0.5519862071098328, + "grad_norm": 0.11117151379585266, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 142790 + }, + { + "epoch": 0.5520248643132162, + "grad_norm": 0.11155954003334045, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 142800 + }, + { + "epoch": 0.5520635215165994, + "grad_norm": 0.09828852862119675, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 142810 + }, + { + "epoch": 0.5521021787199827, + "grad_norm": 0.09801884740591049, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 142820 + }, + { + "epoch": 0.5521408359233659, + "grad_norm": 0.12369256466627121, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 142830 + }, + { + "epoch": 0.5521794931267492, + "grad_norm": 0.09979232400655746, + "learning_rate": 0.002, + "loss": 2.343, + "step": 142840 + }, + { + "epoch": 0.5522181503301326, + "grad_norm": 0.10395888984203339, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 142850 + }, + { + "epoch": 0.5522568075335158, + "grad_norm": 0.09872164577245712, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 142860 + }, + { + "epoch": 0.552295464736899, + "grad_norm": 0.10278721898794174, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 142870 + }, + { + "epoch": 0.5523341219402823, + "grad_norm": 0.09452217072248459, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 142880 + }, + { + "epoch": 0.5523727791436657, + "grad_norm": 0.1081966757774353, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 142890 + }, + { + "epoch": 0.5524114363470489, + "grad_norm": 0.10426264256238937, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 142900 + }, + { + "epoch": 0.5524500935504322, + "grad_norm": 0.09925641119480133, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 142910 + }, + { + "epoch": 0.5524887507538154, + "grad_norm": 0.10067332535982132, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 142920 + }, + { + "epoch": 0.5525274079571988, + "grad_norm": 0.10544595122337341, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 142930 + }, + { + "epoch": 0.552566065160582, + "grad_norm": 0.12352099269628525, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 142940 + }, + { + "epoch": 0.5526047223639653, + "grad_norm": 0.25755220651626587, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 142950 + }, + { + "epoch": 0.5526433795673485, + "grad_norm": 0.1176406517624855, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 142960 + }, + { + "epoch": 0.5526820367707319, + "grad_norm": 0.09558644145727158, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 142970 + }, + { + "epoch": 0.5527206939741152, + "grad_norm": 0.09294940531253815, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 142980 + }, + { + "epoch": 0.5527593511774984, + "grad_norm": 0.12759459018707275, + "learning_rate": 0.002, + "loss": 2.345, + "step": 142990 + }, + { + "epoch": 0.5527980083808817, + "grad_norm": 0.1065143421292305, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 143000 + }, + { + "epoch": 0.5528366655842649, + "grad_norm": 0.10980822890996933, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 143010 + }, + { + "epoch": 0.5528753227876483, + "grad_norm": 0.10744861513376236, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 143020 + }, + { + "epoch": 0.5529139799910315, + "grad_norm": 0.10682953894138336, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 143030 + }, + { + "epoch": 0.5529526371944148, + "grad_norm": 0.1215931624174118, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 143040 + }, + { + "epoch": 0.552991294397798, + "grad_norm": 0.10288725048303604, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 143050 + }, + { + "epoch": 0.5530299516011814, + "grad_norm": 0.10882063210010529, + "learning_rate": 0.002, + "loss": 2.35, + "step": 143060 + }, + { + "epoch": 0.5530686088045647, + "grad_norm": 0.12566843628883362, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 143070 + }, + { + "epoch": 0.5531072660079479, + "grad_norm": 0.11138502508401871, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 143080 + }, + { + "epoch": 0.5531459232113312, + "grad_norm": 0.12486086040735245, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 143090 + }, + { + "epoch": 0.5531845804147145, + "grad_norm": 0.11329344660043716, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 143100 + }, + { + "epoch": 0.5532232376180978, + "grad_norm": 0.09787940979003906, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 143110 + }, + { + "epoch": 0.553261894821481, + "grad_norm": 0.1143239215016365, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 143120 + }, + { + "epoch": 0.5533005520248643, + "grad_norm": 0.10016550123691559, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 143130 + }, + { + "epoch": 0.5533392092282476, + "grad_norm": 0.09186196327209473, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 143140 + }, + { + "epoch": 0.5533778664316309, + "grad_norm": 0.09927267581224442, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 143150 + }, + { + "epoch": 0.5534165236350141, + "grad_norm": 0.10326193273067474, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 143160 + }, + { + "epoch": 0.5534551808383974, + "grad_norm": 0.1423591524362564, + "learning_rate": 0.002, + "loss": 2.338, + "step": 143170 + }, + { + "epoch": 0.5534938380417807, + "grad_norm": 0.11829890310764313, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 143180 + }, + { + "epoch": 0.553532495245164, + "grad_norm": 0.10465212166309357, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 143190 + }, + { + "epoch": 0.5535711524485473, + "grad_norm": 0.1198871061205864, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 143200 + }, + { + "epoch": 0.5536098096519305, + "grad_norm": 0.11005301028490067, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 143210 + }, + { + "epoch": 0.5536484668553138, + "grad_norm": 0.13708800077438354, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 143220 + }, + { + "epoch": 0.5536871240586971, + "grad_norm": 0.12035497277975082, + "learning_rate": 0.002, + "loss": 2.341, + "step": 143230 + }, + { + "epoch": 0.5537257812620804, + "grad_norm": 0.11575876921415329, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 143240 + }, + { + "epoch": 0.5537644384654636, + "grad_norm": 0.11982641369104385, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 143250 + }, + { + "epoch": 0.5538030956688469, + "grad_norm": 0.10384172946214676, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 143260 + }, + { + "epoch": 0.5538417528722303, + "grad_norm": 0.104493148624897, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 143270 + }, + { + "epoch": 0.5538804100756135, + "grad_norm": 0.12926560640335083, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 143280 + }, + { + "epoch": 0.5539190672789968, + "grad_norm": 0.09122234582901001, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 143290 + }, + { + "epoch": 0.55395772448238, + "grad_norm": 0.10327209532260895, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 143300 + }, + { + "epoch": 0.5539963816857634, + "grad_norm": 0.10882581770420074, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 143310 + }, + { + "epoch": 0.5540350388891466, + "grad_norm": 0.11400618404150009, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 143320 + }, + { + "epoch": 0.5540736960925299, + "grad_norm": 0.1005762591958046, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 143330 + }, + { + "epoch": 0.5541123532959131, + "grad_norm": 0.11607342213392258, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 143340 + }, + { + "epoch": 0.5541510104992965, + "grad_norm": 0.0930909588932991, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 143350 + }, + { + "epoch": 0.5541896677026797, + "grad_norm": 0.10060762614011765, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 143360 + }, + { + "epoch": 0.554228324906063, + "grad_norm": 0.11107786744832993, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 143370 + }, + { + "epoch": 0.5542669821094462, + "grad_norm": 0.11191900074481964, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 143380 + }, + { + "epoch": 0.5543056393128295, + "grad_norm": 0.12472184002399445, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 143390 + }, + { + "epoch": 0.5543442965162129, + "grad_norm": 0.11487885564565659, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 143400 + }, + { + "epoch": 0.5543829537195961, + "grad_norm": 0.10302220284938812, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 143410 + }, + { + "epoch": 0.5544216109229794, + "grad_norm": 0.10976248979568481, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 143420 + }, + { + "epoch": 0.5544602681263626, + "grad_norm": 0.11117681115865707, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 143430 + }, + { + "epoch": 0.554498925329746, + "grad_norm": 0.11047784239053726, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 143440 + }, + { + "epoch": 0.5545375825331292, + "grad_norm": 0.09889400750398636, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 143450 + }, + { + "epoch": 0.5545762397365125, + "grad_norm": 0.1134338527917862, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 143460 + }, + { + "epoch": 0.5546148969398957, + "grad_norm": 0.10511985421180725, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 143470 + }, + { + "epoch": 0.5546535541432791, + "grad_norm": 0.11839264631271362, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 143480 + }, + { + "epoch": 0.5546922113466624, + "grad_norm": 0.11493546515703201, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 143490 + }, + { + "epoch": 0.5547308685500456, + "grad_norm": 0.09989528357982635, + "learning_rate": 0.002, + "loss": 2.342, + "step": 143500 + }, + { + "epoch": 0.5547695257534289, + "grad_norm": 0.10545303672552109, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 143510 + }, + { + "epoch": 0.5548081829568122, + "grad_norm": 0.12071103602647781, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 143520 + }, + { + "epoch": 0.5548468401601955, + "grad_norm": 0.11775526404380798, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 143530 + }, + { + "epoch": 0.5548854973635787, + "grad_norm": 0.12348510324954987, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 143540 + }, + { + "epoch": 0.554924154566962, + "grad_norm": 0.11271246522665024, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 143550 + }, + { + "epoch": 0.5549628117703452, + "grad_norm": 0.1072176918387413, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 143560 + }, + { + "epoch": 0.5550014689737286, + "grad_norm": 0.10169960558414459, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 143570 + }, + { + "epoch": 0.5550401261771118, + "grad_norm": 0.10885798186063766, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 143580 + }, + { + "epoch": 0.5550787833804951, + "grad_norm": 0.12151875346899033, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 143590 + }, + { + "epoch": 0.5551174405838784, + "grad_norm": 0.1123412549495697, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 143600 + }, + { + "epoch": 0.5551560977872617, + "grad_norm": 0.09589749574661255, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 143610 + }, + { + "epoch": 0.555194754990645, + "grad_norm": 0.11981192231178284, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 143620 + }, + { + "epoch": 0.5552334121940282, + "grad_norm": 0.10519280284643173, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 143630 + }, + { + "epoch": 0.5552720693974115, + "grad_norm": 0.10929519683122635, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 143640 + }, + { + "epoch": 0.5553107266007948, + "grad_norm": 0.10268570482730865, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 143650 + }, + { + "epoch": 0.5553493838041781, + "grad_norm": 0.10233426094055176, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 143660 + }, + { + "epoch": 0.5553880410075613, + "grad_norm": 0.12025441974401474, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 143670 + }, + { + "epoch": 0.5554266982109446, + "grad_norm": 0.11578936129808426, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 143680 + }, + { + "epoch": 0.555465355414328, + "grad_norm": 0.10593968629837036, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 143690 + }, + { + "epoch": 0.5555040126177112, + "grad_norm": 0.10012836009263992, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 143700 + }, + { + "epoch": 0.5555426698210945, + "grad_norm": 0.10861680656671524, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 143710 + }, + { + "epoch": 0.5555813270244777, + "grad_norm": 0.10936512798070908, + "learning_rate": 0.002, + "loss": 2.344, + "step": 143720 + }, + { + "epoch": 0.5556199842278611, + "grad_norm": 0.11655790358781815, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 143730 + }, + { + "epoch": 0.5556586414312443, + "grad_norm": 0.09844601899385452, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 143740 + }, + { + "epoch": 0.5556972986346276, + "grad_norm": 0.11050175130367279, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 143750 + }, + { + "epoch": 0.5557359558380108, + "grad_norm": 0.10614526271820068, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 143760 + }, + { + "epoch": 0.5557746130413941, + "grad_norm": 0.1081186830997467, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 143770 + }, + { + "epoch": 0.5558132702447774, + "grad_norm": 0.10302522778511047, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 143780 + }, + { + "epoch": 0.5558519274481607, + "grad_norm": 0.10777824372053146, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 143790 + }, + { + "epoch": 0.555890584651544, + "grad_norm": 0.09283053129911423, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 143800 + }, + { + "epoch": 0.5559292418549272, + "grad_norm": 0.11310338228940964, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 143810 + }, + { + "epoch": 0.5559678990583106, + "grad_norm": 0.10526971518993378, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 143820 + }, + { + "epoch": 0.5560065562616938, + "grad_norm": 0.09571385383605957, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 143830 + }, + { + "epoch": 0.5560452134650771, + "grad_norm": 0.10621003806591034, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 143840 + }, + { + "epoch": 0.5560838706684603, + "grad_norm": 0.12751825153827667, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 143850 + }, + { + "epoch": 0.5561225278718437, + "grad_norm": 0.09856680035591125, + "learning_rate": 0.002, + "loss": 2.355, + "step": 143860 + }, + { + "epoch": 0.5561611850752269, + "grad_norm": 0.120022252202034, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 143870 + }, + { + "epoch": 0.5561998422786102, + "grad_norm": 0.10100723803043365, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 143880 + }, + { + "epoch": 0.5562384994819934, + "grad_norm": 0.11421933770179749, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 143890 + }, + { + "epoch": 0.5562771566853768, + "grad_norm": 0.11160522699356079, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 143900 + }, + { + "epoch": 0.5563158138887601, + "grad_norm": 0.10204055905342102, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 143910 + }, + { + "epoch": 0.5563544710921433, + "grad_norm": 0.10301145166158676, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 143920 + }, + { + "epoch": 0.5563931282955266, + "grad_norm": 0.12065683305263519, + "learning_rate": 0.002, + "loss": 2.347, + "step": 143930 + }, + { + "epoch": 0.5564317854989098, + "grad_norm": 0.10993140190839767, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 143940 + }, + { + "epoch": 0.5564704427022932, + "grad_norm": 0.12739704549312592, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 143950 + }, + { + "epoch": 0.5565090999056764, + "grad_norm": 0.09803897887468338, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 143960 + }, + { + "epoch": 0.5565477571090597, + "grad_norm": 0.10325773805379868, + "learning_rate": 0.002, + "loss": 2.34, + "step": 143970 + }, + { + "epoch": 0.5565864143124429, + "grad_norm": 0.10811714828014374, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 143980 + }, + { + "epoch": 0.5566250715158263, + "grad_norm": 0.13029812276363373, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 143990 + }, + { + "epoch": 0.5566637287192095, + "grad_norm": 0.09954417496919632, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 144000 + }, + { + "epoch": 0.5567023859225928, + "grad_norm": 0.10553660988807678, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 144010 + }, + { + "epoch": 0.556741043125976, + "grad_norm": 0.11054238677024841, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 144020 + }, + { + "epoch": 0.5567797003293594, + "grad_norm": 0.10247528553009033, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 144030 + }, + { + "epoch": 0.5568183575327427, + "grad_norm": 0.11531244963407516, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 144040 + }, + { + "epoch": 0.5568570147361259, + "grad_norm": 0.12554004788398743, + "learning_rate": 0.002, + "loss": 2.36, + "step": 144050 + }, + { + "epoch": 0.5568956719395092, + "grad_norm": 0.093537338078022, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 144060 + }, + { + "epoch": 0.5569343291428925, + "grad_norm": 0.10666855424642563, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 144070 + }, + { + "epoch": 0.5569729863462758, + "grad_norm": 0.10169276595115662, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 144080 + }, + { + "epoch": 0.557011643549659, + "grad_norm": 0.11072119325399399, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 144090 + }, + { + "epoch": 0.5570503007530423, + "grad_norm": 0.10343354940414429, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 144100 + }, + { + "epoch": 0.5570889579564255, + "grad_norm": 0.10450764000415802, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 144110 + }, + { + "epoch": 0.5571276151598089, + "grad_norm": 0.10256035625934601, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 144120 + }, + { + "epoch": 0.5571662723631922, + "grad_norm": 0.10732540488243103, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 144130 + }, + { + "epoch": 0.5572049295665754, + "grad_norm": 0.15823794901371002, + "learning_rate": 0.002, + "loss": 2.36, + "step": 144140 + }, + { + "epoch": 0.5572435867699587, + "grad_norm": 0.10332081466913223, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 144150 + }, + { + "epoch": 0.557282243973342, + "grad_norm": 0.1082451194524765, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 144160 + }, + { + "epoch": 0.5573209011767253, + "grad_norm": 0.12070360034704208, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 144170 + }, + { + "epoch": 0.5573595583801085, + "grad_norm": 0.11742536723613739, + "learning_rate": 0.002, + "loss": 2.354, + "step": 144180 + }, + { + "epoch": 0.5573982155834918, + "grad_norm": 0.10265970230102539, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 144190 + }, + { + "epoch": 0.5574368727868751, + "grad_norm": 0.12360739707946777, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 144200 + }, + { + "epoch": 0.5574755299902584, + "grad_norm": 0.11441248655319214, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 144210 + }, + { + "epoch": 0.5575141871936417, + "grad_norm": 0.13491380214691162, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 144220 + }, + { + "epoch": 0.5575528443970249, + "grad_norm": 0.10316541790962219, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 144230 + }, + { + "epoch": 0.5575915016004083, + "grad_norm": 0.11570821702480316, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 144240 + }, + { + "epoch": 0.5576301588037915, + "grad_norm": 0.1069808155298233, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 144250 + }, + { + "epoch": 0.5576688160071748, + "grad_norm": 0.10772182047367096, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 144260 + }, + { + "epoch": 0.557707473210558, + "grad_norm": 0.1067073792219162, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 144270 + }, + { + "epoch": 0.5577461304139414, + "grad_norm": 0.10397684574127197, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 144280 + }, + { + "epoch": 0.5577847876173246, + "grad_norm": 0.1108739972114563, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 144290 + }, + { + "epoch": 0.5578234448207079, + "grad_norm": 0.1144571602344513, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 144300 + }, + { + "epoch": 0.5578621020240911, + "grad_norm": 0.13079805672168732, + "learning_rate": 0.002, + "loss": 2.355, + "step": 144310 + }, + { + "epoch": 0.5579007592274744, + "grad_norm": 0.10899020731449127, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 144320 + }, + { + "epoch": 0.5579394164308578, + "grad_norm": 0.1061665266752243, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 144330 + }, + { + "epoch": 0.557978073634241, + "grad_norm": 0.10170763731002808, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 144340 + }, + { + "epoch": 0.5580167308376243, + "grad_norm": 0.10558304190635681, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 144350 + }, + { + "epoch": 0.5580553880410075, + "grad_norm": 0.13423477113246918, + "learning_rate": 0.002, + "loss": 2.347, + "step": 144360 + }, + { + "epoch": 0.5580940452443909, + "grad_norm": 0.11419844627380371, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 144370 + }, + { + "epoch": 0.5581327024477741, + "grad_norm": 0.09708640724420547, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 144380 + }, + { + "epoch": 0.5581713596511574, + "grad_norm": 0.10306001454591751, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 144390 + }, + { + "epoch": 0.5582100168545406, + "grad_norm": 0.10493401437997818, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 144400 + }, + { + "epoch": 0.558248674057924, + "grad_norm": 0.11392046511173248, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 144410 + }, + { + "epoch": 0.5582873312613073, + "grad_norm": 0.1155625432729721, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 144420 + }, + { + "epoch": 0.5583259884646905, + "grad_norm": 0.13787469267845154, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 144430 + }, + { + "epoch": 0.5583646456680738, + "grad_norm": 0.10516868531703949, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 144440 + }, + { + "epoch": 0.5584033028714571, + "grad_norm": 0.08933563530445099, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 144450 + }, + { + "epoch": 0.5584419600748404, + "grad_norm": 0.09746905416250229, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 144460 + }, + { + "epoch": 0.5584806172782236, + "grad_norm": 0.11433606594800949, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 144470 + }, + { + "epoch": 0.5585192744816069, + "grad_norm": 0.09467669576406479, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 144480 + }, + { + "epoch": 0.5585579316849901, + "grad_norm": 0.12223439663648605, + "learning_rate": 0.002, + "loss": 2.3648, + "step": 144490 + }, + { + "epoch": 0.5585965888883735, + "grad_norm": 0.18821455538272858, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 144500 + }, + { + "epoch": 0.5586352460917567, + "grad_norm": 0.14402391016483307, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 144510 + }, + { + "epoch": 0.55867390329514, + "grad_norm": 0.1065588966012001, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 144520 + }, + { + "epoch": 0.5587125604985232, + "grad_norm": 0.11623067408800125, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 144530 + }, + { + "epoch": 0.5587512177019066, + "grad_norm": 0.23236896097660065, + "learning_rate": 0.002, + "loss": 2.3721, + "step": 144540 + }, + { + "epoch": 0.5587898749052899, + "grad_norm": 0.10903756320476532, + "learning_rate": 0.002, + "loss": 2.3692, + "step": 144550 + }, + { + "epoch": 0.5588285321086731, + "grad_norm": 0.10570161044597626, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 144560 + }, + { + "epoch": 0.5588671893120564, + "grad_norm": 0.11697853356599808, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 144570 + }, + { + "epoch": 0.5589058465154397, + "grad_norm": 0.1024005338549614, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 144580 + }, + { + "epoch": 0.558944503718823, + "grad_norm": 0.10046295821666718, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 144590 + }, + { + "epoch": 0.5589831609222062, + "grad_norm": 0.10596208274364471, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 144600 + }, + { + "epoch": 0.5590218181255895, + "grad_norm": 0.10390448570251465, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 144610 + }, + { + "epoch": 0.5590604753289729, + "grad_norm": 0.09717035293579102, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 144620 + }, + { + "epoch": 0.5590991325323561, + "grad_norm": 0.12094360589981079, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 144630 + }, + { + "epoch": 0.5591377897357394, + "grad_norm": 0.09621819853782654, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 144640 + }, + { + "epoch": 0.5591764469391226, + "grad_norm": 0.12042798846960068, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 144650 + }, + { + "epoch": 0.559215104142506, + "grad_norm": 0.1034356951713562, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 144660 + }, + { + "epoch": 0.5592537613458892, + "grad_norm": 0.10839217156171799, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 144670 + }, + { + "epoch": 0.5592924185492725, + "grad_norm": 0.10241516679525375, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 144680 + }, + { + "epoch": 0.5593310757526557, + "grad_norm": 0.11839167773723602, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 144690 + }, + { + "epoch": 0.559369732956039, + "grad_norm": 0.1063704788684845, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 144700 + }, + { + "epoch": 0.5594083901594223, + "grad_norm": 0.12920372188091278, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 144710 + }, + { + "epoch": 0.5594470473628056, + "grad_norm": 0.1120523139834404, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 144720 + }, + { + "epoch": 0.5594857045661888, + "grad_norm": 0.10131628811359406, + "learning_rate": 0.002, + "loss": 2.324, + "step": 144730 + }, + { + "epoch": 0.5595243617695721, + "grad_norm": 0.09710616618394852, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 144740 + }, + { + "epoch": 0.5595630189729555, + "grad_norm": 0.11539177596569061, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 144750 + }, + { + "epoch": 0.5596016761763387, + "grad_norm": 0.11322654783725739, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 144760 + }, + { + "epoch": 0.559640333379722, + "grad_norm": 0.09051068872213364, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 144770 + }, + { + "epoch": 0.5596789905831052, + "grad_norm": 0.12363360822200775, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 144780 + }, + { + "epoch": 0.5597176477864886, + "grad_norm": 0.09123220294713974, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 144790 + }, + { + "epoch": 0.5597563049898718, + "grad_norm": 0.10503943264484406, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 144800 + }, + { + "epoch": 0.5597949621932551, + "grad_norm": 0.11177346110343933, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 144810 + }, + { + "epoch": 0.5598336193966383, + "grad_norm": 0.0991305410861969, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 144820 + }, + { + "epoch": 0.5598722766000217, + "grad_norm": 0.09888456016778946, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 144830 + }, + { + "epoch": 0.559910933803405, + "grad_norm": 0.10640181601047516, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 144840 + }, + { + "epoch": 0.5599495910067882, + "grad_norm": 0.14159761369228363, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 144850 + }, + { + "epoch": 0.5599882482101715, + "grad_norm": 0.10540436953306198, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 144860 + }, + { + "epoch": 0.5600269054135547, + "grad_norm": 0.10895095765590668, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 144870 + }, + { + "epoch": 0.5600655626169381, + "grad_norm": 0.10320727527141571, + "learning_rate": 0.002, + "loss": 2.355, + "step": 144880 + }, + { + "epoch": 0.5601042198203213, + "grad_norm": 0.1057295873761177, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 144890 + }, + { + "epoch": 0.5601428770237046, + "grad_norm": 0.11810392886400223, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 144900 + }, + { + "epoch": 0.5601815342270878, + "grad_norm": 0.09296156466007233, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 144910 + }, + { + "epoch": 0.5602201914304712, + "grad_norm": 0.09686148166656494, + "learning_rate": 0.002, + "loss": 2.34, + "step": 144920 + }, + { + "epoch": 0.5602588486338544, + "grad_norm": 0.09118534624576569, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 144930 + }, + { + "epoch": 0.5602975058372377, + "grad_norm": 0.1013110876083374, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 144940 + }, + { + "epoch": 0.560336163040621, + "grad_norm": 0.12402082979679108, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 144950 + }, + { + "epoch": 0.5603748202440043, + "grad_norm": 0.10473762452602386, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 144960 + }, + { + "epoch": 0.5604134774473876, + "grad_norm": 0.12455492466688156, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 144970 + }, + { + "epoch": 0.5604521346507708, + "grad_norm": 0.10267633199691772, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 144980 + }, + { + "epoch": 0.5604907918541541, + "grad_norm": 0.12730388343334198, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 144990 + }, + { + "epoch": 0.5605294490575374, + "grad_norm": 0.08786030858755112, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 145000 + }, + { + "epoch": 0.5605681062609207, + "grad_norm": 0.11927718669176102, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 145010 + }, + { + "epoch": 0.5606067634643039, + "grad_norm": 0.1271049529314041, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 145020 + }, + { + "epoch": 0.5606454206676872, + "grad_norm": 0.10028399527072906, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 145030 + }, + { + "epoch": 0.5606840778710704, + "grad_norm": 0.20037201046943665, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 145040 + }, + { + "epoch": 0.5607227350744538, + "grad_norm": 0.10821410268545151, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 145050 + }, + { + "epoch": 0.5607613922778371, + "grad_norm": 0.11034255474805832, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 145060 + }, + { + "epoch": 0.5608000494812203, + "grad_norm": 0.10494337975978851, + "learning_rate": 0.002, + "loss": 2.348, + "step": 145070 + }, + { + "epoch": 0.5608387066846036, + "grad_norm": 0.13450533151626587, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 145080 + }, + { + "epoch": 0.5608773638879869, + "grad_norm": 0.09030753374099731, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 145090 + }, + { + "epoch": 0.5609160210913702, + "grad_norm": 0.09489750862121582, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 145100 + }, + { + "epoch": 0.5609546782947534, + "grad_norm": 0.10595418512821198, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 145110 + }, + { + "epoch": 0.5609933354981367, + "grad_norm": 0.09282474219799042, + "learning_rate": 0.002, + "loss": 2.344, + "step": 145120 + }, + { + "epoch": 0.56103199270152, + "grad_norm": 0.0995539128780365, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 145130 + }, + { + "epoch": 0.5610706499049033, + "grad_norm": 0.1267012655735016, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 145140 + }, + { + "epoch": 0.5611093071082865, + "grad_norm": 0.11552777886390686, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 145150 + }, + { + "epoch": 0.5611479643116698, + "grad_norm": 0.08912569284439087, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 145160 + }, + { + "epoch": 0.5611866215150532, + "grad_norm": 0.19666577875614166, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 145170 + }, + { + "epoch": 0.5612252787184364, + "grad_norm": 0.0973074808716774, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 145180 + }, + { + "epoch": 0.5612639359218197, + "grad_norm": 0.09786782413721085, + "learning_rate": 0.002, + "loss": 2.337, + "step": 145190 + }, + { + "epoch": 0.5613025931252029, + "grad_norm": 0.10890211164951324, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 145200 + }, + { + "epoch": 0.5613412503285863, + "grad_norm": 0.10896290838718414, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 145210 + }, + { + "epoch": 0.5613799075319695, + "grad_norm": 0.10439836978912354, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 145220 + }, + { + "epoch": 0.5614185647353528, + "grad_norm": 0.10249201953411102, + "learning_rate": 0.002, + "loss": 2.361, + "step": 145230 + }, + { + "epoch": 0.561457221938736, + "grad_norm": 0.09261146932840347, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 145240 + }, + { + "epoch": 0.5614958791421193, + "grad_norm": 0.10369481146335602, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 145250 + }, + { + "epoch": 0.5615345363455027, + "grad_norm": 0.12321964651346207, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 145260 + }, + { + "epoch": 0.5615731935488859, + "grad_norm": 0.10361948609352112, + "learning_rate": 0.002, + "loss": 2.345, + "step": 145270 + }, + { + "epoch": 0.5616118507522692, + "grad_norm": 0.12577545642852783, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 145280 + }, + { + "epoch": 0.5616505079556524, + "grad_norm": 0.10888047516345978, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 145290 + }, + { + "epoch": 0.5616891651590358, + "grad_norm": 0.09915715456008911, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 145300 + }, + { + "epoch": 0.561727822362419, + "grad_norm": 0.11483432352542877, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 145310 + }, + { + "epoch": 0.5617664795658023, + "grad_norm": 0.10110090672969818, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 145320 + }, + { + "epoch": 0.5618051367691855, + "grad_norm": 0.11618124693632126, + "learning_rate": 0.002, + "loss": 2.343, + "step": 145330 + }, + { + "epoch": 0.5618437939725689, + "grad_norm": 0.10940609872341156, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 145340 + }, + { + "epoch": 0.5618824511759521, + "grad_norm": 0.10506876558065414, + "learning_rate": 0.002, + "loss": 2.358, + "step": 145350 + }, + { + "epoch": 0.5619211083793354, + "grad_norm": 0.1089290902018547, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 145360 + }, + { + "epoch": 0.5619597655827187, + "grad_norm": 0.1401052325963974, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 145370 + }, + { + "epoch": 0.561998422786102, + "grad_norm": 0.10837940871715546, + "learning_rate": 0.002, + "loss": 2.36, + "step": 145380 + }, + { + "epoch": 0.5620370799894853, + "grad_norm": 0.1082291305065155, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 145390 + }, + { + "epoch": 0.5620757371928685, + "grad_norm": 0.09650030732154846, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 145400 + }, + { + "epoch": 0.5621143943962518, + "grad_norm": 0.13818258047103882, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 145410 + }, + { + "epoch": 0.562153051599635, + "grad_norm": 0.11335279047489166, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 145420 + }, + { + "epoch": 0.5621917088030184, + "grad_norm": 0.11053285002708435, + "learning_rate": 0.002, + "loss": 2.353, + "step": 145430 + }, + { + "epoch": 0.5622303660064016, + "grad_norm": 0.10307853668928146, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 145440 + }, + { + "epoch": 0.5622690232097849, + "grad_norm": 0.11353618651628494, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 145450 + }, + { + "epoch": 0.5623076804131681, + "grad_norm": 0.11812679469585419, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 145460 + }, + { + "epoch": 0.5623463376165515, + "grad_norm": 0.09612657129764557, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 145470 + }, + { + "epoch": 0.5623849948199348, + "grad_norm": 0.10407382994890213, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 145480 + }, + { + "epoch": 0.562423652023318, + "grad_norm": 0.09928133338689804, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 145490 + }, + { + "epoch": 0.5624623092267013, + "grad_norm": 0.0923873707652092, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 145500 + }, + { + "epoch": 0.5625009664300846, + "grad_norm": 0.1048966720700264, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 145510 + }, + { + "epoch": 0.5625396236334679, + "grad_norm": 0.12021849304437637, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 145520 + }, + { + "epoch": 0.5625782808368511, + "grad_norm": 0.1024569571018219, + "learning_rate": 0.002, + "loss": 2.338, + "step": 145530 + }, + { + "epoch": 0.5626169380402344, + "grad_norm": 0.10092726349830627, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 145540 + }, + { + "epoch": 0.5626555952436177, + "grad_norm": 0.10643120110034943, + "learning_rate": 0.002, + "loss": 2.331, + "step": 145550 + }, + { + "epoch": 0.562694252447001, + "grad_norm": 0.11607813090085983, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 145560 + }, + { + "epoch": 0.5627329096503842, + "grad_norm": 0.10902582854032516, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 145570 + }, + { + "epoch": 0.5627715668537675, + "grad_norm": 0.09896028786897659, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 145580 + }, + { + "epoch": 0.5628102240571508, + "grad_norm": 0.0961156114935875, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 145590 + }, + { + "epoch": 0.5628488812605341, + "grad_norm": 0.11849000304937363, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 145600 + }, + { + "epoch": 0.5628875384639174, + "grad_norm": 0.0874343141913414, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 145610 + }, + { + "epoch": 0.5629261956673006, + "grad_norm": 0.13053591549396515, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 145620 + }, + { + "epoch": 0.5629648528706839, + "grad_norm": 0.09980365633964539, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 145630 + }, + { + "epoch": 0.5630035100740672, + "grad_norm": 0.09870453178882599, + "learning_rate": 0.002, + "loss": 2.342, + "step": 145640 + }, + { + "epoch": 0.5630421672774505, + "grad_norm": 0.10686399042606354, + "learning_rate": 0.002, + "loss": 2.355, + "step": 145650 + }, + { + "epoch": 0.5630808244808337, + "grad_norm": 0.10777558386325836, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 145660 + }, + { + "epoch": 0.563119481684217, + "grad_norm": 0.1021328940987587, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 145670 + }, + { + "epoch": 0.5631581388876004, + "grad_norm": 0.10991507768630981, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 145680 + }, + { + "epoch": 0.5631967960909836, + "grad_norm": 0.09759745746850967, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 145690 + }, + { + "epoch": 0.5632354532943669, + "grad_norm": 0.09556913375854492, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 145700 + }, + { + "epoch": 0.5632741104977501, + "grad_norm": 0.11235988140106201, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 145710 + }, + { + "epoch": 0.5633127677011335, + "grad_norm": 0.09124507009983063, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 145720 + }, + { + "epoch": 0.5633514249045167, + "grad_norm": 0.10627923160791397, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 145730 + }, + { + "epoch": 0.5633900821079, + "grad_norm": 0.10545750707387924, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 145740 + }, + { + "epoch": 0.5634287393112832, + "grad_norm": 0.09357081353664398, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 145750 + }, + { + "epoch": 0.5634673965146666, + "grad_norm": 0.11138079315423965, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 145760 + }, + { + "epoch": 0.5635060537180498, + "grad_norm": 0.09826505184173584, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 145770 + }, + { + "epoch": 0.5635447109214331, + "grad_norm": 0.10549304634332657, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 145780 + }, + { + "epoch": 0.5635833681248164, + "grad_norm": 0.10531755536794662, + "learning_rate": 0.002, + "loss": 2.346, + "step": 145790 + }, + { + "epoch": 0.5636220253281996, + "grad_norm": 0.1031210795044899, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 145800 + }, + { + "epoch": 0.563660682531583, + "grad_norm": 0.11040626466274261, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 145810 + }, + { + "epoch": 0.5636993397349662, + "grad_norm": 0.1013822853565216, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 145820 + }, + { + "epoch": 0.5637379969383495, + "grad_norm": 0.11882764846086502, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 145830 + }, + { + "epoch": 0.5637766541417327, + "grad_norm": 0.09364783018827438, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 145840 + }, + { + "epoch": 0.5638153113451161, + "grad_norm": 0.10282673686742783, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 145850 + }, + { + "epoch": 0.5638539685484993, + "grad_norm": 0.11586014181375504, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 145860 + }, + { + "epoch": 0.5638926257518826, + "grad_norm": 0.10484866052865982, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 145870 + }, + { + "epoch": 0.5639312829552658, + "grad_norm": 0.1112280786037445, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 145880 + }, + { + "epoch": 0.5639699401586492, + "grad_norm": 0.0995175763964653, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 145890 + }, + { + "epoch": 0.5640085973620325, + "grad_norm": 0.09986388683319092, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 145900 + }, + { + "epoch": 0.5640472545654157, + "grad_norm": 0.10015115886926651, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 145910 + }, + { + "epoch": 0.564085911768799, + "grad_norm": 0.10942229628562927, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 145920 + }, + { + "epoch": 0.5641245689721823, + "grad_norm": 0.10118841379880905, + "learning_rate": 0.002, + "loss": 2.344, + "step": 145930 + }, + { + "epoch": 0.5641632261755656, + "grad_norm": 0.09918641299009323, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 145940 + }, + { + "epoch": 0.5642018833789488, + "grad_norm": 0.10887645184993744, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 145950 + }, + { + "epoch": 0.5642405405823321, + "grad_norm": 0.12971638143062592, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 145960 + }, + { + "epoch": 0.5642791977857153, + "grad_norm": 0.10035094618797302, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 145970 + }, + { + "epoch": 0.5643178549890987, + "grad_norm": 0.13477998971939087, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 145980 + }, + { + "epoch": 0.564356512192482, + "grad_norm": 0.10733260959386826, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 145990 + }, + { + "epoch": 0.5643951693958652, + "grad_norm": 0.11192618310451508, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 146000 + }, + { + "epoch": 0.5644338265992485, + "grad_norm": 0.12049674242734909, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 146010 + }, + { + "epoch": 0.5644724838026318, + "grad_norm": 0.10427054762840271, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 146020 + }, + { + "epoch": 0.5645111410060151, + "grad_norm": 0.11584754288196564, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 146030 + }, + { + "epoch": 0.5645497982093983, + "grad_norm": 0.1015305295586586, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 146040 + }, + { + "epoch": 0.5645884554127816, + "grad_norm": 0.11458880454301834, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 146050 + }, + { + "epoch": 0.5646271126161649, + "grad_norm": 0.11445124447345734, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 146060 + }, + { + "epoch": 0.5646657698195482, + "grad_norm": 0.10447277128696442, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 146070 + }, + { + "epoch": 0.5647044270229314, + "grad_norm": 0.09257570654153824, + "learning_rate": 0.002, + "loss": 2.347, + "step": 146080 + }, + { + "epoch": 0.5647430842263147, + "grad_norm": 0.1130877286195755, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 146090 + }, + { + "epoch": 0.5647817414296981, + "grad_norm": 0.11452198773622513, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 146100 + }, + { + "epoch": 0.5648203986330813, + "grad_norm": 0.09652146697044373, + "learning_rate": 0.002, + "loss": 2.343, + "step": 146110 + }, + { + "epoch": 0.5648590558364646, + "grad_norm": 0.1043148934841156, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 146120 + }, + { + "epoch": 0.5648977130398478, + "grad_norm": 0.133561372756958, + "learning_rate": 0.002, + "loss": 2.327, + "step": 146130 + }, + { + "epoch": 0.5649363702432312, + "grad_norm": 0.10625911504030228, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 146140 + }, + { + "epoch": 0.5649750274466144, + "grad_norm": 0.10919652879238129, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 146150 + }, + { + "epoch": 0.5650136846499977, + "grad_norm": 0.12570203840732574, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 146160 + }, + { + "epoch": 0.5650523418533809, + "grad_norm": 0.114006906747818, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 146170 + }, + { + "epoch": 0.5650909990567642, + "grad_norm": 0.10396946966648102, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 146180 + }, + { + "epoch": 0.5651296562601476, + "grad_norm": 0.11225318908691406, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 146190 + }, + { + "epoch": 0.5651683134635308, + "grad_norm": 0.0972057357430458, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 146200 + }, + { + "epoch": 0.565206970666914, + "grad_norm": 0.09723258018493652, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 146210 + }, + { + "epoch": 0.5652456278702973, + "grad_norm": 0.12347773462533951, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 146220 + }, + { + "epoch": 0.5652842850736807, + "grad_norm": 0.10941901057958603, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 146230 + }, + { + "epoch": 0.5653229422770639, + "grad_norm": 0.11385990679264069, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 146240 + }, + { + "epoch": 0.5653615994804472, + "grad_norm": 0.09415145963430405, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 146250 + }, + { + "epoch": 0.5654002566838304, + "grad_norm": 0.0994042232632637, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 146260 + }, + { + "epoch": 0.5654389138872138, + "grad_norm": 0.098891481757164, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 146270 + }, + { + "epoch": 0.565477571090597, + "grad_norm": 0.11153262108564377, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 146280 + }, + { + "epoch": 0.5655162282939803, + "grad_norm": 0.11282942444086075, + "learning_rate": 0.002, + "loss": 2.355, + "step": 146290 + }, + { + "epoch": 0.5655548854973635, + "grad_norm": 0.11050242185592651, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 146300 + }, + { + "epoch": 0.5655935427007469, + "grad_norm": 0.10159312933683395, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 146310 + }, + { + "epoch": 0.5656321999041302, + "grad_norm": 0.12493990361690521, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 146320 + }, + { + "epoch": 0.5656708571075134, + "grad_norm": 0.10932406783103943, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 146330 + }, + { + "epoch": 0.5657095143108967, + "grad_norm": 0.10044391453266144, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 146340 + }, + { + "epoch": 0.5657481715142799, + "grad_norm": 0.1306449919939041, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 146350 + }, + { + "epoch": 0.5657868287176633, + "grad_norm": 0.100794717669487, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 146360 + }, + { + "epoch": 0.5658254859210465, + "grad_norm": 0.11264822632074356, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 146370 + }, + { + "epoch": 0.5658641431244298, + "grad_norm": 0.100074902176857, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 146380 + }, + { + "epoch": 0.565902800327813, + "grad_norm": 0.10754478722810745, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 146390 + }, + { + "epoch": 0.5659414575311964, + "grad_norm": 0.14100757241249084, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 146400 + }, + { + "epoch": 0.5659801147345797, + "grad_norm": 0.1034492775797844, + "learning_rate": 0.002, + "loss": 2.363, + "step": 146410 + }, + { + "epoch": 0.5660187719379629, + "grad_norm": 0.09591642022132874, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 146420 + }, + { + "epoch": 0.5660574291413462, + "grad_norm": 0.09747200459241867, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 146430 + }, + { + "epoch": 0.5660960863447295, + "grad_norm": 0.11737639456987381, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 146440 + }, + { + "epoch": 0.5661347435481128, + "grad_norm": 0.10552874207496643, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 146450 + }, + { + "epoch": 0.566173400751496, + "grad_norm": 0.10250578820705414, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 146460 + }, + { + "epoch": 0.5662120579548793, + "grad_norm": 0.10936874151229858, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 146470 + }, + { + "epoch": 0.5662507151582626, + "grad_norm": 0.10178228467702866, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 146480 + }, + { + "epoch": 0.5662893723616459, + "grad_norm": 0.11337237805128098, + "learning_rate": 0.002, + "loss": 2.3776, + "step": 146490 + }, + { + "epoch": 0.5663280295650291, + "grad_norm": 0.13309693336486816, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 146500 + }, + { + "epoch": 0.5663666867684124, + "grad_norm": 0.10486093163490295, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 146510 + }, + { + "epoch": 0.5664053439717956, + "grad_norm": 0.09396681189537048, + "learning_rate": 0.002, + "loss": 2.341, + "step": 146520 + }, + { + "epoch": 0.566444001175179, + "grad_norm": 0.11196771264076233, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 146530 + }, + { + "epoch": 0.5664826583785623, + "grad_norm": 0.09456878155469894, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 146540 + }, + { + "epoch": 0.5665213155819455, + "grad_norm": 0.11470114439725876, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 146550 + }, + { + "epoch": 0.5665599727853288, + "grad_norm": 0.1259901076555252, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 146560 + }, + { + "epoch": 0.5665986299887121, + "grad_norm": 0.11010333895683289, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 146570 + }, + { + "epoch": 0.5666372871920954, + "grad_norm": 0.09986169636249542, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 146580 + }, + { + "epoch": 0.5666759443954786, + "grad_norm": 0.11977176368236542, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 146590 + }, + { + "epoch": 0.5667146015988619, + "grad_norm": 0.10560222715139389, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 146600 + }, + { + "epoch": 0.5667532588022453, + "grad_norm": 0.11105663329362869, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 146610 + }, + { + "epoch": 0.5667919160056285, + "grad_norm": 0.11333081126213074, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 146620 + }, + { + "epoch": 0.5668305732090118, + "grad_norm": 0.10974690318107605, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 146630 + }, + { + "epoch": 0.566869230412395, + "grad_norm": 0.10348519682884216, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 146640 + }, + { + "epoch": 0.5669078876157784, + "grad_norm": 0.09863714873790741, + "learning_rate": 0.002, + "loss": 2.344, + "step": 146650 + }, + { + "epoch": 0.5669465448191616, + "grad_norm": 0.12337260693311691, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 146660 + }, + { + "epoch": 0.5669852020225449, + "grad_norm": 0.11037690192461014, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 146670 + }, + { + "epoch": 0.5670238592259281, + "grad_norm": 0.11719117313623428, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 146680 + }, + { + "epoch": 0.5670625164293115, + "grad_norm": 0.10750693827867508, + "learning_rate": 0.002, + "loss": 2.339, + "step": 146690 + }, + { + "epoch": 0.5671011736326947, + "grad_norm": 0.10234736651182175, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 146700 + }, + { + "epoch": 0.567139830836078, + "grad_norm": 0.1213693618774414, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 146710 + }, + { + "epoch": 0.5671784880394612, + "grad_norm": 0.11592577397823334, + "learning_rate": 0.002, + "loss": 2.323, + "step": 146720 + }, + { + "epoch": 0.5672171452428445, + "grad_norm": 0.11350953578948975, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 146730 + }, + { + "epoch": 0.5672558024462279, + "grad_norm": 0.10382720828056335, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 146740 + }, + { + "epoch": 0.5672944596496111, + "grad_norm": 0.13582183420658112, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 146750 + }, + { + "epoch": 0.5673331168529944, + "grad_norm": 0.10660482197999954, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 146760 + }, + { + "epoch": 0.5673717740563776, + "grad_norm": 0.10936098545789719, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 146770 + }, + { + "epoch": 0.567410431259761, + "grad_norm": 0.15151578187942505, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 146780 + }, + { + "epoch": 0.5674490884631442, + "grad_norm": 0.11021112650632858, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 146790 + }, + { + "epoch": 0.5674877456665275, + "grad_norm": 0.10559472441673279, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 146800 + }, + { + "epoch": 0.5675264028699107, + "grad_norm": 0.11798600107431412, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 146810 + }, + { + "epoch": 0.5675650600732941, + "grad_norm": 0.10911992937326431, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 146820 + }, + { + "epoch": 0.5676037172766774, + "grad_norm": 0.1071673184633255, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 146830 + }, + { + "epoch": 0.5676423744800606, + "grad_norm": 0.11145402491092682, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 146840 + }, + { + "epoch": 0.5676810316834439, + "grad_norm": 0.10432492196559906, + "learning_rate": 0.002, + "loss": 2.34, + "step": 146850 + }, + { + "epoch": 0.5677196888868272, + "grad_norm": 0.13461947441101074, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 146860 + }, + { + "epoch": 0.5677583460902105, + "grad_norm": 0.10047348588705063, + "learning_rate": 0.002, + "loss": 2.343, + "step": 146870 + }, + { + "epoch": 0.5677970032935937, + "grad_norm": 0.11872837692499161, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 146880 + }, + { + "epoch": 0.567835660496977, + "grad_norm": 0.13711833953857422, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 146890 + }, + { + "epoch": 0.5678743177003602, + "grad_norm": 0.12299994379281998, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 146900 + }, + { + "epoch": 0.5679129749037436, + "grad_norm": 0.09880630671977997, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 146910 + }, + { + "epoch": 0.5679516321071268, + "grad_norm": 0.11891564726829529, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 146920 + }, + { + "epoch": 0.5679902893105101, + "grad_norm": 0.1166873499751091, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 146930 + }, + { + "epoch": 0.5680289465138934, + "grad_norm": 0.09265412390232086, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 146940 + }, + { + "epoch": 0.5680676037172767, + "grad_norm": 0.10010464489459991, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 146950 + }, + { + "epoch": 0.56810626092066, + "grad_norm": 0.10351064801216125, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 146960 + }, + { + "epoch": 0.5681449181240432, + "grad_norm": 0.1153855249285698, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 146970 + }, + { + "epoch": 0.5681835753274265, + "grad_norm": 0.14736634492874146, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 146980 + }, + { + "epoch": 0.5682222325308098, + "grad_norm": 0.1133892834186554, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 146990 + }, + { + "epoch": 0.5682608897341931, + "grad_norm": 0.1024647057056427, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 147000 + }, + { + "epoch": 0.5682995469375763, + "grad_norm": 0.10568360984325409, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 147010 + }, + { + "epoch": 0.5683382041409596, + "grad_norm": 0.10185234993696213, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 147020 + }, + { + "epoch": 0.568376861344343, + "grad_norm": 0.10574902594089508, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 147030 + }, + { + "epoch": 0.5684155185477262, + "grad_norm": 0.098385289311409, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 147040 + }, + { + "epoch": 0.5684541757511095, + "grad_norm": 0.10299337655305862, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 147050 + }, + { + "epoch": 0.5684928329544927, + "grad_norm": 0.10877186805009842, + "learning_rate": 0.002, + "loss": 2.345, + "step": 147060 + }, + { + "epoch": 0.5685314901578761, + "grad_norm": 0.13419096171855927, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 147070 + }, + { + "epoch": 0.5685701473612593, + "grad_norm": 0.10985522717237473, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 147080 + }, + { + "epoch": 0.5686088045646426, + "grad_norm": 0.12040858715772629, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 147090 + }, + { + "epoch": 0.5686474617680258, + "grad_norm": 0.12113548815250397, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 147100 + }, + { + "epoch": 0.5686861189714091, + "grad_norm": 0.09761956334114075, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 147110 + }, + { + "epoch": 0.5687247761747924, + "grad_norm": 0.09393589943647385, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 147120 + }, + { + "epoch": 0.5687634333781757, + "grad_norm": 0.10053534805774689, + "learning_rate": 0.002, + "loss": 2.3695, + "step": 147130 + }, + { + "epoch": 0.568802090581559, + "grad_norm": 0.10242559760808945, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 147140 + }, + { + "epoch": 0.5688407477849422, + "grad_norm": 0.2468280792236328, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 147150 + }, + { + "epoch": 0.5688794049883256, + "grad_norm": 0.09995192289352417, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 147160 + }, + { + "epoch": 0.5689180621917088, + "grad_norm": 0.11786067485809326, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 147170 + }, + { + "epoch": 0.5689567193950921, + "grad_norm": 0.10817831009626389, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 147180 + }, + { + "epoch": 0.5689953765984753, + "grad_norm": 0.10672974586486816, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 147190 + }, + { + "epoch": 0.5690340338018587, + "grad_norm": 0.1374005824327469, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 147200 + }, + { + "epoch": 0.5690726910052419, + "grad_norm": 0.0980212464928627, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 147210 + }, + { + "epoch": 0.5691113482086252, + "grad_norm": 0.10079023987054825, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 147220 + }, + { + "epoch": 0.5691500054120084, + "grad_norm": 0.1042499914765358, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 147230 + }, + { + "epoch": 0.5691886626153918, + "grad_norm": 0.11340388655662537, + "learning_rate": 0.002, + "loss": 2.362, + "step": 147240 + }, + { + "epoch": 0.5692273198187751, + "grad_norm": 0.09811675548553467, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 147250 + }, + { + "epoch": 0.5692659770221583, + "grad_norm": 0.10510072857141495, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 147260 + }, + { + "epoch": 0.5693046342255416, + "grad_norm": 0.11670279502868652, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 147270 + }, + { + "epoch": 0.5693432914289248, + "grad_norm": 0.1188029870390892, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 147280 + }, + { + "epoch": 0.5693819486323082, + "grad_norm": 0.10574861615896225, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 147290 + }, + { + "epoch": 0.5694206058356914, + "grad_norm": 0.11340762674808502, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 147300 + }, + { + "epoch": 0.5694592630390747, + "grad_norm": 0.09961926937103271, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 147310 + }, + { + "epoch": 0.5694979202424579, + "grad_norm": 0.12730400264263153, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 147320 + }, + { + "epoch": 0.5695365774458413, + "grad_norm": 0.1308182179927826, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 147330 + }, + { + "epoch": 0.5695752346492245, + "grad_norm": 0.10637819021940231, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 147340 + }, + { + "epoch": 0.5696138918526078, + "grad_norm": 0.1019187867641449, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 147350 + }, + { + "epoch": 0.569652549055991, + "grad_norm": 0.10890976339578629, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 147360 + }, + { + "epoch": 0.5696912062593744, + "grad_norm": 0.10864552855491638, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 147370 + }, + { + "epoch": 0.5697298634627577, + "grad_norm": 0.10701534152030945, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 147380 + }, + { + "epoch": 0.5697685206661409, + "grad_norm": 0.09928543865680695, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 147390 + }, + { + "epoch": 0.5698071778695242, + "grad_norm": 0.11758770048618317, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 147400 + }, + { + "epoch": 0.5698458350729075, + "grad_norm": 0.10023277997970581, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 147410 + }, + { + "epoch": 0.5698844922762908, + "grad_norm": 0.11018215864896774, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 147420 + }, + { + "epoch": 0.569923149479674, + "grad_norm": 0.10372017323970795, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 147430 + }, + { + "epoch": 0.5699618066830573, + "grad_norm": 0.0958276018500328, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 147440 + }, + { + "epoch": 0.5700004638864405, + "grad_norm": 0.11901126801967621, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 147450 + }, + { + "epoch": 0.5700391210898239, + "grad_norm": 0.11974016577005386, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 147460 + }, + { + "epoch": 0.5700777782932072, + "grad_norm": 0.11292359977960587, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 147470 + }, + { + "epoch": 0.5701164354965904, + "grad_norm": 0.10017193853855133, + "learning_rate": 0.002, + "loss": 2.346, + "step": 147480 + }, + { + "epoch": 0.5701550926999737, + "grad_norm": 0.10776569694280624, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 147490 + }, + { + "epoch": 0.570193749903357, + "grad_norm": 0.11120834201574326, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 147500 + }, + { + "epoch": 0.5702324071067403, + "grad_norm": 0.1190958097577095, + "learning_rate": 0.002, + "loss": 2.346, + "step": 147510 + }, + { + "epoch": 0.5702710643101235, + "grad_norm": 0.09756813198328018, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 147520 + }, + { + "epoch": 0.5703097215135068, + "grad_norm": 0.11825324594974518, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 147530 + }, + { + "epoch": 0.5703483787168901, + "grad_norm": 0.1010400578379631, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 147540 + }, + { + "epoch": 0.5703870359202734, + "grad_norm": 0.09621983021497726, + "learning_rate": 0.002, + "loss": 2.337, + "step": 147550 + }, + { + "epoch": 0.5704256931236567, + "grad_norm": 0.11062465608119965, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 147560 + }, + { + "epoch": 0.5704643503270399, + "grad_norm": 0.11987130343914032, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 147570 + }, + { + "epoch": 0.5705030075304233, + "grad_norm": 0.11454615741968155, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 147580 + }, + { + "epoch": 0.5705416647338065, + "grad_norm": 0.1135939434170723, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 147590 + }, + { + "epoch": 0.5705803219371898, + "grad_norm": 0.11390845477581024, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 147600 + }, + { + "epoch": 0.570618979140573, + "grad_norm": 0.12084183841943741, + "learning_rate": 0.002, + "loss": 2.34, + "step": 147610 + }, + { + "epoch": 0.5706576363439564, + "grad_norm": 0.09719133377075195, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 147620 + }, + { + "epoch": 0.5706962935473396, + "grad_norm": 0.10637233406305313, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 147630 + }, + { + "epoch": 0.5707349507507229, + "grad_norm": 0.10290329903364182, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 147640 + }, + { + "epoch": 0.5707736079541061, + "grad_norm": 0.1075134128332138, + "learning_rate": 0.002, + "loss": 2.368, + "step": 147650 + }, + { + "epoch": 0.5708122651574894, + "grad_norm": 0.10845005512237549, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 147660 + }, + { + "epoch": 0.5708509223608728, + "grad_norm": 0.14120826125144958, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 147670 + }, + { + "epoch": 0.570889579564256, + "grad_norm": 0.11323338747024536, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 147680 + }, + { + "epoch": 0.5709282367676393, + "grad_norm": 0.0938161090016365, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 147690 + }, + { + "epoch": 0.5709668939710225, + "grad_norm": 0.09869852662086487, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 147700 + }, + { + "epoch": 0.5710055511744059, + "grad_norm": 0.12468905746936798, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 147710 + }, + { + "epoch": 0.5710442083777891, + "grad_norm": 0.10178691893815994, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 147720 + }, + { + "epoch": 0.5710828655811724, + "grad_norm": 0.1340477019548416, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 147730 + }, + { + "epoch": 0.5711215227845556, + "grad_norm": 0.11438913643360138, + "learning_rate": 0.002, + "loss": 2.336, + "step": 147740 + }, + { + "epoch": 0.571160179987939, + "grad_norm": 0.107391357421875, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 147750 + }, + { + "epoch": 0.5711988371913223, + "grad_norm": 0.11345870047807693, + "learning_rate": 0.002, + "loss": 2.339, + "step": 147760 + }, + { + "epoch": 0.5712374943947055, + "grad_norm": 0.10152135789394379, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 147770 + }, + { + "epoch": 0.5712761515980888, + "grad_norm": 0.11594166606664658, + "learning_rate": 0.002, + "loss": 2.345, + "step": 147780 + }, + { + "epoch": 0.5713148088014721, + "grad_norm": 0.11304232478141785, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 147790 + }, + { + "epoch": 0.5713534660048554, + "grad_norm": 0.11002659797668457, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 147800 + }, + { + "epoch": 0.5713921232082386, + "grad_norm": 0.0949237123131752, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 147810 + }, + { + "epoch": 0.5714307804116219, + "grad_norm": 0.13494773209095, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 147820 + }, + { + "epoch": 0.5714694376150051, + "grad_norm": 0.10871291905641556, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 147830 + }, + { + "epoch": 0.5715080948183885, + "grad_norm": 0.11340344697237015, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 147840 + }, + { + "epoch": 0.5715467520217717, + "grad_norm": 0.09544314444065094, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 147850 + }, + { + "epoch": 0.571585409225155, + "grad_norm": 0.10063931345939636, + "learning_rate": 0.002, + "loss": 2.343, + "step": 147860 + }, + { + "epoch": 0.5716240664285382, + "grad_norm": 0.1033964529633522, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 147870 + }, + { + "epoch": 0.5716627236319216, + "grad_norm": 0.10400796681642532, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 147880 + }, + { + "epoch": 0.5717013808353049, + "grad_norm": 0.10402462631464005, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 147890 + }, + { + "epoch": 0.5717400380386881, + "grad_norm": 0.10717406123876572, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 147900 + }, + { + "epoch": 0.5717786952420714, + "grad_norm": 0.10979738086462021, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 147910 + }, + { + "epoch": 0.5718173524454547, + "grad_norm": 0.1012766882777214, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 147920 + }, + { + "epoch": 0.571856009648838, + "grad_norm": 0.14146389067173004, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 147930 + }, + { + "epoch": 0.5718946668522212, + "grad_norm": 0.1255309134721756, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 147940 + }, + { + "epoch": 0.5719333240556045, + "grad_norm": 0.09301314502954483, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 147950 + }, + { + "epoch": 0.5719719812589878, + "grad_norm": 0.09817476570606232, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 147960 + }, + { + "epoch": 0.5720106384623711, + "grad_norm": 0.123959019780159, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 147970 + }, + { + "epoch": 0.5720492956657544, + "grad_norm": 0.09742964059114456, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 147980 + }, + { + "epoch": 0.5720879528691376, + "grad_norm": 0.10809384286403656, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 147990 + }, + { + "epoch": 0.5721266100725209, + "grad_norm": 0.11448068916797638, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 148000 + }, + { + "epoch": 0.5721652672759042, + "grad_norm": 0.1087169274687767, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 148010 + }, + { + "epoch": 0.5722039244792875, + "grad_norm": 0.10386857390403748, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 148020 + }, + { + "epoch": 0.5722425816826707, + "grad_norm": 0.10609959810972214, + "learning_rate": 0.002, + "loss": 2.341, + "step": 148030 + }, + { + "epoch": 0.572281238886054, + "grad_norm": 0.12076544761657715, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 148040 + }, + { + "epoch": 0.5723198960894373, + "grad_norm": 0.10421552509069443, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 148050 + }, + { + "epoch": 0.5723585532928206, + "grad_norm": 0.11697299778461456, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 148060 + }, + { + "epoch": 0.5723972104962038, + "grad_norm": 0.09293560683727264, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 148070 + }, + { + "epoch": 0.5724358676995871, + "grad_norm": 0.11079003661870956, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 148080 + }, + { + "epoch": 0.5724745249029705, + "grad_norm": 0.14255714416503906, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 148090 + }, + { + "epoch": 0.5725131821063537, + "grad_norm": 0.09715547412633896, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 148100 + }, + { + "epoch": 0.572551839309737, + "grad_norm": 0.10531293600797653, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 148110 + }, + { + "epoch": 0.5725904965131202, + "grad_norm": 0.12235134840011597, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 148120 + }, + { + "epoch": 0.5726291537165036, + "grad_norm": 0.1346643716096878, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 148130 + }, + { + "epoch": 0.5726678109198868, + "grad_norm": 0.12076713144779205, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 148140 + }, + { + "epoch": 0.5727064681232701, + "grad_norm": 0.1027338057756424, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 148150 + }, + { + "epoch": 0.5727451253266533, + "grad_norm": 0.09555674344301224, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 148160 + }, + { + "epoch": 0.5727837825300367, + "grad_norm": 0.09910275042057037, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 148170 + }, + { + "epoch": 0.57282243973342, + "grad_norm": 0.1158229410648346, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 148180 + }, + { + "epoch": 0.5728610969368032, + "grad_norm": 0.10328315943479538, + "learning_rate": 0.002, + "loss": 2.355, + "step": 148190 + }, + { + "epoch": 0.5728997541401865, + "grad_norm": 0.10658963769674301, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 148200 + }, + { + "epoch": 0.5729384113435697, + "grad_norm": 0.12963686883449554, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 148210 + }, + { + "epoch": 0.5729770685469531, + "grad_norm": 0.10582420974969864, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 148220 + }, + { + "epoch": 0.5730157257503363, + "grad_norm": 0.10931842029094696, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 148230 + }, + { + "epoch": 0.5730543829537196, + "grad_norm": 0.1136174127459526, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 148240 + }, + { + "epoch": 0.5730930401571028, + "grad_norm": 0.09869039803743362, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 148250 + }, + { + "epoch": 0.5731316973604862, + "grad_norm": 0.10541972517967224, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 148260 + }, + { + "epoch": 0.5731703545638694, + "grad_norm": 0.10672339051961899, + "learning_rate": 0.002, + "loss": 2.345, + "step": 148270 + }, + { + "epoch": 0.5732090117672527, + "grad_norm": 0.11145348846912384, + "learning_rate": 0.002, + "loss": 2.341, + "step": 148280 + }, + { + "epoch": 0.573247668970636, + "grad_norm": 0.09722515940666199, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 148290 + }, + { + "epoch": 0.5732863261740193, + "grad_norm": 0.10645853728055954, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 148300 + }, + { + "epoch": 0.5733249833774026, + "grad_norm": 0.11253643035888672, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 148310 + }, + { + "epoch": 0.5733636405807858, + "grad_norm": 0.10229386389255524, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 148320 + }, + { + "epoch": 0.5734022977841691, + "grad_norm": 0.10213582217693329, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 148330 + }, + { + "epoch": 0.5734409549875524, + "grad_norm": 0.10059046000242233, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 148340 + }, + { + "epoch": 0.5734796121909357, + "grad_norm": 0.09339335560798645, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 148350 + }, + { + "epoch": 0.5735182693943189, + "grad_norm": 0.10184424370527267, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 148360 + }, + { + "epoch": 0.5735569265977022, + "grad_norm": 0.09910251945257187, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 148370 + }, + { + "epoch": 0.5735955838010854, + "grad_norm": 0.11430943757295609, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 148380 + }, + { + "epoch": 0.5736342410044688, + "grad_norm": 0.09297601878643036, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 148390 + }, + { + "epoch": 0.5736728982078521, + "grad_norm": 0.11601737886667252, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 148400 + }, + { + "epoch": 0.5737115554112353, + "grad_norm": 0.11303595453500748, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 148410 + }, + { + "epoch": 0.5737502126146186, + "grad_norm": 0.11738861352205276, + "learning_rate": 0.002, + "loss": 2.3636, + "step": 148420 + }, + { + "epoch": 0.5737888698180019, + "grad_norm": 0.09818091988563538, + "learning_rate": 0.002, + "loss": 2.347, + "step": 148430 + }, + { + "epoch": 0.5738275270213852, + "grad_norm": 0.1266518533229828, + "learning_rate": 0.002, + "loss": 2.351, + "step": 148440 + }, + { + "epoch": 0.5738661842247684, + "grad_norm": 0.10527455806732178, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 148450 + }, + { + "epoch": 0.5739048414281517, + "grad_norm": 0.13432396948337555, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 148460 + }, + { + "epoch": 0.573943498631535, + "grad_norm": 0.09773876518011093, + "learning_rate": 0.002, + "loss": 2.333, + "step": 148470 + }, + { + "epoch": 0.5739821558349183, + "grad_norm": 0.10255984216928482, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 148480 + }, + { + "epoch": 0.5740208130383015, + "grad_norm": 0.12200841307640076, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 148490 + }, + { + "epoch": 0.5740594702416848, + "grad_norm": 0.10463497787714005, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 148500 + }, + { + "epoch": 0.5740981274450682, + "grad_norm": 0.12294262647628784, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 148510 + }, + { + "epoch": 0.5741367846484514, + "grad_norm": 0.0980486273765564, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 148520 + }, + { + "epoch": 0.5741754418518347, + "grad_norm": 0.10178186744451523, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 148530 + }, + { + "epoch": 0.5742140990552179, + "grad_norm": 0.10752306878566742, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 148540 + }, + { + "epoch": 0.5742527562586013, + "grad_norm": 0.1123095452785492, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 148550 + }, + { + "epoch": 0.5742914134619845, + "grad_norm": 0.09954951703548431, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 148560 + }, + { + "epoch": 0.5743300706653678, + "grad_norm": 0.09189984202384949, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 148570 + }, + { + "epoch": 0.574368727868751, + "grad_norm": 0.5664579272270203, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 148580 + }, + { + "epoch": 0.5744073850721343, + "grad_norm": 0.12876535952091217, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 148590 + }, + { + "epoch": 0.5744460422755177, + "grad_norm": 0.11784474551677704, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 148600 + }, + { + "epoch": 0.5744846994789009, + "grad_norm": 0.13473601639270782, + "learning_rate": 0.002, + "loss": 2.343, + "step": 148610 + }, + { + "epoch": 0.5745233566822842, + "grad_norm": 0.11781080812215805, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 148620 + }, + { + "epoch": 0.5745620138856674, + "grad_norm": 0.1007075086236, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 148630 + }, + { + "epoch": 0.5746006710890508, + "grad_norm": 0.09426712244749069, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 148640 + }, + { + "epoch": 0.574639328292434, + "grad_norm": 0.11516745388507843, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 148650 + }, + { + "epoch": 0.5746779854958173, + "grad_norm": 0.09972114115953445, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 148660 + }, + { + "epoch": 0.5747166426992005, + "grad_norm": 0.10529178380966187, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 148670 + }, + { + "epoch": 0.5747552999025839, + "grad_norm": 0.14836226403713226, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 148680 + }, + { + "epoch": 0.5747939571059671, + "grad_norm": 0.12221290916204453, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 148690 + }, + { + "epoch": 0.5748326143093504, + "grad_norm": 0.11130478233098984, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 148700 + }, + { + "epoch": 0.5748712715127337, + "grad_norm": 0.09555680304765701, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 148710 + }, + { + "epoch": 0.574909928716117, + "grad_norm": 0.12878166139125824, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 148720 + }, + { + "epoch": 0.5749485859195003, + "grad_norm": 0.10222994536161423, + "learning_rate": 0.002, + "loss": 2.35, + "step": 148730 + }, + { + "epoch": 0.5749872431228835, + "grad_norm": 0.13735540211200714, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 148740 + }, + { + "epoch": 0.5750259003262668, + "grad_norm": 0.104596346616745, + "learning_rate": 0.002, + "loss": 2.348, + "step": 148750 + }, + { + "epoch": 0.57506455752965, + "grad_norm": 0.09962372481822968, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 148760 + }, + { + "epoch": 0.5751032147330334, + "grad_norm": 0.10563892126083374, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 148770 + }, + { + "epoch": 0.5751418719364166, + "grad_norm": 0.12737879157066345, + "learning_rate": 0.002, + "loss": 2.352, + "step": 148780 + }, + { + "epoch": 0.5751805291397999, + "grad_norm": 0.09857510775327682, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 148790 + }, + { + "epoch": 0.5752191863431831, + "grad_norm": 0.10216040164232254, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 148800 + }, + { + "epoch": 0.5752578435465665, + "grad_norm": 0.10757684707641602, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 148810 + }, + { + "epoch": 0.5752965007499498, + "grad_norm": 0.09681134670972824, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 148820 + }, + { + "epoch": 0.575335157953333, + "grad_norm": 0.10559449344873428, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 148830 + }, + { + "epoch": 0.5753738151567163, + "grad_norm": 0.1202688217163086, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 148840 + }, + { + "epoch": 0.5754124723600996, + "grad_norm": 0.12340883910655975, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 148850 + }, + { + "epoch": 0.5754511295634829, + "grad_norm": 0.09787190705537796, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 148860 + }, + { + "epoch": 0.5754897867668661, + "grad_norm": 0.10570527613162994, + "learning_rate": 0.002, + "loss": 2.343, + "step": 148870 + }, + { + "epoch": 0.5755284439702494, + "grad_norm": 0.09977946430444717, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 148880 + }, + { + "epoch": 0.5755671011736327, + "grad_norm": 0.10107558965682983, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 148890 + }, + { + "epoch": 0.575605758377016, + "grad_norm": 0.11597032845020294, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 148900 + }, + { + "epoch": 0.5756444155803992, + "grad_norm": 0.10900797694921494, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 148910 + }, + { + "epoch": 0.5756830727837825, + "grad_norm": 0.1132872998714447, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 148920 + }, + { + "epoch": 0.5757217299871658, + "grad_norm": 0.11002641916275024, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 148930 + }, + { + "epoch": 0.5757603871905491, + "grad_norm": 0.11283540725708008, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 148940 + }, + { + "epoch": 0.5757990443939324, + "grad_norm": 0.10150940716266632, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 148950 + }, + { + "epoch": 0.5758377015973156, + "grad_norm": 0.14698892831802368, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 148960 + }, + { + "epoch": 0.5758763588006989, + "grad_norm": 0.10251089185476303, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 148970 + }, + { + "epoch": 0.5759150160040822, + "grad_norm": 0.09909605979919434, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 148980 + }, + { + "epoch": 0.5759536732074655, + "grad_norm": 0.10927826166152954, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 148990 + }, + { + "epoch": 0.5759923304108487, + "grad_norm": 0.10843043774366379, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 149000 + }, + { + "epoch": 0.576030987614232, + "grad_norm": 0.10785437375307083, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 149010 + }, + { + "epoch": 0.5760696448176154, + "grad_norm": 0.09829997271299362, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 149020 + }, + { + "epoch": 0.5761083020209986, + "grad_norm": 0.09375440329313278, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 149030 + }, + { + "epoch": 0.5761469592243819, + "grad_norm": 0.11929398030042648, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 149040 + }, + { + "epoch": 0.5761856164277651, + "grad_norm": 0.09872215986251831, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 149050 + }, + { + "epoch": 0.5762242736311485, + "grad_norm": 0.10986842960119247, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 149060 + }, + { + "epoch": 0.5762629308345317, + "grad_norm": 0.0990709587931633, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 149070 + }, + { + "epoch": 0.576301588037915, + "grad_norm": 0.12177325785160065, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 149080 + }, + { + "epoch": 0.5763402452412982, + "grad_norm": 0.11904153972864151, + "learning_rate": 0.002, + "loss": 2.354, + "step": 149090 + }, + { + "epoch": 0.5763789024446816, + "grad_norm": 0.09998807311058044, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 149100 + }, + { + "epoch": 0.5764175596480648, + "grad_norm": 0.12162663042545319, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 149110 + }, + { + "epoch": 0.5764562168514481, + "grad_norm": 0.11092780530452728, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 149120 + }, + { + "epoch": 0.5764948740548314, + "grad_norm": 0.09261737763881683, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 149130 + }, + { + "epoch": 0.5765335312582146, + "grad_norm": 0.1161278486251831, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 149140 + }, + { + "epoch": 0.576572188461598, + "grad_norm": 0.09691427648067474, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 149150 + }, + { + "epoch": 0.5766108456649812, + "grad_norm": 0.10035521537065506, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 149160 + }, + { + "epoch": 0.5766495028683645, + "grad_norm": 0.10514352470636368, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 149170 + }, + { + "epoch": 0.5766881600717477, + "grad_norm": 0.21642756462097168, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 149180 + }, + { + "epoch": 0.5767268172751311, + "grad_norm": 0.12869301438331604, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 149190 + }, + { + "epoch": 0.5767654744785143, + "grad_norm": 0.1055729016661644, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 149200 + }, + { + "epoch": 0.5768041316818976, + "grad_norm": 0.0956469178199768, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 149210 + }, + { + "epoch": 0.5768427888852808, + "grad_norm": 0.10121183097362518, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 149220 + }, + { + "epoch": 0.5768814460886642, + "grad_norm": 0.12185820937156677, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 149230 + }, + { + "epoch": 0.5769201032920475, + "grad_norm": 0.11856206506490707, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 149240 + }, + { + "epoch": 0.5769587604954307, + "grad_norm": 0.10712940990924835, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 149250 + }, + { + "epoch": 0.576997417698814, + "grad_norm": 0.11829926818609238, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 149260 + }, + { + "epoch": 0.5770360749021973, + "grad_norm": 0.1022658720612526, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 149270 + }, + { + "epoch": 0.5770747321055806, + "grad_norm": 0.11594463884830475, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 149280 + }, + { + "epoch": 0.5771133893089638, + "grad_norm": 0.11002032458782196, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 149290 + }, + { + "epoch": 0.5771520465123471, + "grad_norm": 0.09113029390573502, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 149300 + }, + { + "epoch": 0.5771907037157303, + "grad_norm": 0.11730547249317169, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 149310 + }, + { + "epoch": 0.5772293609191137, + "grad_norm": 0.09948255866765976, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 149320 + }, + { + "epoch": 0.577268018122497, + "grad_norm": 0.11743677407503128, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 149330 + }, + { + "epoch": 0.5773066753258802, + "grad_norm": 0.12467943131923676, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 149340 + }, + { + "epoch": 0.5773453325292635, + "grad_norm": 0.1314239203929901, + "learning_rate": 0.002, + "loss": 2.36, + "step": 149350 + }, + { + "epoch": 0.5773839897326468, + "grad_norm": 0.1049962267279625, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 149360 + }, + { + "epoch": 0.5774226469360301, + "grad_norm": 0.11080177128314972, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 149370 + }, + { + "epoch": 0.5774613041394133, + "grad_norm": 0.0954049751162529, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 149380 + }, + { + "epoch": 0.5774999613427966, + "grad_norm": 0.12053569406270981, + "learning_rate": 0.002, + "loss": 2.344, + "step": 149390 + }, + { + "epoch": 0.5775386185461799, + "grad_norm": 0.10770398378372192, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 149400 + }, + { + "epoch": 0.5775772757495632, + "grad_norm": 0.10702679306268692, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 149410 + }, + { + "epoch": 0.5776159329529464, + "grad_norm": 0.09334122389554977, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 149420 + }, + { + "epoch": 0.5776545901563297, + "grad_norm": 0.11252539604902267, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 149430 + }, + { + "epoch": 0.5776932473597131, + "grad_norm": 0.1189262792468071, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 149440 + }, + { + "epoch": 0.5777319045630963, + "grad_norm": 0.1360866278409958, + "learning_rate": 0.002, + "loss": 2.341, + "step": 149450 + }, + { + "epoch": 0.5777705617664796, + "grad_norm": 0.0987911969423294, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 149460 + }, + { + "epoch": 0.5778092189698628, + "grad_norm": 0.11495771259069443, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 149470 + }, + { + "epoch": 0.5778478761732462, + "grad_norm": 0.1220431998372078, + "learning_rate": 0.002, + "loss": 2.343, + "step": 149480 + }, + { + "epoch": 0.5778865333766294, + "grad_norm": 0.12294565141201019, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 149490 + }, + { + "epoch": 0.5779251905800127, + "grad_norm": 0.10714684426784515, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 149500 + }, + { + "epoch": 0.5779638477833959, + "grad_norm": 0.11005298048257828, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 149510 + }, + { + "epoch": 0.5780025049867792, + "grad_norm": 0.08794310688972473, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 149520 + }, + { + "epoch": 0.5780411621901625, + "grad_norm": 0.1080562099814415, + "learning_rate": 0.002, + "loss": 2.359, + "step": 149530 + }, + { + "epoch": 0.5780798193935458, + "grad_norm": 0.14582188427448273, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 149540 + }, + { + "epoch": 0.578118476596929, + "grad_norm": 0.12180215120315552, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 149550 + }, + { + "epoch": 0.5781571338003123, + "grad_norm": 0.10159006714820862, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 149560 + }, + { + "epoch": 0.5781957910036957, + "grad_norm": 0.10227713733911514, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 149570 + }, + { + "epoch": 0.5782344482070789, + "grad_norm": 0.10744677484035492, + "learning_rate": 0.002, + "loss": 2.337, + "step": 149580 + }, + { + "epoch": 0.5782731054104622, + "grad_norm": 0.09768327325582504, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 149590 + }, + { + "epoch": 0.5783117626138454, + "grad_norm": 0.10872442275285721, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 149600 + }, + { + "epoch": 0.5783504198172288, + "grad_norm": 0.10671534389257431, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 149610 + }, + { + "epoch": 0.578389077020612, + "grad_norm": 0.14711715281009674, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 149620 + }, + { + "epoch": 0.5784277342239953, + "grad_norm": 0.09580583870410919, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 149630 + }, + { + "epoch": 0.5784663914273785, + "grad_norm": 0.0945882648229599, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 149640 + }, + { + "epoch": 0.5785050486307619, + "grad_norm": 0.13525830209255219, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 149650 + }, + { + "epoch": 0.5785437058341452, + "grad_norm": 0.10206573456525803, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 149660 + }, + { + "epoch": 0.5785823630375284, + "grad_norm": 0.09938966482877731, + "learning_rate": 0.002, + "loss": 2.332, + "step": 149670 + }, + { + "epoch": 0.5786210202409117, + "grad_norm": 0.10354302078485489, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 149680 + }, + { + "epoch": 0.5786596774442949, + "grad_norm": 0.10915887355804443, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 149690 + }, + { + "epoch": 0.5786983346476783, + "grad_norm": 0.09858160465955734, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 149700 + }, + { + "epoch": 0.5787369918510615, + "grad_norm": 0.09634033590555191, + "learning_rate": 0.002, + "loss": 2.342, + "step": 149710 + }, + { + "epoch": 0.5787756490544448, + "grad_norm": 0.10882976651191711, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 149720 + }, + { + "epoch": 0.578814306257828, + "grad_norm": 0.12693949043750763, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 149730 + }, + { + "epoch": 0.5788529634612114, + "grad_norm": 0.1207302138209343, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 149740 + }, + { + "epoch": 0.5788916206645947, + "grad_norm": 0.1069578304886818, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 149750 + }, + { + "epoch": 0.5789302778679779, + "grad_norm": 0.10116532444953918, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 149760 + }, + { + "epoch": 0.5789689350713612, + "grad_norm": 0.13171598315238953, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 149770 + }, + { + "epoch": 0.5790075922747445, + "grad_norm": 0.11328830569982529, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 149780 + }, + { + "epoch": 0.5790462494781278, + "grad_norm": 0.10096880793571472, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 149790 + }, + { + "epoch": 0.579084906681511, + "grad_norm": 0.11313828825950623, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 149800 + }, + { + "epoch": 0.5791235638848943, + "grad_norm": 0.09104368090629578, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 149810 + }, + { + "epoch": 0.5791622210882776, + "grad_norm": 0.11752974987030029, + "learning_rate": 0.002, + "loss": 2.343, + "step": 149820 + }, + { + "epoch": 0.5792008782916609, + "grad_norm": 0.1244005411863327, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 149830 + }, + { + "epoch": 0.5792395354950441, + "grad_norm": 0.10278856009244919, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 149840 + }, + { + "epoch": 0.5792781926984274, + "grad_norm": 0.10532396286725998, + "learning_rate": 0.002, + "loss": 2.336, + "step": 149850 + }, + { + "epoch": 0.5793168499018106, + "grad_norm": 0.0982581302523613, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 149860 + }, + { + "epoch": 0.579355507105194, + "grad_norm": 0.10512799769639969, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 149870 + }, + { + "epoch": 0.5793941643085773, + "grad_norm": 0.11000046879053116, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 149880 + }, + { + "epoch": 0.5794328215119605, + "grad_norm": 0.11731689423322678, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 149890 + }, + { + "epoch": 0.5794714787153438, + "grad_norm": 0.10304244607686996, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 149900 + }, + { + "epoch": 0.5795101359187271, + "grad_norm": 0.10752136260271072, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 149910 + }, + { + "epoch": 0.5795487931221104, + "grad_norm": 0.10894905775785446, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 149920 + }, + { + "epoch": 0.5795874503254936, + "grad_norm": 0.09900854527950287, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 149930 + }, + { + "epoch": 0.5796261075288769, + "grad_norm": 0.10060838609933853, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 149940 + }, + { + "epoch": 0.5796647647322603, + "grad_norm": 0.0998382493853569, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 149950 + }, + { + "epoch": 0.5797034219356435, + "grad_norm": 0.09682802110910416, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 149960 + }, + { + "epoch": 0.5797420791390268, + "grad_norm": 0.1006351038813591, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 149970 + }, + { + "epoch": 0.57978073634241, + "grad_norm": 0.1387142390012741, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 149980 + }, + { + "epoch": 0.5798193935457934, + "grad_norm": 0.11638208478689194, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 149990 + }, + { + "epoch": 0.5798580507491766, + "grad_norm": 0.0977184846997261, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 150000 + }, + { + "epoch": 0.5798967079525599, + "grad_norm": 0.13405273854732513, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 150010 + }, + { + "epoch": 0.5799353651559431, + "grad_norm": 0.09380652010440826, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 150020 + }, + { + "epoch": 0.5799740223593265, + "grad_norm": 0.10067908465862274, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 150030 + }, + { + "epoch": 0.5800126795627097, + "grad_norm": 0.10465842485427856, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 150040 + }, + { + "epoch": 0.580051336766093, + "grad_norm": 0.12888339161872864, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 150050 + }, + { + "epoch": 0.5800899939694762, + "grad_norm": 0.11086847633123398, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 150060 + }, + { + "epoch": 0.5801286511728595, + "grad_norm": 0.1055656149983406, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 150070 + }, + { + "epoch": 0.5801673083762429, + "grad_norm": 0.10147044062614441, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 150080 + }, + { + "epoch": 0.5802059655796261, + "grad_norm": 0.09590164572000504, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 150090 + }, + { + "epoch": 0.5802446227830094, + "grad_norm": 0.11232464015483856, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 150100 + }, + { + "epoch": 0.5802832799863926, + "grad_norm": 0.11140336096286774, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 150110 + }, + { + "epoch": 0.580321937189776, + "grad_norm": 0.12357457727193832, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 150120 + }, + { + "epoch": 0.5803605943931592, + "grad_norm": 0.10456366837024689, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 150130 + }, + { + "epoch": 0.5803992515965425, + "grad_norm": 0.11628726124763489, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 150140 + }, + { + "epoch": 0.5804379087999257, + "grad_norm": 0.10711704194545746, + "learning_rate": 0.002, + "loss": 2.352, + "step": 150150 + }, + { + "epoch": 0.5804765660033091, + "grad_norm": 0.11025909334421158, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 150160 + }, + { + "epoch": 0.5805152232066924, + "grad_norm": 0.11634379625320435, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 150170 + }, + { + "epoch": 0.5805538804100756, + "grad_norm": 0.10299687832593918, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 150180 + }, + { + "epoch": 0.5805925376134589, + "grad_norm": 0.11979073286056519, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 150190 + }, + { + "epoch": 0.5806311948168422, + "grad_norm": 0.10962650924921036, + "learning_rate": 0.002, + "loss": 2.336, + "step": 150200 + }, + { + "epoch": 0.5806698520202255, + "grad_norm": 0.09615093469619751, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 150210 + }, + { + "epoch": 0.5807085092236087, + "grad_norm": 0.12243734300136566, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 150220 + }, + { + "epoch": 0.580747166426992, + "grad_norm": 0.10823040455579758, + "learning_rate": 0.002, + "loss": 2.343, + "step": 150230 + }, + { + "epoch": 0.5807858236303752, + "grad_norm": 0.09089124947786331, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 150240 + }, + { + "epoch": 0.5808244808337586, + "grad_norm": 0.14514900743961334, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 150250 + }, + { + "epoch": 0.5808631380371418, + "grad_norm": 0.10589548200368881, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 150260 + }, + { + "epoch": 0.5809017952405251, + "grad_norm": 0.1101815328001976, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 150270 + }, + { + "epoch": 0.5809404524439084, + "grad_norm": 0.10971608012914658, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 150280 + }, + { + "epoch": 0.5809791096472917, + "grad_norm": 0.11273403465747833, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 150290 + }, + { + "epoch": 0.581017766850675, + "grad_norm": 0.1006007120013237, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 150300 + }, + { + "epoch": 0.5810564240540582, + "grad_norm": 0.10495468974113464, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 150310 + }, + { + "epoch": 0.5810950812574415, + "grad_norm": 0.1348286271095276, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 150320 + }, + { + "epoch": 0.5811337384608248, + "grad_norm": 0.09644333273172379, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 150330 + }, + { + "epoch": 0.5811723956642081, + "grad_norm": 0.1011551171541214, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 150340 + }, + { + "epoch": 0.5812110528675913, + "grad_norm": 0.102486751973629, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 150350 + }, + { + "epoch": 0.5812497100709746, + "grad_norm": 0.10425101220607758, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 150360 + }, + { + "epoch": 0.581288367274358, + "grad_norm": 0.11448422074317932, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 150370 + }, + { + "epoch": 0.5813270244777412, + "grad_norm": 0.11644108593463898, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 150380 + }, + { + "epoch": 0.5813656816811245, + "grad_norm": 0.11712195724248886, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 150390 + }, + { + "epoch": 0.5814043388845077, + "grad_norm": 0.11532865464687347, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 150400 + }, + { + "epoch": 0.5814429960878911, + "grad_norm": 0.09724882245063782, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 150410 + }, + { + "epoch": 0.5814816532912743, + "grad_norm": 0.09928759932518005, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 150420 + }, + { + "epoch": 0.5815203104946576, + "grad_norm": 0.11150534451007843, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 150430 + }, + { + "epoch": 0.5815589676980408, + "grad_norm": 0.10577598959207535, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 150440 + }, + { + "epoch": 0.5815976249014241, + "grad_norm": 0.10753431171178818, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 150450 + }, + { + "epoch": 0.5816362821048074, + "grad_norm": 0.10999209433794022, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 150460 + }, + { + "epoch": 0.5816749393081907, + "grad_norm": 0.11631323397159576, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 150470 + }, + { + "epoch": 0.581713596511574, + "grad_norm": 0.10544676333665848, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 150480 + }, + { + "epoch": 0.5817522537149572, + "grad_norm": 0.12632738053798676, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 150490 + }, + { + "epoch": 0.5817909109183406, + "grad_norm": 0.11686243116855621, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 150500 + }, + { + "epoch": 0.5818295681217238, + "grad_norm": 0.09398223459720612, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 150510 + }, + { + "epoch": 0.5818682253251071, + "grad_norm": 0.1024533212184906, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 150520 + }, + { + "epoch": 0.5819068825284903, + "grad_norm": 0.1734480857849121, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 150530 + }, + { + "epoch": 0.5819455397318737, + "grad_norm": 0.10755108296871185, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 150540 + }, + { + "epoch": 0.5819841969352569, + "grad_norm": 0.10732606053352356, + "learning_rate": 0.002, + "loss": 2.355, + "step": 150550 + }, + { + "epoch": 0.5820228541386402, + "grad_norm": 0.10095525532960892, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 150560 + }, + { + "epoch": 0.5820615113420234, + "grad_norm": 0.0937095358967781, + "learning_rate": 0.002, + "loss": 2.323, + "step": 150570 + }, + { + "epoch": 0.5821001685454068, + "grad_norm": 0.10207615792751312, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 150580 + }, + { + "epoch": 0.5821388257487901, + "grad_norm": 0.09388964623212814, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 150590 + }, + { + "epoch": 0.5821774829521733, + "grad_norm": 0.10827864706516266, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 150600 + }, + { + "epoch": 0.5822161401555566, + "grad_norm": 0.10733626037836075, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 150610 + }, + { + "epoch": 0.5822547973589398, + "grad_norm": 0.10135097801685333, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 150620 + }, + { + "epoch": 0.5822934545623232, + "grad_norm": 0.11220720410346985, + "learning_rate": 0.002, + "loss": 2.351, + "step": 150630 + }, + { + "epoch": 0.5823321117657064, + "grad_norm": 0.1044202595949173, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 150640 + }, + { + "epoch": 0.5823707689690897, + "grad_norm": 0.10599269717931747, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 150650 + }, + { + "epoch": 0.5824094261724729, + "grad_norm": 0.1008022353053093, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 150660 + }, + { + "epoch": 0.5824480833758563, + "grad_norm": 0.11797475069761276, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 150670 + }, + { + "epoch": 0.5824867405792395, + "grad_norm": 0.10903535038232803, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 150680 + }, + { + "epoch": 0.5825253977826228, + "grad_norm": 0.14368629455566406, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 150690 + }, + { + "epoch": 0.582564054986006, + "grad_norm": 0.10055279731750488, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 150700 + }, + { + "epoch": 0.5826027121893894, + "grad_norm": 0.11629831790924072, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 150710 + }, + { + "epoch": 0.5826413693927727, + "grad_norm": 0.10161113739013672, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 150720 + }, + { + "epoch": 0.5826800265961559, + "grad_norm": 0.10987081378698349, + "learning_rate": 0.002, + "loss": 2.345, + "step": 150730 + }, + { + "epoch": 0.5827186837995392, + "grad_norm": 0.18283596634864807, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 150740 + }, + { + "epoch": 0.5827573410029225, + "grad_norm": 0.09929069131612778, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 150750 + }, + { + "epoch": 0.5827959982063058, + "grad_norm": 0.10900171846151352, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 150760 + }, + { + "epoch": 0.582834655409689, + "grad_norm": 0.11824193596839905, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 150770 + }, + { + "epoch": 0.5828733126130723, + "grad_norm": 0.10111094266176224, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 150780 + }, + { + "epoch": 0.5829119698164555, + "grad_norm": 0.11064330488443375, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 150790 + }, + { + "epoch": 0.5829506270198389, + "grad_norm": 0.11837726086378098, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 150800 + }, + { + "epoch": 0.5829892842232222, + "grad_norm": 0.11166118830442429, + "learning_rate": 0.002, + "loss": 2.3685, + "step": 150810 + }, + { + "epoch": 0.5830279414266054, + "grad_norm": 0.09941312670707703, + "learning_rate": 0.002, + "loss": 2.34, + "step": 150820 + }, + { + "epoch": 0.5830665986299887, + "grad_norm": 0.12270691245794296, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 150830 + }, + { + "epoch": 0.583105255833372, + "grad_norm": 0.11068408936262131, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 150840 + }, + { + "epoch": 0.5831439130367553, + "grad_norm": 0.1082112044095993, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 150850 + }, + { + "epoch": 0.5831825702401385, + "grad_norm": 0.11955466866493225, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 150860 + }, + { + "epoch": 0.5832212274435218, + "grad_norm": 0.11465366184711456, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 150870 + }, + { + "epoch": 0.5832598846469051, + "grad_norm": 0.10802769660949707, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 150880 + }, + { + "epoch": 0.5832985418502884, + "grad_norm": 0.10408146679401398, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 150890 + }, + { + "epoch": 0.5833371990536717, + "grad_norm": 0.13040228188037872, + "learning_rate": 0.002, + "loss": 2.335, + "step": 150900 + }, + { + "epoch": 0.5833758562570549, + "grad_norm": 0.11632059514522552, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 150910 + }, + { + "epoch": 0.5834145134604383, + "grad_norm": 0.10285551100969315, + "learning_rate": 0.002, + "loss": 2.344, + "step": 150920 + }, + { + "epoch": 0.5834531706638215, + "grad_norm": 0.11648505181074142, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 150930 + }, + { + "epoch": 0.5834918278672048, + "grad_norm": 0.09681139141321182, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 150940 + }, + { + "epoch": 0.583530485070588, + "grad_norm": 0.11876310408115387, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 150950 + }, + { + "epoch": 0.5835691422739714, + "grad_norm": 0.09588060528039932, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 150960 + }, + { + "epoch": 0.5836077994773546, + "grad_norm": 0.11005954444408417, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 150970 + }, + { + "epoch": 0.5836464566807379, + "grad_norm": 0.09692657738924026, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 150980 + }, + { + "epoch": 0.5836851138841211, + "grad_norm": 0.12167064845561981, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 150990 + }, + { + "epoch": 0.5837237710875044, + "grad_norm": 0.12240065634250641, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 151000 + }, + { + "epoch": 0.5837624282908878, + "grad_norm": 0.09758885204792023, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 151010 + }, + { + "epoch": 0.583801085494271, + "grad_norm": 0.11564390361309052, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 151020 + }, + { + "epoch": 0.5838397426976543, + "grad_norm": 0.11047053337097168, + "learning_rate": 0.002, + "loss": 2.356, + "step": 151030 + }, + { + "epoch": 0.5838783999010375, + "grad_norm": 0.11683172732591629, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 151040 + }, + { + "epoch": 0.5839170571044209, + "grad_norm": 0.09342314302921295, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 151050 + }, + { + "epoch": 0.5839557143078041, + "grad_norm": 0.10871961712837219, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 151060 + }, + { + "epoch": 0.5839943715111874, + "grad_norm": 0.09939336031675339, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 151070 + }, + { + "epoch": 0.5840330287145706, + "grad_norm": 0.0856621041893959, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 151080 + }, + { + "epoch": 0.584071685917954, + "grad_norm": 0.11879793554544449, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 151090 + }, + { + "epoch": 0.5841103431213372, + "grad_norm": 0.10782366245985031, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 151100 + }, + { + "epoch": 0.5841490003247205, + "grad_norm": 0.10675476491451263, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 151110 + }, + { + "epoch": 0.5841876575281038, + "grad_norm": 0.1117832288146019, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 151120 + }, + { + "epoch": 0.5842263147314871, + "grad_norm": 0.10932677239179611, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 151130 + }, + { + "epoch": 0.5842649719348704, + "grad_norm": 0.11666686087846756, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 151140 + }, + { + "epoch": 0.5843036291382536, + "grad_norm": 0.09197304397821426, + "learning_rate": 0.002, + "loss": 2.338, + "step": 151150 + }, + { + "epoch": 0.5843422863416369, + "grad_norm": 0.11431872099637985, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 151160 + }, + { + "epoch": 0.5843809435450201, + "grad_norm": 0.1133253276348114, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 151170 + }, + { + "epoch": 0.5844196007484035, + "grad_norm": 0.10361841320991516, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 151180 + }, + { + "epoch": 0.5844582579517867, + "grad_norm": 0.10608696192502975, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 151190 + }, + { + "epoch": 0.58449691515517, + "grad_norm": 0.10624140501022339, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 151200 + }, + { + "epoch": 0.5845355723585532, + "grad_norm": 0.10633480548858643, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 151210 + }, + { + "epoch": 0.5845742295619366, + "grad_norm": 0.09925577044487, + "learning_rate": 0.002, + "loss": 2.352, + "step": 151220 + }, + { + "epoch": 0.5846128867653199, + "grad_norm": 0.1234511286020279, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 151230 + }, + { + "epoch": 0.5846515439687031, + "grad_norm": 0.10758979618549347, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 151240 + }, + { + "epoch": 0.5846902011720864, + "grad_norm": 0.09353959560394287, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 151250 + }, + { + "epoch": 0.5847288583754697, + "grad_norm": 0.12607234716415405, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 151260 + }, + { + "epoch": 0.584767515578853, + "grad_norm": 0.13077405095100403, + "learning_rate": 0.002, + "loss": 2.349, + "step": 151270 + }, + { + "epoch": 0.5848061727822362, + "grad_norm": 0.10392604023218155, + "learning_rate": 0.002, + "loss": 2.341, + "step": 151280 + }, + { + "epoch": 0.5848448299856195, + "grad_norm": 0.13296332955360413, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 151290 + }, + { + "epoch": 0.5848834871890028, + "grad_norm": 0.09580894559621811, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 151300 + }, + { + "epoch": 0.5849221443923861, + "grad_norm": 0.1118881031870842, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 151310 + }, + { + "epoch": 0.5849608015957694, + "grad_norm": 0.10997623950242996, + "learning_rate": 0.002, + "loss": 2.355, + "step": 151320 + }, + { + "epoch": 0.5849994587991526, + "grad_norm": 0.09829279035329819, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 151330 + }, + { + "epoch": 0.5850381160025359, + "grad_norm": 0.12762117385864258, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 151340 + }, + { + "epoch": 0.5850767732059192, + "grad_norm": 0.10648144781589508, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 151350 + }, + { + "epoch": 0.5851154304093025, + "grad_norm": 0.1168905720114708, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 151360 + }, + { + "epoch": 0.5851540876126857, + "grad_norm": 0.11103395372629166, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 151370 + }, + { + "epoch": 0.585192744816069, + "grad_norm": 0.09544747322797775, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 151380 + }, + { + "epoch": 0.5852314020194523, + "grad_norm": 0.1289357841014862, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 151390 + }, + { + "epoch": 0.5852700592228356, + "grad_norm": 0.10271959006786346, + "learning_rate": 0.002, + "loss": 2.344, + "step": 151400 + }, + { + "epoch": 0.5853087164262188, + "grad_norm": 0.11833061277866364, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 151410 + }, + { + "epoch": 0.5853473736296021, + "grad_norm": 0.11826982349157333, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 151420 + }, + { + "epoch": 0.5853860308329855, + "grad_norm": 0.11418145149946213, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 151430 + }, + { + "epoch": 0.5854246880363687, + "grad_norm": 0.10659082233905792, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 151440 + }, + { + "epoch": 0.585463345239752, + "grad_norm": 0.09980858862400055, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 151450 + }, + { + "epoch": 0.5855020024431352, + "grad_norm": 0.10436010360717773, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 151460 + }, + { + "epoch": 0.5855406596465186, + "grad_norm": 0.10832711309194565, + "learning_rate": 0.002, + "loss": 2.358, + "step": 151470 + }, + { + "epoch": 0.5855793168499018, + "grad_norm": 0.11657103151082993, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 151480 + }, + { + "epoch": 0.5856179740532851, + "grad_norm": 0.10700567066669464, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 151490 + }, + { + "epoch": 0.5856566312566683, + "grad_norm": 0.10693954676389694, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 151500 + }, + { + "epoch": 0.5856952884600517, + "grad_norm": 0.10214036703109741, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 151510 + }, + { + "epoch": 0.585733945663435, + "grad_norm": 0.10415514558553696, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 151520 + }, + { + "epoch": 0.5857726028668182, + "grad_norm": 0.11231443285942078, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 151530 + }, + { + "epoch": 0.5858112600702015, + "grad_norm": 0.10299165546894073, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 151540 + }, + { + "epoch": 0.5858499172735847, + "grad_norm": 0.08635806292295456, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 151550 + }, + { + "epoch": 0.5858885744769681, + "grad_norm": 0.10262977331876755, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 151560 + }, + { + "epoch": 0.5859272316803513, + "grad_norm": 0.10760878771543503, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 151570 + }, + { + "epoch": 0.5859658888837346, + "grad_norm": 0.1183459609746933, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 151580 + }, + { + "epoch": 0.5860045460871178, + "grad_norm": 0.1195802241563797, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 151590 + }, + { + "epoch": 0.5860432032905012, + "grad_norm": 0.09660229086875916, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 151600 + }, + { + "epoch": 0.5860818604938844, + "grad_norm": 0.11083091050386429, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 151610 + }, + { + "epoch": 0.5861205176972677, + "grad_norm": 0.09059544652700424, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 151620 + }, + { + "epoch": 0.586159174900651, + "grad_norm": 0.10028998553752899, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 151630 + }, + { + "epoch": 0.5861978321040343, + "grad_norm": 0.11900894343852997, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 151640 + }, + { + "epoch": 0.5862364893074176, + "grad_norm": 0.10602059215307236, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 151650 + }, + { + "epoch": 0.5862751465108008, + "grad_norm": 0.10835479944944382, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 151660 + }, + { + "epoch": 0.5863138037141841, + "grad_norm": 0.09900346398353577, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 151670 + }, + { + "epoch": 0.5863524609175674, + "grad_norm": 0.10289320349693298, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 151680 + }, + { + "epoch": 0.5863911181209507, + "grad_norm": 0.12360788136720657, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 151690 + }, + { + "epoch": 0.5864297753243339, + "grad_norm": 0.0973445475101471, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 151700 + }, + { + "epoch": 0.5864684325277172, + "grad_norm": 0.10253198444843292, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 151710 + }, + { + "epoch": 0.5865070897311004, + "grad_norm": 0.2577599585056305, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 151720 + }, + { + "epoch": 0.5865457469344838, + "grad_norm": 0.13467490673065186, + "learning_rate": 0.002, + "loss": 2.348, + "step": 151730 + }, + { + "epoch": 0.586584404137867, + "grad_norm": 0.09774551540613174, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 151740 + }, + { + "epoch": 0.5866230613412503, + "grad_norm": 0.10804309695959091, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 151750 + }, + { + "epoch": 0.5866617185446336, + "grad_norm": 0.10399198532104492, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 151760 + }, + { + "epoch": 0.5867003757480169, + "grad_norm": 0.11586041003465652, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 151770 + }, + { + "epoch": 0.5867390329514002, + "grad_norm": 0.11312390863895416, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 151780 + }, + { + "epoch": 0.5867776901547834, + "grad_norm": 0.10324378311634064, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 151790 + }, + { + "epoch": 0.5868163473581667, + "grad_norm": 0.08976956456899643, + "learning_rate": 0.002, + "loss": 2.341, + "step": 151800 + }, + { + "epoch": 0.58685500456155, + "grad_norm": 0.10378418862819672, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 151810 + }, + { + "epoch": 0.5868936617649333, + "grad_norm": 0.11894010752439499, + "learning_rate": 0.002, + "loss": 2.358, + "step": 151820 + }, + { + "epoch": 0.5869323189683165, + "grad_norm": 0.11267915368080139, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 151830 + }, + { + "epoch": 0.5869709761716998, + "grad_norm": 0.1251029670238495, + "learning_rate": 0.002, + "loss": 2.35, + "step": 151840 + }, + { + "epoch": 0.5870096333750832, + "grad_norm": 0.11871767044067383, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 151850 + }, + { + "epoch": 0.5870482905784664, + "grad_norm": 0.11793194711208344, + "learning_rate": 0.002, + "loss": 2.347, + "step": 151860 + }, + { + "epoch": 0.5870869477818497, + "grad_norm": 0.10210611671209335, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 151870 + }, + { + "epoch": 0.5871256049852329, + "grad_norm": 0.09311977028846741, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 151880 + }, + { + "epoch": 0.5871642621886163, + "grad_norm": 0.09820962697267532, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 151890 + }, + { + "epoch": 0.5872029193919995, + "grad_norm": 0.11874396353960037, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 151900 + }, + { + "epoch": 0.5872415765953828, + "grad_norm": 0.1036624014377594, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 151910 + }, + { + "epoch": 0.587280233798766, + "grad_norm": 0.11919917911291122, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 151920 + }, + { + "epoch": 0.5873188910021493, + "grad_norm": 0.12413368374109268, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 151930 + }, + { + "epoch": 0.5873575482055327, + "grad_norm": 0.11070192605257034, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 151940 + }, + { + "epoch": 0.5873962054089159, + "grad_norm": 0.11603335291147232, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 151950 + }, + { + "epoch": 0.5874348626122992, + "grad_norm": 0.09753865003585815, + "learning_rate": 0.002, + "loss": 2.351, + "step": 151960 + }, + { + "epoch": 0.5874735198156824, + "grad_norm": 0.13039851188659668, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 151970 + }, + { + "epoch": 0.5875121770190658, + "grad_norm": 0.09103492647409439, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 151980 + }, + { + "epoch": 0.587550834222449, + "grad_norm": 0.1167258694767952, + "learning_rate": 0.002, + "loss": 2.331, + "step": 151990 + }, + { + "epoch": 0.5875894914258323, + "grad_norm": 0.11920642852783203, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 152000 + }, + { + "epoch": 0.5876281486292155, + "grad_norm": 0.10152848064899445, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 152010 + }, + { + "epoch": 0.5876668058325989, + "grad_norm": 0.10228315740823746, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 152020 + }, + { + "epoch": 0.5877054630359821, + "grad_norm": 0.1047983169555664, + "learning_rate": 0.002, + "loss": 2.355, + "step": 152030 + }, + { + "epoch": 0.5877441202393654, + "grad_norm": 0.11148735135793686, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 152040 + }, + { + "epoch": 0.5877827774427486, + "grad_norm": 0.10150796920061111, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 152050 + }, + { + "epoch": 0.587821434646132, + "grad_norm": 0.11171367019414902, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 152060 + }, + { + "epoch": 0.5878600918495153, + "grad_norm": 0.09901855885982513, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 152070 + }, + { + "epoch": 0.5878987490528985, + "grad_norm": 0.10765265673398972, + "learning_rate": 0.002, + "loss": 2.359, + "step": 152080 + }, + { + "epoch": 0.5879374062562818, + "grad_norm": 0.12273464351892471, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 152090 + }, + { + "epoch": 0.587976063459665, + "grad_norm": 0.1203690692782402, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 152100 + }, + { + "epoch": 0.5880147206630484, + "grad_norm": 0.12574422359466553, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 152110 + }, + { + "epoch": 0.5880533778664316, + "grad_norm": 0.09954094886779785, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 152120 + }, + { + "epoch": 0.5880920350698149, + "grad_norm": 0.09980536252260208, + "learning_rate": 0.002, + "loss": 2.34, + "step": 152130 + }, + { + "epoch": 0.5881306922731981, + "grad_norm": 0.10849287360906601, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 152140 + }, + { + "epoch": 0.5881693494765815, + "grad_norm": 0.11069836467504501, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 152150 + }, + { + "epoch": 0.5882080066799648, + "grad_norm": 0.09529554843902588, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 152160 + }, + { + "epoch": 0.588246663883348, + "grad_norm": 0.10251989960670471, + "learning_rate": 0.002, + "loss": 2.327, + "step": 152170 + }, + { + "epoch": 0.5882853210867313, + "grad_norm": 0.10750257223844528, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 152180 + }, + { + "epoch": 0.5883239782901146, + "grad_norm": 0.1048988401889801, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 152190 + }, + { + "epoch": 0.5883626354934979, + "grad_norm": 0.10716889053583145, + "learning_rate": 0.002, + "loss": 2.334, + "step": 152200 + }, + { + "epoch": 0.5884012926968811, + "grad_norm": 0.09506648778915405, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 152210 + }, + { + "epoch": 0.5884399499002644, + "grad_norm": 0.12282132357358932, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 152220 + }, + { + "epoch": 0.5884786071036477, + "grad_norm": 0.1068616658449173, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 152230 + }, + { + "epoch": 0.588517264307031, + "grad_norm": 0.1003294587135315, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 152240 + }, + { + "epoch": 0.5885559215104142, + "grad_norm": 0.10827738791704178, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 152250 + }, + { + "epoch": 0.5885945787137975, + "grad_norm": 0.10909453779459, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 152260 + }, + { + "epoch": 0.5886332359171808, + "grad_norm": 0.1163821890950203, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 152270 + }, + { + "epoch": 0.5886718931205641, + "grad_norm": 0.09663323312997818, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 152280 + }, + { + "epoch": 0.5887105503239474, + "grad_norm": 0.0876787081360817, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 152290 + }, + { + "epoch": 0.5887492075273306, + "grad_norm": 0.09570583701133728, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 152300 + }, + { + "epoch": 0.5887878647307139, + "grad_norm": 0.3451126217842102, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 152310 + }, + { + "epoch": 0.5888265219340972, + "grad_norm": 0.11119550466537476, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 152320 + }, + { + "epoch": 0.5888651791374805, + "grad_norm": 0.10163173079490662, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 152330 + }, + { + "epoch": 0.5889038363408637, + "grad_norm": 0.10424763709306717, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 152340 + }, + { + "epoch": 0.588942493544247, + "grad_norm": 0.10825823992490768, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 152350 + }, + { + "epoch": 0.5889811507476304, + "grad_norm": 0.1267893761396408, + "learning_rate": 0.002, + "loss": 2.366, + "step": 152360 + }, + { + "epoch": 0.5890198079510136, + "grad_norm": 0.10046001523733139, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 152370 + }, + { + "epoch": 0.5890584651543969, + "grad_norm": 0.09802465885877609, + "learning_rate": 0.002, + "loss": 2.345, + "step": 152380 + }, + { + "epoch": 0.5890971223577801, + "grad_norm": 0.09976851940155029, + "learning_rate": 0.002, + "loss": 2.354, + "step": 152390 + }, + { + "epoch": 0.5891357795611635, + "grad_norm": 0.11055140197277069, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 152400 + }, + { + "epoch": 0.5891744367645467, + "grad_norm": 0.10662383586168289, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 152410 + }, + { + "epoch": 0.58921309396793, + "grad_norm": 0.11698868870735168, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 152420 + }, + { + "epoch": 0.5892517511713132, + "grad_norm": 0.10271777957677841, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 152430 + }, + { + "epoch": 0.5892904083746966, + "grad_norm": 0.10462356358766556, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 152440 + }, + { + "epoch": 0.5893290655780798, + "grad_norm": 0.13692766427993774, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 152450 + }, + { + "epoch": 0.5893677227814631, + "grad_norm": 0.09228157997131348, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 152460 + }, + { + "epoch": 0.5894063799848464, + "grad_norm": 0.1338038593530655, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 152470 + }, + { + "epoch": 0.5894450371882296, + "grad_norm": 0.10304766148328781, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 152480 + }, + { + "epoch": 0.589483694391613, + "grad_norm": 0.12008009105920792, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 152490 + }, + { + "epoch": 0.5895223515949962, + "grad_norm": 0.11367745697498322, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 152500 + }, + { + "epoch": 0.5895610087983795, + "grad_norm": 0.10777866095304489, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 152510 + }, + { + "epoch": 0.5895996660017627, + "grad_norm": 0.09322947263717651, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 152520 + }, + { + "epoch": 0.5896383232051461, + "grad_norm": 0.10844715684652328, + "learning_rate": 0.002, + "loss": 2.33, + "step": 152530 + }, + { + "epoch": 0.5896769804085293, + "grad_norm": 0.11750151962041855, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 152540 + }, + { + "epoch": 0.5897156376119126, + "grad_norm": 0.09563729166984558, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 152550 + }, + { + "epoch": 0.5897542948152958, + "grad_norm": 0.13296782970428467, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 152560 + }, + { + "epoch": 0.5897929520186792, + "grad_norm": 0.11598195135593414, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 152570 + }, + { + "epoch": 0.5898316092220625, + "grad_norm": 0.11028944700956345, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 152580 + }, + { + "epoch": 0.5898702664254457, + "grad_norm": 0.10078743100166321, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 152590 + }, + { + "epoch": 0.589908923628829, + "grad_norm": 0.11776949465274811, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 152600 + }, + { + "epoch": 0.5899475808322123, + "grad_norm": 0.10884489864110947, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 152610 + }, + { + "epoch": 0.5899862380355956, + "grad_norm": 0.10422182828187943, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 152620 + }, + { + "epoch": 0.5900248952389788, + "grad_norm": 0.10939929634332657, + "learning_rate": 0.002, + "loss": 2.34, + "step": 152630 + }, + { + "epoch": 0.5900635524423621, + "grad_norm": 0.10515302419662476, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 152640 + }, + { + "epoch": 0.5901022096457453, + "grad_norm": 0.10827838629484177, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 152650 + }, + { + "epoch": 0.5901408668491287, + "grad_norm": 0.09868855774402618, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 152660 + }, + { + "epoch": 0.590179524052512, + "grad_norm": 0.11079439520835876, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 152670 + }, + { + "epoch": 0.5902181812558952, + "grad_norm": 0.12535911798477173, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 152680 + }, + { + "epoch": 0.5902568384592785, + "grad_norm": 0.24511437118053436, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 152690 + }, + { + "epoch": 0.5902954956626618, + "grad_norm": 0.10732803493738174, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 152700 + }, + { + "epoch": 0.5903341528660451, + "grad_norm": 0.113482765853405, + "learning_rate": 0.002, + "loss": 2.362, + "step": 152710 + }, + { + "epoch": 0.5903728100694283, + "grad_norm": 0.0953284204006195, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 152720 + }, + { + "epoch": 0.5904114672728116, + "grad_norm": 0.12417623400688171, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 152730 + }, + { + "epoch": 0.5904501244761949, + "grad_norm": 0.10904087871313095, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 152740 + }, + { + "epoch": 0.5904887816795782, + "grad_norm": 0.11377087235450745, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 152750 + }, + { + "epoch": 0.5905274388829614, + "grad_norm": 0.09927655011415482, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 152760 + }, + { + "epoch": 0.5905660960863447, + "grad_norm": 0.12110207229852676, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 152770 + }, + { + "epoch": 0.5906047532897281, + "grad_norm": 0.1045595183968544, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 152780 + }, + { + "epoch": 0.5906434104931113, + "grad_norm": 0.09562791883945465, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 152790 + }, + { + "epoch": 0.5906820676964946, + "grad_norm": 0.09046997874975204, + "learning_rate": 0.002, + "loss": 2.349, + "step": 152800 + }, + { + "epoch": 0.5907207248998778, + "grad_norm": 0.12062862515449524, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 152810 + }, + { + "epoch": 0.5907593821032612, + "grad_norm": 0.11804115027189255, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 152820 + }, + { + "epoch": 0.5907980393066444, + "grad_norm": 0.10457627475261688, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 152830 + }, + { + "epoch": 0.5908366965100277, + "grad_norm": 0.09671007841825485, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 152840 + }, + { + "epoch": 0.5908753537134109, + "grad_norm": 0.11309293657541275, + "learning_rate": 0.002, + "loss": 2.351, + "step": 152850 + }, + { + "epoch": 0.5909140109167942, + "grad_norm": 0.11583933979272842, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 152860 + }, + { + "epoch": 0.5909526681201775, + "grad_norm": 0.10668080300092697, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 152870 + }, + { + "epoch": 0.5909913253235608, + "grad_norm": 0.09829465299844742, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 152880 + }, + { + "epoch": 0.591029982526944, + "grad_norm": 0.11730373650789261, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 152890 + }, + { + "epoch": 0.5910686397303273, + "grad_norm": 0.11244652420282364, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 152900 + }, + { + "epoch": 0.5911072969337107, + "grad_norm": 0.10793368518352509, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 152910 + }, + { + "epoch": 0.5911459541370939, + "grad_norm": 0.09984087944030762, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 152920 + }, + { + "epoch": 0.5911846113404772, + "grad_norm": 0.11790379136800766, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 152930 + }, + { + "epoch": 0.5912232685438604, + "grad_norm": 0.11779079586267471, + "learning_rate": 0.002, + "loss": 2.334, + "step": 152940 + }, + { + "epoch": 0.5912619257472438, + "grad_norm": 0.11105469614267349, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 152950 + }, + { + "epoch": 0.591300582950627, + "grad_norm": 0.11927516013383865, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 152960 + }, + { + "epoch": 0.5913392401540103, + "grad_norm": 0.12462148070335388, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 152970 + }, + { + "epoch": 0.5913778973573935, + "grad_norm": 0.11625771969556808, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 152980 + }, + { + "epoch": 0.5914165545607769, + "grad_norm": 0.10062477737665176, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 152990 + }, + { + "epoch": 0.5914552117641602, + "grad_norm": 0.10655297338962555, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 153000 + }, + { + "epoch": 0.5914938689675434, + "grad_norm": 0.11452042311429977, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 153010 + }, + { + "epoch": 0.5915325261709267, + "grad_norm": 0.10042541474103928, + "learning_rate": 0.002, + "loss": 2.348, + "step": 153020 + }, + { + "epoch": 0.5915711833743099, + "grad_norm": 0.09965793043375015, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 153030 + }, + { + "epoch": 0.5916098405776933, + "grad_norm": 0.10968238115310669, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 153040 + }, + { + "epoch": 0.5916484977810765, + "grad_norm": 0.12298446893692017, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 153050 + }, + { + "epoch": 0.5916871549844598, + "grad_norm": 0.12933231890201569, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 153060 + }, + { + "epoch": 0.591725812187843, + "grad_norm": 0.09767874330282211, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 153070 + }, + { + "epoch": 0.5917644693912264, + "grad_norm": 0.11389636248350143, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 153080 + }, + { + "epoch": 0.5918031265946097, + "grad_norm": 0.09770804643630981, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 153090 + }, + { + "epoch": 0.5918417837979929, + "grad_norm": 0.09838932752609253, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 153100 + }, + { + "epoch": 0.5918804410013762, + "grad_norm": 0.12737208604812622, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 153110 + }, + { + "epoch": 0.5919190982047595, + "grad_norm": 0.14221762120723724, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 153120 + }, + { + "epoch": 0.5919577554081428, + "grad_norm": 0.10190019011497498, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 153130 + }, + { + "epoch": 0.591996412611526, + "grad_norm": 0.12370285391807556, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 153140 + }, + { + "epoch": 0.5920350698149093, + "grad_norm": 0.0968901664018631, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 153150 + }, + { + "epoch": 0.5920737270182926, + "grad_norm": 0.10463910549879074, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 153160 + }, + { + "epoch": 0.5921123842216759, + "grad_norm": 0.11504076421260834, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 153170 + }, + { + "epoch": 0.5921510414250591, + "grad_norm": 0.110844686627388, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 153180 + }, + { + "epoch": 0.5921896986284424, + "grad_norm": 0.10349141061306, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 153190 + }, + { + "epoch": 0.5922283558318256, + "grad_norm": 0.1068180575966835, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 153200 + }, + { + "epoch": 0.592267013035209, + "grad_norm": 0.10640236735343933, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 153210 + }, + { + "epoch": 0.5923056702385923, + "grad_norm": 0.12542495131492615, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 153220 + }, + { + "epoch": 0.5923443274419755, + "grad_norm": 0.12914881110191345, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 153230 + }, + { + "epoch": 0.5923829846453588, + "grad_norm": 0.11109361052513123, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 153240 + }, + { + "epoch": 0.5924216418487421, + "grad_norm": 0.09706513583660126, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 153250 + }, + { + "epoch": 0.5924602990521254, + "grad_norm": 0.10857724398374557, + "learning_rate": 0.002, + "loss": 2.3067, + "step": 153260 + }, + { + "epoch": 0.5924989562555086, + "grad_norm": 0.11418808996677399, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 153270 + }, + { + "epoch": 0.5925376134588919, + "grad_norm": 0.11678769439458847, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 153280 + }, + { + "epoch": 0.5925762706622753, + "grad_norm": 0.134332537651062, + "learning_rate": 0.002, + "loss": 2.338, + "step": 153290 + }, + { + "epoch": 0.5926149278656585, + "grad_norm": 0.10128011554479599, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 153300 + }, + { + "epoch": 0.5926535850690418, + "grad_norm": 0.1074603945016861, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 153310 + }, + { + "epoch": 0.592692242272425, + "grad_norm": 0.1251911222934723, + "learning_rate": 0.002, + "loss": 2.353, + "step": 153320 + }, + { + "epoch": 0.5927308994758084, + "grad_norm": 0.09642443805932999, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 153330 + }, + { + "epoch": 0.5927695566791916, + "grad_norm": 0.11492682993412018, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 153340 + }, + { + "epoch": 0.5928082138825749, + "grad_norm": 0.0965128168463707, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 153350 + }, + { + "epoch": 0.5928468710859581, + "grad_norm": 0.12206316739320755, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 153360 + }, + { + "epoch": 0.5928855282893415, + "grad_norm": 0.10395307838916779, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 153370 + }, + { + "epoch": 0.5929241854927247, + "grad_norm": 0.12461967021226883, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 153380 + }, + { + "epoch": 0.592962842696108, + "grad_norm": 0.11842044442892075, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 153390 + }, + { + "epoch": 0.5930014998994912, + "grad_norm": 0.1287367045879364, + "learning_rate": 0.002, + "loss": 2.33, + "step": 153400 + }, + { + "epoch": 0.5930401571028745, + "grad_norm": 0.11194679886102676, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 153410 + }, + { + "epoch": 0.5930788143062579, + "grad_norm": 0.10873754322528839, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 153420 + }, + { + "epoch": 0.5931174715096411, + "grad_norm": 0.10627902299165726, + "learning_rate": 0.002, + "loss": 2.33, + "step": 153430 + }, + { + "epoch": 0.5931561287130244, + "grad_norm": 0.123879075050354, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 153440 + }, + { + "epoch": 0.5931947859164076, + "grad_norm": 0.09307146072387695, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 153450 + }, + { + "epoch": 0.593233443119791, + "grad_norm": 0.09851375222206116, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 153460 + }, + { + "epoch": 0.5932721003231742, + "grad_norm": 0.11978261172771454, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 153470 + }, + { + "epoch": 0.5933107575265575, + "grad_norm": 0.09971176832914352, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 153480 + }, + { + "epoch": 0.5933494147299407, + "grad_norm": 0.11914601922035217, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 153490 + }, + { + "epoch": 0.5933880719333241, + "grad_norm": 0.09303689748048782, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 153500 + }, + { + "epoch": 0.5934267291367074, + "grad_norm": 0.10256947576999664, + "learning_rate": 0.002, + "loss": 2.341, + "step": 153510 + }, + { + "epoch": 0.5934653863400906, + "grad_norm": 0.13114187121391296, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 153520 + }, + { + "epoch": 0.5935040435434739, + "grad_norm": 0.0972515270113945, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 153530 + }, + { + "epoch": 0.5935427007468572, + "grad_norm": 0.11419462412595749, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 153540 + }, + { + "epoch": 0.5935813579502405, + "grad_norm": 0.10986484587192535, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 153550 + }, + { + "epoch": 0.5936200151536237, + "grad_norm": 0.09035072475671768, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 153560 + }, + { + "epoch": 0.593658672357007, + "grad_norm": 0.11297523230314255, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 153570 + }, + { + "epoch": 0.5936973295603902, + "grad_norm": 0.11120027303695679, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 153580 + }, + { + "epoch": 0.5937359867637736, + "grad_norm": 0.08788348734378815, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 153590 + }, + { + "epoch": 0.5937746439671568, + "grad_norm": 0.10375424474477768, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 153600 + }, + { + "epoch": 0.5938133011705401, + "grad_norm": 0.1176786869764328, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 153610 + }, + { + "epoch": 0.5938519583739233, + "grad_norm": 0.09632980823516846, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 153620 + }, + { + "epoch": 0.5938906155773067, + "grad_norm": 0.09856001287698746, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 153630 + }, + { + "epoch": 0.59392927278069, + "grad_norm": 0.11624328047037125, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 153640 + }, + { + "epoch": 0.5939679299840732, + "grad_norm": 0.102755106985569, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 153650 + }, + { + "epoch": 0.5940065871874565, + "grad_norm": 0.10109134018421173, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 153660 + }, + { + "epoch": 0.5940452443908398, + "grad_norm": 0.10665129125118256, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 153670 + }, + { + "epoch": 0.5940839015942231, + "grad_norm": 0.1010684221982956, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 153680 + }, + { + "epoch": 0.5941225587976063, + "grad_norm": 0.10469356924295425, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 153690 + }, + { + "epoch": 0.5941612160009896, + "grad_norm": 0.12337398529052734, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 153700 + }, + { + "epoch": 0.594199873204373, + "grad_norm": 0.09571670740842819, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 153710 + }, + { + "epoch": 0.5942385304077562, + "grad_norm": 0.11873577535152435, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 153720 + }, + { + "epoch": 0.5942771876111395, + "grad_norm": 0.11242678761482239, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 153730 + }, + { + "epoch": 0.5943158448145227, + "grad_norm": 0.10509809851646423, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 153740 + }, + { + "epoch": 0.594354502017906, + "grad_norm": 0.10575684159994125, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 153750 + }, + { + "epoch": 0.5943931592212893, + "grad_norm": 0.12107384204864502, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 153760 + }, + { + "epoch": 0.5944318164246726, + "grad_norm": 0.10724588483572006, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 153770 + }, + { + "epoch": 0.5944704736280558, + "grad_norm": 0.10052936524152756, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 153780 + }, + { + "epoch": 0.5945091308314391, + "grad_norm": 0.10812316834926605, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 153790 + }, + { + "epoch": 0.5945477880348224, + "grad_norm": 0.09277022629976273, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 153800 + }, + { + "epoch": 0.5945864452382057, + "grad_norm": 0.09093903750181198, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 153810 + }, + { + "epoch": 0.594625102441589, + "grad_norm": 0.11882476508617401, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 153820 + }, + { + "epoch": 0.5946637596449722, + "grad_norm": 0.09744821488857269, + "learning_rate": 0.002, + "loss": 2.333, + "step": 153830 + }, + { + "epoch": 0.5947024168483556, + "grad_norm": 0.1002984419465065, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 153840 + }, + { + "epoch": 0.5947410740517388, + "grad_norm": 0.10048675537109375, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 153850 + }, + { + "epoch": 0.5947797312551221, + "grad_norm": 0.12754768133163452, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 153860 + }, + { + "epoch": 0.5948183884585053, + "grad_norm": 0.13870207965373993, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 153870 + }, + { + "epoch": 0.5948570456618887, + "grad_norm": 0.0983053594827652, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 153880 + }, + { + "epoch": 0.5948957028652719, + "grad_norm": 0.10627438127994537, + "learning_rate": 0.002, + "loss": 2.342, + "step": 153890 + }, + { + "epoch": 0.5949343600686552, + "grad_norm": 0.10147716104984283, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 153900 + }, + { + "epoch": 0.5949730172720384, + "grad_norm": 0.11695457249879837, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 153910 + }, + { + "epoch": 0.5950116744754218, + "grad_norm": 0.10442083328962326, + "learning_rate": 0.002, + "loss": 2.336, + "step": 153920 + }, + { + "epoch": 0.5950503316788051, + "grad_norm": 0.10922045260667801, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 153930 + }, + { + "epoch": 0.5950889888821883, + "grad_norm": 0.09638384729623795, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 153940 + }, + { + "epoch": 0.5951276460855716, + "grad_norm": 0.09987162798643112, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 153950 + }, + { + "epoch": 0.5951663032889548, + "grad_norm": 0.11333297193050385, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 153960 + }, + { + "epoch": 0.5952049604923382, + "grad_norm": 0.11005302518606186, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 153970 + }, + { + "epoch": 0.5952436176957214, + "grad_norm": 0.09839659184217453, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 153980 + }, + { + "epoch": 0.5952822748991047, + "grad_norm": 0.1176500916481018, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 153990 + }, + { + "epoch": 0.5953209321024879, + "grad_norm": 0.1214519664645195, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 154000 + }, + { + "epoch": 0.5953595893058713, + "grad_norm": 0.10236170887947083, + "learning_rate": 0.002, + "loss": 2.352, + "step": 154010 + }, + { + "epoch": 0.5953982465092545, + "grad_norm": 0.11794347316026688, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 154020 + }, + { + "epoch": 0.5954369037126378, + "grad_norm": 0.1112416610121727, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 154030 + }, + { + "epoch": 0.595475560916021, + "grad_norm": 0.09213864058256149, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 154040 + }, + { + "epoch": 0.5955142181194044, + "grad_norm": 0.11961939930915833, + "learning_rate": 0.002, + "loss": 2.341, + "step": 154050 + }, + { + "epoch": 0.5955528753227877, + "grad_norm": 0.10888458788394928, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 154060 + }, + { + "epoch": 0.5955915325261709, + "grad_norm": 0.13337060809135437, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 154070 + }, + { + "epoch": 0.5956301897295542, + "grad_norm": 0.12271896749734879, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 154080 + }, + { + "epoch": 0.5956688469329375, + "grad_norm": 0.114785335958004, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 154090 + }, + { + "epoch": 0.5957075041363208, + "grad_norm": 0.11995737999677658, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 154100 + }, + { + "epoch": 0.595746161339704, + "grad_norm": 0.11258647590875626, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 154110 + }, + { + "epoch": 0.5957848185430873, + "grad_norm": 0.12628565728664398, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 154120 + }, + { + "epoch": 0.5958234757464705, + "grad_norm": 0.1209024041891098, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 154130 + }, + { + "epoch": 0.5958621329498539, + "grad_norm": 0.10436531901359558, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 154140 + }, + { + "epoch": 0.5959007901532372, + "grad_norm": 0.11716852337121964, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 154150 + }, + { + "epoch": 0.5959394473566204, + "grad_norm": 0.10218223929405212, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 154160 + }, + { + "epoch": 0.5959781045600037, + "grad_norm": 0.12400393187999725, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 154170 + }, + { + "epoch": 0.596016761763387, + "grad_norm": 0.1067901998758316, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 154180 + }, + { + "epoch": 0.5960554189667703, + "grad_norm": 0.10009181499481201, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 154190 + }, + { + "epoch": 0.5960940761701535, + "grad_norm": 0.1042267307639122, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 154200 + }, + { + "epoch": 0.5961327333735368, + "grad_norm": 0.1324894279241562, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 154210 + }, + { + "epoch": 0.5961713905769201, + "grad_norm": 0.11904510855674744, + "learning_rate": 0.002, + "loss": 2.346, + "step": 154220 + }, + { + "epoch": 0.5962100477803034, + "grad_norm": 0.0989355519413948, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 154230 + }, + { + "epoch": 0.5962487049836867, + "grad_norm": 0.12672419846057892, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 154240 + }, + { + "epoch": 0.5962873621870699, + "grad_norm": 0.11545150727033615, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 154250 + }, + { + "epoch": 0.5963260193904533, + "grad_norm": 0.11972194910049438, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 154260 + }, + { + "epoch": 0.5963646765938365, + "grad_norm": 0.13377872109413147, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 154270 + }, + { + "epoch": 0.5964033337972198, + "grad_norm": 0.10976868122816086, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 154280 + }, + { + "epoch": 0.596441991000603, + "grad_norm": 0.1025083065032959, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 154290 + }, + { + "epoch": 0.5964806482039864, + "grad_norm": 0.09834298491477966, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 154300 + }, + { + "epoch": 0.5965193054073696, + "grad_norm": 0.12274927645921707, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 154310 + }, + { + "epoch": 0.5965579626107529, + "grad_norm": 0.09800335764884949, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 154320 + }, + { + "epoch": 0.5965966198141361, + "grad_norm": 0.09520439058542252, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 154330 + }, + { + "epoch": 0.5966352770175194, + "grad_norm": 0.10089229792356491, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 154340 + }, + { + "epoch": 0.5966739342209028, + "grad_norm": 0.11390987038612366, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 154350 + }, + { + "epoch": 0.596712591424286, + "grad_norm": 0.09597501158714294, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 154360 + }, + { + "epoch": 0.5967512486276693, + "grad_norm": 0.12074465304613113, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 154370 + }, + { + "epoch": 0.5967899058310525, + "grad_norm": 0.13768284022808075, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 154380 + }, + { + "epoch": 0.5968285630344359, + "grad_norm": 0.11336904764175415, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 154390 + }, + { + "epoch": 0.5968672202378191, + "grad_norm": 0.1107833981513977, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 154400 + }, + { + "epoch": 0.5969058774412024, + "grad_norm": 0.10331069678068161, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 154410 + }, + { + "epoch": 0.5969445346445856, + "grad_norm": 0.11662972718477249, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 154420 + }, + { + "epoch": 0.596983191847969, + "grad_norm": 0.10128601640462875, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 154430 + }, + { + "epoch": 0.5970218490513522, + "grad_norm": 0.09762250632047653, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 154440 + }, + { + "epoch": 0.5970605062547355, + "grad_norm": 0.10796613991260529, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 154450 + }, + { + "epoch": 0.5970991634581188, + "grad_norm": 0.13601264357566833, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 154460 + }, + { + "epoch": 0.5971378206615021, + "grad_norm": 0.10176949948072433, + "learning_rate": 0.002, + "loss": 2.3762, + "step": 154470 + }, + { + "epoch": 0.5971764778648854, + "grad_norm": 0.10595223307609558, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 154480 + }, + { + "epoch": 0.5972151350682686, + "grad_norm": 0.10171402990818024, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 154490 + }, + { + "epoch": 0.5972537922716519, + "grad_norm": 0.11845794320106506, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 154500 + }, + { + "epoch": 0.5972924494750351, + "grad_norm": 0.09843221306800842, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 154510 + }, + { + "epoch": 0.5973311066784185, + "grad_norm": 0.09988819062709808, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 154520 + }, + { + "epoch": 0.5973697638818017, + "grad_norm": 0.11028525233268738, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 154530 + }, + { + "epoch": 0.597408421085185, + "grad_norm": 0.10710286349058151, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 154540 + }, + { + "epoch": 0.5974470782885682, + "grad_norm": 0.11657308787107468, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 154550 + }, + { + "epoch": 0.5974857354919516, + "grad_norm": 0.1102280244231224, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 154560 + }, + { + "epoch": 0.5975243926953349, + "grad_norm": 0.10331825911998749, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 154570 + }, + { + "epoch": 0.5975630498987181, + "grad_norm": 0.11934579163789749, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 154580 + }, + { + "epoch": 0.5976017071021014, + "grad_norm": 0.11135943233966827, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 154590 + }, + { + "epoch": 0.5976403643054847, + "grad_norm": 0.1203981563448906, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 154600 + }, + { + "epoch": 0.597679021508868, + "grad_norm": 0.0986235961318016, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 154610 + }, + { + "epoch": 0.5977176787122512, + "grad_norm": 0.09534791857004166, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 154620 + }, + { + "epoch": 0.5977563359156345, + "grad_norm": 0.09276114404201508, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 154630 + }, + { + "epoch": 0.5977949931190178, + "grad_norm": 0.11678782850503922, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 154640 + }, + { + "epoch": 0.5978336503224011, + "grad_norm": 0.4351901412010193, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 154650 + }, + { + "epoch": 0.5978723075257844, + "grad_norm": 0.1325816512107849, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 154660 + }, + { + "epoch": 0.5979109647291676, + "grad_norm": 0.11587052047252655, + "learning_rate": 0.002, + "loss": 2.342, + "step": 154670 + }, + { + "epoch": 0.5979496219325509, + "grad_norm": 0.12498050928115845, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 154680 + }, + { + "epoch": 0.5979882791359342, + "grad_norm": 0.11049990355968475, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 154690 + }, + { + "epoch": 0.5980269363393175, + "grad_norm": 0.11567365378141403, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 154700 + }, + { + "epoch": 0.5980655935427007, + "grad_norm": 0.10405659675598145, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 154710 + }, + { + "epoch": 0.598104250746084, + "grad_norm": 0.11374452710151672, + "learning_rate": 0.002, + "loss": 2.352, + "step": 154720 + }, + { + "epoch": 0.5981429079494673, + "grad_norm": 0.12257920950651169, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 154730 + }, + { + "epoch": 0.5981815651528506, + "grad_norm": 0.10406675189733505, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 154740 + }, + { + "epoch": 0.5982202223562338, + "grad_norm": 0.13149400055408478, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 154750 + }, + { + "epoch": 0.5982588795596171, + "grad_norm": 0.1053181141614914, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 154760 + }, + { + "epoch": 0.5982975367630005, + "grad_norm": 0.10634828358888626, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 154770 + }, + { + "epoch": 0.5983361939663837, + "grad_norm": 0.10084746778011322, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 154780 + }, + { + "epoch": 0.598374851169767, + "grad_norm": 0.09804457426071167, + "learning_rate": 0.002, + "loss": 2.353, + "step": 154790 + }, + { + "epoch": 0.5984135083731502, + "grad_norm": 0.12280680984258652, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 154800 + }, + { + "epoch": 0.5984521655765336, + "grad_norm": 0.11038525402545929, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 154810 + }, + { + "epoch": 0.5984908227799168, + "grad_norm": 0.10350499302148819, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 154820 + }, + { + "epoch": 0.5985294799833001, + "grad_norm": 0.08930511027574539, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 154830 + }, + { + "epoch": 0.5985681371866833, + "grad_norm": 0.13034844398498535, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 154840 + }, + { + "epoch": 0.5986067943900667, + "grad_norm": 0.10703147202730179, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 154850 + }, + { + "epoch": 0.59864545159345, + "grad_norm": 0.10099921375513077, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 154860 + }, + { + "epoch": 0.5986841087968332, + "grad_norm": 0.1223965436220169, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 154870 + }, + { + "epoch": 0.5987227660002165, + "grad_norm": 0.13722404837608337, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 154880 + }, + { + "epoch": 0.5987614232035997, + "grad_norm": 0.13416677713394165, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 154890 + }, + { + "epoch": 0.5988000804069831, + "grad_norm": 0.11614516377449036, + "learning_rate": 0.002, + "loss": 2.34, + "step": 154900 + }, + { + "epoch": 0.5988387376103663, + "grad_norm": 0.10839620977640152, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 154910 + }, + { + "epoch": 0.5988773948137496, + "grad_norm": 0.10474540293216705, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 154920 + }, + { + "epoch": 0.5989160520171328, + "grad_norm": 0.12745654582977295, + "learning_rate": 0.002, + "loss": 2.348, + "step": 154930 + }, + { + "epoch": 0.5989547092205162, + "grad_norm": 0.09551647305488586, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 154940 + }, + { + "epoch": 0.5989933664238994, + "grad_norm": 0.1009441688656807, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 154950 + }, + { + "epoch": 0.5990320236272827, + "grad_norm": 0.10916905105113983, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 154960 + }, + { + "epoch": 0.599070680830666, + "grad_norm": 0.10929003357887268, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 154970 + }, + { + "epoch": 0.5991093380340493, + "grad_norm": 0.10747554153203964, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 154980 + }, + { + "epoch": 0.5991479952374326, + "grad_norm": 0.12056456506252289, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 154990 + }, + { + "epoch": 0.5991866524408158, + "grad_norm": 0.13263089954853058, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 155000 + }, + { + "epoch": 0.5992253096441991, + "grad_norm": 0.10803119838237762, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 155010 + }, + { + "epoch": 0.5992639668475824, + "grad_norm": 0.10046329349279404, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 155020 + }, + { + "epoch": 0.5993026240509657, + "grad_norm": 0.13293473422527313, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 155030 + }, + { + "epoch": 0.5993412812543489, + "grad_norm": 0.1141979917883873, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 155040 + }, + { + "epoch": 0.5993799384577322, + "grad_norm": 0.100059375166893, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 155050 + }, + { + "epoch": 0.5994185956611154, + "grad_norm": 0.11091131716966629, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 155060 + }, + { + "epoch": 0.5994572528644988, + "grad_norm": 0.1243298128247261, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 155070 + }, + { + "epoch": 0.599495910067882, + "grad_norm": 0.11147356033325195, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 155080 + }, + { + "epoch": 0.5995345672712653, + "grad_norm": 0.1063820868730545, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 155090 + }, + { + "epoch": 0.5995732244746486, + "grad_norm": 0.117628313601017, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 155100 + }, + { + "epoch": 0.5996118816780319, + "grad_norm": 0.1024029403924942, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 155110 + }, + { + "epoch": 0.5996505388814152, + "grad_norm": 0.10223310440778732, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 155120 + }, + { + "epoch": 0.5996891960847984, + "grad_norm": 0.12325169891119003, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 155130 + }, + { + "epoch": 0.5997278532881817, + "grad_norm": 0.11169067770242691, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 155140 + }, + { + "epoch": 0.599766510491565, + "grad_norm": 0.10746439546346664, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 155150 + }, + { + "epoch": 0.5998051676949483, + "grad_norm": 0.09545475989580154, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 155160 + }, + { + "epoch": 0.5998438248983315, + "grad_norm": 0.10252842307090759, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 155170 + }, + { + "epoch": 0.5998824821017148, + "grad_norm": 0.10515642911195755, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 155180 + }, + { + "epoch": 0.5999211393050982, + "grad_norm": 0.09995514154434204, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 155190 + }, + { + "epoch": 0.5999597965084814, + "grad_norm": 0.10166479647159576, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 155200 + }, + { + "epoch": 0.5999984537118647, + "grad_norm": 0.10672900080680847, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 155210 + }, + { + "epoch": 0.6000371109152479, + "grad_norm": 0.12125857174396515, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 155220 + }, + { + "epoch": 0.6000757681186313, + "grad_norm": 0.11408720165491104, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 155230 + }, + { + "epoch": 0.6001144253220145, + "grad_norm": 0.11315006017684937, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 155240 + }, + { + "epoch": 0.6001530825253978, + "grad_norm": 0.11115438491106033, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 155250 + }, + { + "epoch": 0.600191739728781, + "grad_norm": 0.11406593769788742, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 155260 + }, + { + "epoch": 0.6002303969321643, + "grad_norm": 0.11923495680093765, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 155270 + }, + { + "epoch": 0.6002690541355477, + "grad_norm": 0.10646390169858932, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 155280 + }, + { + "epoch": 0.6003077113389309, + "grad_norm": 0.11619962006807327, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 155290 + }, + { + "epoch": 0.6003463685423142, + "grad_norm": 0.09947273135185242, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 155300 + }, + { + "epoch": 0.6003850257456974, + "grad_norm": 0.10065237432718277, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 155310 + }, + { + "epoch": 0.6004236829490808, + "grad_norm": 0.10713927447795868, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 155320 + }, + { + "epoch": 0.600462340152464, + "grad_norm": 0.12866121530532837, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 155330 + }, + { + "epoch": 0.6005009973558473, + "grad_norm": 0.1103544682264328, + "learning_rate": 0.002, + "loss": 2.363, + "step": 155340 + }, + { + "epoch": 0.6005396545592305, + "grad_norm": 0.1077926978468895, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 155350 + }, + { + "epoch": 0.6005783117626139, + "grad_norm": 0.11506489664316177, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 155360 + }, + { + "epoch": 0.6006169689659971, + "grad_norm": 0.09376713633537292, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 155370 + }, + { + "epoch": 0.6006556261693804, + "grad_norm": 0.12837748229503632, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 155380 + }, + { + "epoch": 0.6006942833727636, + "grad_norm": 0.1213526576757431, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 155390 + }, + { + "epoch": 0.600732940576147, + "grad_norm": 0.09528365731239319, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 155400 + }, + { + "epoch": 0.6007715977795303, + "grad_norm": 0.09534302353858948, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 155410 + }, + { + "epoch": 0.6008102549829135, + "grad_norm": 0.13553263247013092, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 155420 + }, + { + "epoch": 0.6008489121862968, + "grad_norm": 0.11701808869838715, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 155430 + }, + { + "epoch": 0.60088756938968, + "grad_norm": 0.10585474222898483, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 155440 + }, + { + "epoch": 0.6009262265930634, + "grad_norm": 0.1210872009396553, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 155450 + }, + { + "epoch": 0.6009648837964466, + "grad_norm": 0.11863139271736145, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 155460 + }, + { + "epoch": 0.6010035409998299, + "grad_norm": 0.12181145697832108, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 155470 + }, + { + "epoch": 0.6010421982032131, + "grad_norm": 0.09719795733690262, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 155480 + }, + { + "epoch": 0.6010808554065965, + "grad_norm": 0.09969165176153183, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 155490 + }, + { + "epoch": 0.6011195126099798, + "grad_norm": 0.12080059945583344, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 155500 + }, + { + "epoch": 0.601158169813363, + "grad_norm": 0.10750175267457962, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 155510 + }, + { + "epoch": 0.6011968270167463, + "grad_norm": 0.11188539862632751, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 155520 + }, + { + "epoch": 0.6012354842201296, + "grad_norm": 0.12191398441791534, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 155530 + }, + { + "epoch": 0.6012741414235129, + "grad_norm": 0.10276643931865692, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 155540 + }, + { + "epoch": 0.6013127986268961, + "grad_norm": 0.1120799109339714, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 155550 + }, + { + "epoch": 0.6013514558302794, + "grad_norm": 0.11369843035936356, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 155560 + }, + { + "epoch": 0.6013901130336627, + "grad_norm": 0.13047267496585846, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 155570 + }, + { + "epoch": 0.601428770237046, + "grad_norm": 0.10796220600605011, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 155580 + }, + { + "epoch": 0.6014674274404292, + "grad_norm": 0.09764180332422256, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 155590 + }, + { + "epoch": 0.6015060846438125, + "grad_norm": 0.11820457130670547, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 155600 + }, + { + "epoch": 0.6015447418471958, + "grad_norm": 0.11059782654047012, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 155610 + }, + { + "epoch": 0.6015833990505791, + "grad_norm": 0.0945165678858757, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 155620 + }, + { + "epoch": 0.6016220562539624, + "grad_norm": 0.1834547221660614, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 155630 + }, + { + "epoch": 0.6016607134573456, + "grad_norm": 0.12191552668809891, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 155640 + }, + { + "epoch": 0.6016993706607289, + "grad_norm": 0.1166863664984703, + "learning_rate": 0.002, + "loss": 2.356, + "step": 155650 + }, + { + "epoch": 0.6017380278641122, + "grad_norm": 0.09542674571275711, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 155660 + }, + { + "epoch": 0.6017766850674955, + "grad_norm": 0.11937938630580902, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 155670 + }, + { + "epoch": 0.6018153422708787, + "grad_norm": 0.1050414890050888, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 155680 + }, + { + "epoch": 0.601853999474262, + "grad_norm": 0.10865213721990585, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 155690 + }, + { + "epoch": 0.6018926566776454, + "grad_norm": 0.11925183981657028, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 155700 + }, + { + "epoch": 0.6019313138810286, + "grad_norm": 0.10119999945163727, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 155710 + }, + { + "epoch": 0.6019699710844119, + "grad_norm": 0.10889890044927597, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 155720 + }, + { + "epoch": 0.6020086282877951, + "grad_norm": 0.10714955627918243, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 155730 + }, + { + "epoch": 0.6020472854911785, + "grad_norm": 0.12309153378009796, + "learning_rate": 0.002, + "loss": 2.361, + "step": 155740 + }, + { + "epoch": 0.6020859426945617, + "grad_norm": 0.09973911941051483, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 155750 + }, + { + "epoch": 0.602124599897945, + "grad_norm": 0.13327579200267792, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 155760 + }, + { + "epoch": 0.6021632571013282, + "grad_norm": 0.10561807453632355, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 155770 + }, + { + "epoch": 0.6022019143047116, + "grad_norm": 0.10778845101594925, + "learning_rate": 0.002, + "loss": 2.348, + "step": 155780 + }, + { + "epoch": 0.6022405715080948, + "grad_norm": 0.1271054595708847, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 155790 + }, + { + "epoch": 0.6022792287114781, + "grad_norm": 0.10283089429140091, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 155800 + }, + { + "epoch": 0.6023178859148614, + "grad_norm": 0.16596998274326324, + "learning_rate": 0.002, + "loss": 2.357, + "step": 155810 + }, + { + "epoch": 0.6023565431182446, + "grad_norm": 0.1061653345823288, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 155820 + }, + { + "epoch": 0.602395200321628, + "grad_norm": 0.10739660263061523, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 155830 + }, + { + "epoch": 0.6024338575250112, + "grad_norm": 0.11349905282258987, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 155840 + }, + { + "epoch": 0.6024725147283945, + "grad_norm": 0.11594951897859573, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 155850 + }, + { + "epoch": 0.6025111719317777, + "grad_norm": 0.11781451106071472, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 155860 + }, + { + "epoch": 0.6025498291351611, + "grad_norm": 0.10783499479293823, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 155870 + }, + { + "epoch": 0.6025884863385443, + "grad_norm": 0.0937013030052185, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 155880 + }, + { + "epoch": 0.6026271435419276, + "grad_norm": 0.11056476086378098, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 155890 + }, + { + "epoch": 0.6026658007453108, + "grad_norm": 0.1398683488368988, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 155900 + }, + { + "epoch": 0.6027044579486942, + "grad_norm": 0.10671201348304749, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 155910 + }, + { + "epoch": 0.6027431151520775, + "grad_norm": 0.10434763878583908, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 155920 + }, + { + "epoch": 0.6027817723554607, + "grad_norm": 0.10570269823074341, + "learning_rate": 0.002, + "loss": 2.332, + "step": 155930 + }, + { + "epoch": 0.602820429558844, + "grad_norm": 0.08993524312973022, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 155940 + }, + { + "epoch": 0.6028590867622273, + "grad_norm": 0.11433251202106476, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 155950 + }, + { + "epoch": 0.6028977439656106, + "grad_norm": 0.12297467142343521, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 155960 + }, + { + "epoch": 0.6029364011689938, + "grad_norm": 0.09785819798707962, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 155970 + }, + { + "epoch": 0.6029750583723771, + "grad_norm": 0.10338174551725388, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 155980 + }, + { + "epoch": 0.6030137155757603, + "grad_norm": 0.13314157724380493, + "learning_rate": 0.002, + "loss": 2.363, + "step": 155990 + }, + { + "epoch": 0.6030523727791437, + "grad_norm": 0.1302737146615982, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 156000 + }, + { + "epoch": 0.603091029982527, + "grad_norm": 0.13459540903568268, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 156010 + }, + { + "epoch": 0.6031296871859102, + "grad_norm": 0.1265462189912796, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 156020 + }, + { + "epoch": 0.6031683443892935, + "grad_norm": 0.13730959594249725, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 156030 + }, + { + "epoch": 0.6032070015926768, + "grad_norm": 0.09791308641433716, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 156040 + }, + { + "epoch": 0.6032456587960601, + "grad_norm": 0.09639491885900497, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 156050 + }, + { + "epoch": 0.6032843159994433, + "grad_norm": 0.11070328950881958, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 156060 + }, + { + "epoch": 0.6033229732028266, + "grad_norm": 0.10137160122394562, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 156070 + }, + { + "epoch": 0.6033616304062099, + "grad_norm": 0.11534697562456131, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 156080 + }, + { + "epoch": 0.6034002876095932, + "grad_norm": 0.1556681990623474, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 156090 + }, + { + "epoch": 0.6034389448129764, + "grad_norm": 0.11398902535438538, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 156100 + }, + { + "epoch": 0.6034776020163597, + "grad_norm": 0.09806044399738312, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 156110 + }, + { + "epoch": 0.6035162592197431, + "grad_norm": 0.11352315545082092, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 156120 + }, + { + "epoch": 0.6035549164231263, + "grad_norm": 0.09236298501491547, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 156130 + }, + { + "epoch": 0.6035935736265096, + "grad_norm": 0.0992632508277893, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 156140 + }, + { + "epoch": 0.6036322308298928, + "grad_norm": 0.1060241162776947, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 156150 + }, + { + "epoch": 0.6036708880332761, + "grad_norm": 0.10750989615917206, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 156160 + }, + { + "epoch": 0.6037095452366594, + "grad_norm": 0.11849751323461533, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 156170 + }, + { + "epoch": 0.6037482024400427, + "grad_norm": 0.10138081014156342, + "learning_rate": 0.002, + "loss": 2.341, + "step": 156180 + }, + { + "epoch": 0.6037868596434259, + "grad_norm": 0.10642571747303009, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 156190 + }, + { + "epoch": 0.6038255168468092, + "grad_norm": 0.1288752406835556, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 156200 + }, + { + "epoch": 0.6038641740501925, + "grad_norm": 0.09953559935092926, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 156210 + }, + { + "epoch": 0.6039028312535758, + "grad_norm": 0.10579574853181839, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 156220 + }, + { + "epoch": 0.603941488456959, + "grad_norm": 0.0933934897184372, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 156230 + }, + { + "epoch": 0.6039801456603423, + "grad_norm": 0.10033083707094193, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 156240 + }, + { + "epoch": 0.6040188028637257, + "grad_norm": 0.10382883250713348, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 156250 + }, + { + "epoch": 0.6040574600671089, + "grad_norm": 0.10549649596214294, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 156260 + }, + { + "epoch": 0.6040961172704922, + "grad_norm": 0.11817780882120132, + "learning_rate": 0.002, + "loss": 2.371, + "step": 156270 + }, + { + "epoch": 0.6041347744738754, + "grad_norm": 0.12325211614370346, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 156280 + }, + { + "epoch": 0.6041734316772588, + "grad_norm": 0.1172434538602829, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 156290 + }, + { + "epoch": 0.604212088880642, + "grad_norm": 0.09819969534873962, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 156300 + }, + { + "epoch": 0.6042507460840253, + "grad_norm": 0.10299625247716904, + "learning_rate": 0.002, + "loss": 2.355, + "step": 156310 + }, + { + "epoch": 0.6042894032874085, + "grad_norm": 0.09782897680997849, + "learning_rate": 0.002, + "loss": 2.3641, + "step": 156320 + }, + { + "epoch": 0.6043280604907919, + "grad_norm": 0.11144860088825226, + "learning_rate": 0.002, + "loss": 2.349, + "step": 156330 + }, + { + "epoch": 0.6043667176941752, + "grad_norm": 0.11498149484395981, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 156340 + }, + { + "epoch": 0.6044053748975584, + "grad_norm": 0.09959711879491806, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 156350 + }, + { + "epoch": 0.6044440321009417, + "grad_norm": 0.10747111588716507, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 156360 + }, + { + "epoch": 0.6044826893043249, + "grad_norm": 0.09978053718805313, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 156370 + }, + { + "epoch": 0.6045213465077083, + "grad_norm": 0.10574757307767868, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 156380 + }, + { + "epoch": 0.6045600037110915, + "grad_norm": 0.1058548167347908, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 156390 + }, + { + "epoch": 0.6045986609144748, + "grad_norm": 0.11262711137533188, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 156400 + }, + { + "epoch": 0.604637318117858, + "grad_norm": 0.10832460969686508, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 156410 + }, + { + "epoch": 0.6046759753212414, + "grad_norm": 0.10032720118761063, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 156420 + }, + { + "epoch": 0.6047146325246247, + "grad_norm": 0.1258467733860016, + "learning_rate": 0.002, + "loss": 2.337, + "step": 156430 + }, + { + "epoch": 0.6047532897280079, + "grad_norm": 0.10651124268770218, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 156440 + }, + { + "epoch": 0.6047919469313912, + "grad_norm": 0.1075170561671257, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 156450 + }, + { + "epoch": 0.6048306041347745, + "grad_norm": 0.10806979238986969, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 156460 + }, + { + "epoch": 0.6048692613381578, + "grad_norm": 0.11116418987512589, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 156470 + }, + { + "epoch": 0.604907918541541, + "grad_norm": 0.10847268253564835, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 156480 + }, + { + "epoch": 0.6049465757449243, + "grad_norm": 0.12186453491449356, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 156490 + }, + { + "epoch": 0.6049852329483076, + "grad_norm": 0.09700154513120651, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 156500 + }, + { + "epoch": 0.6050238901516909, + "grad_norm": 0.09895280003547668, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 156510 + }, + { + "epoch": 0.6050625473550741, + "grad_norm": 0.10787780582904816, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 156520 + }, + { + "epoch": 0.6051012045584574, + "grad_norm": 0.08980587869882584, + "learning_rate": 0.002, + "loss": 2.347, + "step": 156530 + }, + { + "epoch": 0.6051398617618406, + "grad_norm": 0.09867847710847855, + "learning_rate": 0.002, + "loss": 2.334, + "step": 156540 + }, + { + "epoch": 0.605178518965224, + "grad_norm": 0.110805444419384, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 156550 + }, + { + "epoch": 0.6052171761686073, + "grad_norm": 0.10935091972351074, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 156560 + }, + { + "epoch": 0.6052558333719905, + "grad_norm": 0.1027233675122261, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 156570 + }, + { + "epoch": 0.6052944905753738, + "grad_norm": 0.11280585825443268, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 156580 + }, + { + "epoch": 0.6053331477787571, + "grad_norm": 0.09640748798847198, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 156590 + }, + { + "epoch": 0.6053718049821404, + "grad_norm": 0.09245883673429489, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 156600 + }, + { + "epoch": 0.6054104621855236, + "grad_norm": 0.16029566526412964, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 156610 + }, + { + "epoch": 0.6054491193889069, + "grad_norm": 0.10584169626235962, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 156620 + }, + { + "epoch": 0.6054877765922902, + "grad_norm": 0.10097134113311768, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 156630 + }, + { + "epoch": 0.6055264337956735, + "grad_norm": 0.10749845951795578, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 156640 + }, + { + "epoch": 0.6055650909990568, + "grad_norm": 0.10039525479078293, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 156650 + }, + { + "epoch": 0.60560374820244, + "grad_norm": 0.1171375960111618, + "learning_rate": 0.002, + "loss": 2.3654, + "step": 156660 + }, + { + "epoch": 0.6056424054058234, + "grad_norm": 0.10131317377090454, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 156670 + }, + { + "epoch": 0.6056810626092066, + "grad_norm": 0.10574357956647873, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 156680 + }, + { + "epoch": 0.6057197198125899, + "grad_norm": 0.12090755254030228, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 156690 + }, + { + "epoch": 0.6057583770159731, + "grad_norm": 0.1064232811331749, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 156700 + }, + { + "epoch": 0.6057970342193565, + "grad_norm": 0.09819698333740234, + "learning_rate": 0.002, + "loss": 2.32, + "step": 156710 + }, + { + "epoch": 0.6058356914227397, + "grad_norm": 0.12896254658699036, + "learning_rate": 0.002, + "loss": 2.3706, + "step": 156720 + }, + { + "epoch": 0.605874348626123, + "grad_norm": 0.10714369267225266, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 156730 + }, + { + "epoch": 0.6059130058295062, + "grad_norm": 0.11655072122812271, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 156740 + }, + { + "epoch": 0.6059516630328895, + "grad_norm": 0.10611128062009811, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 156750 + }, + { + "epoch": 0.6059903202362729, + "grad_norm": 0.09566417336463928, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 156760 + }, + { + "epoch": 0.6060289774396561, + "grad_norm": 0.09997794032096863, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 156770 + }, + { + "epoch": 0.6060676346430394, + "grad_norm": 0.12219233810901642, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 156780 + }, + { + "epoch": 0.6061062918464226, + "grad_norm": 0.1029295101761818, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 156790 + }, + { + "epoch": 0.606144949049806, + "grad_norm": 0.09928140044212341, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 156800 + }, + { + "epoch": 0.6061836062531892, + "grad_norm": 0.11081463098526001, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 156810 + }, + { + "epoch": 0.6062222634565725, + "grad_norm": 0.11391083896160126, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 156820 + }, + { + "epoch": 0.6062609206599557, + "grad_norm": 0.12047885358333588, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 156830 + }, + { + "epoch": 0.6062995778633391, + "grad_norm": 0.1044929251074791, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 156840 + }, + { + "epoch": 0.6063382350667224, + "grad_norm": 0.1222505047917366, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 156850 + }, + { + "epoch": 0.6063768922701056, + "grad_norm": 0.10657400637865067, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 156860 + }, + { + "epoch": 0.6064155494734889, + "grad_norm": 0.10880149900913239, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 156870 + }, + { + "epoch": 0.6064542066768722, + "grad_norm": 0.10814648121595383, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 156880 + }, + { + "epoch": 0.6064928638802555, + "grad_norm": 0.09671936929225922, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 156890 + }, + { + "epoch": 0.6065315210836387, + "grad_norm": 0.10830371081829071, + "learning_rate": 0.002, + "loss": 2.352, + "step": 156900 + }, + { + "epoch": 0.606570178287022, + "grad_norm": 0.1003207340836525, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 156910 + }, + { + "epoch": 0.6066088354904052, + "grad_norm": 0.12952585518360138, + "learning_rate": 0.002, + "loss": 2.341, + "step": 156920 + }, + { + "epoch": 0.6066474926937886, + "grad_norm": 0.09259293973445892, + "learning_rate": 0.002, + "loss": 2.347, + "step": 156930 + }, + { + "epoch": 0.6066861498971718, + "grad_norm": 0.1007172018289566, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 156940 + }, + { + "epoch": 0.6067248071005551, + "grad_norm": 0.13747857511043549, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 156950 + }, + { + "epoch": 0.6067634643039383, + "grad_norm": 0.11612991988658905, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 156960 + }, + { + "epoch": 0.6068021215073217, + "grad_norm": 0.11666108667850494, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 156970 + }, + { + "epoch": 0.606840778710705, + "grad_norm": 0.10534953325986862, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 156980 + }, + { + "epoch": 0.6068794359140882, + "grad_norm": 0.11357153207063675, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 156990 + }, + { + "epoch": 0.6069180931174715, + "grad_norm": 0.08578240126371384, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 157000 + }, + { + "epoch": 0.6069567503208548, + "grad_norm": 0.12134891003370285, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 157010 + }, + { + "epoch": 0.6069954075242381, + "grad_norm": 0.10584051162004471, + "learning_rate": 0.002, + "loss": 2.358, + "step": 157020 + }, + { + "epoch": 0.6070340647276213, + "grad_norm": 0.11070246249437332, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 157030 + }, + { + "epoch": 0.6070727219310046, + "grad_norm": 0.11073992401361465, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 157040 + }, + { + "epoch": 0.607111379134388, + "grad_norm": 0.10716310143470764, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 157050 + }, + { + "epoch": 0.6071500363377712, + "grad_norm": 0.11177654564380646, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 157060 + }, + { + "epoch": 0.6071886935411545, + "grad_norm": 0.10450034588575363, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 157070 + }, + { + "epoch": 0.6072273507445377, + "grad_norm": 0.12878698110580444, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 157080 + }, + { + "epoch": 0.607266007947921, + "grad_norm": 0.10073508322238922, + "learning_rate": 0.002, + "loss": 2.353, + "step": 157090 + }, + { + "epoch": 0.6073046651513043, + "grad_norm": 0.1201476901769638, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 157100 + }, + { + "epoch": 0.6073433223546876, + "grad_norm": 0.10388737916946411, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 157110 + }, + { + "epoch": 0.6073819795580708, + "grad_norm": 0.09778716415166855, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 157120 + }, + { + "epoch": 0.6074206367614541, + "grad_norm": 0.12194199860095978, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 157130 + }, + { + "epoch": 0.6074592939648374, + "grad_norm": 0.11156753450632095, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 157140 + }, + { + "epoch": 0.6074979511682207, + "grad_norm": 0.10753699392080307, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 157150 + }, + { + "epoch": 0.607536608371604, + "grad_norm": 0.10721046477556229, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 157160 + }, + { + "epoch": 0.6075752655749872, + "grad_norm": 0.10429911315441132, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 157170 + }, + { + "epoch": 0.6076139227783706, + "grad_norm": 0.12225025147199631, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 157180 + }, + { + "epoch": 0.6076525799817538, + "grad_norm": 0.08854345232248306, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 157190 + }, + { + "epoch": 0.6076912371851371, + "grad_norm": 0.09663832187652588, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 157200 + }, + { + "epoch": 0.6077298943885203, + "grad_norm": 0.09602214395999908, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 157210 + }, + { + "epoch": 0.6077685515919037, + "grad_norm": 0.11233019083738327, + "learning_rate": 0.002, + "loss": 2.34, + "step": 157220 + }, + { + "epoch": 0.6078072087952869, + "grad_norm": 0.10633435100317001, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 157230 + }, + { + "epoch": 0.6078458659986702, + "grad_norm": 0.09058328717947006, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 157240 + }, + { + "epoch": 0.6078845232020534, + "grad_norm": 0.10827624797821045, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 157250 + }, + { + "epoch": 0.6079231804054368, + "grad_norm": 0.11266107112169266, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 157260 + }, + { + "epoch": 0.60796183760882, + "grad_norm": 0.10475073009729385, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 157270 + }, + { + "epoch": 0.6080004948122033, + "grad_norm": 0.09929769486188889, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 157280 + }, + { + "epoch": 0.6080391520155866, + "grad_norm": 0.11319760233163834, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 157290 + }, + { + "epoch": 0.6080778092189698, + "grad_norm": 0.10194125026464462, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 157300 + }, + { + "epoch": 0.6081164664223532, + "grad_norm": 0.12336062639951706, + "learning_rate": 0.002, + "loss": 2.353, + "step": 157310 + }, + { + "epoch": 0.6081551236257364, + "grad_norm": 0.11531714349985123, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 157320 + }, + { + "epoch": 0.6081937808291197, + "grad_norm": 0.10148004442453384, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 157330 + }, + { + "epoch": 0.6082324380325029, + "grad_norm": 0.09875989705324173, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 157340 + }, + { + "epoch": 0.6082710952358863, + "grad_norm": 0.119626984000206, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 157350 + }, + { + "epoch": 0.6083097524392695, + "grad_norm": 0.11117689311504364, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 157360 + }, + { + "epoch": 0.6083484096426528, + "grad_norm": 0.12111735343933105, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 157370 + }, + { + "epoch": 0.608387066846036, + "grad_norm": 0.09148456901311874, + "learning_rate": 0.002, + "loss": 2.342, + "step": 157380 + }, + { + "epoch": 0.6084257240494194, + "grad_norm": 0.11267891526222229, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 157390 + }, + { + "epoch": 0.6084643812528027, + "grad_norm": 0.1054447740316391, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 157400 + }, + { + "epoch": 0.6085030384561859, + "grad_norm": 0.10002760589122772, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 157410 + }, + { + "epoch": 0.6085416956595692, + "grad_norm": 0.10492801666259766, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 157420 + }, + { + "epoch": 0.6085803528629525, + "grad_norm": 0.12571987509727478, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 157430 + }, + { + "epoch": 0.6086190100663358, + "grad_norm": 0.11996311694383621, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 157440 + }, + { + "epoch": 0.608657667269719, + "grad_norm": 0.10160239785909653, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 157450 + }, + { + "epoch": 0.6086963244731023, + "grad_norm": 0.0983286201953888, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 157460 + }, + { + "epoch": 0.6087349816764855, + "grad_norm": 0.10145774483680725, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 157470 + }, + { + "epoch": 0.6087736388798689, + "grad_norm": 0.13767632842063904, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 157480 + }, + { + "epoch": 0.6088122960832522, + "grad_norm": 0.11958499997854233, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 157490 + }, + { + "epoch": 0.6088509532866354, + "grad_norm": 0.10636449605226517, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 157500 + }, + { + "epoch": 0.6088896104900187, + "grad_norm": 0.1382298618555069, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 157510 + }, + { + "epoch": 0.608928267693402, + "grad_norm": 0.11528230458498001, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 157520 + }, + { + "epoch": 0.6089669248967853, + "grad_norm": 0.11574528366327286, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 157530 + }, + { + "epoch": 0.6090055821001685, + "grad_norm": 0.10157489031553268, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 157540 + }, + { + "epoch": 0.6090442393035518, + "grad_norm": 0.09453226625919342, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 157550 + }, + { + "epoch": 0.6090828965069351, + "grad_norm": 0.10535252839326859, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 157560 + }, + { + "epoch": 0.6091215537103184, + "grad_norm": 0.09338109195232391, + "learning_rate": 0.002, + "loss": 2.323, + "step": 157570 + }, + { + "epoch": 0.6091602109137016, + "grad_norm": 0.1233820989727974, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 157580 + }, + { + "epoch": 0.6091988681170849, + "grad_norm": 0.09621302038431168, + "learning_rate": 0.002, + "loss": 2.347, + "step": 157590 + }, + { + "epoch": 0.6092375253204683, + "grad_norm": 0.11600583046674728, + "learning_rate": 0.002, + "loss": 2.35, + "step": 157600 + }, + { + "epoch": 0.6092761825238515, + "grad_norm": 0.10407842695713043, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 157610 + }, + { + "epoch": 0.6093148397272348, + "grad_norm": 0.08800975233316422, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 157620 + }, + { + "epoch": 0.609353496930618, + "grad_norm": 0.1047428622841835, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 157630 + }, + { + "epoch": 0.6093921541340014, + "grad_norm": 0.1025058701634407, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 157640 + }, + { + "epoch": 0.6094308113373846, + "grad_norm": 0.10395365953445435, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 157650 + }, + { + "epoch": 0.6094694685407679, + "grad_norm": 0.13296107947826385, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 157660 + }, + { + "epoch": 0.6095081257441511, + "grad_norm": 0.11022595316171646, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 157670 + }, + { + "epoch": 0.6095467829475344, + "grad_norm": 0.1114407628774643, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 157680 + }, + { + "epoch": 0.6095854401509178, + "grad_norm": 0.11167177557945251, + "learning_rate": 0.002, + "loss": 2.341, + "step": 157690 + }, + { + "epoch": 0.609624097354301, + "grad_norm": 0.1167573407292366, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 157700 + }, + { + "epoch": 0.6096627545576843, + "grad_norm": 0.10351765155792236, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 157710 + }, + { + "epoch": 0.6097014117610675, + "grad_norm": 0.11204945296049118, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 157720 + }, + { + "epoch": 0.6097400689644509, + "grad_norm": 0.12831169366836548, + "learning_rate": 0.002, + "loss": 2.342, + "step": 157730 + }, + { + "epoch": 0.6097787261678341, + "grad_norm": 0.11609380692243576, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 157740 + }, + { + "epoch": 0.6098173833712174, + "grad_norm": 0.0960511639714241, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 157750 + }, + { + "epoch": 0.6098560405746006, + "grad_norm": 0.11027969419956207, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 157760 + }, + { + "epoch": 0.609894697777984, + "grad_norm": 0.1651405245065689, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 157770 + }, + { + "epoch": 0.6099333549813672, + "grad_norm": 0.12306205928325653, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 157780 + }, + { + "epoch": 0.6099720121847505, + "grad_norm": 0.10880153626203537, + "learning_rate": 0.002, + "loss": 2.33, + "step": 157790 + }, + { + "epoch": 0.6100106693881338, + "grad_norm": 0.10684796422719955, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 157800 + }, + { + "epoch": 0.6100493265915171, + "grad_norm": 0.0998319610953331, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 157810 + }, + { + "epoch": 0.6100879837949004, + "grad_norm": 0.1425325572490692, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 157820 + }, + { + "epoch": 0.6101266409982836, + "grad_norm": 0.09976795315742493, + "learning_rate": 0.002, + "loss": 2.349, + "step": 157830 + }, + { + "epoch": 0.6101652982016669, + "grad_norm": 0.10247869044542313, + "learning_rate": 0.002, + "loss": 2.321, + "step": 157840 + }, + { + "epoch": 0.6102039554050501, + "grad_norm": 0.10698070377111435, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 157850 + }, + { + "epoch": 0.6102426126084335, + "grad_norm": 0.11454866081476212, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 157860 + }, + { + "epoch": 0.6102812698118167, + "grad_norm": 0.09456183016300201, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 157870 + }, + { + "epoch": 0.6103199270152, + "grad_norm": 0.10078510642051697, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 157880 + }, + { + "epoch": 0.6103585842185832, + "grad_norm": 0.14035581052303314, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 157890 + }, + { + "epoch": 0.6103972414219666, + "grad_norm": 0.09534963965415955, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 157900 + }, + { + "epoch": 0.6104358986253499, + "grad_norm": 0.09866109490394592, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 157910 + }, + { + "epoch": 0.6104745558287331, + "grad_norm": 0.10259261727333069, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 157920 + }, + { + "epoch": 0.6105132130321164, + "grad_norm": 0.1016649454832077, + "learning_rate": 0.002, + "loss": 2.337, + "step": 157930 + }, + { + "epoch": 0.6105518702354997, + "grad_norm": 0.10683011263608932, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 157940 + }, + { + "epoch": 0.610590527438883, + "grad_norm": 0.11511304974555969, + "learning_rate": 0.002, + "loss": 2.338, + "step": 157950 + }, + { + "epoch": 0.6106291846422662, + "grad_norm": 0.09644953161478043, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 157960 + }, + { + "epoch": 0.6106678418456495, + "grad_norm": 0.13170172274112701, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 157970 + }, + { + "epoch": 0.6107064990490328, + "grad_norm": 0.13730081915855408, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 157980 + }, + { + "epoch": 0.6107451562524161, + "grad_norm": 0.0939185619354248, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 157990 + }, + { + "epoch": 0.6107838134557994, + "grad_norm": 0.11765972524881363, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 158000 + }, + { + "epoch": 0.6108224706591826, + "grad_norm": 0.10731218010187149, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 158010 + }, + { + "epoch": 0.6108611278625659, + "grad_norm": 0.10578600317239761, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 158020 + }, + { + "epoch": 0.6108997850659492, + "grad_norm": 0.097139373421669, + "learning_rate": 0.002, + "loss": 2.357, + "step": 158030 + }, + { + "epoch": 0.6109384422693325, + "grad_norm": 0.10506080090999603, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 158040 + }, + { + "epoch": 0.6109770994727157, + "grad_norm": 0.10108962655067444, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 158050 + }, + { + "epoch": 0.611015756676099, + "grad_norm": 0.10646820813417435, + "learning_rate": 0.002, + "loss": 2.344, + "step": 158060 + }, + { + "epoch": 0.6110544138794823, + "grad_norm": 0.11607225984334946, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 158070 + }, + { + "epoch": 0.6110930710828656, + "grad_norm": 0.1082080751657486, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 158080 + }, + { + "epoch": 0.6111317282862488, + "grad_norm": 0.0966542437672615, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 158090 + }, + { + "epoch": 0.6111703854896321, + "grad_norm": 0.10189072787761688, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 158100 + }, + { + "epoch": 0.6112090426930155, + "grad_norm": 0.09674588590860367, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 158110 + }, + { + "epoch": 0.6112476998963987, + "grad_norm": 0.10019045323133469, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 158120 + }, + { + "epoch": 0.611286357099782, + "grad_norm": 0.09782741218805313, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 158130 + }, + { + "epoch": 0.6113250143031652, + "grad_norm": 0.1437297761440277, + "learning_rate": 0.002, + "loss": 2.334, + "step": 158140 + }, + { + "epoch": 0.6113636715065486, + "grad_norm": 0.09866262972354889, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 158150 + }, + { + "epoch": 0.6114023287099318, + "grad_norm": 0.10425934195518494, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 158160 + }, + { + "epoch": 0.6114409859133151, + "grad_norm": 0.0972549319267273, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 158170 + }, + { + "epoch": 0.6114796431166983, + "grad_norm": 0.10543369501829147, + "learning_rate": 0.002, + "loss": 2.345, + "step": 158180 + }, + { + "epoch": 0.6115183003200817, + "grad_norm": 0.1036418080329895, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 158190 + }, + { + "epoch": 0.611556957523465, + "grad_norm": 0.14457623660564423, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 158200 + }, + { + "epoch": 0.6115956147268482, + "grad_norm": 0.11930687725543976, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 158210 + }, + { + "epoch": 0.6116342719302315, + "grad_norm": 0.1089979037642479, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 158220 + }, + { + "epoch": 0.6116729291336147, + "grad_norm": 0.12308035790920258, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 158230 + }, + { + "epoch": 0.6117115863369981, + "grad_norm": 0.10787247121334076, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 158240 + }, + { + "epoch": 0.6117502435403813, + "grad_norm": 0.11869463324546814, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 158250 + }, + { + "epoch": 0.6117889007437646, + "grad_norm": 0.11029456555843353, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 158260 + }, + { + "epoch": 0.6118275579471478, + "grad_norm": 0.1282932013273239, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 158270 + }, + { + "epoch": 0.6118662151505312, + "grad_norm": 0.1101757362484932, + "learning_rate": 0.002, + "loss": 2.329, + "step": 158280 + }, + { + "epoch": 0.6119048723539144, + "grad_norm": 0.11456190049648285, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 158290 + }, + { + "epoch": 0.6119435295572977, + "grad_norm": 0.13458718359470367, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 158300 + }, + { + "epoch": 0.611982186760681, + "grad_norm": 0.10693209618330002, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 158310 + }, + { + "epoch": 0.6120208439640643, + "grad_norm": 0.09521332383155823, + "learning_rate": 0.002, + "loss": 2.343, + "step": 158320 + }, + { + "epoch": 0.6120595011674476, + "grad_norm": 0.09104505181312561, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 158330 + }, + { + "epoch": 0.6120981583708308, + "grad_norm": 0.1118229329586029, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 158340 + }, + { + "epoch": 0.6121368155742141, + "grad_norm": 0.09892457723617554, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 158350 + }, + { + "epoch": 0.6121754727775974, + "grad_norm": 0.10420117527246475, + "learning_rate": 0.002, + "loss": 2.36, + "step": 158360 + }, + { + "epoch": 0.6122141299809807, + "grad_norm": 0.10320697724819183, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 158370 + }, + { + "epoch": 0.6122527871843639, + "grad_norm": 0.08881300687789917, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 158380 + }, + { + "epoch": 0.6122914443877472, + "grad_norm": 0.10302989929914474, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 158390 + }, + { + "epoch": 0.6123301015911304, + "grad_norm": 0.12415283173322678, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 158400 + }, + { + "epoch": 0.6123687587945138, + "grad_norm": 0.09723592549562454, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 158410 + }, + { + "epoch": 0.612407415997897, + "grad_norm": 0.10486268252134323, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 158420 + }, + { + "epoch": 0.6124460732012803, + "grad_norm": 0.10682319849729538, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 158430 + }, + { + "epoch": 0.6124847304046636, + "grad_norm": 0.09781024605035782, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 158440 + }, + { + "epoch": 0.6125233876080469, + "grad_norm": 0.11337437480688095, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 158450 + }, + { + "epoch": 0.6125620448114302, + "grad_norm": 0.10655515640974045, + "learning_rate": 0.002, + "loss": 2.341, + "step": 158460 + }, + { + "epoch": 0.6126007020148134, + "grad_norm": 0.12020324915647507, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 158470 + }, + { + "epoch": 0.6126393592181967, + "grad_norm": 0.09384416043758392, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 158480 + }, + { + "epoch": 0.61267801642158, + "grad_norm": 0.10570283234119415, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 158490 + }, + { + "epoch": 0.6127166736249633, + "grad_norm": 0.13045048713684082, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 158500 + }, + { + "epoch": 0.6127553308283465, + "grad_norm": 0.10316865146160126, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 158510 + }, + { + "epoch": 0.6127939880317298, + "grad_norm": 0.10627957433462143, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 158520 + }, + { + "epoch": 0.6128326452351132, + "grad_norm": 0.12966708838939667, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 158530 + }, + { + "epoch": 0.6128713024384964, + "grad_norm": 0.11290135234594345, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 158540 + }, + { + "epoch": 0.6129099596418797, + "grad_norm": 0.09105128049850464, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 158550 + }, + { + "epoch": 0.6129486168452629, + "grad_norm": 0.10901861637830734, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 158560 + }, + { + "epoch": 0.6129872740486463, + "grad_norm": 0.10790381580591202, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 158570 + }, + { + "epoch": 0.6130259312520295, + "grad_norm": 0.10326814651489258, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 158580 + }, + { + "epoch": 0.6130645884554128, + "grad_norm": 0.11023823916912079, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 158590 + }, + { + "epoch": 0.613103245658796, + "grad_norm": 0.13924099504947662, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 158600 + }, + { + "epoch": 0.6131419028621793, + "grad_norm": 0.11610828340053558, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 158610 + }, + { + "epoch": 0.6131805600655627, + "grad_norm": 0.11992853134870529, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 158620 + }, + { + "epoch": 0.6132192172689459, + "grad_norm": 0.11580061167478561, + "learning_rate": 0.002, + "loss": 2.357, + "step": 158630 + }, + { + "epoch": 0.6132578744723292, + "grad_norm": 0.11151894181966782, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 158640 + }, + { + "epoch": 0.6132965316757124, + "grad_norm": 0.10860782116651535, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 158650 + }, + { + "epoch": 0.6133351888790958, + "grad_norm": 0.09790189564228058, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 158660 + }, + { + "epoch": 0.613373846082479, + "grad_norm": 0.13056792318820953, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 158670 + }, + { + "epoch": 0.6134125032858623, + "grad_norm": 0.09393236041069031, + "learning_rate": 0.002, + "loss": 2.351, + "step": 158680 + }, + { + "epoch": 0.6134511604892455, + "grad_norm": 0.10080570727586746, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 158690 + }, + { + "epoch": 0.6134898176926289, + "grad_norm": 0.11203580349683762, + "learning_rate": 0.002, + "loss": 2.336, + "step": 158700 + }, + { + "epoch": 0.6135284748960121, + "grad_norm": 0.11879559606313705, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 158710 + }, + { + "epoch": 0.6135671320993954, + "grad_norm": 0.102769635617733, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 158720 + }, + { + "epoch": 0.6136057893027786, + "grad_norm": 0.12622100114822388, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 158730 + }, + { + "epoch": 0.613644446506162, + "grad_norm": 0.09503379464149475, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 158740 + }, + { + "epoch": 0.6136831037095453, + "grad_norm": 0.12669788300991058, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 158750 + }, + { + "epoch": 0.6137217609129285, + "grad_norm": 0.11435175687074661, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 158760 + }, + { + "epoch": 0.6137604181163118, + "grad_norm": 0.10128527134656906, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 158770 + }, + { + "epoch": 0.613799075319695, + "grad_norm": 0.10439524799585342, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 158780 + }, + { + "epoch": 0.6138377325230784, + "grad_norm": 0.09249827265739441, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 158790 + }, + { + "epoch": 0.6138763897264616, + "grad_norm": 0.1345771849155426, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 158800 + }, + { + "epoch": 0.6139150469298449, + "grad_norm": 0.09422943741083145, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 158810 + }, + { + "epoch": 0.6139537041332281, + "grad_norm": 0.10849691182374954, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 158820 + }, + { + "epoch": 0.6139923613366115, + "grad_norm": 0.11163496226072311, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 158830 + }, + { + "epoch": 0.6140310185399948, + "grad_norm": 0.10774195939302444, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 158840 + }, + { + "epoch": 0.614069675743378, + "grad_norm": 0.13317424058914185, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 158850 + }, + { + "epoch": 0.6141083329467613, + "grad_norm": 0.10984144359827042, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 158860 + }, + { + "epoch": 0.6141469901501446, + "grad_norm": 0.10041702538728714, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 158870 + }, + { + "epoch": 0.6141856473535279, + "grad_norm": 0.10012153536081314, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 158880 + }, + { + "epoch": 0.6142243045569111, + "grad_norm": 0.11682833731174469, + "learning_rate": 0.002, + "loss": 2.3662, + "step": 158890 + }, + { + "epoch": 0.6142629617602944, + "grad_norm": 0.10764049738645554, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 158900 + }, + { + "epoch": 0.6143016189636777, + "grad_norm": 0.10517425090074539, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 158910 + }, + { + "epoch": 0.614340276167061, + "grad_norm": 0.10669191181659698, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 158920 + }, + { + "epoch": 0.6143789333704442, + "grad_norm": 0.10772227495908737, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 158930 + }, + { + "epoch": 0.6144175905738275, + "grad_norm": 0.0992896556854248, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 158940 + }, + { + "epoch": 0.6144562477772108, + "grad_norm": 0.09629623591899872, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 158950 + }, + { + "epoch": 0.6144949049805941, + "grad_norm": 0.09165239334106445, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 158960 + }, + { + "epoch": 0.6145335621839774, + "grad_norm": 0.12956255674362183, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 158970 + }, + { + "epoch": 0.6145722193873606, + "grad_norm": 0.1039843037724495, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 158980 + }, + { + "epoch": 0.6146108765907439, + "grad_norm": 0.11637865006923676, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 158990 + }, + { + "epoch": 0.6146495337941272, + "grad_norm": 0.114281065762043, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 159000 + }, + { + "epoch": 0.6146881909975105, + "grad_norm": 0.12161819636821747, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 159010 + }, + { + "epoch": 0.6147268482008937, + "grad_norm": 0.10355739295482635, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 159020 + }, + { + "epoch": 0.614765505404277, + "grad_norm": 0.1096806526184082, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 159030 + }, + { + "epoch": 0.6148041626076604, + "grad_norm": 0.11160244047641754, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 159040 + }, + { + "epoch": 0.6148428198110436, + "grad_norm": 0.11055063456296921, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 159050 + }, + { + "epoch": 0.6148814770144269, + "grad_norm": 0.10469084978103638, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 159060 + }, + { + "epoch": 0.6149201342178101, + "grad_norm": 0.121566042304039, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 159070 + }, + { + "epoch": 0.6149587914211935, + "grad_norm": 0.11239653825759888, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 159080 + }, + { + "epoch": 0.6149974486245767, + "grad_norm": 0.09685921669006348, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 159090 + }, + { + "epoch": 0.61503610582796, + "grad_norm": 0.13571828603744507, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 159100 + }, + { + "epoch": 0.6150747630313432, + "grad_norm": 0.10878980159759521, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 159110 + }, + { + "epoch": 0.6151134202347266, + "grad_norm": 0.09652600437402725, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 159120 + }, + { + "epoch": 0.6151520774381098, + "grad_norm": 0.103079654276371, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 159130 + }, + { + "epoch": 0.6151907346414931, + "grad_norm": 0.09848003089427948, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 159140 + }, + { + "epoch": 0.6152293918448763, + "grad_norm": 0.1262648105621338, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 159150 + }, + { + "epoch": 0.6152680490482596, + "grad_norm": 0.1000993624329567, + "learning_rate": 0.002, + "loss": 2.332, + "step": 159160 + }, + { + "epoch": 0.615306706251643, + "grad_norm": 0.12465333938598633, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 159170 + }, + { + "epoch": 0.6153453634550262, + "grad_norm": 0.11284952610731125, + "learning_rate": 0.002, + "loss": 2.3647, + "step": 159180 + }, + { + "epoch": 0.6153840206584095, + "grad_norm": 0.11351795494556427, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 159190 + }, + { + "epoch": 0.6154226778617927, + "grad_norm": 0.10991384088993073, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 159200 + }, + { + "epoch": 0.6154613350651761, + "grad_norm": 0.11290508508682251, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 159210 + }, + { + "epoch": 0.6154999922685593, + "grad_norm": 0.11408476531505585, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 159220 + }, + { + "epoch": 0.6155386494719426, + "grad_norm": 0.10141675174236298, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 159230 + }, + { + "epoch": 0.6155773066753258, + "grad_norm": 0.12357048690319061, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 159240 + }, + { + "epoch": 0.6156159638787092, + "grad_norm": 0.09966486692428589, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 159250 + }, + { + "epoch": 0.6156546210820925, + "grad_norm": 0.11067885905504227, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 159260 + }, + { + "epoch": 0.6156932782854757, + "grad_norm": 0.12197203934192657, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 159270 + }, + { + "epoch": 0.615731935488859, + "grad_norm": 0.10108727961778641, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 159280 + }, + { + "epoch": 0.6157705926922423, + "grad_norm": 0.09892028570175171, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 159290 + }, + { + "epoch": 0.6158092498956256, + "grad_norm": 0.1011892557144165, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 159300 + }, + { + "epoch": 0.6158479070990088, + "grad_norm": 0.1394670456647873, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 159310 + }, + { + "epoch": 0.6158865643023921, + "grad_norm": 0.11167936027050018, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 159320 + }, + { + "epoch": 0.6159252215057753, + "grad_norm": 0.10931015014648438, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 159330 + }, + { + "epoch": 0.6159638787091587, + "grad_norm": 0.10654870420694351, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 159340 + }, + { + "epoch": 0.616002535912542, + "grad_norm": 0.09276770800352097, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 159350 + }, + { + "epoch": 0.6160411931159252, + "grad_norm": 0.09629074484109879, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 159360 + }, + { + "epoch": 0.6160798503193085, + "grad_norm": 0.10154861956834793, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 159370 + }, + { + "epoch": 0.6161185075226918, + "grad_norm": 0.09723439067602158, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 159380 + }, + { + "epoch": 0.6161571647260751, + "grad_norm": 0.25782158970832825, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 159390 + }, + { + "epoch": 0.6161958219294583, + "grad_norm": 0.10921783745288849, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 159400 + }, + { + "epoch": 0.6162344791328416, + "grad_norm": 0.09872537851333618, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 159410 + }, + { + "epoch": 0.6162731363362249, + "grad_norm": 0.10222181677818298, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 159420 + }, + { + "epoch": 0.6163117935396082, + "grad_norm": 0.1002846285700798, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 159430 + }, + { + "epoch": 0.6163504507429914, + "grad_norm": 0.1227022334933281, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 159440 + }, + { + "epoch": 0.6163891079463747, + "grad_norm": 0.10258164256811142, + "learning_rate": 0.002, + "loss": 2.357, + "step": 159450 + }, + { + "epoch": 0.6164277651497581, + "grad_norm": 0.13086551427841187, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 159460 + }, + { + "epoch": 0.6164664223531413, + "grad_norm": 0.1075073629617691, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 159470 + }, + { + "epoch": 0.6165050795565246, + "grad_norm": 0.09900263696908951, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 159480 + }, + { + "epoch": 0.6165437367599078, + "grad_norm": 0.09153147041797638, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 159490 + }, + { + "epoch": 0.6165823939632911, + "grad_norm": 0.11093005537986755, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 159500 + }, + { + "epoch": 0.6166210511666744, + "grad_norm": 0.11436476558446884, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 159510 + }, + { + "epoch": 0.6166597083700577, + "grad_norm": 0.10130810737609863, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 159520 + }, + { + "epoch": 0.6166983655734409, + "grad_norm": 0.16572950780391693, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 159530 + }, + { + "epoch": 0.6167370227768242, + "grad_norm": 0.1082151010632515, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 159540 + }, + { + "epoch": 0.6167756799802075, + "grad_norm": 0.10624217987060547, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 159550 + }, + { + "epoch": 0.6168143371835908, + "grad_norm": 0.1009274423122406, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 159560 + }, + { + "epoch": 0.616852994386974, + "grad_norm": 0.12295461446046829, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 159570 + }, + { + "epoch": 0.6168916515903573, + "grad_norm": 0.10738131403923035, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 159580 + }, + { + "epoch": 0.6169303087937407, + "grad_norm": 0.10055730491876602, + "learning_rate": 0.002, + "loss": 2.3634, + "step": 159590 + }, + { + "epoch": 0.6169689659971239, + "grad_norm": 0.10193420946598053, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 159600 + }, + { + "epoch": 0.6170076232005072, + "grad_norm": 0.09353253990411758, + "learning_rate": 0.002, + "loss": 2.349, + "step": 159610 + }, + { + "epoch": 0.6170462804038904, + "grad_norm": 0.10531756281852722, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 159620 + }, + { + "epoch": 0.6170849376072738, + "grad_norm": 0.11639406532049179, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 159630 + }, + { + "epoch": 0.617123594810657, + "grad_norm": 0.2310658097267151, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 159640 + }, + { + "epoch": 0.6171622520140403, + "grad_norm": 1.0232207775115967, + "learning_rate": 0.002, + "loss": 2.372, + "step": 159650 + }, + { + "epoch": 0.6172009092174235, + "grad_norm": 0.2197166085243225, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 159660 + }, + { + "epoch": 0.6172395664208069, + "grad_norm": 0.17389705777168274, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 159670 + }, + { + "epoch": 0.6172782236241902, + "grad_norm": 0.10403891652822495, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 159680 + }, + { + "epoch": 0.6173168808275734, + "grad_norm": 0.10457657277584076, + "learning_rate": 0.002, + "loss": 2.348, + "step": 159690 + }, + { + "epoch": 0.6173555380309567, + "grad_norm": 0.10556718707084656, + "learning_rate": 0.002, + "loss": 2.341, + "step": 159700 + }, + { + "epoch": 0.6173941952343399, + "grad_norm": 0.11601948738098145, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 159710 + }, + { + "epoch": 0.6174328524377233, + "grad_norm": 0.1278046816587448, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 159720 + }, + { + "epoch": 0.6174715096411065, + "grad_norm": 0.10599952191114426, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 159730 + }, + { + "epoch": 0.6175101668444898, + "grad_norm": 0.10278228670358658, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 159740 + }, + { + "epoch": 0.617548824047873, + "grad_norm": 0.10085756331682205, + "learning_rate": 0.002, + "loss": 2.352, + "step": 159750 + }, + { + "epoch": 0.6175874812512564, + "grad_norm": 0.13335645198822021, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 159760 + }, + { + "epoch": 0.6176261384546397, + "grad_norm": 0.11378223448991776, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 159770 + }, + { + "epoch": 0.6176647956580229, + "grad_norm": 0.10508736968040466, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 159780 + }, + { + "epoch": 0.6177034528614062, + "grad_norm": 0.10805348306894302, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 159790 + }, + { + "epoch": 0.6177421100647895, + "grad_norm": 0.10574381053447723, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 159800 + }, + { + "epoch": 0.6177807672681728, + "grad_norm": 0.11227120459079742, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 159810 + }, + { + "epoch": 0.617819424471556, + "grad_norm": 0.11124691367149353, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 159820 + }, + { + "epoch": 0.6178580816749393, + "grad_norm": 0.11466017365455627, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 159830 + }, + { + "epoch": 0.6178967388783226, + "grad_norm": 0.11591002345085144, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 159840 + }, + { + "epoch": 0.6179353960817059, + "grad_norm": 0.09957937896251678, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 159850 + }, + { + "epoch": 0.6179740532850891, + "grad_norm": 0.1121915876865387, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 159860 + }, + { + "epoch": 0.6180127104884724, + "grad_norm": 0.09734398871660233, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 159870 + }, + { + "epoch": 0.6180513676918556, + "grad_norm": 0.08913884311914444, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 159880 + }, + { + "epoch": 0.618090024895239, + "grad_norm": 0.12423180788755417, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 159890 + }, + { + "epoch": 0.6181286820986223, + "grad_norm": 0.1204834133386612, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 159900 + }, + { + "epoch": 0.6181673393020055, + "grad_norm": 0.09985199570655823, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 159910 + }, + { + "epoch": 0.6182059965053888, + "grad_norm": 0.1060582771897316, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 159920 + }, + { + "epoch": 0.6182446537087721, + "grad_norm": 0.11842060089111328, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 159930 + }, + { + "epoch": 0.6182833109121554, + "grad_norm": 0.09812531620264053, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 159940 + }, + { + "epoch": 0.6183219681155386, + "grad_norm": 0.10997223854064941, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 159950 + }, + { + "epoch": 0.6183606253189219, + "grad_norm": 0.1334807425737381, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 159960 + }, + { + "epoch": 0.6183992825223052, + "grad_norm": 0.1621674746274948, + "learning_rate": 0.002, + "loss": 2.351, + "step": 159970 + }, + { + "epoch": 0.6184379397256885, + "grad_norm": 0.10352502018213272, + "learning_rate": 0.002, + "loss": 2.337, + "step": 159980 + }, + { + "epoch": 0.6184765969290718, + "grad_norm": 0.10434258729219437, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 159990 + }, + { + "epoch": 0.618515254132455, + "grad_norm": 0.08978249132633209, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 160000 + }, + { + "epoch": 0.6185539113358384, + "grad_norm": 0.09855789691209793, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 160010 + }, + { + "epoch": 0.6185925685392216, + "grad_norm": 0.11199983209371567, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 160020 + }, + { + "epoch": 0.6186312257426049, + "grad_norm": 0.1278078407049179, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 160030 + }, + { + "epoch": 0.6186698829459881, + "grad_norm": 0.10843171924352646, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 160040 + }, + { + "epoch": 0.6187085401493715, + "grad_norm": 0.11941742151975632, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 160050 + }, + { + "epoch": 0.6187471973527547, + "grad_norm": 0.1163121685385704, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 160060 + }, + { + "epoch": 0.618785854556138, + "grad_norm": 0.09469784796237946, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 160070 + }, + { + "epoch": 0.6188245117595212, + "grad_norm": 0.1268109381198883, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 160080 + }, + { + "epoch": 0.6188631689629045, + "grad_norm": 0.09837017208337784, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 160090 + }, + { + "epoch": 0.6189018261662879, + "grad_norm": 0.15756139159202576, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 160100 + }, + { + "epoch": 0.6189404833696711, + "grad_norm": 0.11236690729856491, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 160110 + }, + { + "epoch": 0.6189791405730544, + "grad_norm": 0.10446345806121826, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 160120 + }, + { + "epoch": 0.6190177977764376, + "grad_norm": 0.11776310205459595, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 160130 + }, + { + "epoch": 0.619056454979821, + "grad_norm": 0.1054246798157692, + "learning_rate": 0.002, + "loss": 2.341, + "step": 160140 + }, + { + "epoch": 0.6190951121832042, + "grad_norm": 0.10899261385202408, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 160150 + }, + { + "epoch": 0.6191337693865875, + "grad_norm": 0.1234220638871193, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 160160 + }, + { + "epoch": 0.6191724265899707, + "grad_norm": 0.09885939955711365, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 160170 + }, + { + "epoch": 0.6192110837933541, + "grad_norm": 0.10989391803741455, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 160180 + }, + { + "epoch": 0.6192497409967374, + "grad_norm": 0.1203991025686264, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 160190 + }, + { + "epoch": 0.6192883982001206, + "grad_norm": 0.1000213697552681, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 160200 + }, + { + "epoch": 0.6193270554035039, + "grad_norm": 0.11637286096811295, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 160210 + }, + { + "epoch": 0.6193657126068872, + "grad_norm": 0.11612696945667267, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 160220 + }, + { + "epoch": 0.6194043698102705, + "grad_norm": 0.10938242822885513, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 160230 + }, + { + "epoch": 0.6194430270136537, + "grad_norm": 0.10004662722349167, + "learning_rate": 0.002, + "loss": 2.336, + "step": 160240 + }, + { + "epoch": 0.619481684217037, + "grad_norm": 0.09453094005584717, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 160250 + }, + { + "epoch": 0.6195203414204202, + "grad_norm": 0.11575914174318314, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 160260 + }, + { + "epoch": 0.6195589986238036, + "grad_norm": 0.1202038824558258, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 160270 + }, + { + "epoch": 0.6195976558271868, + "grad_norm": 0.11634848266839981, + "learning_rate": 0.002, + "loss": 2.349, + "step": 160280 + }, + { + "epoch": 0.6196363130305701, + "grad_norm": 0.09766193479299545, + "learning_rate": 0.002, + "loss": 2.342, + "step": 160290 + }, + { + "epoch": 0.6196749702339533, + "grad_norm": 0.13540469110012054, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 160300 + }, + { + "epoch": 0.6197136274373367, + "grad_norm": 0.11753330379724503, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 160310 + }, + { + "epoch": 0.61975228464072, + "grad_norm": 0.11685632914304733, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 160320 + }, + { + "epoch": 0.6197909418441032, + "grad_norm": 0.12577968835830688, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 160330 + }, + { + "epoch": 0.6198295990474865, + "grad_norm": 0.10916007310152054, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 160340 + }, + { + "epoch": 0.6198682562508698, + "grad_norm": 0.09644216299057007, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 160350 + }, + { + "epoch": 0.6199069134542531, + "grad_norm": 0.11642160266637802, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 160360 + }, + { + "epoch": 0.6199455706576363, + "grad_norm": 0.11780939996242523, + "learning_rate": 0.002, + "loss": 2.342, + "step": 160370 + }, + { + "epoch": 0.6199842278610196, + "grad_norm": 0.10777207463979721, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 160380 + }, + { + "epoch": 0.620022885064403, + "grad_norm": 0.09815847128629684, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 160390 + }, + { + "epoch": 0.6200615422677862, + "grad_norm": 0.10464230924844742, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 160400 + }, + { + "epoch": 0.6201001994711695, + "grad_norm": 0.0949997529387474, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 160410 + }, + { + "epoch": 0.6201388566745527, + "grad_norm": 0.10524674504995346, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 160420 + }, + { + "epoch": 0.620177513877936, + "grad_norm": 0.09734108299016953, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 160430 + }, + { + "epoch": 0.6202161710813193, + "grad_norm": 0.11617741733789444, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 160440 + }, + { + "epoch": 0.6202548282847026, + "grad_norm": 0.10688306391239166, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 160450 + }, + { + "epoch": 0.6202934854880858, + "grad_norm": 0.10394486784934998, + "learning_rate": 0.002, + "loss": 2.337, + "step": 160460 + }, + { + "epoch": 0.6203321426914691, + "grad_norm": 0.10238736122846603, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 160470 + }, + { + "epoch": 0.6203707998948524, + "grad_norm": 0.11343356966972351, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 160480 + }, + { + "epoch": 0.6204094570982357, + "grad_norm": 0.11415211856365204, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 160490 + }, + { + "epoch": 0.620448114301619, + "grad_norm": 0.12570852041244507, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 160500 + }, + { + "epoch": 0.6204867715050022, + "grad_norm": 0.11392834037542343, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 160510 + }, + { + "epoch": 0.6205254287083856, + "grad_norm": 0.10089188814163208, + "learning_rate": 0.002, + "loss": 2.361, + "step": 160520 + }, + { + "epoch": 0.6205640859117688, + "grad_norm": 0.10702072083950043, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 160530 + }, + { + "epoch": 0.6206027431151521, + "grad_norm": 0.13690462708473206, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 160540 + }, + { + "epoch": 0.6206414003185353, + "grad_norm": 0.09837660938501358, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 160550 + }, + { + "epoch": 0.6206800575219187, + "grad_norm": 0.10660859942436218, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 160560 + }, + { + "epoch": 0.6207187147253019, + "grad_norm": 0.10659363120794296, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 160570 + }, + { + "epoch": 0.6207573719286852, + "grad_norm": 0.13043168187141418, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 160580 + }, + { + "epoch": 0.6207960291320684, + "grad_norm": 0.11595991253852844, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 160590 + }, + { + "epoch": 0.6208346863354518, + "grad_norm": 0.10297106951475143, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 160600 + }, + { + "epoch": 0.620873343538835, + "grad_norm": 0.10592272877693176, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 160610 + }, + { + "epoch": 0.6209120007422183, + "grad_norm": 0.12115828692913055, + "learning_rate": 0.002, + "loss": 2.344, + "step": 160620 + }, + { + "epoch": 0.6209506579456016, + "grad_norm": 0.12772081792354584, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 160630 + }, + { + "epoch": 0.6209893151489848, + "grad_norm": 0.10656872391700745, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 160640 + }, + { + "epoch": 0.6210279723523682, + "grad_norm": 0.12006162106990814, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 160650 + }, + { + "epoch": 0.6210666295557514, + "grad_norm": 0.1134830042719841, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 160660 + }, + { + "epoch": 0.6211052867591347, + "grad_norm": 0.09791508316993713, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 160670 + }, + { + "epoch": 0.6211439439625179, + "grad_norm": 0.1255888044834137, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 160680 + }, + { + "epoch": 0.6211826011659013, + "grad_norm": 0.09975399821996689, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 160690 + }, + { + "epoch": 0.6212212583692845, + "grad_norm": 0.11866630613803864, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 160700 + }, + { + "epoch": 0.6212599155726678, + "grad_norm": 0.09884458780288696, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 160710 + }, + { + "epoch": 0.621298572776051, + "grad_norm": 0.0869886502623558, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 160720 + }, + { + "epoch": 0.6213372299794344, + "grad_norm": 0.11433932930231094, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 160730 + }, + { + "epoch": 0.6213758871828177, + "grad_norm": 0.09779713302850723, + "learning_rate": 0.002, + "loss": 2.336, + "step": 160740 + }, + { + "epoch": 0.6214145443862009, + "grad_norm": 0.125816211104393, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 160750 + }, + { + "epoch": 0.6214532015895842, + "grad_norm": 0.10931852459907532, + "learning_rate": 0.002, + "loss": 2.358, + "step": 160760 + }, + { + "epoch": 0.6214918587929675, + "grad_norm": 0.10296256095170975, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 160770 + }, + { + "epoch": 0.6215305159963508, + "grad_norm": 0.10576235502958298, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 160780 + }, + { + "epoch": 0.621569173199734, + "grad_norm": 0.1022237166762352, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 160790 + }, + { + "epoch": 0.6216078304031173, + "grad_norm": 0.11737734824419022, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 160800 + }, + { + "epoch": 0.6216464876065005, + "grad_norm": 0.10755621641874313, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 160810 + }, + { + "epoch": 0.6216851448098839, + "grad_norm": 0.12070505321025848, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 160820 + }, + { + "epoch": 0.6217238020132672, + "grad_norm": 0.0936044454574585, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 160830 + }, + { + "epoch": 0.6217624592166504, + "grad_norm": 0.09852291643619537, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 160840 + }, + { + "epoch": 0.6218011164200337, + "grad_norm": 0.09821783006191254, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 160850 + }, + { + "epoch": 0.621839773623417, + "grad_norm": 0.10328206419944763, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 160860 + }, + { + "epoch": 0.6218784308268003, + "grad_norm": 0.09731963276863098, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 160870 + }, + { + "epoch": 0.6219170880301835, + "grad_norm": 0.11714418232440948, + "learning_rate": 0.002, + "loss": 2.344, + "step": 160880 + }, + { + "epoch": 0.6219557452335668, + "grad_norm": 0.10939300060272217, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 160890 + }, + { + "epoch": 0.6219944024369501, + "grad_norm": 0.11220026016235352, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 160900 + }, + { + "epoch": 0.6220330596403334, + "grad_norm": 0.11689543724060059, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 160910 + }, + { + "epoch": 0.6220717168437166, + "grad_norm": 0.12207639217376709, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 160920 + }, + { + "epoch": 0.6221103740470999, + "grad_norm": 0.1033955067396164, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 160930 + }, + { + "epoch": 0.6221490312504833, + "grad_norm": 0.09711343795061111, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 160940 + }, + { + "epoch": 0.6221876884538665, + "grad_norm": 0.10052355378866196, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 160950 + }, + { + "epoch": 0.6222263456572498, + "grad_norm": 0.12841641902923584, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 160960 + }, + { + "epoch": 0.622265002860633, + "grad_norm": 0.1063733622431755, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 160970 + }, + { + "epoch": 0.6223036600640164, + "grad_norm": 0.11179991811513901, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 160980 + }, + { + "epoch": 0.6223423172673996, + "grad_norm": 0.10117033869028091, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 160990 + }, + { + "epoch": 0.6223809744707829, + "grad_norm": 0.11976698786020279, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 161000 + }, + { + "epoch": 0.6224196316741661, + "grad_norm": 0.09096930921077728, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 161010 + }, + { + "epoch": 0.6224582888775494, + "grad_norm": 0.23469658195972443, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 161020 + }, + { + "epoch": 0.6224969460809328, + "grad_norm": 0.11037572473287582, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 161030 + }, + { + "epoch": 0.622535603284316, + "grad_norm": 0.10270857810974121, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 161040 + }, + { + "epoch": 0.6225742604876993, + "grad_norm": 0.1143588274717331, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 161050 + }, + { + "epoch": 0.6226129176910825, + "grad_norm": 0.11809398233890533, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 161060 + }, + { + "epoch": 0.6226515748944659, + "grad_norm": 0.10981971025466919, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 161070 + }, + { + "epoch": 0.6226902320978491, + "grad_norm": 0.09778551012277603, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 161080 + }, + { + "epoch": 0.6227288893012324, + "grad_norm": 0.10928311198949814, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 161090 + }, + { + "epoch": 0.6227675465046156, + "grad_norm": 0.1072680875658989, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 161100 + }, + { + "epoch": 0.622806203707999, + "grad_norm": 0.22394028306007385, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 161110 + }, + { + "epoch": 0.6228448609113822, + "grad_norm": 0.10629655420780182, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 161120 + }, + { + "epoch": 0.6228835181147655, + "grad_norm": 0.09425670653581619, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 161130 + }, + { + "epoch": 0.6229221753181488, + "grad_norm": 0.10633596777915955, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 161140 + }, + { + "epoch": 0.6229608325215321, + "grad_norm": 0.10779435187578201, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 161150 + }, + { + "epoch": 0.6229994897249154, + "grad_norm": 0.12639832496643066, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 161160 + }, + { + "epoch": 0.6230381469282986, + "grad_norm": 0.11051980406045914, + "learning_rate": 0.002, + "loss": 2.357, + "step": 161170 + }, + { + "epoch": 0.6230768041316819, + "grad_norm": 0.09174934774637222, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 161180 + }, + { + "epoch": 0.6231154613350651, + "grad_norm": 0.11634857207536697, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 161190 + }, + { + "epoch": 0.6231541185384485, + "grad_norm": 0.10571297258138657, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 161200 + }, + { + "epoch": 0.6231927757418317, + "grad_norm": 0.09662492573261261, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 161210 + }, + { + "epoch": 0.623231432945215, + "grad_norm": 0.10626404732465744, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 161220 + }, + { + "epoch": 0.6232700901485982, + "grad_norm": 0.10699160397052765, + "learning_rate": 0.002, + "loss": 2.347, + "step": 161230 + }, + { + "epoch": 0.6233087473519816, + "grad_norm": 0.10607574135065079, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 161240 + }, + { + "epoch": 0.6233474045553649, + "grad_norm": 0.10362876951694489, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 161250 + }, + { + "epoch": 0.6233860617587481, + "grad_norm": 0.10847746580839157, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 161260 + }, + { + "epoch": 0.6234247189621314, + "grad_norm": 0.13175007700920105, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 161270 + }, + { + "epoch": 0.6234633761655147, + "grad_norm": 0.10136456042528152, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 161280 + }, + { + "epoch": 0.623502033368898, + "grad_norm": 0.09823547303676605, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 161290 + }, + { + "epoch": 0.6235406905722812, + "grad_norm": 0.12443459033966064, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 161300 + }, + { + "epoch": 0.6235793477756645, + "grad_norm": 0.12469448894262314, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 161310 + }, + { + "epoch": 0.6236180049790478, + "grad_norm": 0.11748787015676498, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 161320 + }, + { + "epoch": 0.6236566621824311, + "grad_norm": 0.12585628032684326, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 161330 + }, + { + "epoch": 0.6236953193858144, + "grad_norm": 0.2920219302177429, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 161340 + }, + { + "epoch": 0.6237339765891976, + "grad_norm": 0.09079938381910324, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 161350 + }, + { + "epoch": 0.6237726337925809, + "grad_norm": 0.12725763022899628, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 161360 + }, + { + "epoch": 0.6238112909959642, + "grad_norm": 0.1162029504776001, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 161370 + }, + { + "epoch": 0.6238499481993475, + "grad_norm": 0.10406900942325592, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 161380 + }, + { + "epoch": 0.6238886054027307, + "grad_norm": 0.10797575861215591, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 161390 + }, + { + "epoch": 0.623927262606114, + "grad_norm": 0.1059408187866211, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 161400 + }, + { + "epoch": 0.6239659198094973, + "grad_norm": 0.09478353708982468, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 161410 + }, + { + "epoch": 0.6240045770128806, + "grad_norm": 0.1414909064769745, + "learning_rate": 0.002, + "loss": 2.345, + "step": 161420 + }, + { + "epoch": 0.6240432342162638, + "grad_norm": 0.10855178534984589, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 161430 + }, + { + "epoch": 0.6240818914196471, + "grad_norm": 0.09712377935647964, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 161440 + }, + { + "epoch": 0.6241205486230305, + "grad_norm": 0.09870415180921555, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 161450 + }, + { + "epoch": 0.6241592058264137, + "grad_norm": 0.1139429435133934, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 161460 + }, + { + "epoch": 0.624197863029797, + "grad_norm": 0.10842197388410568, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 161470 + }, + { + "epoch": 0.6242365202331802, + "grad_norm": 0.12628144025802612, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 161480 + }, + { + "epoch": 0.6242751774365636, + "grad_norm": 0.11068233102560043, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 161490 + }, + { + "epoch": 0.6243138346399468, + "grad_norm": 0.12281452119350433, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 161500 + }, + { + "epoch": 0.6243524918433301, + "grad_norm": 0.10054417699575424, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 161510 + }, + { + "epoch": 0.6243911490467133, + "grad_norm": 0.10512567311525345, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 161520 + }, + { + "epoch": 0.6244298062500967, + "grad_norm": 0.10423226654529572, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 161530 + }, + { + "epoch": 0.62446846345348, + "grad_norm": 0.11877862364053726, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 161540 + }, + { + "epoch": 0.6245071206568632, + "grad_norm": 0.10388844460248947, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 161550 + }, + { + "epoch": 0.6245457778602465, + "grad_norm": 0.094859778881073, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 161560 + }, + { + "epoch": 0.6245844350636297, + "grad_norm": 0.12273389846086502, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 161570 + }, + { + "epoch": 0.6246230922670131, + "grad_norm": 0.142424076795578, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 161580 + }, + { + "epoch": 0.6246617494703963, + "grad_norm": 0.1083383709192276, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 161590 + }, + { + "epoch": 0.6247004066737796, + "grad_norm": 0.10823900252580643, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 161600 + }, + { + "epoch": 0.6247390638771628, + "grad_norm": 0.10325956344604492, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 161610 + }, + { + "epoch": 0.6247777210805462, + "grad_norm": 0.1117747575044632, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 161620 + }, + { + "epoch": 0.6248163782839294, + "grad_norm": 0.10606677085161209, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 161630 + }, + { + "epoch": 0.6248550354873127, + "grad_norm": 0.11204840987920761, + "learning_rate": 0.002, + "loss": 2.354, + "step": 161640 + }, + { + "epoch": 0.624893692690696, + "grad_norm": 0.1343681663274765, + "learning_rate": 0.002, + "loss": 2.354, + "step": 161650 + }, + { + "epoch": 0.6249323498940793, + "grad_norm": 0.11574757844209671, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 161660 + }, + { + "epoch": 0.6249710070974626, + "grad_norm": 0.11187739670276642, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 161670 + }, + { + "epoch": 0.6250096643008458, + "grad_norm": 0.11739975214004517, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 161680 + }, + { + "epoch": 0.6250483215042291, + "grad_norm": 0.1051754280924797, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 161690 + }, + { + "epoch": 0.6250869787076124, + "grad_norm": 0.09944341331720352, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 161700 + }, + { + "epoch": 0.6251256359109957, + "grad_norm": 0.11645132303237915, + "learning_rate": 0.002, + "loss": 2.356, + "step": 161710 + }, + { + "epoch": 0.6251642931143789, + "grad_norm": 0.10068979114294052, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 161720 + }, + { + "epoch": 0.6252029503177622, + "grad_norm": 0.10064230114221573, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 161730 + }, + { + "epoch": 0.6252416075211454, + "grad_norm": 0.12956196069717407, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 161740 + }, + { + "epoch": 0.6252802647245288, + "grad_norm": 0.10924475640058517, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 161750 + }, + { + "epoch": 0.625318921927912, + "grad_norm": 0.10532253235578537, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 161760 + }, + { + "epoch": 0.6253575791312953, + "grad_norm": 0.09834955632686615, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 161770 + }, + { + "epoch": 0.6253962363346786, + "grad_norm": 0.10091876238584518, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 161780 + }, + { + "epoch": 0.6254348935380619, + "grad_norm": 0.10023108124732971, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 161790 + }, + { + "epoch": 0.6254735507414452, + "grad_norm": 0.14204120635986328, + "learning_rate": 0.002, + "loss": 2.345, + "step": 161800 + }, + { + "epoch": 0.6255122079448284, + "grad_norm": 0.12205145508050919, + "learning_rate": 0.002, + "loss": 2.342, + "step": 161810 + }, + { + "epoch": 0.6255508651482117, + "grad_norm": 0.10635034739971161, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 161820 + }, + { + "epoch": 0.625589522351595, + "grad_norm": 0.11630839854478836, + "learning_rate": 0.002, + "loss": 2.353, + "step": 161830 + }, + { + "epoch": 0.6256281795549783, + "grad_norm": 0.10341896116733551, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 161840 + }, + { + "epoch": 0.6256668367583615, + "grad_norm": 0.1114698201417923, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 161850 + }, + { + "epoch": 0.6257054939617448, + "grad_norm": 0.10270850360393524, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 161860 + }, + { + "epoch": 0.6257441511651282, + "grad_norm": 0.12893211841583252, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 161870 + }, + { + "epoch": 0.6257828083685114, + "grad_norm": 0.11013567447662354, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 161880 + }, + { + "epoch": 0.6258214655718947, + "grad_norm": 0.09540493786334991, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 161890 + }, + { + "epoch": 0.6258601227752779, + "grad_norm": 0.08586122840642929, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 161900 + }, + { + "epoch": 0.6258987799786612, + "grad_norm": 0.10693126171827316, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 161910 + }, + { + "epoch": 0.6259374371820445, + "grad_norm": 0.1023019403219223, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 161920 + }, + { + "epoch": 0.6259760943854278, + "grad_norm": 0.14287428557872772, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 161930 + }, + { + "epoch": 0.626014751588811, + "grad_norm": 0.10729516297578812, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 161940 + }, + { + "epoch": 0.6260534087921943, + "grad_norm": 0.11237427592277527, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 161950 + }, + { + "epoch": 0.6260920659955777, + "grad_norm": 0.09579712897539139, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 161960 + }, + { + "epoch": 0.6261307231989609, + "grad_norm": 0.12017855048179626, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 161970 + }, + { + "epoch": 0.6261693804023442, + "grad_norm": 0.09329648315906525, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 161980 + }, + { + "epoch": 0.6262080376057274, + "grad_norm": 0.1064738929271698, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 161990 + }, + { + "epoch": 0.6262466948091108, + "grad_norm": 0.10892489552497864, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 162000 + }, + { + "epoch": 0.626285352012494, + "grad_norm": 0.08925356715917587, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 162010 + }, + { + "epoch": 0.6263240092158773, + "grad_norm": 0.10236651450395584, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 162020 + }, + { + "epoch": 0.6263626664192605, + "grad_norm": 0.10449860244989395, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 162030 + }, + { + "epoch": 0.6264013236226439, + "grad_norm": 0.10375801473855972, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 162040 + }, + { + "epoch": 0.6264399808260271, + "grad_norm": 0.1058206558227539, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 162050 + }, + { + "epoch": 0.6264786380294104, + "grad_norm": 0.12303949892520905, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 162060 + }, + { + "epoch": 0.6265172952327936, + "grad_norm": 0.09614730626344681, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 162070 + }, + { + "epoch": 0.626555952436177, + "grad_norm": 0.10775220394134521, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 162080 + }, + { + "epoch": 0.6265946096395603, + "grad_norm": 0.10548651218414307, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 162090 + }, + { + "epoch": 0.6266332668429435, + "grad_norm": 0.133774533867836, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 162100 + }, + { + "epoch": 0.6266719240463268, + "grad_norm": 0.10769365727901459, + "learning_rate": 0.002, + "loss": 2.333, + "step": 162110 + }, + { + "epoch": 0.62671058124971, + "grad_norm": 0.11196187883615494, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 162120 + }, + { + "epoch": 0.6267492384530934, + "grad_norm": 0.09751760959625244, + "learning_rate": 0.002, + "loss": 2.35, + "step": 162130 + }, + { + "epoch": 0.6267878956564766, + "grad_norm": 0.10065893828868866, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 162140 + }, + { + "epoch": 0.6268265528598599, + "grad_norm": 0.11453534662723541, + "learning_rate": 0.002, + "loss": 2.352, + "step": 162150 + }, + { + "epoch": 0.6268652100632431, + "grad_norm": 0.11050843447446823, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 162160 + }, + { + "epoch": 0.6269038672666265, + "grad_norm": 0.09605526179075241, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 162170 + }, + { + "epoch": 0.6269425244700098, + "grad_norm": 0.11402832716703415, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 162180 + }, + { + "epoch": 0.626981181673393, + "grad_norm": 0.10346776247024536, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 162190 + }, + { + "epoch": 0.6270198388767763, + "grad_norm": 0.11273309588432312, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 162200 + }, + { + "epoch": 0.6270584960801596, + "grad_norm": 0.11330553144216537, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 162210 + }, + { + "epoch": 0.6270971532835429, + "grad_norm": 0.11245008558034897, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 162220 + }, + { + "epoch": 0.6271358104869261, + "grad_norm": 0.0939001739025116, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 162230 + }, + { + "epoch": 0.6271744676903094, + "grad_norm": 0.11301296949386597, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 162240 + }, + { + "epoch": 0.6272131248936927, + "grad_norm": 0.10336881130933762, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 162250 + }, + { + "epoch": 0.627251782097076, + "grad_norm": 0.12161832302808762, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 162260 + }, + { + "epoch": 0.6272904393004592, + "grad_norm": 0.0986507460474968, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 162270 + }, + { + "epoch": 0.6273290965038425, + "grad_norm": 0.09472258388996124, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 162280 + }, + { + "epoch": 0.6273677537072258, + "grad_norm": 0.10754972696304321, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 162290 + }, + { + "epoch": 0.6274064109106091, + "grad_norm": 0.11803829669952393, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 162300 + }, + { + "epoch": 0.6274450681139924, + "grad_norm": 0.0967666506767273, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 162310 + }, + { + "epoch": 0.6274837253173756, + "grad_norm": 0.11755374073982239, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 162320 + }, + { + "epoch": 0.6275223825207589, + "grad_norm": 0.09747770428657532, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 162330 + }, + { + "epoch": 0.6275610397241422, + "grad_norm": 0.10866741091012955, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 162340 + }, + { + "epoch": 0.6275996969275255, + "grad_norm": 0.11812674254179001, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 162350 + }, + { + "epoch": 0.6276383541309087, + "grad_norm": 0.09401613473892212, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 162360 + }, + { + "epoch": 0.627677011334292, + "grad_norm": 0.11415007710456848, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 162370 + }, + { + "epoch": 0.6277156685376754, + "grad_norm": 0.10338752716779709, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 162380 + }, + { + "epoch": 0.6277543257410586, + "grad_norm": 0.13331128656864166, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 162390 + }, + { + "epoch": 0.6277929829444419, + "grad_norm": 0.12205469608306885, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 162400 + }, + { + "epoch": 0.6278316401478251, + "grad_norm": 0.11239419132471085, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 162410 + }, + { + "epoch": 0.6278702973512085, + "grad_norm": 0.10969708114862442, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 162420 + }, + { + "epoch": 0.6279089545545917, + "grad_norm": 0.1146257221698761, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 162430 + }, + { + "epoch": 0.627947611757975, + "grad_norm": 0.1019849181175232, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 162440 + }, + { + "epoch": 0.6279862689613582, + "grad_norm": 0.09386495500802994, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 162450 + }, + { + "epoch": 0.6280249261647416, + "grad_norm": 0.10513816773891449, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 162460 + }, + { + "epoch": 0.6280635833681248, + "grad_norm": 0.11198500543832779, + "learning_rate": 0.002, + "loss": 2.343, + "step": 162470 + }, + { + "epoch": 0.6281022405715081, + "grad_norm": 0.09839422255754471, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 162480 + }, + { + "epoch": 0.6281408977748913, + "grad_norm": 0.09457303583621979, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 162490 + }, + { + "epoch": 0.6281795549782746, + "grad_norm": 0.0936691090464592, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 162500 + }, + { + "epoch": 0.628218212181658, + "grad_norm": 0.1124194860458374, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 162510 + }, + { + "epoch": 0.6282568693850412, + "grad_norm": 0.12158659845590591, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 162520 + }, + { + "epoch": 0.6282955265884245, + "grad_norm": 0.09474817663431168, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 162530 + }, + { + "epoch": 0.6283341837918077, + "grad_norm": 0.09250221401453018, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 162540 + }, + { + "epoch": 0.6283728409951911, + "grad_norm": 0.09708566963672638, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 162550 + }, + { + "epoch": 0.6284114981985743, + "grad_norm": 0.09655487537384033, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 162560 + }, + { + "epoch": 0.6284501554019576, + "grad_norm": 0.0928114727139473, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 162570 + }, + { + "epoch": 0.6284888126053408, + "grad_norm": 0.10528218001127243, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 162580 + }, + { + "epoch": 0.6285274698087242, + "grad_norm": 0.11332085728645325, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 162590 + }, + { + "epoch": 0.6285661270121075, + "grad_norm": 0.11296750605106354, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 162600 + }, + { + "epoch": 0.6286047842154907, + "grad_norm": 0.09918206930160522, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 162610 + }, + { + "epoch": 0.628643441418874, + "grad_norm": 0.11857914924621582, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 162620 + }, + { + "epoch": 0.6286820986222573, + "grad_norm": 0.11675361543893814, + "learning_rate": 0.002, + "loss": 2.341, + "step": 162630 + }, + { + "epoch": 0.6287207558256406, + "grad_norm": 0.13692381978034973, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 162640 + }, + { + "epoch": 0.6287594130290238, + "grad_norm": 0.08462470024824142, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 162650 + }, + { + "epoch": 0.6287980702324071, + "grad_norm": 0.11189759522676468, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 162660 + }, + { + "epoch": 0.6288367274357903, + "grad_norm": 0.12786071002483368, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 162670 + }, + { + "epoch": 0.6288753846391737, + "grad_norm": 0.09691540151834488, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 162680 + }, + { + "epoch": 0.628914041842557, + "grad_norm": 0.10311124473810196, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 162690 + }, + { + "epoch": 0.6289526990459402, + "grad_norm": 0.10573244094848633, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 162700 + }, + { + "epoch": 0.6289913562493235, + "grad_norm": 0.10980737209320068, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 162710 + }, + { + "epoch": 0.6290300134527068, + "grad_norm": 0.1420067548751831, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 162720 + }, + { + "epoch": 0.6290686706560901, + "grad_norm": 0.10817544907331467, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 162730 + }, + { + "epoch": 0.6291073278594733, + "grad_norm": 0.11977480351924896, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 162740 + }, + { + "epoch": 0.6291459850628566, + "grad_norm": 0.10848201811313629, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 162750 + }, + { + "epoch": 0.6291846422662399, + "grad_norm": 0.12340037524700165, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 162760 + }, + { + "epoch": 0.6292232994696232, + "grad_norm": 0.10151165723800659, + "learning_rate": 0.002, + "loss": 2.343, + "step": 162770 + }, + { + "epoch": 0.6292619566730064, + "grad_norm": 0.094622902572155, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 162780 + }, + { + "epoch": 0.6293006138763897, + "grad_norm": 0.10345442593097687, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 162790 + }, + { + "epoch": 0.629339271079773, + "grad_norm": 0.11360262334346771, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 162800 + }, + { + "epoch": 0.6293779282831563, + "grad_norm": 0.11832479387521744, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 162810 + }, + { + "epoch": 0.6294165854865396, + "grad_norm": 0.11140234768390656, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 162820 + }, + { + "epoch": 0.6294552426899228, + "grad_norm": 0.16921010613441467, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 162830 + }, + { + "epoch": 0.6294938998933061, + "grad_norm": 0.1355878859758377, + "learning_rate": 0.002, + "loss": 2.35, + "step": 162840 + }, + { + "epoch": 0.6295325570966894, + "grad_norm": 0.10031003504991531, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 162850 + }, + { + "epoch": 0.6295712143000727, + "grad_norm": 0.09619584679603577, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 162860 + }, + { + "epoch": 0.6296098715034559, + "grad_norm": 0.11223511397838593, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 162870 + }, + { + "epoch": 0.6296485287068392, + "grad_norm": 0.11276748776435852, + "learning_rate": 0.002, + "loss": 2.335, + "step": 162880 + }, + { + "epoch": 0.6296871859102225, + "grad_norm": 0.10060160607099533, + "learning_rate": 0.002, + "loss": 2.347, + "step": 162890 + }, + { + "epoch": 0.6297258431136058, + "grad_norm": 0.09993769228458405, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 162900 + }, + { + "epoch": 0.629764500316989, + "grad_norm": 0.09578732401132584, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 162910 + }, + { + "epoch": 0.6298031575203723, + "grad_norm": 0.16651290655136108, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 162920 + }, + { + "epoch": 0.6298418147237557, + "grad_norm": 0.10034544765949249, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 162930 + }, + { + "epoch": 0.6298804719271389, + "grad_norm": 0.11658070236444473, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 162940 + }, + { + "epoch": 0.6299191291305222, + "grad_norm": 0.08569231629371643, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 162950 + }, + { + "epoch": 0.6299577863339054, + "grad_norm": 0.760219395160675, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 162960 + }, + { + "epoch": 0.6299964435372888, + "grad_norm": 0.11948889493942261, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 162970 + }, + { + "epoch": 0.630035100740672, + "grad_norm": 0.09244943410158157, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 162980 + }, + { + "epoch": 0.6300737579440553, + "grad_norm": 0.08750931918621063, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 162990 + }, + { + "epoch": 0.6301124151474385, + "grad_norm": 0.09580099582672119, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 163000 + }, + { + "epoch": 0.6301510723508219, + "grad_norm": 0.1356991082429886, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 163010 + }, + { + "epoch": 0.6301897295542052, + "grad_norm": 0.10146520286798477, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 163020 + }, + { + "epoch": 0.6302283867575884, + "grad_norm": 0.11479398608207703, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 163030 + }, + { + "epoch": 0.6302670439609717, + "grad_norm": 0.0942661389708519, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 163040 + }, + { + "epoch": 0.6303057011643549, + "grad_norm": 0.10510151088237762, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 163050 + }, + { + "epoch": 0.6303443583677383, + "grad_norm": 0.12045959383249283, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 163060 + }, + { + "epoch": 0.6303830155711215, + "grad_norm": 0.10102223604917526, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 163070 + }, + { + "epoch": 0.6304216727745048, + "grad_norm": 0.10106303542852402, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 163080 + }, + { + "epoch": 0.630460329977888, + "grad_norm": 0.09535668045282364, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 163090 + }, + { + "epoch": 0.6304989871812714, + "grad_norm": 0.09568269550800323, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 163100 + }, + { + "epoch": 0.6305376443846546, + "grad_norm": 0.10704615712165833, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 163110 + }, + { + "epoch": 0.6305763015880379, + "grad_norm": 0.1080525666475296, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 163120 + }, + { + "epoch": 0.6306149587914212, + "grad_norm": 0.11370661854743958, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 163130 + }, + { + "epoch": 0.6306536159948045, + "grad_norm": 0.11887659877538681, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 163140 + }, + { + "epoch": 0.6306922731981878, + "grad_norm": 0.09894140809774399, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 163150 + }, + { + "epoch": 0.630730930401571, + "grad_norm": 0.0998791828751564, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 163160 + }, + { + "epoch": 0.6307695876049543, + "grad_norm": 0.11118055880069733, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 163170 + }, + { + "epoch": 0.6308082448083376, + "grad_norm": 0.10285613685846329, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 163180 + }, + { + "epoch": 0.6308469020117209, + "grad_norm": 0.11359003186225891, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 163190 + }, + { + "epoch": 0.6308855592151041, + "grad_norm": 0.1001129075884819, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 163200 + }, + { + "epoch": 0.6309242164184874, + "grad_norm": 0.09381992369890213, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 163210 + }, + { + "epoch": 0.6309628736218706, + "grad_norm": 0.1008177399635315, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 163220 + }, + { + "epoch": 0.631001530825254, + "grad_norm": 0.09887038916349411, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 163230 + }, + { + "epoch": 0.6310401880286373, + "grad_norm": 0.10974892228841782, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 163240 + }, + { + "epoch": 0.6310788452320205, + "grad_norm": 0.12041390687227249, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 163250 + }, + { + "epoch": 0.6311175024354038, + "grad_norm": 0.0948445051908493, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 163260 + }, + { + "epoch": 0.6311561596387871, + "grad_norm": 0.132850781083107, + "learning_rate": 0.002, + "loss": 2.36, + "step": 163270 + }, + { + "epoch": 0.6311948168421704, + "grad_norm": 0.11287672817707062, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 163280 + }, + { + "epoch": 0.6312334740455536, + "grad_norm": 0.10684788227081299, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 163290 + }, + { + "epoch": 0.6312721312489369, + "grad_norm": 0.10461094975471497, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 163300 + }, + { + "epoch": 0.6313107884523202, + "grad_norm": 0.09377182275056839, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 163310 + }, + { + "epoch": 0.6313494456557035, + "grad_norm": 0.09944086521863937, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 163320 + }, + { + "epoch": 0.6313881028590868, + "grad_norm": 0.10178162902593613, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 163330 + }, + { + "epoch": 0.63142676006247, + "grad_norm": 0.1384466588497162, + "learning_rate": 0.002, + "loss": 2.356, + "step": 163340 + }, + { + "epoch": 0.6314654172658534, + "grad_norm": 0.10060542821884155, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 163350 + }, + { + "epoch": 0.6315040744692366, + "grad_norm": 0.1142500638961792, + "learning_rate": 0.002, + "loss": 2.346, + "step": 163360 + }, + { + "epoch": 0.6315427316726199, + "grad_norm": 0.11058689653873444, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 163370 + }, + { + "epoch": 0.6315813888760031, + "grad_norm": 0.10664571076631546, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 163380 + }, + { + "epoch": 0.6316200460793865, + "grad_norm": 0.1178840771317482, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 163390 + }, + { + "epoch": 0.6316587032827697, + "grad_norm": 0.09117285162210464, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 163400 + }, + { + "epoch": 0.631697360486153, + "grad_norm": 0.11692478507757187, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 163410 + }, + { + "epoch": 0.6317360176895362, + "grad_norm": 0.1247502863407135, + "learning_rate": 0.002, + "loss": 2.335, + "step": 163420 + }, + { + "epoch": 0.6317746748929195, + "grad_norm": 0.11090654134750366, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 163430 + }, + { + "epoch": 0.6318133320963029, + "grad_norm": 0.11316248774528503, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 163440 + }, + { + "epoch": 0.6318519892996861, + "grad_norm": 0.101323701441288, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 163450 + }, + { + "epoch": 0.6318906465030694, + "grad_norm": 0.10291837900876999, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 163460 + }, + { + "epoch": 0.6319293037064526, + "grad_norm": 0.1195756047964096, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 163470 + }, + { + "epoch": 0.631967960909836, + "grad_norm": 0.10091858357191086, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 163480 + }, + { + "epoch": 0.6320066181132192, + "grad_norm": 0.10238576680421829, + "learning_rate": 0.002, + "loss": 2.334, + "step": 163490 + }, + { + "epoch": 0.6320452753166025, + "grad_norm": 0.11800894141197205, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 163500 + }, + { + "epoch": 0.6320839325199857, + "grad_norm": 0.10410700738430023, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 163510 + }, + { + "epoch": 0.6321225897233691, + "grad_norm": 0.11911267787218094, + "learning_rate": 0.002, + "loss": 2.35, + "step": 163520 + }, + { + "epoch": 0.6321612469267524, + "grad_norm": 0.10908859968185425, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 163530 + }, + { + "epoch": 0.6321999041301356, + "grad_norm": 0.10330608487129211, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 163540 + }, + { + "epoch": 0.6322385613335189, + "grad_norm": 0.11335937678813934, + "learning_rate": 0.002, + "loss": 2.353, + "step": 163550 + }, + { + "epoch": 0.6322772185369022, + "grad_norm": 0.10461365431547165, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 163560 + }, + { + "epoch": 0.6323158757402855, + "grad_norm": 0.0950535461306572, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 163570 + }, + { + "epoch": 0.6323545329436687, + "grad_norm": 0.09518333524465561, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 163580 + }, + { + "epoch": 0.632393190147052, + "grad_norm": 0.11334793269634247, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 163590 + }, + { + "epoch": 0.6324318473504352, + "grad_norm": 0.13851404190063477, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 163600 + }, + { + "epoch": 0.6324705045538186, + "grad_norm": 0.09821225702762604, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 163610 + }, + { + "epoch": 0.6325091617572018, + "grad_norm": 0.11290562897920609, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 163620 + }, + { + "epoch": 0.6325478189605851, + "grad_norm": 0.09985516965389252, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 163630 + }, + { + "epoch": 0.6325864761639683, + "grad_norm": 0.11000210791826248, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 163640 + }, + { + "epoch": 0.6326251333673517, + "grad_norm": 0.11401853710412979, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 163650 + }, + { + "epoch": 0.632663790570735, + "grad_norm": 0.1564873307943344, + "learning_rate": 0.002, + "loss": 2.346, + "step": 163660 + }, + { + "epoch": 0.6327024477741182, + "grad_norm": 0.10557052493095398, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 163670 + }, + { + "epoch": 0.6327411049775015, + "grad_norm": 0.09930504113435745, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 163680 + }, + { + "epoch": 0.6327797621808848, + "grad_norm": 0.10658255964517593, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 163690 + }, + { + "epoch": 0.6328184193842681, + "grad_norm": 0.09009906649589539, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 163700 + }, + { + "epoch": 0.6328570765876513, + "grad_norm": 0.09801620244979858, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 163710 + }, + { + "epoch": 0.6328957337910346, + "grad_norm": 0.09540320932865143, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 163720 + }, + { + "epoch": 0.632934390994418, + "grad_norm": 0.11449936777353287, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 163730 + }, + { + "epoch": 0.6329730481978012, + "grad_norm": 0.10777530819177628, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 163740 + }, + { + "epoch": 0.6330117054011845, + "grad_norm": 0.12651008367538452, + "learning_rate": 0.002, + "loss": 2.342, + "step": 163750 + }, + { + "epoch": 0.6330503626045677, + "grad_norm": 0.10534662008285522, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 163760 + }, + { + "epoch": 0.633089019807951, + "grad_norm": 0.10595740377902985, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 163770 + }, + { + "epoch": 0.6331276770113343, + "grad_norm": 0.11781991273164749, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 163780 + }, + { + "epoch": 0.6331663342147176, + "grad_norm": 0.10847979784011841, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 163790 + }, + { + "epoch": 0.6332049914181008, + "grad_norm": 0.11100934445858002, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 163800 + }, + { + "epoch": 0.6332436486214841, + "grad_norm": 0.09589594602584839, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 163810 + }, + { + "epoch": 0.6332823058248674, + "grad_norm": 0.08635708689689636, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 163820 + }, + { + "epoch": 0.6333209630282507, + "grad_norm": 0.09627924114465714, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 163830 + }, + { + "epoch": 0.633359620231634, + "grad_norm": 0.10940419882535934, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 163840 + }, + { + "epoch": 0.6333982774350172, + "grad_norm": 0.10659973323345184, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 163850 + }, + { + "epoch": 0.6334369346384006, + "grad_norm": 0.10410846024751663, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 163860 + }, + { + "epoch": 0.6334755918417838, + "grad_norm": 0.08976782858371735, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 163870 + }, + { + "epoch": 0.6335142490451671, + "grad_norm": 0.11145444214344025, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 163880 + }, + { + "epoch": 0.6335529062485503, + "grad_norm": 0.10460666567087173, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 163890 + }, + { + "epoch": 0.6335915634519337, + "grad_norm": 0.09775000810623169, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 163900 + }, + { + "epoch": 0.6336302206553169, + "grad_norm": 0.10219895094633102, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 163910 + }, + { + "epoch": 0.6336688778587002, + "grad_norm": 0.13126534223556519, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 163920 + }, + { + "epoch": 0.6337075350620834, + "grad_norm": 0.15083256363868713, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 163930 + }, + { + "epoch": 0.6337461922654668, + "grad_norm": 0.13783420622348785, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 163940 + }, + { + "epoch": 0.63378484946885, + "grad_norm": 0.105369433760643, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 163950 + }, + { + "epoch": 0.6338235066722333, + "grad_norm": 0.11018861085176468, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 163960 + }, + { + "epoch": 0.6338621638756166, + "grad_norm": 0.11167337745428085, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 163970 + }, + { + "epoch": 0.6339008210789998, + "grad_norm": 0.09401069581508636, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 163980 + }, + { + "epoch": 0.6339394782823832, + "grad_norm": 0.10706432163715363, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 163990 + }, + { + "epoch": 0.6339781354857664, + "grad_norm": 0.10177291929721832, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 164000 + }, + { + "epoch": 0.6340167926891497, + "grad_norm": 0.10011303424835205, + "learning_rate": 0.002, + "loss": 2.335, + "step": 164010 + }, + { + "epoch": 0.6340554498925329, + "grad_norm": 0.13315925002098083, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 164020 + }, + { + "epoch": 0.6340941070959163, + "grad_norm": 0.1010168194770813, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 164030 + }, + { + "epoch": 0.6341327642992995, + "grad_norm": 0.11664897203445435, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 164040 + }, + { + "epoch": 0.6341714215026828, + "grad_norm": 0.10335059463977814, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 164050 + }, + { + "epoch": 0.634210078706066, + "grad_norm": 0.11125491559505463, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 164060 + }, + { + "epoch": 0.6342487359094494, + "grad_norm": 0.10294213891029358, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 164070 + }, + { + "epoch": 0.6342873931128327, + "grad_norm": 0.10268981754779816, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 164080 + }, + { + "epoch": 0.6343260503162159, + "grad_norm": 0.11555498093366623, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 164090 + }, + { + "epoch": 0.6343647075195992, + "grad_norm": 0.09391045570373535, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 164100 + }, + { + "epoch": 0.6344033647229825, + "grad_norm": 0.08841327577829361, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 164110 + }, + { + "epoch": 0.6344420219263658, + "grad_norm": 0.11089881509542465, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 164120 + }, + { + "epoch": 0.634480679129749, + "grad_norm": 0.10194240510463715, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 164130 + }, + { + "epoch": 0.6345193363331323, + "grad_norm": 0.10533329099416733, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 164140 + }, + { + "epoch": 0.6345579935365155, + "grad_norm": 0.10327567905187607, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 164150 + }, + { + "epoch": 0.6345966507398989, + "grad_norm": 0.12457548081874847, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 164160 + }, + { + "epoch": 0.6346353079432822, + "grad_norm": 0.14414572715759277, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 164170 + }, + { + "epoch": 0.6346739651466654, + "grad_norm": 0.11866335570812225, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 164180 + }, + { + "epoch": 0.6347126223500487, + "grad_norm": 0.09757451713085175, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 164190 + }, + { + "epoch": 0.634751279553432, + "grad_norm": 0.1049044206738472, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 164200 + }, + { + "epoch": 0.6347899367568153, + "grad_norm": 0.08824124187231064, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 164210 + }, + { + "epoch": 0.6348285939601985, + "grad_norm": 0.10676911473274231, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 164220 + }, + { + "epoch": 0.6348672511635818, + "grad_norm": 0.11019159853458405, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 164230 + }, + { + "epoch": 0.6349059083669651, + "grad_norm": 0.13544712960720062, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 164240 + }, + { + "epoch": 0.6349445655703484, + "grad_norm": 0.10107675194740295, + "learning_rate": 0.002, + "loss": 2.337, + "step": 164250 + }, + { + "epoch": 0.6349832227737316, + "grad_norm": 0.10352107882499695, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 164260 + }, + { + "epoch": 0.6350218799771149, + "grad_norm": 0.14161401987075806, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 164270 + }, + { + "epoch": 0.6350605371804983, + "grad_norm": 0.11117195338010788, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 164280 + }, + { + "epoch": 0.6350991943838815, + "grad_norm": 0.0933212861418724, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 164290 + }, + { + "epoch": 0.6351378515872648, + "grad_norm": 0.10913147032260895, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 164300 + }, + { + "epoch": 0.635176508790648, + "grad_norm": 0.08877578377723694, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 164310 + }, + { + "epoch": 0.6352151659940314, + "grad_norm": 0.11947460472583771, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 164320 + }, + { + "epoch": 0.6352538231974146, + "grad_norm": 0.09575257450342178, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 164330 + }, + { + "epoch": 0.6352924804007979, + "grad_norm": 0.09766148775815964, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 164340 + }, + { + "epoch": 0.6353311376041811, + "grad_norm": 0.12030192464590073, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 164350 + }, + { + "epoch": 0.6353697948075644, + "grad_norm": 0.10689179599285126, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 164360 + }, + { + "epoch": 0.6354084520109478, + "grad_norm": 0.10224936157464981, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 164370 + }, + { + "epoch": 0.635447109214331, + "grad_norm": 0.094263955950737, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 164380 + }, + { + "epoch": 0.6354857664177143, + "grad_norm": 0.10004042834043503, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 164390 + }, + { + "epoch": 0.6355244236210975, + "grad_norm": 0.10117456316947937, + "learning_rate": 0.002, + "loss": 2.35, + "step": 164400 + }, + { + "epoch": 0.6355630808244809, + "grad_norm": 0.10049059987068176, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 164410 + }, + { + "epoch": 0.6356017380278641, + "grad_norm": 0.11376563459634781, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 164420 + }, + { + "epoch": 0.6356403952312474, + "grad_norm": 0.10315565019845963, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 164430 + }, + { + "epoch": 0.6356790524346306, + "grad_norm": 0.10755404829978943, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 164440 + }, + { + "epoch": 0.635717709638014, + "grad_norm": 0.0966995358467102, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 164450 + }, + { + "epoch": 0.6357563668413972, + "grad_norm": 0.11110610514879227, + "learning_rate": 0.002, + "loss": 2.351, + "step": 164460 + }, + { + "epoch": 0.6357950240447805, + "grad_norm": 0.11405228823423386, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 164470 + }, + { + "epoch": 0.6358336812481638, + "grad_norm": 0.12278889864683151, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 164480 + }, + { + "epoch": 0.6358723384515471, + "grad_norm": 0.10765815526247025, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 164490 + }, + { + "epoch": 0.6359109956549304, + "grad_norm": 0.1147020012140274, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 164500 + }, + { + "epoch": 0.6359496528583136, + "grad_norm": 0.09279278665781021, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 164510 + }, + { + "epoch": 0.6359883100616969, + "grad_norm": 0.09570721536874771, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 164520 + }, + { + "epoch": 0.6360269672650801, + "grad_norm": 0.11539546400308609, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 164530 + }, + { + "epoch": 0.6360656244684635, + "grad_norm": 0.09704439342021942, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 164540 + }, + { + "epoch": 0.6361042816718467, + "grad_norm": 0.11590418964624405, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 164550 + }, + { + "epoch": 0.63614293887523, + "grad_norm": 0.10154294222593307, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 164560 + }, + { + "epoch": 0.6361815960786132, + "grad_norm": 0.09837421029806137, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 164570 + }, + { + "epoch": 0.6362202532819966, + "grad_norm": 0.10751167684793472, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 164580 + }, + { + "epoch": 0.6362589104853799, + "grad_norm": 0.09590762853622437, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 164590 + }, + { + "epoch": 0.6362975676887631, + "grad_norm": 0.11032095551490784, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 164600 + }, + { + "epoch": 0.6363362248921464, + "grad_norm": 0.09885178506374359, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 164610 + }, + { + "epoch": 0.6363748820955297, + "grad_norm": 0.12746669352054596, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 164620 + }, + { + "epoch": 0.636413539298913, + "grad_norm": 0.09944802522659302, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 164630 + }, + { + "epoch": 0.6364521965022962, + "grad_norm": 0.11385858803987503, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 164640 + }, + { + "epoch": 0.6364908537056795, + "grad_norm": 0.1174749806523323, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 164650 + }, + { + "epoch": 0.6365295109090628, + "grad_norm": 0.10102608799934387, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 164660 + }, + { + "epoch": 0.6365681681124461, + "grad_norm": 0.1062927097082138, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 164670 + }, + { + "epoch": 0.6366068253158293, + "grad_norm": 0.10197556763887405, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 164680 + }, + { + "epoch": 0.6366454825192126, + "grad_norm": 0.11311152577400208, + "learning_rate": 0.002, + "loss": 2.342, + "step": 164690 + }, + { + "epoch": 0.6366841397225959, + "grad_norm": 0.10863665491342545, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 164700 + }, + { + "epoch": 0.6367227969259792, + "grad_norm": 0.09699777513742447, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 164710 + }, + { + "epoch": 0.6367614541293625, + "grad_norm": 0.11128968000411987, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 164720 + }, + { + "epoch": 0.6368001113327457, + "grad_norm": 0.10367298126220703, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 164730 + }, + { + "epoch": 0.636838768536129, + "grad_norm": 0.111191026866436, + "learning_rate": 0.002, + "loss": 2.345, + "step": 164740 + }, + { + "epoch": 0.6368774257395123, + "grad_norm": 0.10926955193281174, + "learning_rate": 0.002, + "loss": 2.33, + "step": 164750 + }, + { + "epoch": 0.6369160829428956, + "grad_norm": 0.12507140636444092, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 164760 + }, + { + "epoch": 0.6369547401462788, + "grad_norm": 0.09888520836830139, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 164770 + }, + { + "epoch": 0.6369933973496621, + "grad_norm": 0.11516008526086807, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 164780 + }, + { + "epoch": 0.6370320545530455, + "grad_norm": 0.10190610587596893, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 164790 + }, + { + "epoch": 0.6370707117564287, + "grad_norm": 0.08981244266033173, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 164800 + }, + { + "epoch": 0.637109368959812, + "grad_norm": 0.1259339600801468, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 164810 + }, + { + "epoch": 0.6371480261631952, + "grad_norm": 0.11346227675676346, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 164820 + }, + { + "epoch": 0.6371866833665786, + "grad_norm": 0.10800295323133469, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 164830 + }, + { + "epoch": 0.6372253405699618, + "grad_norm": 0.1250246912240982, + "learning_rate": 0.002, + "loss": 2.353, + "step": 164840 + }, + { + "epoch": 0.6372639977733451, + "grad_norm": 0.10640424489974976, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 164850 + }, + { + "epoch": 0.6373026549767283, + "grad_norm": 0.12365873903036118, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 164860 + }, + { + "epoch": 0.6373413121801117, + "grad_norm": 0.113923080265522, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 164870 + }, + { + "epoch": 0.637379969383495, + "grad_norm": 0.13655415177345276, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 164880 + }, + { + "epoch": 0.6374186265868782, + "grad_norm": 0.10525278747081757, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 164890 + }, + { + "epoch": 0.6374572837902615, + "grad_norm": 0.09903251379728317, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 164900 + }, + { + "epoch": 0.6374959409936447, + "grad_norm": 0.09784550219774246, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 164910 + }, + { + "epoch": 0.6375345981970281, + "grad_norm": 0.121519073843956, + "learning_rate": 0.002, + "loss": 2.3741, + "step": 164920 + }, + { + "epoch": 0.6375732554004113, + "grad_norm": 0.14645813405513763, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 164930 + }, + { + "epoch": 0.6376119126037946, + "grad_norm": 0.09632168710231781, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 164940 + }, + { + "epoch": 0.6376505698071778, + "grad_norm": 0.09478174895048141, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 164950 + }, + { + "epoch": 0.6376892270105612, + "grad_norm": 0.11825351417064667, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 164960 + }, + { + "epoch": 0.6377278842139444, + "grad_norm": 0.1066097617149353, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 164970 + }, + { + "epoch": 0.6377665414173277, + "grad_norm": 0.10353917628526688, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 164980 + }, + { + "epoch": 0.637805198620711, + "grad_norm": 0.11960267275571823, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 164990 + }, + { + "epoch": 0.6378438558240943, + "grad_norm": 0.10730654746294022, + "learning_rate": 0.002, + "loss": 2.338, + "step": 165000 + }, + { + "epoch": 0.6378825130274776, + "grad_norm": 0.10325822979211807, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 165010 + }, + { + "epoch": 0.6379211702308608, + "grad_norm": 0.09659511595964432, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 165020 + }, + { + "epoch": 0.6379598274342441, + "grad_norm": 0.11972470581531525, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 165030 + }, + { + "epoch": 0.6379984846376274, + "grad_norm": 0.13168546557426453, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 165040 + }, + { + "epoch": 0.6380371418410107, + "grad_norm": 0.09514293074607849, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 165050 + }, + { + "epoch": 0.6380757990443939, + "grad_norm": 0.09249638766050339, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 165060 + }, + { + "epoch": 0.6381144562477772, + "grad_norm": 0.11831273138523102, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 165070 + }, + { + "epoch": 0.6381531134511604, + "grad_norm": 0.10477405041456223, + "learning_rate": 0.002, + "loss": 2.343, + "step": 165080 + }, + { + "epoch": 0.6381917706545438, + "grad_norm": 0.13984504342079163, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 165090 + }, + { + "epoch": 0.638230427857927, + "grad_norm": 0.09976739436388016, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 165100 + }, + { + "epoch": 0.6382690850613103, + "grad_norm": 0.10496357828378677, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 165110 + }, + { + "epoch": 0.6383077422646936, + "grad_norm": 0.11283092200756073, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 165120 + }, + { + "epoch": 0.6383463994680769, + "grad_norm": 0.09938319772481918, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 165130 + }, + { + "epoch": 0.6383850566714602, + "grad_norm": 0.0918104350566864, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 165140 + }, + { + "epoch": 0.6384237138748434, + "grad_norm": 0.12190679460763931, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 165150 + }, + { + "epoch": 0.6384623710782267, + "grad_norm": 0.09865977615118027, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 165160 + }, + { + "epoch": 0.63850102828161, + "grad_norm": 0.10292075574398041, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 165170 + }, + { + "epoch": 0.6385396854849933, + "grad_norm": 0.11105905473232269, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 165180 + }, + { + "epoch": 0.6385783426883765, + "grad_norm": 0.09727694094181061, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 165190 + }, + { + "epoch": 0.6386169998917598, + "grad_norm": 0.09970822930335999, + "learning_rate": 0.002, + "loss": 2.329, + "step": 165200 + }, + { + "epoch": 0.6386556570951432, + "grad_norm": 0.09458530694246292, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 165210 + }, + { + "epoch": 0.6386943142985264, + "grad_norm": 0.09014523029327393, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 165220 + }, + { + "epoch": 0.6387329715019097, + "grad_norm": 0.10275470465421677, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 165230 + }, + { + "epoch": 0.6387716287052929, + "grad_norm": 0.09910184890031815, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 165240 + }, + { + "epoch": 0.6388102859086762, + "grad_norm": 0.10784036666154861, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 165250 + }, + { + "epoch": 0.6388489431120595, + "grad_norm": 0.18909576535224915, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 165260 + }, + { + "epoch": 0.6388876003154428, + "grad_norm": 0.10532589256763458, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 165270 + }, + { + "epoch": 0.638926257518826, + "grad_norm": 0.11066906899213791, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 165280 + }, + { + "epoch": 0.6389649147222093, + "grad_norm": 0.1000770553946495, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 165290 + }, + { + "epoch": 0.6390035719255927, + "grad_norm": 0.1069023460149765, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 165300 + }, + { + "epoch": 0.6390422291289759, + "grad_norm": 0.08948520570993423, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 165310 + }, + { + "epoch": 0.6390808863323592, + "grad_norm": 0.11200417578220367, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 165320 + }, + { + "epoch": 0.6391195435357424, + "grad_norm": 0.09061744809150696, + "learning_rate": 0.002, + "loss": 2.342, + "step": 165330 + }, + { + "epoch": 0.6391582007391258, + "grad_norm": 0.11405932158231735, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 165340 + }, + { + "epoch": 0.639196857942509, + "grad_norm": 0.2837398648262024, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 165350 + }, + { + "epoch": 0.6392355151458923, + "grad_norm": 0.12070345133543015, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 165360 + }, + { + "epoch": 0.6392741723492755, + "grad_norm": 0.11351531744003296, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 165370 + }, + { + "epoch": 0.6393128295526589, + "grad_norm": 0.11273518204689026, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 165380 + }, + { + "epoch": 0.6393514867560421, + "grad_norm": 0.12190520763397217, + "learning_rate": 0.002, + "loss": 2.342, + "step": 165390 + }, + { + "epoch": 0.6393901439594254, + "grad_norm": 0.11630602180957794, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 165400 + }, + { + "epoch": 0.6394288011628086, + "grad_norm": 0.10664429515600204, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 165410 + }, + { + "epoch": 0.639467458366192, + "grad_norm": 0.10132356733083725, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 165420 + }, + { + "epoch": 0.6395061155695753, + "grad_norm": 0.10533542931079865, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 165430 + }, + { + "epoch": 0.6395447727729585, + "grad_norm": 0.09856123477220535, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 165440 + }, + { + "epoch": 0.6395834299763418, + "grad_norm": 0.2605516314506531, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 165450 + }, + { + "epoch": 0.639622087179725, + "grad_norm": 0.11958436667919159, + "learning_rate": 0.002, + "loss": 2.349, + "step": 165460 + }, + { + "epoch": 0.6396607443831084, + "grad_norm": 0.11529088765382767, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 165470 + }, + { + "epoch": 0.6396994015864916, + "grad_norm": 0.10149786621332169, + "learning_rate": 0.002, + "loss": 2.338, + "step": 165480 + }, + { + "epoch": 0.6397380587898749, + "grad_norm": 0.11168461292982101, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 165490 + }, + { + "epoch": 0.6397767159932581, + "grad_norm": 0.1082560122013092, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 165500 + }, + { + "epoch": 0.6398153731966415, + "grad_norm": 0.12643194198608398, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 165510 + }, + { + "epoch": 0.6398540304000248, + "grad_norm": 0.1188395693898201, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 165520 + }, + { + "epoch": 0.639892687603408, + "grad_norm": 0.11413159221410751, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 165530 + }, + { + "epoch": 0.6399313448067913, + "grad_norm": 0.10644764453172684, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 165540 + }, + { + "epoch": 0.6399700020101746, + "grad_norm": 0.1004197746515274, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 165550 + }, + { + "epoch": 0.6400086592135579, + "grad_norm": 0.14543674886226654, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 165560 + }, + { + "epoch": 0.6400473164169411, + "grad_norm": 0.09863635152578354, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 165570 + }, + { + "epoch": 0.6400859736203244, + "grad_norm": 0.1266559362411499, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 165580 + }, + { + "epoch": 0.6401246308237077, + "grad_norm": 0.11178171634674072, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 165590 + }, + { + "epoch": 0.640163288027091, + "grad_norm": 0.10830844193696976, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 165600 + }, + { + "epoch": 0.6402019452304742, + "grad_norm": 0.10719065368175507, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 165610 + }, + { + "epoch": 0.6402406024338575, + "grad_norm": 0.16013024747371674, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 165620 + }, + { + "epoch": 0.6402792596372407, + "grad_norm": 0.11934558302164078, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 165630 + }, + { + "epoch": 0.6403179168406241, + "grad_norm": 0.09629912674427032, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 165640 + }, + { + "epoch": 0.6403565740440074, + "grad_norm": 0.11514745652675629, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 165650 + }, + { + "epoch": 0.6403952312473906, + "grad_norm": 0.09105557948350906, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 165660 + }, + { + "epoch": 0.6404338884507739, + "grad_norm": 0.16520722210407257, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 165670 + }, + { + "epoch": 0.6404725456541572, + "grad_norm": 0.09158007055521011, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 165680 + }, + { + "epoch": 0.6405112028575405, + "grad_norm": 0.1115182489156723, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 165690 + }, + { + "epoch": 0.6405498600609237, + "grad_norm": 0.13768655061721802, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 165700 + }, + { + "epoch": 0.640588517264307, + "grad_norm": 0.2795492112636566, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 165710 + }, + { + "epoch": 0.6406271744676904, + "grad_norm": 0.11476332694292068, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 165720 + }, + { + "epoch": 0.6406658316710736, + "grad_norm": 0.13985703885555267, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 165730 + }, + { + "epoch": 0.6407044888744569, + "grad_norm": 0.10406840592622757, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 165740 + }, + { + "epoch": 0.6407431460778401, + "grad_norm": 0.10520946234464645, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 165750 + }, + { + "epoch": 0.6407818032812235, + "grad_norm": 0.10424433648586273, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 165760 + }, + { + "epoch": 0.6408204604846067, + "grad_norm": 0.11177675426006317, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 165770 + }, + { + "epoch": 0.64085911768799, + "grad_norm": 0.09933862835168839, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 165780 + }, + { + "epoch": 0.6408977748913732, + "grad_norm": 0.09627087414264679, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 165790 + }, + { + "epoch": 0.6409364320947566, + "grad_norm": 0.11053643375635147, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 165800 + }, + { + "epoch": 0.6409750892981398, + "grad_norm": 0.10794644802808762, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 165810 + }, + { + "epoch": 0.6410137465015231, + "grad_norm": 0.10086505860090256, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 165820 + }, + { + "epoch": 0.6410524037049063, + "grad_norm": 0.11320238560438156, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 165830 + }, + { + "epoch": 0.6410910609082896, + "grad_norm": 0.09917449206113815, + "learning_rate": 0.002, + "loss": 2.352, + "step": 165840 + }, + { + "epoch": 0.641129718111673, + "grad_norm": 0.12727132439613342, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 165850 + }, + { + "epoch": 0.6411683753150562, + "grad_norm": 0.10197830945253372, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 165860 + }, + { + "epoch": 0.6412070325184395, + "grad_norm": 0.089323990046978, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 165870 + }, + { + "epoch": 0.6412456897218227, + "grad_norm": 0.10101732611656189, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 165880 + }, + { + "epoch": 0.6412843469252061, + "grad_norm": 0.08730916678905487, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 165890 + }, + { + "epoch": 0.6413230041285893, + "grad_norm": 0.10433012247085571, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 165900 + }, + { + "epoch": 0.6413616613319726, + "grad_norm": 0.15657569468021393, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 165910 + }, + { + "epoch": 0.6414003185353558, + "grad_norm": 0.11098662763834, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 165920 + }, + { + "epoch": 0.6414389757387392, + "grad_norm": 0.09086654335260391, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 165930 + }, + { + "epoch": 0.6414776329421225, + "grad_norm": 0.10442627221345901, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 165940 + }, + { + "epoch": 0.6415162901455057, + "grad_norm": 0.09077071398496628, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 165950 + }, + { + "epoch": 0.641554947348889, + "grad_norm": 0.11129163950681686, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 165960 + }, + { + "epoch": 0.6415936045522723, + "grad_norm": 0.12598635256290436, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 165970 + }, + { + "epoch": 0.6416322617556556, + "grad_norm": 0.13409005105495453, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 165980 + }, + { + "epoch": 0.6416709189590388, + "grad_norm": 0.17931459844112396, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 165990 + }, + { + "epoch": 0.6417095761624221, + "grad_norm": 0.10173191130161285, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 166000 + }, + { + "epoch": 0.6417482333658053, + "grad_norm": 0.10050991922616959, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 166010 + }, + { + "epoch": 0.6417868905691887, + "grad_norm": 0.11818138509988785, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 166020 + }, + { + "epoch": 0.641825547772572, + "grad_norm": 0.10185743868350983, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 166030 + }, + { + "epoch": 0.6418642049759552, + "grad_norm": 0.11149696260690689, + "learning_rate": 0.002, + "loss": 2.333, + "step": 166040 + }, + { + "epoch": 0.6419028621793385, + "grad_norm": 0.13266867399215698, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 166050 + }, + { + "epoch": 0.6419415193827218, + "grad_norm": 0.12067516148090363, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 166060 + }, + { + "epoch": 0.6419801765861051, + "grad_norm": 0.09225700795650482, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 166070 + }, + { + "epoch": 0.6420188337894883, + "grad_norm": 0.13785415887832642, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 166080 + }, + { + "epoch": 0.6420574909928716, + "grad_norm": 0.09486342966556549, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 166090 + }, + { + "epoch": 0.6420961481962549, + "grad_norm": 0.1279897838830948, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 166100 + }, + { + "epoch": 0.6421348053996382, + "grad_norm": 0.11228784918785095, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 166110 + }, + { + "epoch": 0.6421734626030214, + "grad_norm": 0.11330057680606842, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 166120 + }, + { + "epoch": 0.6422121198064047, + "grad_norm": 0.11201409995555878, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 166130 + }, + { + "epoch": 0.642250777009788, + "grad_norm": 0.10510092228651047, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 166140 + }, + { + "epoch": 0.6422894342131713, + "grad_norm": 0.09827280789613724, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 166150 + }, + { + "epoch": 0.6423280914165546, + "grad_norm": 0.11704771965742111, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 166160 + }, + { + "epoch": 0.6423667486199378, + "grad_norm": 0.12008139491081238, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 166170 + }, + { + "epoch": 0.6424054058233211, + "grad_norm": 0.11050605028867722, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 166180 + }, + { + "epoch": 0.6424440630267044, + "grad_norm": 0.09580128639936447, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 166190 + }, + { + "epoch": 0.6424827202300877, + "grad_norm": 0.1491931825876236, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 166200 + }, + { + "epoch": 0.6425213774334709, + "grad_norm": 0.09168663620948792, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 166210 + }, + { + "epoch": 0.6425600346368542, + "grad_norm": 0.09992379695177078, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 166220 + }, + { + "epoch": 0.6425986918402375, + "grad_norm": 0.10679803043603897, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 166230 + }, + { + "epoch": 0.6426373490436208, + "grad_norm": 0.10552909970283508, + "learning_rate": 0.002, + "loss": 2.347, + "step": 166240 + }, + { + "epoch": 0.642676006247004, + "grad_norm": 0.20289833843708038, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 166250 + }, + { + "epoch": 0.6427146634503873, + "grad_norm": 0.13337108492851257, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 166260 + }, + { + "epoch": 0.6427533206537707, + "grad_norm": 0.10712134838104248, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 166270 + }, + { + "epoch": 0.6427919778571539, + "grad_norm": 0.11020371317863464, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 166280 + }, + { + "epoch": 0.6428306350605372, + "grad_norm": 0.11662837862968445, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 166290 + }, + { + "epoch": 0.6428692922639204, + "grad_norm": 0.10071861743927002, + "learning_rate": 0.002, + "loss": 2.34, + "step": 166300 + }, + { + "epoch": 0.6429079494673038, + "grad_norm": 0.11239639669656754, + "learning_rate": 0.002, + "loss": 2.335, + "step": 166310 + }, + { + "epoch": 0.642946606670687, + "grad_norm": 0.1093963086605072, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 166320 + }, + { + "epoch": 0.6429852638740703, + "grad_norm": 0.1184404194355011, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 166330 + }, + { + "epoch": 0.6430239210774535, + "grad_norm": 0.11267964541912079, + "learning_rate": 0.002, + "loss": 2.334, + "step": 166340 + }, + { + "epoch": 0.6430625782808369, + "grad_norm": 0.10850871354341507, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 166350 + }, + { + "epoch": 0.6431012354842202, + "grad_norm": 0.10646646469831467, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 166360 + }, + { + "epoch": 0.6431398926876034, + "grad_norm": 0.10959392786026001, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 166370 + }, + { + "epoch": 0.6431785498909867, + "grad_norm": 0.10171080380678177, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 166380 + }, + { + "epoch": 0.6432172070943699, + "grad_norm": 0.12380674481391907, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 166390 + }, + { + "epoch": 0.6432558642977533, + "grad_norm": 0.11397700756788254, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 166400 + }, + { + "epoch": 0.6432945215011365, + "grad_norm": 0.12687024474143982, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 166410 + }, + { + "epoch": 0.6433331787045198, + "grad_norm": 0.11587760597467422, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 166420 + }, + { + "epoch": 0.643371835907903, + "grad_norm": 0.09550909698009491, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 166430 + }, + { + "epoch": 0.6434104931112864, + "grad_norm": 0.10184428095817566, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 166440 + }, + { + "epoch": 0.6434491503146696, + "grad_norm": 0.09556592255830765, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 166450 + }, + { + "epoch": 0.6434878075180529, + "grad_norm": 0.10573387145996094, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 166460 + }, + { + "epoch": 0.6435264647214362, + "grad_norm": 0.10674691945314407, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 166470 + }, + { + "epoch": 0.6435651219248195, + "grad_norm": 0.12763813138008118, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 166480 + }, + { + "epoch": 0.6436037791282028, + "grad_norm": 0.10722203552722931, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 166490 + }, + { + "epoch": 0.643642436331586, + "grad_norm": 0.10920672863721848, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 166500 + }, + { + "epoch": 0.6436810935349693, + "grad_norm": 0.09678395837545395, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 166510 + }, + { + "epoch": 0.6437197507383526, + "grad_norm": 0.10371027141809464, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 166520 + }, + { + "epoch": 0.6437584079417359, + "grad_norm": 0.09537974745035172, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 166530 + }, + { + "epoch": 0.6437970651451191, + "grad_norm": 0.12189342826604843, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 166540 + }, + { + "epoch": 0.6438357223485024, + "grad_norm": 0.0994722917675972, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 166550 + }, + { + "epoch": 0.6438743795518856, + "grad_norm": 0.1094537302851677, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 166560 + }, + { + "epoch": 0.643913036755269, + "grad_norm": 0.1033482551574707, + "learning_rate": 0.002, + "loss": 2.349, + "step": 166570 + }, + { + "epoch": 0.6439516939586523, + "grad_norm": 0.08849822729825974, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 166580 + }, + { + "epoch": 0.6439903511620355, + "grad_norm": 0.11044814437627792, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 166590 + }, + { + "epoch": 0.6440290083654188, + "grad_norm": 0.11067377775907516, + "learning_rate": 0.002, + "loss": 2.333, + "step": 166600 + }, + { + "epoch": 0.6440676655688021, + "grad_norm": 0.10998781770467758, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 166610 + }, + { + "epoch": 0.6441063227721854, + "grad_norm": 0.10315155982971191, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 166620 + }, + { + "epoch": 0.6441449799755686, + "grad_norm": 0.09741552919149399, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 166630 + }, + { + "epoch": 0.6441836371789519, + "grad_norm": 0.14576038718223572, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 166640 + }, + { + "epoch": 0.6442222943823352, + "grad_norm": 0.09207435697317123, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 166650 + }, + { + "epoch": 0.6442609515857185, + "grad_norm": 0.10076536238193512, + "learning_rate": 0.002, + "loss": 2.336, + "step": 166660 + }, + { + "epoch": 0.6442996087891018, + "grad_norm": 0.11621806025505066, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 166670 + }, + { + "epoch": 0.644338265992485, + "grad_norm": 0.11521206051111221, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 166680 + }, + { + "epoch": 0.6443769231958684, + "grad_norm": 0.10904458165168762, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 166690 + }, + { + "epoch": 0.6444155803992516, + "grad_norm": 0.14390254020690918, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 166700 + }, + { + "epoch": 0.6444542376026349, + "grad_norm": 0.1041063517332077, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 166710 + }, + { + "epoch": 0.6444928948060181, + "grad_norm": 0.10243967920541763, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 166720 + }, + { + "epoch": 0.6445315520094015, + "grad_norm": 0.10894256830215454, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 166730 + }, + { + "epoch": 0.6445702092127847, + "grad_norm": 0.14673057198524475, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 166740 + }, + { + "epoch": 0.644608866416168, + "grad_norm": 0.09796229004859924, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 166750 + }, + { + "epoch": 0.6446475236195512, + "grad_norm": 0.09875568747520447, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 166760 + }, + { + "epoch": 0.6446861808229345, + "grad_norm": 0.09696167707443237, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 166770 + }, + { + "epoch": 0.6447248380263179, + "grad_norm": 0.12166699767112732, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 166780 + }, + { + "epoch": 0.6447634952297011, + "grad_norm": 0.12227968871593475, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 166790 + }, + { + "epoch": 0.6448021524330844, + "grad_norm": 0.11464644968509674, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 166800 + }, + { + "epoch": 0.6448408096364676, + "grad_norm": 0.14891226589679718, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 166810 + }, + { + "epoch": 0.644879466839851, + "grad_norm": 0.1151174008846283, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 166820 + }, + { + "epoch": 0.6449181240432342, + "grad_norm": 0.09968895465135574, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 166830 + }, + { + "epoch": 0.6449567812466175, + "grad_norm": 0.12719690799713135, + "learning_rate": 0.002, + "loss": 2.34, + "step": 166840 + }, + { + "epoch": 0.6449954384500007, + "grad_norm": 0.1118369996547699, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 166850 + }, + { + "epoch": 0.6450340956533841, + "grad_norm": 0.15037807822227478, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 166860 + }, + { + "epoch": 0.6450727528567674, + "grad_norm": 0.09736327826976776, + "learning_rate": 0.002, + "loss": 2.345, + "step": 166870 + }, + { + "epoch": 0.6451114100601506, + "grad_norm": 0.14185789227485657, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 166880 + }, + { + "epoch": 0.6451500672635339, + "grad_norm": 0.10070722550153732, + "learning_rate": 0.002, + "loss": 2.341, + "step": 166890 + }, + { + "epoch": 0.6451887244669172, + "grad_norm": 0.09284228831529617, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 166900 + }, + { + "epoch": 0.6452273816703005, + "grad_norm": 0.12558911740779877, + "learning_rate": 0.002, + "loss": 2.37, + "step": 166910 + }, + { + "epoch": 0.6452660388736837, + "grad_norm": 0.10748089849948883, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 166920 + }, + { + "epoch": 0.645304696077067, + "grad_norm": 0.09128913283348083, + "learning_rate": 0.002, + "loss": 2.343, + "step": 166930 + }, + { + "epoch": 0.6453433532804502, + "grad_norm": 0.09370754659175873, + "learning_rate": 0.002, + "loss": 2.338, + "step": 166940 + }, + { + "epoch": 0.6453820104838336, + "grad_norm": 0.10552221536636353, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 166950 + }, + { + "epoch": 0.6454206676872168, + "grad_norm": 0.10560479760169983, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 166960 + }, + { + "epoch": 0.6454593248906001, + "grad_norm": 0.12228506058454514, + "learning_rate": 0.002, + "loss": 2.343, + "step": 166970 + }, + { + "epoch": 0.6454979820939833, + "grad_norm": 0.10752666741609573, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 166980 + }, + { + "epoch": 0.6455366392973667, + "grad_norm": 0.13183359801769257, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 166990 + }, + { + "epoch": 0.64557529650075, + "grad_norm": 0.09106297045946121, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 167000 + }, + { + "epoch": 0.6456139537041332, + "grad_norm": 0.09969896823167801, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 167010 + }, + { + "epoch": 0.6456526109075165, + "grad_norm": 0.10386525094509125, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 167020 + }, + { + "epoch": 0.6456912681108998, + "grad_norm": 0.1330537647008896, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 167030 + }, + { + "epoch": 0.6457299253142831, + "grad_norm": 0.1207583099603653, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 167040 + }, + { + "epoch": 0.6457685825176663, + "grad_norm": 0.11657993495464325, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 167050 + }, + { + "epoch": 0.6458072397210496, + "grad_norm": 0.1179221123456955, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 167060 + }, + { + "epoch": 0.645845896924433, + "grad_norm": 0.11940892040729523, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 167070 + }, + { + "epoch": 0.6458845541278162, + "grad_norm": 0.10305957496166229, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 167080 + }, + { + "epoch": 0.6459232113311995, + "grad_norm": 0.09387367963790894, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 167090 + }, + { + "epoch": 0.6459618685345827, + "grad_norm": 0.09949074685573578, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 167100 + }, + { + "epoch": 0.646000525737966, + "grad_norm": 0.11801401525735855, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 167110 + }, + { + "epoch": 0.6460391829413493, + "grad_norm": 0.11283881217241287, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 167120 + }, + { + "epoch": 0.6460778401447326, + "grad_norm": 0.10136590152978897, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 167130 + }, + { + "epoch": 0.6461164973481158, + "grad_norm": 0.09200512617826462, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 167140 + }, + { + "epoch": 0.6461551545514991, + "grad_norm": 0.10584022849798203, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 167150 + }, + { + "epoch": 0.6461938117548824, + "grad_norm": 0.11500915884971619, + "learning_rate": 0.002, + "loss": 2.3668, + "step": 167160 + }, + { + "epoch": 0.6462324689582657, + "grad_norm": 0.11622696369886398, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 167170 + }, + { + "epoch": 0.646271126161649, + "grad_norm": 0.1187836229801178, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 167180 + }, + { + "epoch": 0.6463097833650322, + "grad_norm": 0.12957388162612915, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 167190 + }, + { + "epoch": 0.6463484405684156, + "grad_norm": 0.10623416304588318, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 167200 + }, + { + "epoch": 0.6463870977717988, + "grad_norm": 0.09525856375694275, + "learning_rate": 0.002, + "loss": 2.341, + "step": 167210 + }, + { + "epoch": 0.6464257549751821, + "grad_norm": 0.09404592216014862, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 167220 + }, + { + "epoch": 0.6464644121785653, + "grad_norm": 0.11730389297008514, + "learning_rate": 0.002, + "loss": 2.342, + "step": 167230 + }, + { + "epoch": 0.6465030693819487, + "grad_norm": 0.1096835732460022, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 167240 + }, + { + "epoch": 0.6465417265853319, + "grad_norm": 0.11142761260271072, + "learning_rate": 0.002, + "loss": 2.347, + "step": 167250 + }, + { + "epoch": 0.6465803837887152, + "grad_norm": 0.09524443000555038, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 167260 + }, + { + "epoch": 0.6466190409920984, + "grad_norm": 0.09275458008050919, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 167270 + }, + { + "epoch": 0.6466576981954818, + "grad_norm": 0.11118104308843613, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 167280 + }, + { + "epoch": 0.646696355398865, + "grad_norm": 0.10477935522794724, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 167290 + }, + { + "epoch": 0.6467350126022483, + "grad_norm": 0.11114238947629929, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 167300 + }, + { + "epoch": 0.6467736698056316, + "grad_norm": 0.09935715049505234, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 167310 + }, + { + "epoch": 0.6468123270090148, + "grad_norm": 0.09258892387151718, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 167320 + }, + { + "epoch": 0.6468509842123982, + "grad_norm": 0.10509154200553894, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 167330 + }, + { + "epoch": 0.6468896414157814, + "grad_norm": 0.11609136313199997, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 167340 + }, + { + "epoch": 0.6469282986191647, + "grad_norm": 0.10558400303125381, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 167350 + }, + { + "epoch": 0.6469669558225479, + "grad_norm": 0.0904054194688797, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 167360 + }, + { + "epoch": 0.6470056130259313, + "grad_norm": 0.1114594042301178, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 167370 + }, + { + "epoch": 0.6470442702293145, + "grad_norm": 0.09686373919248581, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 167380 + }, + { + "epoch": 0.6470829274326978, + "grad_norm": 0.1125197783112526, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 167390 + }, + { + "epoch": 0.647121584636081, + "grad_norm": 0.09414125978946686, + "learning_rate": 0.002, + "loss": 2.3669, + "step": 167400 + }, + { + "epoch": 0.6471602418394644, + "grad_norm": 0.10086499899625778, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 167410 + }, + { + "epoch": 0.6471988990428477, + "grad_norm": 0.1413799226284027, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 167420 + }, + { + "epoch": 0.6472375562462309, + "grad_norm": 0.11475475877523422, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 167430 + }, + { + "epoch": 0.6472762134496142, + "grad_norm": 0.10616303235292435, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 167440 + }, + { + "epoch": 0.6473148706529975, + "grad_norm": 0.10366956889629364, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 167450 + }, + { + "epoch": 0.6473535278563808, + "grad_norm": 0.10474330931901932, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 167460 + }, + { + "epoch": 0.647392185059764, + "grad_norm": 0.10938055068254471, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 167470 + }, + { + "epoch": 0.6474308422631473, + "grad_norm": 0.0967249721288681, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 167480 + }, + { + "epoch": 0.6474694994665305, + "grad_norm": 0.11261042207479477, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 167490 + }, + { + "epoch": 0.6475081566699139, + "grad_norm": 0.10691956430673599, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 167500 + }, + { + "epoch": 0.6475468138732972, + "grad_norm": 0.11203692853450775, + "learning_rate": 0.002, + "loss": 2.362, + "step": 167510 + }, + { + "epoch": 0.6475854710766804, + "grad_norm": 0.09567520767450333, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 167520 + }, + { + "epoch": 0.6476241282800637, + "grad_norm": 0.12334605306386948, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 167530 + }, + { + "epoch": 0.647662785483447, + "grad_norm": 0.11762816458940506, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 167540 + }, + { + "epoch": 0.6477014426868303, + "grad_norm": 0.09287743270397186, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 167550 + }, + { + "epoch": 0.6477400998902135, + "grad_norm": 0.1160036027431488, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 167560 + }, + { + "epoch": 0.6477787570935968, + "grad_norm": 0.1122221052646637, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 167570 + }, + { + "epoch": 0.6478174142969801, + "grad_norm": 0.11325201392173767, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 167580 + }, + { + "epoch": 0.6478560715003634, + "grad_norm": 0.09645060449838638, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 167590 + }, + { + "epoch": 0.6478947287037466, + "grad_norm": 0.10355105996131897, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 167600 + }, + { + "epoch": 0.6479333859071299, + "grad_norm": 0.09704997390508652, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 167610 + }, + { + "epoch": 0.6479720431105133, + "grad_norm": 0.10785232484340668, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 167620 + }, + { + "epoch": 0.6480107003138965, + "grad_norm": 0.1302575320005417, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 167630 + }, + { + "epoch": 0.6480493575172798, + "grad_norm": 0.09805729240179062, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 167640 + }, + { + "epoch": 0.648088014720663, + "grad_norm": 0.10799106955528259, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 167650 + }, + { + "epoch": 0.6481266719240463, + "grad_norm": 0.11254876106977463, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 167660 + }, + { + "epoch": 0.6481653291274296, + "grad_norm": 0.10297144204378128, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 167670 + }, + { + "epoch": 0.6482039863308129, + "grad_norm": 0.09445956349372864, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 167680 + }, + { + "epoch": 0.6482426435341961, + "grad_norm": 0.11523937433958054, + "learning_rate": 0.002, + "loss": 2.3632, + "step": 167690 + }, + { + "epoch": 0.6482813007375794, + "grad_norm": 0.12582455575466156, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 167700 + }, + { + "epoch": 0.6483199579409628, + "grad_norm": 0.11468572169542313, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 167710 + }, + { + "epoch": 0.648358615144346, + "grad_norm": 0.09803872555494308, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 167720 + }, + { + "epoch": 0.6483972723477293, + "grad_norm": 0.09609941393136978, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 167730 + }, + { + "epoch": 0.6484359295511125, + "grad_norm": 0.10531718283891678, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 167740 + }, + { + "epoch": 0.6484745867544959, + "grad_norm": 0.1036820337176323, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 167750 + }, + { + "epoch": 0.6485132439578791, + "grad_norm": 0.09775065630674362, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 167760 + }, + { + "epoch": 0.6485519011612624, + "grad_norm": 0.10095041245222092, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 167770 + }, + { + "epoch": 0.6485905583646456, + "grad_norm": 0.10327846556901932, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 167780 + }, + { + "epoch": 0.648629215568029, + "grad_norm": 0.2384766787290573, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 167790 + }, + { + "epoch": 0.6486678727714122, + "grad_norm": 0.09711385518312454, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 167800 + }, + { + "epoch": 0.6487065299747955, + "grad_norm": 0.1092362031340599, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 167810 + }, + { + "epoch": 0.6487451871781788, + "grad_norm": 0.11628155410289764, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 167820 + }, + { + "epoch": 0.6487838443815621, + "grad_norm": 0.10958966612815857, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 167830 + }, + { + "epoch": 0.6488225015849454, + "grad_norm": 0.11430975794792175, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 167840 + }, + { + "epoch": 0.6488611587883286, + "grad_norm": 0.08860432356595993, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 167850 + }, + { + "epoch": 0.6488998159917119, + "grad_norm": 0.10471168160438538, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 167860 + }, + { + "epoch": 0.6489384731950951, + "grad_norm": 0.10139449685811996, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 167870 + }, + { + "epoch": 0.6489771303984785, + "grad_norm": 0.10843594372272491, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 167880 + }, + { + "epoch": 0.6490157876018617, + "grad_norm": 0.10248102247714996, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 167890 + }, + { + "epoch": 0.649054444805245, + "grad_norm": 0.08800366520881653, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 167900 + }, + { + "epoch": 0.6490931020086282, + "grad_norm": 0.1202792376279831, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 167910 + }, + { + "epoch": 0.6491317592120116, + "grad_norm": 0.11822064220905304, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 167920 + }, + { + "epoch": 0.6491704164153949, + "grad_norm": 0.09871813654899597, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 167930 + }, + { + "epoch": 0.6492090736187781, + "grad_norm": 0.09461401402950287, + "learning_rate": 0.002, + "loss": 2.344, + "step": 167940 + }, + { + "epoch": 0.6492477308221614, + "grad_norm": 0.1045580804347992, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 167950 + }, + { + "epoch": 0.6492863880255447, + "grad_norm": 0.14083321392536163, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 167960 + }, + { + "epoch": 0.649325045228928, + "grad_norm": 0.1170283779501915, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 167970 + }, + { + "epoch": 0.6493637024323112, + "grad_norm": 0.1271386593580246, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 167980 + }, + { + "epoch": 0.6494023596356945, + "grad_norm": 0.12174142152070999, + "learning_rate": 0.002, + "loss": 2.349, + "step": 167990 + }, + { + "epoch": 0.6494410168390778, + "grad_norm": 0.09934654831886292, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 168000 + }, + { + "epoch": 0.6494796740424611, + "grad_norm": 0.08649374544620514, + "learning_rate": 0.002, + "loss": 2.353, + "step": 168010 + }, + { + "epoch": 0.6495183312458443, + "grad_norm": 0.1037168875336647, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 168020 + }, + { + "epoch": 0.6495569884492276, + "grad_norm": 0.10774770379066467, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 168030 + }, + { + "epoch": 0.6495956456526109, + "grad_norm": 0.10287889838218689, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 168040 + }, + { + "epoch": 0.6496343028559942, + "grad_norm": 0.09221872687339783, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 168050 + }, + { + "epoch": 0.6496729600593775, + "grad_norm": 0.11805696040391922, + "learning_rate": 0.002, + "loss": 2.336, + "step": 168060 + }, + { + "epoch": 0.6497116172627607, + "grad_norm": 0.10697631537914276, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 168070 + }, + { + "epoch": 0.649750274466144, + "grad_norm": 0.11242430657148361, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 168080 + }, + { + "epoch": 0.6497889316695273, + "grad_norm": 0.11488054692745209, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 168090 + }, + { + "epoch": 0.6498275888729106, + "grad_norm": 0.11183663457632065, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 168100 + }, + { + "epoch": 0.6498662460762938, + "grad_norm": 0.10771051794290543, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 168110 + }, + { + "epoch": 0.6499049032796771, + "grad_norm": 0.1694340854883194, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 168120 + }, + { + "epoch": 0.6499435604830605, + "grad_norm": 0.10719171166419983, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 168130 + }, + { + "epoch": 0.6499822176864437, + "grad_norm": 0.09480391442775726, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 168140 + }, + { + "epoch": 0.650020874889827, + "grad_norm": 0.14159810543060303, + "learning_rate": 0.002, + "loss": 2.348, + "step": 168150 + }, + { + "epoch": 0.6500595320932102, + "grad_norm": 0.10143940150737762, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 168160 + }, + { + "epoch": 0.6500981892965936, + "grad_norm": 0.11819176375865936, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 168170 + }, + { + "epoch": 0.6501368464999768, + "grad_norm": 0.11603454500436783, + "learning_rate": 0.002, + "loss": 2.346, + "step": 168180 + }, + { + "epoch": 0.6501755037033601, + "grad_norm": 0.12313628941774368, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 168190 + }, + { + "epoch": 0.6502141609067433, + "grad_norm": 0.09967542439699173, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 168200 + }, + { + "epoch": 0.6502528181101267, + "grad_norm": 0.10996179282665253, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 168210 + }, + { + "epoch": 0.65029147531351, + "grad_norm": 0.099667027592659, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 168220 + }, + { + "epoch": 0.6503301325168932, + "grad_norm": 0.14640873670578003, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 168230 + }, + { + "epoch": 0.6503687897202765, + "grad_norm": 0.10246577858924866, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 168240 + }, + { + "epoch": 0.6504074469236597, + "grad_norm": 0.09082327038049698, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 168250 + }, + { + "epoch": 0.6504461041270431, + "grad_norm": 0.1145891472697258, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 168260 + }, + { + "epoch": 0.6504847613304263, + "grad_norm": 0.10838694870471954, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 168270 + }, + { + "epoch": 0.6505234185338096, + "grad_norm": 0.12250230461359024, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 168280 + }, + { + "epoch": 0.6505620757371928, + "grad_norm": 0.09471901506185532, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 168290 + }, + { + "epoch": 0.6506007329405762, + "grad_norm": 0.1212347075343132, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 168300 + }, + { + "epoch": 0.6506393901439594, + "grad_norm": 0.09806258231401443, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 168310 + }, + { + "epoch": 0.6506780473473427, + "grad_norm": 0.10392429679632187, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 168320 + }, + { + "epoch": 0.6507167045507259, + "grad_norm": 0.10079951584339142, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 168330 + }, + { + "epoch": 0.6507553617541093, + "grad_norm": 0.10576821863651276, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 168340 + }, + { + "epoch": 0.6507940189574926, + "grad_norm": 0.09604281932115555, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 168350 + }, + { + "epoch": 0.6508326761608758, + "grad_norm": 0.12223909050226212, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 168360 + }, + { + "epoch": 0.6508713333642591, + "grad_norm": 0.1080576702952385, + "learning_rate": 0.002, + "loss": 2.345, + "step": 168370 + }, + { + "epoch": 0.6509099905676424, + "grad_norm": 0.10412755608558655, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 168380 + }, + { + "epoch": 0.6509486477710257, + "grad_norm": 0.09832601994276047, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 168390 + }, + { + "epoch": 0.6509873049744089, + "grad_norm": 0.09842384606599808, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 168400 + }, + { + "epoch": 0.6510259621777922, + "grad_norm": 0.11502683907747269, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 168410 + }, + { + "epoch": 0.6510646193811754, + "grad_norm": 0.11557365208864212, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 168420 + }, + { + "epoch": 0.6511032765845588, + "grad_norm": 0.124544657766819, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 168430 + }, + { + "epoch": 0.651141933787942, + "grad_norm": 0.10851182788610458, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 168440 + }, + { + "epoch": 0.6511805909913253, + "grad_norm": 0.10926005244255066, + "learning_rate": 0.002, + "loss": 2.341, + "step": 168450 + }, + { + "epoch": 0.6512192481947086, + "grad_norm": 0.11285897344350815, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 168460 + }, + { + "epoch": 0.6512579053980919, + "grad_norm": 0.1205969750881195, + "learning_rate": 0.002, + "loss": 2.332, + "step": 168470 + }, + { + "epoch": 0.6512965626014752, + "grad_norm": 0.09737059473991394, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 168480 + }, + { + "epoch": 0.6513352198048584, + "grad_norm": 0.10460642725229263, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 168490 + }, + { + "epoch": 0.6513738770082417, + "grad_norm": 0.10728586465120316, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 168500 + }, + { + "epoch": 0.651412534211625, + "grad_norm": 0.11106391996145248, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 168510 + }, + { + "epoch": 0.6514511914150083, + "grad_norm": 0.0917786955833435, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 168520 + }, + { + "epoch": 0.6514898486183915, + "grad_norm": 0.10599620640277863, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 168530 + }, + { + "epoch": 0.6515285058217748, + "grad_norm": 0.10357921570539474, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 168540 + }, + { + "epoch": 0.6515671630251582, + "grad_norm": 0.13063235580921173, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 168550 + }, + { + "epoch": 0.6516058202285414, + "grad_norm": 0.23453116416931152, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 168560 + }, + { + "epoch": 0.6516444774319247, + "grad_norm": 0.11451299488544464, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 168570 + }, + { + "epoch": 0.6516831346353079, + "grad_norm": 0.10291838645935059, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 168580 + }, + { + "epoch": 0.6517217918386912, + "grad_norm": 0.11585838347673416, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 168590 + }, + { + "epoch": 0.6517604490420745, + "grad_norm": 0.0969030112028122, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 168600 + }, + { + "epoch": 0.6517991062454578, + "grad_norm": 0.15430507063865662, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 168610 + }, + { + "epoch": 0.651837763448841, + "grad_norm": 0.12934225797653198, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 168620 + }, + { + "epoch": 0.6518764206522243, + "grad_norm": 0.1302899420261383, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 168630 + }, + { + "epoch": 0.6519150778556076, + "grad_norm": 0.0999523252248764, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 168640 + }, + { + "epoch": 0.6519537350589909, + "grad_norm": 0.1182340458035469, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 168650 + }, + { + "epoch": 0.6519923922623742, + "grad_norm": 0.11040201783180237, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 168660 + }, + { + "epoch": 0.6520310494657574, + "grad_norm": 0.11541494727134705, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 168670 + }, + { + "epoch": 0.6520697066691408, + "grad_norm": 0.12044411152601242, + "learning_rate": 0.002, + "loss": 2.343, + "step": 168680 + }, + { + "epoch": 0.652108363872524, + "grad_norm": 0.09319040924310684, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 168690 + }, + { + "epoch": 0.6521470210759073, + "grad_norm": 0.10696112364530563, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 168700 + }, + { + "epoch": 0.6521856782792905, + "grad_norm": 0.09727698564529419, + "learning_rate": 0.002, + "loss": 2.324, + "step": 168710 + }, + { + "epoch": 0.6522243354826739, + "grad_norm": 0.11124209314584732, + "learning_rate": 0.002, + "loss": 2.358, + "step": 168720 + }, + { + "epoch": 0.6522629926860571, + "grad_norm": 0.10262446105480194, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 168730 + }, + { + "epoch": 0.6523016498894404, + "grad_norm": 0.11032116413116455, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 168740 + }, + { + "epoch": 0.6523403070928236, + "grad_norm": 0.125161811709404, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 168750 + }, + { + "epoch": 0.652378964296207, + "grad_norm": 0.09789597988128662, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 168760 + }, + { + "epoch": 0.6524176214995903, + "grad_norm": 0.09521053731441498, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 168770 + }, + { + "epoch": 0.6524562787029735, + "grad_norm": 0.13877107203006744, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 168780 + }, + { + "epoch": 0.6524949359063568, + "grad_norm": 0.1110723614692688, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 168790 + }, + { + "epoch": 0.65253359310974, + "grad_norm": 0.11743714660406113, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 168800 + }, + { + "epoch": 0.6525722503131234, + "grad_norm": 0.1045524850487709, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 168810 + }, + { + "epoch": 0.6526109075165066, + "grad_norm": 0.10068535804748535, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 168820 + }, + { + "epoch": 0.6526495647198899, + "grad_norm": 0.08899284899234772, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 168830 + }, + { + "epoch": 0.6526882219232731, + "grad_norm": 0.140655517578125, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 168840 + }, + { + "epoch": 0.6527268791266565, + "grad_norm": 0.10260647535324097, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 168850 + }, + { + "epoch": 0.6527655363300398, + "grad_norm": 0.10964271426200867, + "learning_rate": 0.002, + "loss": 2.344, + "step": 168860 + }, + { + "epoch": 0.652804193533423, + "grad_norm": 0.11483105272054672, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 168870 + }, + { + "epoch": 0.6528428507368063, + "grad_norm": 0.11260164529085159, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 168880 + }, + { + "epoch": 0.6528815079401896, + "grad_norm": 0.11363021284341812, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 168890 + }, + { + "epoch": 0.6529201651435729, + "grad_norm": 0.10534073412418365, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 168900 + }, + { + "epoch": 0.6529588223469561, + "grad_norm": 0.12031086534261703, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 168910 + }, + { + "epoch": 0.6529974795503394, + "grad_norm": 0.10896068066358566, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 168920 + }, + { + "epoch": 0.6530361367537227, + "grad_norm": 0.1259516328573227, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 168930 + }, + { + "epoch": 0.653074793957106, + "grad_norm": 0.110020712018013, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 168940 + }, + { + "epoch": 0.6531134511604892, + "grad_norm": 0.14240184426307678, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 168950 + }, + { + "epoch": 0.6531521083638725, + "grad_norm": 0.0922495424747467, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 168960 + }, + { + "epoch": 0.6531907655672557, + "grad_norm": 0.08859805017709732, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 168970 + }, + { + "epoch": 0.6532294227706391, + "grad_norm": 0.09676842391490936, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 168980 + }, + { + "epoch": 0.6532680799740224, + "grad_norm": 0.11511459946632385, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 168990 + }, + { + "epoch": 0.6533067371774056, + "grad_norm": 0.09584780037403107, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 169000 + }, + { + "epoch": 0.6533453943807889, + "grad_norm": 0.1151767149567604, + "learning_rate": 0.002, + "loss": 2.333, + "step": 169010 + }, + { + "epoch": 0.6533840515841722, + "grad_norm": 0.09524775296449661, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 169020 + }, + { + "epoch": 0.6534227087875555, + "grad_norm": 0.10495606064796448, + "learning_rate": 0.002, + "loss": 2.331, + "step": 169030 + }, + { + "epoch": 0.6534613659909387, + "grad_norm": 0.09548933058977127, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 169040 + }, + { + "epoch": 0.653500023194322, + "grad_norm": 0.1085919663310051, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 169050 + }, + { + "epoch": 0.6535386803977054, + "grad_norm": 0.09393031895160675, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 169060 + }, + { + "epoch": 0.6535773376010886, + "grad_norm": 0.11786603927612305, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 169070 + }, + { + "epoch": 0.6536159948044719, + "grad_norm": 0.09170527011156082, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 169080 + }, + { + "epoch": 0.6536546520078551, + "grad_norm": 0.11640550196170807, + "learning_rate": 0.002, + "loss": 2.329, + "step": 169090 + }, + { + "epoch": 0.6536933092112385, + "grad_norm": 0.10119421035051346, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 169100 + }, + { + "epoch": 0.6537319664146217, + "grad_norm": 0.1151321604847908, + "learning_rate": 0.002, + "loss": 2.342, + "step": 169110 + }, + { + "epoch": 0.653770623618005, + "grad_norm": 0.11418864130973816, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 169120 + }, + { + "epoch": 0.6538092808213882, + "grad_norm": 0.10545522719621658, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 169130 + }, + { + "epoch": 0.6538479380247716, + "grad_norm": 0.09957364946603775, + "learning_rate": 0.002, + "loss": 2.341, + "step": 169140 + }, + { + "epoch": 0.6538865952281548, + "grad_norm": 0.12598615884780884, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 169150 + }, + { + "epoch": 0.6539252524315381, + "grad_norm": 0.1004445031285286, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 169160 + }, + { + "epoch": 0.6539639096349213, + "grad_norm": 0.09190630912780762, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 169170 + }, + { + "epoch": 0.6540025668383046, + "grad_norm": 0.12551818788051605, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 169180 + }, + { + "epoch": 0.654041224041688, + "grad_norm": 0.09470973163843155, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 169190 + }, + { + "epoch": 0.6540798812450712, + "grad_norm": 0.11399231106042862, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 169200 + }, + { + "epoch": 0.6541185384484545, + "grad_norm": 0.09847266227006912, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 169210 + }, + { + "epoch": 0.6541571956518377, + "grad_norm": 0.1182786300778389, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 169220 + }, + { + "epoch": 0.6541958528552211, + "grad_norm": 0.11805164813995361, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 169230 + }, + { + "epoch": 0.6542345100586043, + "grad_norm": 0.11220698058605194, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 169240 + }, + { + "epoch": 0.6542731672619876, + "grad_norm": 0.11356706917285919, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 169250 + }, + { + "epoch": 0.6543118244653708, + "grad_norm": 0.09796137362718582, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 169260 + }, + { + "epoch": 0.6543504816687542, + "grad_norm": 0.11808452755212784, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 169270 + }, + { + "epoch": 0.6543891388721375, + "grad_norm": 0.17121228575706482, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 169280 + }, + { + "epoch": 0.6544277960755207, + "grad_norm": 0.1005435660481453, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 169290 + }, + { + "epoch": 0.654466453278904, + "grad_norm": 0.10762804001569748, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 169300 + }, + { + "epoch": 0.6545051104822873, + "grad_norm": 0.10820998251438141, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 169310 + }, + { + "epoch": 0.6545437676856706, + "grad_norm": 0.10625746846199036, + "learning_rate": 0.002, + "loss": 2.354, + "step": 169320 + }, + { + "epoch": 0.6545824248890538, + "grad_norm": 0.09775619208812714, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 169330 + }, + { + "epoch": 0.6546210820924371, + "grad_norm": 0.09884735196828842, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 169340 + }, + { + "epoch": 0.6546597392958203, + "grad_norm": 0.11361068487167358, + "learning_rate": 0.002, + "loss": 2.359, + "step": 169350 + }, + { + "epoch": 0.6546983964992037, + "grad_norm": 0.09546619653701782, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 169360 + }, + { + "epoch": 0.654737053702587, + "grad_norm": 0.503311038017273, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 169370 + }, + { + "epoch": 0.6547757109059702, + "grad_norm": 0.1377723515033722, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 169380 + }, + { + "epoch": 0.6548143681093535, + "grad_norm": 0.10469796508550644, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 169390 + }, + { + "epoch": 0.6548530253127368, + "grad_norm": 0.09249204397201538, + "learning_rate": 0.002, + "loss": 2.342, + "step": 169400 + }, + { + "epoch": 0.6548916825161201, + "grad_norm": 0.11732906848192215, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 169410 + }, + { + "epoch": 0.6549303397195033, + "grad_norm": 0.1086328774690628, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 169420 + }, + { + "epoch": 0.6549689969228866, + "grad_norm": 0.10182340443134308, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 169430 + }, + { + "epoch": 0.6550076541262699, + "grad_norm": 0.12364275008440018, + "learning_rate": 0.002, + "loss": 2.352, + "step": 169440 + }, + { + "epoch": 0.6550463113296532, + "grad_norm": 0.1127183586359024, + "learning_rate": 0.002, + "loss": 2.349, + "step": 169450 + }, + { + "epoch": 0.6550849685330364, + "grad_norm": 0.08764702081680298, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 169460 + }, + { + "epoch": 0.6551236257364197, + "grad_norm": 0.11021017283201218, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 169470 + }, + { + "epoch": 0.655162282939803, + "grad_norm": 0.1042870506644249, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 169480 + }, + { + "epoch": 0.6552009401431863, + "grad_norm": 0.10292533040046692, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 169490 + }, + { + "epoch": 0.6552395973465696, + "grad_norm": 0.12277467548847198, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 169500 + }, + { + "epoch": 0.6552782545499528, + "grad_norm": 0.09520641714334488, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 169510 + }, + { + "epoch": 0.6553169117533361, + "grad_norm": 0.11056701838970184, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 169520 + }, + { + "epoch": 0.6553555689567194, + "grad_norm": 0.09635456651449203, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 169530 + }, + { + "epoch": 0.6553942261601027, + "grad_norm": 0.10798116773366928, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 169540 + }, + { + "epoch": 0.6554328833634859, + "grad_norm": 0.09781666100025177, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 169550 + }, + { + "epoch": 0.6554715405668692, + "grad_norm": 0.0961836725473404, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 169560 + }, + { + "epoch": 0.6555101977702525, + "grad_norm": 0.1179947480559349, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 169570 + }, + { + "epoch": 0.6555488549736358, + "grad_norm": 0.11416078358888626, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 169580 + }, + { + "epoch": 0.655587512177019, + "grad_norm": 0.10990830510854721, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 169590 + }, + { + "epoch": 0.6556261693804023, + "grad_norm": 0.12073973566293716, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 169600 + }, + { + "epoch": 0.6556648265837857, + "grad_norm": 0.1154635101556778, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 169610 + }, + { + "epoch": 0.6557034837871689, + "grad_norm": 0.10131113976240158, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 169620 + }, + { + "epoch": 0.6557421409905522, + "grad_norm": 0.09605452418327332, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 169630 + }, + { + "epoch": 0.6557807981939354, + "grad_norm": 0.12022049725055695, + "learning_rate": 0.002, + "loss": 2.345, + "step": 169640 + }, + { + "epoch": 0.6558194553973188, + "grad_norm": 0.11077900975942612, + "learning_rate": 0.002, + "loss": 2.325, + "step": 169650 + }, + { + "epoch": 0.655858112600702, + "grad_norm": 0.10479994863271713, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 169660 + }, + { + "epoch": 0.6558967698040853, + "grad_norm": 0.10238562524318695, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 169670 + }, + { + "epoch": 0.6559354270074685, + "grad_norm": 0.09943419694900513, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 169680 + }, + { + "epoch": 0.6559740842108519, + "grad_norm": 0.10164353251457214, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 169690 + }, + { + "epoch": 0.6560127414142352, + "grad_norm": 0.10022822767496109, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 169700 + }, + { + "epoch": 0.6560513986176184, + "grad_norm": 0.10680403560400009, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 169710 + }, + { + "epoch": 0.6560900558210017, + "grad_norm": 0.10432382673025131, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 169720 + }, + { + "epoch": 0.6561287130243849, + "grad_norm": 0.11037509143352509, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 169730 + }, + { + "epoch": 0.6561673702277683, + "grad_norm": 0.10722117871046066, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 169740 + }, + { + "epoch": 0.6562060274311515, + "grad_norm": 0.12687820196151733, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 169750 + }, + { + "epoch": 0.6562446846345348, + "grad_norm": 0.10140392184257507, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 169760 + }, + { + "epoch": 0.656283341837918, + "grad_norm": 0.1151595190167427, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 169770 + }, + { + "epoch": 0.6563219990413014, + "grad_norm": 0.0952373743057251, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 169780 + }, + { + "epoch": 0.6563606562446846, + "grad_norm": 0.11280129104852676, + "learning_rate": 0.002, + "loss": 2.351, + "step": 169790 + }, + { + "epoch": 0.6563993134480679, + "grad_norm": 0.11494222283363342, + "learning_rate": 0.002, + "loss": 2.334, + "step": 169800 + }, + { + "epoch": 0.6564379706514512, + "grad_norm": 0.10257001966238022, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 169810 + }, + { + "epoch": 0.6564766278548345, + "grad_norm": 0.08688849210739136, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 169820 + }, + { + "epoch": 0.6565152850582178, + "grad_norm": 0.10008734464645386, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 169830 + }, + { + "epoch": 0.656553942261601, + "grad_norm": 0.12060262262821198, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 169840 + }, + { + "epoch": 0.6565925994649843, + "grad_norm": 0.10228583961725235, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 169850 + }, + { + "epoch": 0.6566312566683676, + "grad_norm": 0.11394084990024567, + "learning_rate": 0.002, + "loss": 2.357, + "step": 169860 + }, + { + "epoch": 0.6566699138717509, + "grad_norm": 0.10401419550180435, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 169870 + }, + { + "epoch": 0.6567085710751341, + "grad_norm": 0.10975929349660873, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 169880 + }, + { + "epoch": 0.6567472282785174, + "grad_norm": 0.10642538219690323, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 169890 + }, + { + "epoch": 0.6567858854819006, + "grad_norm": 0.1057378351688385, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 169900 + }, + { + "epoch": 0.656824542685284, + "grad_norm": 0.10456452518701553, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 169910 + }, + { + "epoch": 0.6568631998886673, + "grad_norm": 0.09889863431453705, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 169920 + }, + { + "epoch": 0.6569018570920505, + "grad_norm": 0.09749070554971695, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 169930 + }, + { + "epoch": 0.6569405142954338, + "grad_norm": 0.10418283939361572, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 169940 + }, + { + "epoch": 0.6569791714988171, + "grad_norm": 0.12010916322469711, + "learning_rate": 0.002, + "loss": 2.354, + "step": 169950 + }, + { + "epoch": 0.6570178287022004, + "grad_norm": 0.09374318271875381, + "learning_rate": 0.002, + "loss": 2.344, + "step": 169960 + }, + { + "epoch": 0.6570564859055836, + "grad_norm": 0.11701393872499466, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 169970 + }, + { + "epoch": 0.6570951431089669, + "grad_norm": 0.11446920782327652, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 169980 + }, + { + "epoch": 0.6571338003123502, + "grad_norm": 0.10390263050794601, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 169990 + }, + { + "epoch": 0.6571724575157335, + "grad_norm": 0.10065855830907822, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 170000 + }, + { + "epoch": 0.6572111147191168, + "grad_norm": 0.10264160484075546, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 170010 + }, + { + "epoch": 0.6572497719225, + "grad_norm": 0.10007256269454956, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 170020 + }, + { + "epoch": 0.6572884291258834, + "grad_norm": 0.11538249254226685, + "learning_rate": 0.002, + "loss": 2.344, + "step": 170030 + }, + { + "epoch": 0.6573270863292666, + "grad_norm": 0.12416301667690277, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 170040 + }, + { + "epoch": 0.6573657435326499, + "grad_norm": 0.10510220378637314, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 170050 + }, + { + "epoch": 0.6574044007360331, + "grad_norm": 0.10308302938938141, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 170060 + }, + { + "epoch": 0.6574430579394164, + "grad_norm": 0.11111033707857132, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 170070 + }, + { + "epoch": 0.6574817151427997, + "grad_norm": 0.11000286042690277, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 170080 + }, + { + "epoch": 0.657520372346183, + "grad_norm": 0.1113034263253212, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 170090 + }, + { + "epoch": 0.6575590295495662, + "grad_norm": 0.10262025892734528, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 170100 + }, + { + "epoch": 0.6575976867529495, + "grad_norm": 0.09241412580013275, + "learning_rate": 0.002, + "loss": 2.347, + "step": 170110 + }, + { + "epoch": 0.6576363439563329, + "grad_norm": 0.10361220687627792, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 170120 + }, + { + "epoch": 0.6576750011597161, + "grad_norm": 0.0940166786313057, + "learning_rate": 0.002, + "loss": 2.343, + "step": 170130 + }, + { + "epoch": 0.6577136583630994, + "grad_norm": 0.10688845813274384, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 170140 + }, + { + "epoch": 0.6577523155664826, + "grad_norm": 0.0985884964466095, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 170150 + }, + { + "epoch": 0.657790972769866, + "grad_norm": 0.11543738096952438, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 170160 + }, + { + "epoch": 0.6578296299732492, + "grad_norm": 0.11746984720230103, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 170170 + }, + { + "epoch": 0.6578682871766325, + "grad_norm": 0.08940320461988449, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 170180 + }, + { + "epoch": 0.6579069443800157, + "grad_norm": 0.13141517341136932, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 170190 + }, + { + "epoch": 0.6579456015833991, + "grad_norm": 0.1139146015048027, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 170200 + }, + { + "epoch": 0.6579842587867824, + "grad_norm": 0.09153608232736588, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 170210 + }, + { + "epoch": 0.6580229159901656, + "grad_norm": 0.08851674199104309, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 170220 + }, + { + "epoch": 0.6580615731935489, + "grad_norm": 0.09489694982767105, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 170230 + }, + { + "epoch": 0.6581002303969322, + "grad_norm": 0.0882200300693512, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 170240 + }, + { + "epoch": 0.6581388876003155, + "grad_norm": 0.12556059658527374, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 170250 + }, + { + "epoch": 0.6581775448036987, + "grad_norm": 0.11310429126024246, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 170260 + }, + { + "epoch": 0.658216202007082, + "grad_norm": 0.10799125581979752, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 170270 + }, + { + "epoch": 0.6582548592104652, + "grad_norm": 0.10793901234865189, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 170280 + }, + { + "epoch": 0.6582935164138486, + "grad_norm": 0.11244485527276993, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 170290 + }, + { + "epoch": 0.6583321736172318, + "grad_norm": 0.08430363982915878, + "learning_rate": 0.002, + "loss": 2.331, + "step": 170300 + }, + { + "epoch": 0.6583708308206151, + "grad_norm": 0.10967878997325897, + "learning_rate": 0.002, + "loss": 2.348, + "step": 170310 + }, + { + "epoch": 0.6584094880239983, + "grad_norm": 0.10456710308790207, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 170320 + }, + { + "epoch": 0.6584481452273817, + "grad_norm": 0.1308537870645523, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 170330 + }, + { + "epoch": 0.658486802430765, + "grad_norm": 0.09279914945363998, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 170340 + }, + { + "epoch": 0.6585254596341482, + "grad_norm": 0.10395380854606628, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 170350 + }, + { + "epoch": 0.6585641168375315, + "grad_norm": 0.09720392525196075, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 170360 + }, + { + "epoch": 0.6586027740409148, + "grad_norm": 0.12921123206615448, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 170370 + }, + { + "epoch": 0.6586414312442981, + "grad_norm": 0.11742226034402847, + "learning_rate": 0.002, + "loss": 2.362, + "step": 170380 + }, + { + "epoch": 0.6586800884476813, + "grad_norm": 0.0947229340672493, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 170390 + }, + { + "epoch": 0.6587187456510646, + "grad_norm": 0.1013537347316742, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 170400 + }, + { + "epoch": 0.658757402854448, + "grad_norm": 0.11041922867298126, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 170410 + }, + { + "epoch": 0.6587960600578312, + "grad_norm": 0.12972447276115417, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 170420 + }, + { + "epoch": 0.6588347172612145, + "grad_norm": 0.10491415858268738, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 170430 + }, + { + "epoch": 0.6588733744645977, + "grad_norm": 0.11179272085428238, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 170440 + }, + { + "epoch": 0.658912031667981, + "grad_norm": 0.09766574203968048, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 170450 + }, + { + "epoch": 0.6589506888713643, + "grad_norm": 0.09233935177326202, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 170460 + }, + { + "epoch": 0.6589893460747476, + "grad_norm": 0.10109974443912506, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 170470 + }, + { + "epoch": 0.6590280032781308, + "grad_norm": 0.10457310825586319, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 170480 + }, + { + "epoch": 0.6590666604815141, + "grad_norm": 0.13295739889144897, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 170490 + }, + { + "epoch": 0.6591053176848974, + "grad_norm": 0.09190351516008377, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 170500 + }, + { + "epoch": 0.6591439748882807, + "grad_norm": 0.1040644571185112, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 170510 + }, + { + "epoch": 0.659182632091664, + "grad_norm": 0.10038863122463226, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 170520 + }, + { + "epoch": 0.6592212892950472, + "grad_norm": 0.1172012984752655, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 170530 + }, + { + "epoch": 0.6592599464984306, + "grad_norm": 0.10850688070058823, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 170540 + }, + { + "epoch": 0.6592986037018138, + "grad_norm": 0.09408782422542572, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 170550 + }, + { + "epoch": 0.6593372609051971, + "grad_norm": 0.12638218700885773, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 170560 + }, + { + "epoch": 0.6593759181085803, + "grad_norm": 0.10627342760562897, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 170570 + }, + { + "epoch": 0.6594145753119637, + "grad_norm": 0.16851374506950378, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 170580 + }, + { + "epoch": 0.6594532325153469, + "grad_norm": 0.08803575485944748, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 170590 + }, + { + "epoch": 0.6594918897187302, + "grad_norm": 0.11330971121788025, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 170600 + }, + { + "epoch": 0.6595305469221134, + "grad_norm": 0.09639140963554382, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 170610 + }, + { + "epoch": 0.6595692041254968, + "grad_norm": 0.10504119098186493, + "learning_rate": 0.002, + "loss": 2.335, + "step": 170620 + }, + { + "epoch": 0.65960786132888, + "grad_norm": 0.2254355549812317, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 170630 + }, + { + "epoch": 0.6596465185322633, + "grad_norm": 0.09327276796102524, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 170640 + }, + { + "epoch": 0.6596851757356466, + "grad_norm": 0.09641090780496597, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 170650 + }, + { + "epoch": 0.6597238329390298, + "grad_norm": 0.1110328882932663, + "learning_rate": 0.002, + "loss": 2.364, + "step": 170660 + }, + { + "epoch": 0.6597624901424132, + "grad_norm": 0.10514163970947266, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 170670 + }, + { + "epoch": 0.6598011473457964, + "grad_norm": 0.1033729761838913, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 170680 + }, + { + "epoch": 0.6598398045491797, + "grad_norm": 0.09916379302740097, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 170690 + }, + { + "epoch": 0.6598784617525629, + "grad_norm": 0.09037516266107559, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 170700 + }, + { + "epoch": 0.6599171189559463, + "grad_norm": 0.0995062068104744, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 170710 + }, + { + "epoch": 0.6599557761593295, + "grad_norm": 0.1171247586607933, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 170720 + }, + { + "epoch": 0.6599944333627128, + "grad_norm": 0.09908228367567062, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 170730 + }, + { + "epoch": 0.660033090566096, + "grad_norm": 0.10033340752124786, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 170740 + }, + { + "epoch": 0.6600717477694794, + "grad_norm": 0.10300584137439728, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 170750 + }, + { + "epoch": 0.6601104049728627, + "grad_norm": 0.09993764758110046, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 170760 + }, + { + "epoch": 0.6601490621762459, + "grad_norm": 0.1555805802345276, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 170770 + }, + { + "epoch": 0.6601877193796292, + "grad_norm": 0.12053221464157104, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 170780 + }, + { + "epoch": 0.6602263765830125, + "grad_norm": 0.10223250091075897, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 170790 + }, + { + "epoch": 0.6602650337863958, + "grad_norm": 0.10473880916833878, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 170800 + }, + { + "epoch": 0.660303690989779, + "grad_norm": 0.11674695461988449, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 170810 + }, + { + "epoch": 0.6603423481931623, + "grad_norm": 0.10591929405927658, + "learning_rate": 0.002, + "loss": 2.344, + "step": 170820 + }, + { + "epoch": 0.6603810053965455, + "grad_norm": 0.10028078407049179, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 170830 + }, + { + "epoch": 0.6604196625999289, + "grad_norm": 0.08663002401590347, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 170840 + }, + { + "epoch": 0.6604583198033122, + "grad_norm": 0.11164186894893646, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 170850 + }, + { + "epoch": 0.6604969770066954, + "grad_norm": 0.12362227588891983, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 170860 + }, + { + "epoch": 0.6605356342100787, + "grad_norm": 0.09875123202800751, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 170870 + }, + { + "epoch": 0.660574291413462, + "grad_norm": 0.10698625445365906, + "learning_rate": 0.002, + "loss": 2.327, + "step": 170880 + }, + { + "epoch": 0.6606129486168453, + "grad_norm": 0.0939648225903511, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 170890 + }, + { + "epoch": 0.6606516058202285, + "grad_norm": 0.1279655396938324, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 170900 + }, + { + "epoch": 0.6606902630236118, + "grad_norm": 0.11866834759712219, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 170910 + }, + { + "epoch": 0.6607289202269951, + "grad_norm": 0.09348573535680771, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 170920 + }, + { + "epoch": 0.6607675774303784, + "grad_norm": 0.1124269962310791, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 170930 + }, + { + "epoch": 0.6608062346337616, + "grad_norm": 0.12030279636383057, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 170940 + }, + { + "epoch": 0.6608448918371449, + "grad_norm": 0.10367617756128311, + "learning_rate": 0.002, + "loss": 2.321, + "step": 170950 + }, + { + "epoch": 0.6608835490405283, + "grad_norm": 0.09243983775377274, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 170960 + }, + { + "epoch": 0.6609222062439115, + "grad_norm": 0.09867145121097565, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 170970 + }, + { + "epoch": 0.6609608634472948, + "grad_norm": 0.09240375459194183, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 170980 + }, + { + "epoch": 0.660999520650678, + "grad_norm": 0.0999855175614357, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 170990 + }, + { + "epoch": 0.6610381778540613, + "grad_norm": 0.14586104452610016, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 171000 + }, + { + "epoch": 0.6610768350574446, + "grad_norm": 0.11599107831716537, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 171010 + }, + { + "epoch": 0.6611154922608279, + "grad_norm": 0.099213145673275, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 171020 + }, + { + "epoch": 0.6611541494642111, + "grad_norm": 0.10795638710260391, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 171030 + }, + { + "epoch": 0.6611928066675944, + "grad_norm": 0.19326281547546387, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 171040 + }, + { + "epoch": 0.6612314638709778, + "grad_norm": 0.09383906424045563, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 171050 + }, + { + "epoch": 0.661270121074361, + "grad_norm": 0.11155370622873306, + "learning_rate": 0.002, + "loss": 2.3658, + "step": 171060 + }, + { + "epoch": 0.6613087782777443, + "grad_norm": 0.09720378369092941, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 171070 + }, + { + "epoch": 0.6613474354811275, + "grad_norm": 0.10207916796207428, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 171080 + }, + { + "epoch": 0.6613860926845109, + "grad_norm": 0.09510542452335358, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 171090 + }, + { + "epoch": 0.6614247498878941, + "grad_norm": 0.10385362058877945, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 171100 + }, + { + "epoch": 0.6614634070912774, + "grad_norm": 0.09923926740884781, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 171110 + }, + { + "epoch": 0.6615020642946606, + "grad_norm": 0.10726913809776306, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 171120 + }, + { + "epoch": 0.661540721498044, + "grad_norm": 0.13127969205379486, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 171130 + }, + { + "epoch": 0.6615793787014272, + "grad_norm": 0.10140059143304825, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 171140 + }, + { + "epoch": 0.6616180359048105, + "grad_norm": 0.10477566719055176, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 171150 + }, + { + "epoch": 0.6616566931081937, + "grad_norm": 0.13156050443649292, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 171160 + }, + { + "epoch": 0.6616953503115771, + "grad_norm": 0.09503421187400818, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 171170 + }, + { + "epoch": 0.6617340075149604, + "grad_norm": 0.11637887358665466, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 171180 + }, + { + "epoch": 0.6617726647183436, + "grad_norm": 0.0924702063202858, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 171190 + }, + { + "epoch": 0.6618113219217269, + "grad_norm": 0.11030198633670807, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 171200 + }, + { + "epoch": 0.6618499791251101, + "grad_norm": 0.11644049733877182, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 171210 + }, + { + "epoch": 0.6618886363284935, + "grad_norm": 0.10431084781885147, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 171220 + }, + { + "epoch": 0.6619272935318767, + "grad_norm": 0.12342498451471329, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 171230 + }, + { + "epoch": 0.66196595073526, + "grad_norm": 0.08883868902921677, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 171240 + }, + { + "epoch": 0.6620046079386432, + "grad_norm": 0.10376245528459549, + "learning_rate": 0.002, + "loss": 2.352, + "step": 171250 + }, + { + "epoch": 0.6620432651420266, + "grad_norm": 0.12346050143241882, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 171260 + }, + { + "epoch": 0.6620819223454099, + "grad_norm": 0.09900625050067902, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 171270 + }, + { + "epoch": 0.6621205795487931, + "grad_norm": 0.11180734634399414, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 171280 + }, + { + "epoch": 0.6621592367521764, + "grad_norm": 0.11599894613027573, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 171290 + }, + { + "epoch": 0.6621978939555597, + "grad_norm": 0.09575050324201584, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 171300 + }, + { + "epoch": 0.662236551158943, + "grad_norm": 0.10637818276882172, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 171310 + }, + { + "epoch": 0.6622752083623262, + "grad_norm": 0.10875699669122696, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 171320 + }, + { + "epoch": 0.6623138655657095, + "grad_norm": 0.09395532310009003, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 171330 + }, + { + "epoch": 0.6623525227690928, + "grad_norm": 0.08983360975980759, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 171340 + }, + { + "epoch": 0.6623911799724761, + "grad_norm": 0.08850152045488358, + "learning_rate": 0.002, + "loss": 2.343, + "step": 171350 + }, + { + "epoch": 0.6624298371758593, + "grad_norm": 0.10465314239263535, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 171360 + }, + { + "epoch": 0.6624684943792426, + "grad_norm": 0.1074366495013237, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 171370 + }, + { + "epoch": 0.6625071515826259, + "grad_norm": 0.11158204078674316, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 171380 + }, + { + "epoch": 0.6625458087860092, + "grad_norm": 0.3130476772785187, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 171390 + }, + { + "epoch": 0.6625844659893925, + "grad_norm": 0.10336320102214813, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 171400 + }, + { + "epoch": 0.6626231231927757, + "grad_norm": 0.09524131566286087, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 171410 + }, + { + "epoch": 0.662661780396159, + "grad_norm": 0.0908019170165062, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 171420 + }, + { + "epoch": 0.6627004375995423, + "grad_norm": 0.1123889610171318, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 171430 + }, + { + "epoch": 0.6627390948029256, + "grad_norm": 0.10293354839086533, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 171440 + }, + { + "epoch": 0.6627777520063088, + "grad_norm": 0.12804663181304932, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 171450 + }, + { + "epoch": 0.6628164092096921, + "grad_norm": 0.12000278383493423, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 171460 + }, + { + "epoch": 0.6628550664130755, + "grad_norm": 0.1008700430393219, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 171470 + }, + { + "epoch": 0.6628937236164587, + "grad_norm": 0.11298047751188278, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 171480 + }, + { + "epoch": 0.662932380819842, + "grad_norm": 0.09088566899299622, + "learning_rate": 0.002, + "loss": 2.333, + "step": 171490 + }, + { + "epoch": 0.6629710380232252, + "grad_norm": 0.10908161848783493, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 171500 + }, + { + "epoch": 0.6630096952266086, + "grad_norm": 0.10217082500457764, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 171510 + }, + { + "epoch": 0.6630483524299918, + "grad_norm": 0.10757473111152649, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 171520 + }, + { + "epoch": 0.6630870096333751, + "grad_norm": 0.11873800307512283, + "learning_rate": 0.002, + "loss": 2.336, + "step": 171530 + }, + { + "epoch": 0.6631256668367583, + "grad_norm": 0.10032789409160614, + "learning_rate": 0.002, + "loss": 2.345, + "step": 171540 + }, + { + "epoch": 0.6631643240401417, + "grad_norm": 0.1108940839767456, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 171550 + }, + { + "epoch": 0.663202981243525, + "grad_norm": 0.11462477594614029, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 171560 + }, + { + "epoch": 0.6632416384469082, + "grad_norm": 0.11187411844730377, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 171570 + }, + { + "epoch": 0.6632802956502915, + "grad_norm": 0.10882826149463654, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 171580 + }, + { + "epoch": 0.6633189528536747, + "grad_norm": 0.11319388449192047, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 171590 + }, + { + "epoch": 0.6633576100570581, + "grad_norm": 0.1004195362329483, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 171600 + }, + { + "epoch": 0.6633962672604413, + "grad_norm": 0.09757034480571747, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 171610 + }, + { + "epoch": 0.6634349244638246, + "grad_norm": 0.10731105506420135, + "learning_rate": 0.002, + "loss": 2.332, + "step": 171620 + }, + { + "epoch": 0.6634735816672078, + "grad_norm": 0.1167788878083229, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 171630 + }, + { + "epoch": 0.6635122388705912, + "grad_norm": 0.11580292135477066, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 171640 + }, + { + "epoch": 0.6635508960739744, + "grad_norm": 0.09653709828853607, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 171650 + }, + { + "epoch": 0.6635895532773577, + "grad_norm": 0.09271356463432312, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 171660 + }, + { + "epoch": 0.6636282104807409, + "grad_norm": 0.1072278544306755, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 171670 + }, + { + "epoch": 0.6636668676841243, + "grad_norm": 0.11917044967412949, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 171680 + }, + { + "epoch": 0.6637055248875076, + "grad_norm": 0.10324475914239883, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 171690 + }, + { + "epoch": 0.6637441820908908, + "grad_norm": 0.11788350343704224, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 171700 + }, + { + "epoch": 0.6637828392942741, + "grad_norm": 0.10315144807100296, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 171710 + }, + { + "epoch": 0.6638214964976574, + "grad_norm": 0.12169231474399567, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 171720 + }, + { + "epoch": 0.6638601537010407, + "grad_norm": 0.08901426941156387, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 171730 + }, + { + "epoch": 0.6638988109044239, + "grad_norm": 0.11607123911380768, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 171740 + }, + { + "epoch": 0.6639374681078072, + "grad_norm": 0.111125148832798, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 171750 + }, + { + "epoch": 0.6639761253111904, + "grad_norm": 0.10658658295869827, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 171760 + }, + { + "epoch": 0.6640147825145738, + "grad_norm": 0.09600212424993515, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 171770 + }, + { + "epoch": 0.664053439717957, + "grad_norm": 0.09580216556787491, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 171780 + }, + { + "epoch": 0.6640920969213403, + "grad_norm": 0.11963184922933578, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 171790 + }, + { + "epoch": 0.6641307541247236, + "grad_norm": 0.09414113312959671, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 171800 + }, + { + "epoch": 0.6641694113281069, + "grad_norm": 0.12084329128265381, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 171810 + }, + { + "epoch": 0.6642080685314902, + "grad_norm": 0.09752669930458069, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 171820 + }, + { + "epoch": 0.6642467257348734, + "grad_norm": 0.11089248955249786, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 171830 + }, + { + "epoch": 0.6642853829382567, + "grad_norm": 0.12359671294689178, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 171840 + }, + { + "epoch": 0.66432404014164, + "grad_norm": 0.09718482196331024, + "learning_rate": 0.002, + "loss": 2.333, + "step": 171850 + }, + { + "epoch": 0.6643626973450233, + "grad_norm": 0.09817170351743698, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 171860 + }, + { + "epoch": 0.6644013545484065, + "grad_norm": 0.09323927760124207, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 171870 + }, + { + "epoch": 0.6644400117517898, + "grad_norm": 0.1100674495100975, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 171880 + }, + { + "epoch": 0.6644786689551732, + "grad_norm": 0.10748188197612762, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 171890 + }, + { + "epoch": 0.6645173261585564, + "grad_norm": 0.12465234100818634, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 171900 + }, + { + "epoch": 0.6645559833619397, + "grad_norm": 0.09937615692615509, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 171910 + }, + { + "epoch": 0.6645946405653229, + "grad_norm": 0.09921646118164062, + "learning_rate": 0.002, + "loss": 2.343, + "step": 171920 + }, + { + "epoch": 0.6646332977687062, + "grad_norm": 0.11704915761947632, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 171930 + }, + { + "epoch": 0.6646719549720895, + "grad_norm": 0.10025927424430847, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 171940 + }, + { + "epoch": 0.6647106121754728, + "grad_norm": 0.134184330701828, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 171950 + }, + { + "epoch": 0.664749269378856, + "grad_norm": 0.09934668242931366, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 171960 + }, + { + "epoch": 0.6647879265822393, + "grad_norm": 0.12052752077579498, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 171970 + }, + { + "epoch": 0.6648265837856226, + "grad_norm": 0.12109331041574478, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 171980 + }, + { + "epoch": 0.6648652409890059, + "grad_norm": 0.10054156184196472, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 171990 + }, + { + "epoch": 0.6649038981923892, + "grad_norm": 0.09870258718729019, + "learning_rate": 0.002, + "loss": 2.345, + "step": 172000 + }, + { + "epoch": 0.6649425553957724, + "grad_norm": 0.11486545205116272, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 172010 + }, + { + "epoch": 0.6649812125991558, + "grad_norm": 0.11269167810678482, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 172020 + }, + { + "epoch": 0.665019869802539, + "grad_norm": 0.09960546344518661, + "learning_rate": 0.002, + "loss": 2.352, + "step": 172030 + }, + { + "epoch": 0.6650585270059223, + "grad_norm": 0.09361325949430466, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 172040 + }, + { + "epoch": 0.6650971842093055, + "grad_norm": 0.10352955013513565, + "learning_rate": 0.002, + "loss": 2.3642, + "step": 172050 + }, + { + "epoch": 0.6651358414126889, + "grad_norm": 0.1031767800450325, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 172060 + }, + { + "epoch": 0.6651744986160721, + "grad_norm": 0.09600525349378586, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 172070 + }, + { + "epoch": 0.6652131558194554, + "grad_norm": 0.10943388938903809, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 172080 + }, + { + "epoch": 0.6652518130228386, + "grad_norm": 0.11022301763296127, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 172090 + }, + { + "epoch": 0.665290470226222, + "grad_norm": 0.09631247818470001, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 172100 + }, + { + "epoch": 0.6653291274296053, + "grad_norm": 0.23882737755775452, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 172110 + }, + { + "epoch": 0.6653677846329885, + "grad_norm": 0.100875124335289, + "learning_rate": 0.002, + "loss": 2.352, + "step": 172120 + }, + { + "epoch": 0.6654064418363718, + "grad_norm": 0.11418143659830093, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 172130 + }, + { + "epoch": 0.665445099039755, + "grad_norm": 0.10449163615703583, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 172140 + }, + { + "epoch": 0.6654837562431384, + "grad_norm": 0.1039036363363266, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 172150 + }, + { + "epoch": 0.6655224134465216, + "grad_norm": 0.09626703709363937, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 172160 + }, + { + "epoch": 0.6655610706499049, + "grad_norm": 0.10677660256624222, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 172170 + }, + { + "epoch": 0.6655997278532881, + "grad_norm": 0.09619175642728806, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 172180 + }, + { + "epoch": 0.6656383850566715, + "grad_norm": 0.12057211995124817, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 172190 + }, + { + "epoch": 0.6656770422600548, + "grad_norm": 0.10427337139844894, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 172200 + }, + { + "epoch": 0.665715699463438, + "grad_norm": 0.10566114634275436, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 172210 + }, + { + "epoch": 0.6657543566668213, + "grad_norm": 0.10993968695402145, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 172220 + }, + { + "epoch": 0.6657930138702046, + "grad_norm": 0.09533580392599106, + "learning_rate": 0.002, + "loss": 2.34, + "step": 172230 + }, + { + "epoch": 0.6658316710735879, + "grad_norm": 0.09474946558475494, + "learning_rate": 0.002, + "loss": 2.32, + "step": 172240 + }, + { + "epoch": 0.6658703282769711, + "grad_norm": 0.12730206549167633, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 172250 + }, + { + "epoch": 0.6659089854803544, + "grad_norm": 0.10522949695587158, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 172260 + }, + { + "epoch": 0.6659476426837377, + "grad_norm": 0.12083995342254639, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 172270 + }, + { + "epoch": 0.665986299887121, + "grad_norm": 0.11327656358480453, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 172280 + }, + { + "epoch": 0.6660249570905042, + "grad_norm": 0.1076776310801506, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 172290 + }, + { + "epoch": 0.6660636142938875, + "grad_norm": 0.11642111092805862, + "learning_rate": 0.002, + "loss": 2.326, + "step": 172300 + }, + { + "epoch": 0.6661022714972707, + "grad_norm": 0.09679548442363739, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 172310 + }, + { + "epoch": 0.6661409287006541, + "grad_norm": 0.13206057250499725, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 172320 + }, + { + "epoch": 0.6661795859040374, + "grad_norm": 0.09222513437271118, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 172330 + }, + { + "epoch": 0.6662182431074206, + "grad_norm": 0.09885145723819733, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 172340 + }, + { + "epoch": 0.6662569003108039, + "grad_norm": 0.09937558323144913, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 172350 + }, + { + "epoch": 0.6662955575141872, + "grad_norm": 0.11098925769329071, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 172360 + }, + { + "epoch": 0.6663342147175705, + "grad_norm": 0.11017122119665146, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 172370 + }, + { + "epoch": 0.6663728719209537, + "grad_norm": 0.09626687318086624, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 172380 + }, + { + "epoch": 0.666411529124337, + "grad_norm": 0.10198944061994553, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 172390 + }, + { + "epoch": 0.6664501863277204, + "grad_norm": 0.10744412243366241, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 172400 + }, + { + "epoch": 0.6664888435311036, + "grad_norm": 0.09577515721321106, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 172410 + }, + { + "epoch": 0.6665275007344869, + "grad_norm": 0.1001209020614624, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 172420 + }, + { + "epoch": 0.6665661579378701, + "grad_norm": 0.09737569838762283, + "learning_rate": 0.002, + "loss": 2.357, + "step": 172430 + }, + { + "epoch": 0.6666048151412535, + "grad_norm": 0.1063157245516777, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 172440 + }, + { + "epoch": 0.6666434723446367, + "grad_norm": 0.1438492089509964, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 172450 + }, + { + "epoch": 0.66668212954802, + "grad_norm": 0.1074848547577858, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 172460 + }, + { + "epoch": 0.6667207867514032, + "grad_norm": 0.10906050354242325, + "learning_rate": 0.002, + "loss": 2.352, + "step": 172470 + }, + { + "epoch": 0.6667594439547866, + "grad_norm": 0.10571140795946121, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 172480 + }, + { + "epoch": 0.6667981011581698, + "grad_norm": 0.10705535113811493, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 172490 + }, + { + "epoch": 0.6668367583615531, + "grad_norm": 0.1198195368051529, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 172500 + }, + { + "epoch": 0.6668754155649363, + "grad_norm": 0.10089991241693497, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 172510 + }, + { + "epoch": 0.6669140727683196, + "grad_norm": 0.10191459208726883, + "learning_rate": 0.002, + "loss": 2.354, + "step": 172520 + }, + { + "epoch": 0.666952729971703, + "grad_norm": 0.09125278890132904, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 172530 + }, + { + "epoch": 0.6669913871750862, + "grad_norm": 0.12424398213624954, + "learning_rate": 0.002, + "loss": 2.354, + "step": 172540 + }, + { + "epoch": 0.6670300443784695, + "grad_norm": 0.09108825773000717, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 172550 + }, + { + "epoch": 0.6670687015818527, + "grad_norm": 0.10158373415470123, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 172560 + }, + { + "epoch": 0.6671073587852361, + "grad_norm": 0.126356303691864, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 172570 + }, + { + "epoch": 0.6671460159886193, + "grad_norm": 0.11853097379207611, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 172580 + }, + { + "epoch": 0.6671846731920026, + "grad_norm": 0.09849544614553452, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 172590 + }, + { + "epoch": 0.6672233303953858, + "grad_norm": 0.09688036143779755, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 172600 + }, + { + "epoch": 0.6672619875987692, + "grad_norm": 0.10107052326202393, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 172610 + }, + { + "epoch": 0.6673006448021525, + "grad_norm": 0.10965628921985626, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 172620 + }, + { + "epoch": 0.6673393020055357, + "grad_norm": 0.10272455960512161, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 172630 + }, + { + "epoch": 0.667377959208919, + "grad_norm": 0.10162177681922913, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 172640 + }, + { + "epoch": 0.6674166164123023, + "grad_norm": 0.10011301189661026, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 172650 + }, + { + "epoch": 0.6674552736156856, + "grad_norm": 0.1002359688282013, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 172660 + }, + { + "epoch": 0.6674939308190688, + "grad_norm": 0.2552264928817749, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 172670 + }, + { + "epoch": 0.6675325880224521, + "grad_norm": 0.09233082830905914, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 172680 + }, + { + "epoch": 0.6675712452258353, + "grad_norm": 0.09360641241073608, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 172690 + }, + { + "epoch": 0.6676099024292187, + "grad_norm": 0.10834074020385742, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 172700 + }, + { + "epoch": 0.667648559632602, + "grad_norm": 0.10256896913051605, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 172710 + }, + { + "epoch": 0.6676872168359852, + "grad_norm": 0.10912622511386871, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 172720 + }, + { + "epoch": 0.6677258740393684, + "grad_norm": 0.10098609328269958, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 172730 + }, + { + "epoch": 0.6677645312427518, + "grad_norm": 0.10524442046880722, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 172740 + }, + { + "epoch": 0.6678031884461351, + "grad_norm": 0.10921610891819, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 172750 + }, + { + "epoch": 0.6678418456495183, + "grad_norm": 0.15663886070251465, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 172760 + }, + { + "epoch": 0.6678805028529016, + "grad_norm": 0.09868650138378143, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 172770 + }, + { + "epoch": 0.6679191600562849, + "grad_norm": 0.13061143457889557, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 172780 + }, + { + "epoch": 0.6679578172596682, + "grad_norm": 0.10689136385917664, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 172790 + }, + { + "epoch": 0.6679964744630514, + "grad_norm": 0.10942581295967102, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 172800 + }, + { + "epoch": 0.6680351316664347, + "grad_norm": 0.09561222046613693, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 172810 + }, + { + "epoch": 0.668073788869818, + "grad_norm": 0.10102529078722, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 172820 + }, + { + "epoch": 0.6681124460732013, + "grad_norm": 0.10421686619520187, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 172830 + }, + { + "epoch": 0.6681511032765846, + "grad_norm": 0.11196110397577286, + "learning_rate": 0.002, + "loss": 2.337, + "step": 172840 + }, + { + "epoch": 0.6681897604799678, + "grad_norm": 0.10805104672908783, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 172850 + }, + { + "epoch": 0.6682284176833511, + "grad_norm": 0.10558205097913742, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 172860 + }, + { + "epoch": 0.6682670748867344, + "grad_norm": 0.10156062245368958, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 172870 + }, + { + "epoch": 0.6683057320901177, + "grad_norm": 0.09688648581504822, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 172880 + }, + { + "epoch": 0.6683443892935009, + "grad_norm": 0.10593561083078384, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 172890 + }, + { + "epoch": 0.6683830464968842, + "grad_norm": 0.10871266573667526, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 172900 + }, + { + "epoch": 0.6684217037002675, + "grad_norm": 0.0923452377319336, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 172910 + }, + { + "epoch": 0.6684603609036508, + "grad_norm": 0.12133041024208069, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 172920 + }, + { + "epoch": 0.668499018107034, + "grad_norm": 0.11136896908283234, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 172930 + }, + { + "epoch": 0.6685376753104173, + "grad_norm": 0.140518918633461, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 172940 + }, + { + "epoch": 0.6685763325138007, + "grad_norm": 0.10117638111114502, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 172950 + }, + { + "epoch": 0.6686149897171839, + "grad_norm": 0.09949972480535507, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 172960 + }, + { + "epoch": 0.6686536469205672, + "grad_norm": 0.10655559599399567, + "learning_rate": 0.002, + "loss": 2.343, + "step": 172970 + }, + { + "epoch": 0.6686923041239504, + "grad_norm": 0.12371232360601425, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 172980 + }, + { + "epoch": 0.6687309613273338, + "grad_norm": 0.12231665849685669, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 172990 + }, + { + "epoch": 0.668769618530717, + "grad_norm": 0.09841171652078629, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 173000 + }, + { + "epoch": 0.6688082757341003, + "grad_norm": 0.11069756746292114, + "learning_rate": 0.002, + "loss": 2.346, + "step": 173010 + }, + { + "epoch": 0.6688469329374835, + "grad_norm": 0.10788854956626892, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 173020 + }, + { + "epoch": 0.6688855901408669, + "grad_norm": 0.10781175643205643, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 173030 + }, + { + "epoch": 0.6689242473442502, + "grad_norm": 0.11309197545051575, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 173040 + }, + { + "epoch": 0.6689629045476334, + "grad_norm": 0.09210246801376343, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 173050 + }, + { + "epoch": 0.6690015617510167, + "grad_norm": 0.105115607380867, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 173060 + }, + { + "epoch": 0.6690402189543999, + "grad_norm": 0.1119607463479042, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 173070 + }, + { + "epoch": 0.6690788761577833, + "grad_norm": 0.10662411153316498, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 173080 + }, + { + "epoch": 0.6691175333611665, + "grad_norm": 0.0970088467001915, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 173090 + }, + { + "epoch": 0.6691561905645498, + "grad_norm": 0.1164088249206543, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 173100 + }, + { + "epoch": 0.669194847767933, + "grad_norm": 0.14752018451690674, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 173110 + }, + { + "epoch": 0.6692335049713164, + "grad_norm": 0.11193527281284332, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 173120 + }, + { + "epoch": 0.6692721621746996, + "grad_norm": 0.21462298929691315, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 173130 + }, + { + "epoch": 0.6693108193780829, + "grad_norm": 0.09348655492067337, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 173140 + }, + { + "epoch": 0.6693494765814662, + "grad_norm": 0.10245306044816971, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 173150 + }, + { + "epoch": 0.6693881337848495, + "grad_norm": 0.10220210999250412, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 173160 + }, + { + "epoch": 0.6694267909882328, + "grad_norm": 0.1089777946472168, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 173170 + }, + { + "epoch": 0.669465448191616, + "grad_norm": 0.09728941321372986, + "learning_rate": 0.002, + "loss": 2.333, + "step": 173180 + }, + { + "epoch": 0.6695041053949993, + "grad_norm": 0.10754229873418808, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 173190 + }, + { + "epoch": 0.6695427625983826, + "grad_norm": 0.11171595752239227, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 173200 + }, + { + "epoch": 0.6695814198017659, + "grad_norm": 0.0972415879368782, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 173210 + }, + { + "epoch": 0.6696200770051491, + "grad_norm": 0.09847305715084076, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 173220 + }, + { + "epoch": 0.6696587342085324, + "grad_norm": 0.10625992715358734, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 173230 + }, + { + "epoch": 0.6696973914119156, + "grad_norm": 0.09553809463977814, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 173240 + }, + { + "epoch": 0.669736048615299, + "grad_norm": 0.10626339167356491, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 173250 + }, + { + "epoch": 0.6697747058186823, + "grad_norm": 0.10143054276704788, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 173260 + }, + { + "epoch": 0.6698133630220655, + "grad_norm": 0.10228617489337921, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 173270 + }, + { + "epoch": 0.6698520202254488, + "grad_norm": 0.12096145004034042, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 173280 + }, + { + "epoch": 0.6698906774288321, + "grad_norm": 0.09476984292268753, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 173290 + }, + { + "epoch": 0.6699293346322154, + "grad_norm": 0.11676125973463058, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 173300 + }, + { + "epoch": 0.6699679918355986, + "grad_norm": 0.09415993094444275, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 173310 + }, + { + "epoch": 0.6700066490389819, + "grad_norm": 0.09608684480190277, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 173320 + }, + { + "epoch": 0.6700453062423652, + "grad_norm": 0.11301315575838089, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 173330 + }, + { + "epoch": 0.6700839634457485, + "grad_norm": 0.10460449755191803, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 173340 + }, + { + "epoch": 0.6701226206491318, + "grad_norm": 0.09276396781206131, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 173350 + }, + { + "epoch": 0.670161277852515, + "grad_norm": 0.12138626724481583, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 173360 + }, + { + "epoch": 0.6701999350558984, + "grad_norm": 0.10992854088544846, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 173370 + }, + { + "epoch": 0.6702385922592816, + "grad_norm": 0.0991344228386879, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 173380 + }, + { + "epoch": 0.6702772494626649, + "grad_norm": 0.10722635686397552, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 173390 + }, + { + "epoch": 0.6703159066660481, + "grad_norm": 0.09343632310628891, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 173400 + }, + { + "epoch": 0.6703545638694314, + "grad_norm": 0.12453175336122513, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 173410 + }, + { + "epoch": 0.6703932210728147, + "grad_norm": 0.11105192452669144, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 173420 + }, + { + "epoch": 0.670431878276198, + "grad_norm": 0.10980962961912155, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 173430 + }, + { + "epoch": 0.6704705354795812, + "grad_norm": 0.09669952839612961, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 173440 + }, + { + "epoch": 0.6705091926829645, + "grad_norm": 0.11269879341125488, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 173450 + }, + { + "epoch": 0.6705478498863479, + "grad_norm": 0.09396583586931229, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 173460 + }, + { + "epoch": 0.6705865070897311, + "grad_norm": 0.09842613339424133, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 173470 + }, + { + "epoch": 0.6706251642931144, + "grad_norm": 0.09631490707397461, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 173480 + }, + { + "epoch": 0.6706638214964976, + "grad_norm": 0.11753473430871964, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 173490 + }, + { + "epoch": 0.670702478699881, + "grad_norm": 0.1194978803396225, + "learning_rate": 0.002, + "loss": 2.355, + "step": 173500 + }, + { + "epoch": 0.6707411359032642, + "grad_norm": 0.10833332687616348, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 173510 + }, + { + "epoch": 0.6707797931066475, + "grad_norm": 0.09782851487398148, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 173520 + }, + { + "epoch": 0.6708184503100307, + "grad_norm": 0.10231298208236694, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 173530 + }, + { + "epoch": 0.6708571075134141, + "grad_norm": 0.09388585388660431, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 173540 + }, + { + "epoch": 0.6708957647167973, + "grad_norm": 0.10473788529634476, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 173550 + }, + { + "epoch": 0.6709344219201806, + "grad_norm": 0.09349475800991058, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 173560 + }, + { + "epoch": 0.6709730791235639, + "grad_norm": 0.12404922395944595, + "learning_rate": 0.002, + "loss": 2.336, + "step": 173570 + }, + { + "epoch": 0.6710117363269472, + "grad_norm": 0.12190348654985428, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 173580 + }, + { + "epoch": 0.6710503935303305, + "grad_norm": 0.10820824652910233, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 173590 + }, + { + "epoch": 0.6710890507337137, + "grad_norm": 0.11660218983888626, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 173600 + }, + { + "epoch": 0.671127707937097, + "grad_norm": 0.09696482867002487, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 173610 + }, + { + "epoch": 0.6711663651404802, + "grad_norm": 0.12469976395368576, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 173620 + }, + { + "epoch": 0.6712050223438636, + "grad_norm": 0.09914630651473999, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 173630 + }, + { + "epoch": 0.6712436795472468, + "grad_norm": 0.11698149144649506, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 173640 + }, + { + "epoch": 0.6712823367506301, + "grad_norm": 0.10479546338319778, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 173650 + }, + { + "epoch": 0.6713209939540133, + "grad_norm": 0.09075053781270981, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 173660 + }, + { + "epoch": 0.6713596511573967, + "grad_norm": 0.1083453819155693, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 173670 + }, + { + "epoch": 0.67139830836078, + "grad_norm": 0.24330176413059235, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 173680 + }, + { + "epoch": 0.6714369655641632, + "grad_norm": 0.10410798341035843, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 173690 + }, + { + "epoch": 0.6714756227675465, + "grad_norm": 0.1088162288069725, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 173700 + }, + { + "epoch": 0.6715142799709298, + "grad_norm": 0.09707775712013245, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 173710 + }, + { + "epoch": 0.6715529371743131, + "grad_norm": 0.10446558892726898, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 173720 + }, + { + "epoch": 0.6715915943776963, + "grad_norm": 0.10093081742525101, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 173730 + }, + { + "epoch": 0.6716302515810796, + "grad_norm": 0.08895033597946167, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 173740 + }, + { + "epoch": 0.671668908784463, + "grad_norm": 0.11400709301233292, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 173750 + }, + { + "epoch": 0.6717075659878462, + "grad_norm": 0.10750409215688705, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 173760 + }, + { + "epoch": 0.6717462231912295, + "grad_norm": 0.09366723150014877, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 173770 + }, + { + "epoch": 0.6717848803946127, + "grad_norm": 0.11019507795572281, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 173780 + }, + { + "epoch": 0.671823537597996, + "grad_norm": 0.10667601972818375, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 173790 + }, + { + "epoch": 0.6718621948013793, + "grad_norm": 0.11151214689016342, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 173800 + }, + { + "epoch": 0.6719008520047626, + "grad_norm": 0.11602430790662766, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 173810 + }, + { + "epoch": 0.6719395092081458, + "grad_norm": 0.08965712040662766, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 173820 + }, + { + "epoch": 0.6719781664115291, + "grad_norm": 0.09411187469959259, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 173830 + }, + { + "epoch": 0.6720168236149124, + "grad_norm": 0.10649342834949493, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 173840 + }, + { + "epoch": 0.6720554808182957, + "grad_norm": 0.10887311398983002, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 173850 + }, + { + "epoch": 0.6720941380216789, + "grad_norm": 0.09790990501642227, + "learning_rate": 0.002, + "loss": 2.333, + "step": 173860 + }, + { + "epoch": 0.6721327952250622, + "grad_norm": 0.11011838912963867, + "learning_rate": 0.002, + "loss": 2.342, + "step": 173870 + }, + { + "epoch": 0.6721714524284456, + "grad_norm": 0.11605843901634216, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 173880 + }, + { + "epoch": 0.6722101096318288, + "grad_norm": 0.09449160844087601, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 173890 + }, + { + "epoch": 0.6722487668352121, + "grad_norm": 0.10037367790937424, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 173900 + }, + { + "epoch": 0.6722874240385953, + "grad_norm": 0.09971857815980911, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 173910 + }, + { + "epoch": 0.6723260812419787, + "grad_norm": 0.12663498520851135, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 173920 + }, + { + "epoch": 0.6723647384453619, + "grad_norm": 0.10218092054128647, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 173930 + }, + { + "epoch": 0.6724033956487452, + "grad_norm": 0.09552987664937973, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 173940 + }, + { + "epoch": 0.6724420528521284, + "grad_norm": 0.1135910302400589, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 173950 + }, + { + "epoch": 0.6724807100555118, + "grad_norm": 0.09681475907564163, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 173960 + }, + { + "epoch": 0.672519367258895, + "grad_norm": 0.10417526960372925, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 173970 + }, + { + "epoch": 0.6725580244622783, + "grad_norm": 0.09279265254735947, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 173980 + }, + { + "epoch": 0.6725966816656616, + "grad_norm": 0.09781613945960999, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 173990 + }, + { + "epoch": 0.6726353388690448, + "grad_norm": 0.11745069175958633, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 174000 + }, + { + "epoch": 0.6726739960724282, + "grad_norm": 0.09635234624147415, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 174010 + }, + { + "epoch": 0.6727126532758114, + "grad_norm": 0.11234599351882935, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 174020 + }, + { + "epoch": 0.6727513104791947, + "grad_norm": 0.09256234019994736, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 174030 + }, + { + "epoch": 0.6727899676825779, + "grad_norm": 0.1221209242939949, + "learning_rate": 0.002, + "loss": 2.345, + "step": 174040 + }, + { + "epoch": 0.6728286248859613, + "grad_norm": 0.1162743866443634, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 174050 + }, + { + "epoch": 0.6728672820893445, + "grad_norm": 0.09238558262586594, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 174060 + }, + { + "epoch": 0.6729059392927278, + "grad_norm": 0.1003466248512268, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 174070 + }, + { + "epoch": 0.672944596496111, + "grad_norm": 0.11341916769742966, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 174080 + }, + { + "epoch": 0.6729832536994944, + "grad_norm": 0.11505437642335892, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 174090 + }, + { + "epoch": 0.6730219109028777, + "grad_norm": 0.13397817313671112, + "learning_rate": 0.002, + "loss": 2.343, + "step": 174100 + }, + { + "epoch": 0.6730605681062609, + "grad_norm": 0.1059814989566803, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 174110 + }, + { + "epoch": 0.6730992253096442, + "grad_norm": 0.11259469389915466, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 174120 + }, + { + "epoch": 0.6731378825130275, + "grad_norm": 0.10442819446325302, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 174130 + }, + { + "epoch": 0.6731765397164108, + "grad_norm": 0.09504423290491104, + "learning_rate": 0.002, + "loss": 2.344, + "step": 174140 + }, + { + "epoch": 0.673215196919794, + "grad_norm": 0.12814587354660034, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 174150 + }, + { + "epoch": 0.6732538541231773, + "grad_norm": 0.09581288695335388, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 174160 + }, + { + "epoch": 0.6732925113265605, + "grad_norm": 0.09748073667287827, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 174170 + }, + { + "epoch": 0.6733311685299439, + "grad_norm": 0.10420890897512436, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 174180 + }, + { + "epoch": 0.6733698257333272, + "grad_norm": 0.10162430256605148, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 174190 + }, + { + "epoch": 0.6734084829367104, + "grad_norm": 0.10959216952323914, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 174200 + }, + { + "epoch": 0.6734471401400937, + "grad_norm": 0.10551372915506363, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 174210 + }, + { + "epoch": 0.673485797343477, + "grad_norm": 0.09912195056676865, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 174220 + }, + { + "epoch": 0.6735244545468603, + "grad_norm": 0.09719519317150116, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 174230 + }, + { + "epoch": 0.6735631117502435, + "grad_norm": 0.09635747969150543, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 174240 + }, + { + "epoch": 0.6736017689536268, + "grad_norm": 0.09930815547704697, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 174250 + }, + { + "epoch": 0.6736404261570101, + "grad_norm": 0.10854223370552063, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 174260 + }, + { + "epoch": 0.6736790833603934, + "grad_norm": 0.141081765294075, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 174270 + }, + { + "epoch": 0.6737177405637766, + "grad_norm": 0.09559612721204758, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 174280 + }, + { + "epoch": 0.6737563977671599, + "grad_norm": 0.11618802696466446, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 174290 + }, + { + "epoch": 0.6737950549705433, + "grad_norm": 0.10047265142202377, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 174300 + }, + { + "epoch": 0.6738337121739265, + "grad_norm": 0.11988501995801926, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 174310 + }, + { + "epoch": 0.6738723693773098, + "grad_norm": 0.11306623369455338, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 174320 + }, + { + "epoch": 0.673911026580693, + "grad_norm": 0.09925418347120285, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 174330 + }, + { + "epoch": 0.6739496837840763, + "grad_norm": 0.12130644172430038, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 174340 + }, + { + "epoch": 0.6739883409874596, + "grad_norm": 0.14711587131023407, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 174350 + }, + { + "epoch": 0.6740269981908429, + "grad_norm": 0.1304912120103836, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 174360 + }, + { + "epoch": 0.6740656553942261, + "grad_norm": 0.10425528883934021, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 174370 + }, + { + "epoch": 0.6741043125976094, + "grad_norm": 0.10731105506420135, + "learning_rate": 0.002, + "loss": 2.353, + "step": 174380 + }, + { + "epoch": 0.6741429698009928, + "grad_norm": 0.09718817472457886, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 174390 + }, + { + "epoch": 0.674181627004376, + "grad_norm": 0.14790160953998566, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 174400 + }, + { + "epoch": 0.6742202842077593, + "grad_norm": 0.1125716120004654, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 174410 + }, + { + "epoch": 0.6742589414111425, + "grad_norm": 0.0974559560418129, + "learning_rate": 0.002, + "loss": 2.351, + "step": 174420 + }, + { + "epoch": 0.6742975986145259, + "grad_norm": 0.11191441118717194, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 174430 + }, + { + "epoch": 0.6743362558179091, + "grad_norm": 0.11979559808969498, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 174440 + }, + { + "epoch": 0.6743749130212924, + "grad_norm": 0.13155923783779144, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 174450 + }, + { + "epoch": 0.6744135702246756, + "grad_norm": 0.10687007009983063, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 174460 + }, + { + "epoch": 0.674452227428059, + "grad_norm": 0.08892308175563812, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 174470 + }, + { + "epoch": 0.6744908846314422, + "grad_norm": 0.10634169727563858, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 174480 + }, + { + "epoch": 0.6745295418348255, + "grad_norm": 0.9842195510864258, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 174490 + }, + { + "epoch": 0.6745681990382087, + "grad_norm": 0.10620087385177612, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 174500 + }, + { + "epoch": 0.6746068562415921, + "grad_norm": 0.10536065697669983, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 174510 + }, + { + "epoch": 0.6746455134449754, + "grad_norm": 0.09353658556938171, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 174520 + }, + { + "epoch": 0.6746841706483586, + "grad_norm": 0.10854317247867584, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 174530 + }, + { + "epoch": 0.6747228278517419, + "grad_norm": 0.11007989197969437, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 174540 + }, + { + "epoch": 0.6747614850551251, + "grad_norm": 0.10677378624677658, + "learning_rate": 0.002, + "loss": 2.351, + "step": 174550 + }, + { + "epoch": 0.6748001422585085, + "grad_norm": 0.10250411182641983, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 174560 + }, + { + "epoch": 0.6748387994618917, + "grad_norm": 0.12104775756597519, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 174570 + }, + { + "epoch": 0.674877456665275, + "grad_norm": 0.10298202931880951, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 174580 + }, + { + "epoch": 0.6749161138686582, + "grad_norm": 0.15690723061561584, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 174590 + }, + { + "epoch": 0.6749547710720416, + "grad_norm": 0.11443834751844406, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 174600 + }, + { + "epoch": 0.6749934282754249, + "grad_norm": 0.0889236256480217, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 174610 + }, + { + "epoch": 0.6750320854788081, + "grad_norm": 0.10543763637542725, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 174620 + }, + { + "epoch": 0.6750707426821914, + "grad_norm": 0.12079984694719315, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 174630 + }, + { + "epoch": 0.6751093998855747, + "grad_norm": 0.08959334343671799, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 174640 + }, + { + "epoch": 0.675148057088958, + "grad_norm": 0.09968244284391403, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 174650 + }, + { + "epoch": 0.6751867142923412, + "grad_norm": 0.09839235991239548, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 174660 + }, + { + "epoch": 0.6752253714957245, + "grad_norm": 0.12109307199716568, + "learning_rate": 0.002, + "loss": 2.341, + "step": 174670 + }, + { + "epoch": 0.6752640286991078, + "grad_norm": 0.09745091944932938, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 174680 + }, + { + "epoch": 0.6753026859024911, + "grad_norm": 0.10645421594381332, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 174690 + }, + { + "epoch": 0.6753413431058743, + "grad_norm": 0.10345783829689026, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 174700 + }, + { + "epoch": 0.6753800003092576, + "grad_norm": 0.10724533349275589, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 174710 + }, + { + "epoch": 0.6754186575126409, + "grad_norm": 0.11676304042339325, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 174720 + }, + { + "epoch": 0.6754573147160242, + "grad_norm": 0.09646974503993988, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 174730 + }, + { + "epoch": 0.6754959719194075, + "grad_norm": 0.10687783360481262, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 174740 + }, + { + "epoch": 0.6755346291227907, + "grad_norm": 0.10278775542974472, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 174750 + }, + { + "epoch": 0.675573286326174, + "grad_norm": 0.10462471842765808, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 174760 + }, + { + "epoch": 0.6756119435295573, + "grad_norm": 0.11497380584478378, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 174770 + }, + { + "epoch": 0.6756506007329406, + "grad_norm": 0.11669895797967911, + "learning_rate": 0.002, + "loss": 2.346, + "step": 174780 + }, + { + "epoch": 0.6756892579363238, + "grad_norm": 0.094562828540802, + "learning_rate": 0.002, + "loss": 2.377, + "step": 174790 + }, + { + "epoch": 0.6757279151397071, + "grad_norm": 0.1032317727804184, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 174800 + }, + { + "epoch": 0.6757665723430905, + "grad_norm": 0.12714195251464844, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 174810 + }, + { + "epoch": 0.6758052295464737, + "grad_norm": 0.10985013097524643, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 174820 + }, + { + "epoch": 0.675843886749857, + "grad_norm": 0.10180466622114182, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 174830 + }, + { + "epoch": 0.6758825439532402, + "grad_norm": 0.1006990447640419, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 174840 + }, + { + "epoch": 0.6759212011566236, + "grad_norm": 0.10727022588253021, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 174850 + }, + { + "epoch": 0.6759598583600068, + "grad_norm": 0.12462523579597473, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 174860 + }, + { + "epoch": 0.6759985155633901, + "grad_norm": 0.09988413751125336, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 174870 + }, + { + "epoch": 0.6760371727667733, + "grad_norm": 0.10924515873193741, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 174880 + }, + { + "epoch": 0.6760758299701567, + "grad_norm": 0.10243270546197891, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 174890 + }, + { + "epoch": 0.67611448717354, + "grad_norm": 0.10781146585941315, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 174900 + }, + { + "epoch": 0.6761531443769232, + "grad_norm": 0.10424323379993439, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 174910 + }, + { + "epoch": 0.6761918015803065, + "grad_norm": 0.10621855407953262, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 174920 + }, + { + "epoch": 0.6762304587836897, + "grad_norm": 0.09816805273294449, + "learning_rate": 0.002, + "loss": 2.354, + "step": 174930 + }, + { + "epoch": 0.6762691159870731, + "grad_norm": 0.13953115046024323, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 174940 + }, + { + "epoch": 0.6763077731904563, + "grad_norm": 0.14118227362632751, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 174950 + }, + { + "epoch": 0.6763464303938396, + "grad_norm": 0.10070734471082687, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 174960 + }, + { + "epoch": 0.6763850875972228, + "grad_norm": 0.11258470267057419, + "learning_rate": 0.002, + "loss": 2.346, + "step": 174970 + }, + { + "epoch": 0.6764237448006062, + "grad_norm": 0.10367239266633987, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 174980 + }, + { + "epoch": 0.6764624020039894, + "grad_norm": 0.09598944336175919, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 174990 + }, + { + "epoch": 0.6765010592073727, + "grad_norm": 0.09504383057355881, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 175000 + }, + { + "epoch": 0.6765397164107559, + "grad_norm": 0.13171350955963135, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 175010 + }, + { + "epoch": 0.6765783736141393, + "grad_norm": 0.09265443682670593, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 175020 + }, + { + "epoch": 0.6766170308175226, + "grad_norm": 0.11585959047079086, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 175030 + }, + { + "epoch": 0.6766556880209058, + "grad_norm": 0.10820352286100388, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 175040 + }, + { + "epoch": 0.6766943452242891, + "grad_norm": 0.09318529069423676, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 175050 + }, + { + "epoch": 0.6767330024276724, + "grad_norm": 0.09986617416143417, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 175060 + }, + { + "epoch": 0.6767716596310557, + "grad_norm": 0.08849131315946579, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 175070 + }, + { + "epoch": 0.6768103168344389, + "grad_norm": 0.12096413224935532, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 175080 + }, + { + "epoch": 0.6768489740378222, + "grad_norm": 0.09429614245891571, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 175090 + }, + { + "epoch": 0.6768876312412054, + "grad_norm": 0.11105930060148239, + "learning_rate": 0.002, + "loss": 2.337, + "step": 175100 + }, + { + "epoch": 0.6769262884445888, + "grad_norm": 0.10252535343170166, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 175110 + }, + { + "epoch": 0.676964945647972, + "grad_norm": 0.10260120779275894, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 175120 + }, + { + "epoch": 0.6770036028513553, + "grad_norm": 0.11789651960134506, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 175130 + }, + { + "epoch": 0.6770422600547386, + "grad_norm": 0.11125901341438293, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 175140 + }, + { + "epoch": 0.6770809172581219, + "grad_norm": 0.10644970089197159, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 175150 + }, + { + "epoch": 0.6771195744615052, + "grad_norm": 0.12210649251937866, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 175160 + }, + { + "epoch": 0.6771582316648884, + "grad_norm": 0.12430441379547119, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 175170 + }, + { + "epoch": 0.6771968888682717, + "grad_norm": 0.11322494596242905, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 175180 + }, + { + "epoch": 0.677235546071655, + "grad_norm": 0.11685037612915039, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 175190 + }, + { + "epoch": 0.6772742032750383, + "grad_norm": 0.09667099267244339, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 175200 + }, + { + "epoch": 0.6773128604784215, + "grad_norm": 0.09514366090297699, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 175210 + }, + { + "epoch": 0.6773515176818048, + "grad_norm": 0.09986500442028046, + "learning_rate": 0.002, + "loss": 2.339, + "step": 175220 + }, + { + "epoch": 0.6773901748851882, + "grad_norm": 0.09903539717197418, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 175230 + }, + { + "epoch": 0.6774288320885714, + "grad_norm": 0.11586198955774307, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 175240 + }, + { + "epoch": 0.6774674892919547, + "grad_norm": 0.12105626612901688, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 175250 + }, + { + "epoch": 0.6775061464953379, + "grad_norm": 0.10017690062522888, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 175260 + }, + { + "epoch": 0.6775448036987212, + "grad_norm": 0.10963975638151169, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 175270 + }, + { + "epoch": 0.6775834609021045, + "grad_norm": 0.12276821583509445, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 175280 + }, + { + "epoch": 0.6776221181054878, + "grad_norm": 0.11214709281921387, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 175290 + }, + { + "epoch": 0.677660775308871, + "grad_norm": 0.11168606579303741, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 175300 + }, + { + "epoch": 0.6776994325122543, + "grad_norm": 0.10621609538793564, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 175310 + }, + { + "epoch": 0.6777380897156376, + "grad_norm": 0.12428240478038788, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 175320 + }, + { + "epoch": 0.6777767469190209, + "grad_norm": 0.1008407399058342, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 175330 + }, + { + "epoch": 0.6778154041224042, + "grad_norm": 0.10509267449378967, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 175340 + }, + { + "epoch": 0.6778540613257874, + "grad_norm": 0.11543798446655273, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 175350 + }, + { + "epoch": 0.6778927185291708, + "grad_norm": 0.10814554244279861, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 175360 + }, + { + "epoch": 0.677931375732554, + "grad_norm": 0.09828812628984451, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 175370 + }, + { + "epoch": 0.6779700329359373, + "grad_norm": 0.09091383218765259, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 175380 + }, + { + "epoch": 0.6780086901393205, + "grad_norm": 0.10134921967983246, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 175390 + }, + { + "epoch": 0.6780473473427039, + "grad_norm": 0.11470029503107071, + "learning_rate": 0.002, + "loss": 2.34, + "step": 175400 + }, + { + "epoch": 0.6780860045460871, + "grad_norm": 0.0986943319439888, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 175410 + }, + { + "epoch": 0.6781246617494704, + "grad_norm": 0.11578582972288132, + "learning_rate": 0.002, + "loss": 2.341, + "step": 175420 + }, + { + "epoch": 0.6781633189528536, + "grad_norm": 0.11289183795452118, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 175430 + }, + { + "epoch": 0.678201976156237, + "grad_norm": 0.10435447096824646, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 175440 + }, + { + "epoch": 0.6782406333596203, + "grad_norm": 0.10721666365861893, + "learning_rate": 0.002, + "loss": 2.339, + "step": 175450 + }, + { + "epoch": 0.6782792905630035, + "grad_norm": 0.10489135980606079, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 175460 + }, + { + "epoch": 0.6783179477663868, + "grad_norm": 0.08868864178657532, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 175470 + }, + { + "epoch": 0.67835660496977, + "grad_norm": 0.11689954251050949, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 175480 + }, + { + "epoch": 0.6783952621731534, + "grad_norm": 0.11360076814889908, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 175490 + }, + { + "epoch": 0.6784339193765366, + "grad_norm": 0.10669480264186859, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 175500 + }, + { + "epoch": 0.6784725765799199, + "grad_norm": 0.11861662566661835, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 175510 + }, + { + "epoch": 0.6785112337833031, + "grad_norm": 0.10718594491481781, + "learning_rate": 0.002, + "loss": 2.35, + "step": 175520 + }, + { + "epoch": 0.6785498909866865, + "grad_norm": 0.0939483791589737, + "learning_rate": 0.002, + "loss": 2.352, + "step": 175530 + }, + { + "epoch": 0.6785885481900698, + "grad_norm": 0.12440059334039688, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 175540 + }, + { + "epoch": 0.678627205393453, + "grad_norm": 0.09499397873878479, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 175550 + }, + { + "epoch": 0.6786658625968363, + "grad_norm": 0.11958306282758713, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 175560 + }, + { + "epoch": 0.6787045198002196, + "grad_norm": 0.11625002324581146, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 175570 + }, + { + "epoch": 0.6787431770036029, + "grad_norm": 0.10071554034948349, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 175580 + }, + { + "epoch": 0.6787818342069861, + "grad_norm": 0.11125433444976807, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 175590 + }, + { + "epoch": 0.6788204914103694, + "grad_norm": 0.12130042165517807, + "learning_rate": 0.002, + "loss": 2.335, + "step": 175600 + }, + { + "epoch": 0.6788591486137527, + "grad_norm": 0.10506013035774231, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 175610 + }, + { + "epoch": 0.678897805817136, + "grad_norm": 0.11317040771245956, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 175620 + }, + { + "epoch": 0.6789364630205192, + "grad_norm": 0.09023216366767883, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 175630 + }, + { + "epoch": 0.6789751202239025, + "grad_norm": 0.09898082911968231, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 175640 + }, + { + "epoch": 0.6790137774272857, + "grad_norm": 0.09514901787042618, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 175650 + }, + { + "epoch": 0.6790524346306691, + "grad_norm": 0.10889595001935959, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 175660 + }, + { + "epoch": 0.6790910918340524, + "grad_norm": 0.09953156113624573, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 175670 + }, + { + "epoch": 0.6791297490374356, + "grad_norm": 0.09541979432106018, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 175680 + }, + { + "epoch": 0.6791684062408189, + "grad_norm": 0.09662420302629471, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 175690 + }, + { + "epoch": 0.6792070634442022, + "grad_norm": 0.1041659340262413, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 175700 + }, + { + "epoch": 0.6792457206475855, + "grad_norm": 0.09401014447212219, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 175710 + }, + { + "epoch": 0.6792843778509687, + "grad_norm": 0.11179909110069275, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 175720 + }, + { + "epoch": 0.679323035054352, + "grad_norm": 0.11274517327547073, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 175730 + }, + { + "epoch": 0.6793616922577354, + "grad_norm": 0.10251244157552719, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 175740 + }, + { + "epoch": 0.6794003494611186, + "grad_norm": 0.11762916296720505, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 175750 + }, + { + "epoch": 0.6794390066645019, + "grad_norm": 0.10066024959087372, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 175760 + }, + { + "epoch": 0.6794776638678851, + "grad_norm": 0.10446237772703171, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 175770 + }, + { + "epoch": 0.6795163210712685, + "grad_norm": 0.10105162113904953, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 175780 + }, + { + "epoch": 0.6795549782746517, + "grad_norm": 0.11937082558870316, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 175790 + }, + { + "epoch": 0.679593635478035, + "grad_norm": 0.1018209308385849, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 175800 + }, + { + "epoch": 0.6796322926814182, + "grad_norm": 0.10392194241285324, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 175810 + }, + { + "epoch": 0.6796709498848015, + "grad_norm": 0.09500529617071152, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 175820 + }, + { + "epoch": 0.6797096070881848, + "grad_norm": 0.10185185074806213, + "learning_rate": 0.002, + "loss": 2.338, + "step": 175830 + }, + { + "epoch": 0.6797482642915681, + "grad_norm": 0.10656815022230148, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 175840 + }, + { + "epoch": 0.6797869214949513, + "grad_norm": 0.1034289002418518, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 175850 + }, + { + "epoch": 0.6798255786983346, + "grad_norm": 0.10846489667892456, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 175860 + }, + { + "epoch": 0.679864235901718, + "grad_norm": 0.09887100011110306, + "learning_rate": 0.002, + "loss": 2.348, + "step": 175870 + }, + { + "epoch": 0.6799028931051012, + "grad_norm": 0.11460306495428085, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 175880 + }, + { + "epoch": 0.6799415503084845, + "grad_norm": 0.12321964651346207, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 175890 + }, + { + "epoch": 0.6799802075118677, + "grad_norm": 0.13607902824878693, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 175900 + }, + { + "epoch": 0.6800188647152511, + "grad_norm": 0.09586074203252792, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 175910 + }, + { + "epoch": 0.6800575219186343, + "grad_norm": 0.10172473639249802, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 175920 + }, + { + "epoch": 0.6800961791220176, + "grad_norm": 0.1413239687681198, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 175930 + }, + { + "epoch": 0.6801348363254008, + "grad_norm": 0.11910225450992584, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 175940 + }, + { + "epoch": 0.6801734935287842, + "grad_norm": 0.09417007863521576, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 175950 + }, + { + "epoch": 0.6802121507321675, + "grad_norm": 0.08883190155029297, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 175960 + }, + { + "epoch": 0.6802508079355507, + "grad_norm": 0.0998324379324913, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 175970 + }, + { + "epoch": 0.680289465138934, + "grad_norm": 0.10993354022502899, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 175980 + }, + { + "epoch": 0.6803281223423173, + "grad_norm": 0.09957070648670197, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 175990 + }, + { + "epoch": 0.6803667795457006, + "grad_norm": 0.09717411547899246, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 176000 + }, + { + "epoch": 0.6804054367490838, + "grad_norm": 0.11479727178812027, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 176010 + }, + { + "epoch": 0.6804440939524671, + "grad_norm": 0.11380119621753693, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 176020 + }, + { + "epoch": 0.6804827511558503, + "grad_norm": 0.09961897879838943, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 176030 + }, + { + "epoch": 0.6805214083592337, + "grad_norm": 0.10932581126689911, + "learning_rate": 0.002, + "loss": 2.343, + "step": 176040 + }, + { + "epoch": 0.680560065562617, + "grad_norm": 0.09730560332536697, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 176050 + }, + { + "epoch": 0.6805987227660002, + "grad_norm": 0.1072082445025444, + "learning_rate": 0.002, + "loss": 2.354, + "step": 176060 + }, + { + "epoch": 0.6806373799693834, + "grad_norm": 0.10722892731428146, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 176070 + }, + { + "epoch": 0.6806760371727668, + "grad_norm": 0.11320706456899643, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 176080 + }, + { + "epoch": 0.6807146943761501, + "grad_norm": 0.11437489092350006, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 176090 + }, + { + "epoch": 0.6807533515795333, + "grad_norm": 0.08987102657556534, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 176100 + }, + { + "epoch": 0.6807920087829166, + "grad_norm": 0.12513278424739838, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 176110 + }, + { + "epoch": 0.6808306659862999, + "grad_norm": 0.10193182528018951, + "learning_rate": 0.002, + "loss": 2.351, + "step": 176120 + }, + { + "epoch": 0.6808693231896832, + "grad_norm": 0.09458781033754349, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 176130 + }, + { + "epoch": 0.6809079803930664, + "grad_norm": 0.09140831977128983, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 176140 + }, + { + "epoch": 0.6809466375964497, + "grad_norm": 0.09771974384784698, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 176150 + }, + { + "epoch": 0.680985294799833, + "grad_norm": 0.09318406134843826, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 176160 + }, + { + "epoch": 0.6810239520032163, + "grad_norm": 0.15654128789901733, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 176170 + }, + { + "epoch": 0.6810626092065996, + "grad_norm": 0.09128965437412262, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 176180 + }, + { + "epoch": 0.6811012664099828, + "grad_norm": 0.11416096240282059, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 176190 + }, + { + "epoch": 0.6811399236133661, + "grad_norm": 0.09656243771314621, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 176200 + }, + { + "epoch": 0.6811785808167494, + "grad_norm": 0.10527592897415161, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 176210 + }, + { + "epoch": 0.6812172380201327, + "grad_norm": 0.11535614728927612, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 176220 + }, + { + "epoch": 0.6812558952235159, + "grad_norm": 0.0986952930688858, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 176230 + }, + { + "epoch": 0.6812945524268992, + "grad_norm": 0.10911183804273605, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 176240 + }, + { + "epoch": 0.6813332096302825, + "grad_norm": 0.10323513299226761, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 176250 + }, + { + "epoch": 0.6813718668336658, + "grad_norm": 0.13658122718334198, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 176260 + }, + { + "epoch": 0.681410524037049, + "grad_norm": 0.11002452671527863, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 176270 + }, + { + "epoch": 0.6814491812404323, + "grad_norm": 0.10110387951135635, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 176280 + }, + { + "epoch": 0.6814878384438157, + "grad_norm": 0.11138048022985458, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 176290 + }, + { + "epoch": 0.6815264956471989, + "grad_norm": 0.17693695425987244, + "learning_rate": 0.002, + "loss": 2.35, + "step": 176300 + }, + { + "epoch": 0.6815651528505822, + "grad_norm": 0.11095073819160461, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 176310 + }, + { + "epoch": 0.6816038100539654, + "grad_norm": 1.074438452720642, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 176320 + }, + { + "epoch": 0.6816424672573488, + "grad_norm": 0.10732623934745789, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 176330 + }, + { + "epoch": 0.681681124460732, + "grad_norm": 0.1113322302699089, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 176340 + }, + { + "epoch": 0.6817197816641153, + "grad_norm": 0.11108031868934631, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 176350 + }, + { + "epoch": 0.6817584388674985, + "grad_norm": 0.11360624432563782, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 176360 + }, + { + "epoch": 0.6817970960708819, + "grad_norm": 0.10377056896686554, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 176370 + }, + { + "epoch": 0.6818357532742652, + "grad_norm": 0.15662111341953278, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 176380 + }, + { + "epoch": 0.6818744104776484, + "grad_norm": 0.1019727885723114, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 176390 + }, + { + "epoch": 0.6819130676810317, + "grad_norm": 0.10392680764198303, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 176400 + }, + { + "epoch": 0.6819517248844149, + "grad_norm": 0.13927051424980164, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 176410 + }, + { + "epoch": 0.6819903820877983, + "grad_norm": 0.10128787159919739, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 176420 + }, + { + "epoch": 0.6820290392911815, + "grad_norm": 0.1053689643740654, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 176430 + }, + { + "epoch": 0.6820676964945648, + "grad_norm": 0.10823885351419449, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 176440 + }, + { + "epoch": 0.682106353697948, + "grad_norm": 0.11570017784833908, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 176450 + }, + { + "epoch": 0.6821450109013314, + "grad_norm": 0.11610689759254456, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 176460 + }, + { + "epoch": 0.6821836681047146, + "grad_norm": 0.11022695899009705, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 176470 + }, + { + "epoch": 0.6822223253080979, + "grad_norm": 0.1143764927983284, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 176480 + }, + { + "epoch": 0.6822609825114812, + "grad_norm": 0.10477987676858902, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 176490 + }, + { + "epoch": 0.6822996397148645, + "grad_norm": 0.11330216377973557, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 176500 + }, + { + "epoch": 0.6823382969182478, + "grad_norm": 0.09663552045822144, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 176510 + }, + { + "epoch": 0.682376954121631, + "grad_norm": 0.1135273277759552, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 176520 + }, + { + "epoch": 0.6824156113250143, + "grad_norm": 0.10124694555997849, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 176530 + }, + { + "epoch": 0.6824542685283976, + "grad_norm": 0.1054057776927948, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 176540 + }, + { + "epoch": 0.6824929257317809, + "grad_norm": 0.12605276703834534, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 176550 + }, + { + "epoch": 0.6825315829351641, + "grad_norm": 0.10874178260564804, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 176560 + }, + { + "epoch": 0.6825702401385474, + "grad_norm": 0.10447292774915695, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 176570 + }, + { + "epoch": 0.6826088973419306, + "grad_norm": 0.11027207970619202, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 176580 + }, + { + "epoch": 0.682647554545314, + "grad_norm": 0.10599247366189957, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 176590 + }, + { + "epoch": 0.6826862117486973, + "grad_norm": 0.09568461775779724, + "learning_rate": 0.002, + "loss": 2.338, + "step": 176600 + }, + { + "epoch": 0.6827248689520805, + "grad_norm": 0.11142941564321518, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 176610 + }, + { + "epoch": 0.6827635261554638, + "grad_norm": 0.09197314828634262, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 176620 + }, + { + "epoch": 0.6828021833588471, + "grad_norm": 0.09502045065164566, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 176630 + }, + { + "epoch": 0.6828408405622304, + "grad_norm": 0.1063617616891861, + "learning_rate": 0.002, + "loss": 2.351, + "step": 176640 + }, + { + "epoch": 0.6828794977656136, + "grad_norm": 0.11514590680599213, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 176650 + }, + { + "epoch": 0.6829181549689969, + "grad_norm": 0.0956290066242218, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 176660 + }, + { + "epoch": 0.6829568121723802, + "grad_norm": 0.10817734152078629, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 176670 + }, + { + "epoch": 0.6829954693757635, + "grad_norm": 0.11415576934814453, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 176680 + }, + { + "epoch": 0.6830341265791467, + "grad_norm": 0.08466695249080658, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 176690 + }, + { + "epoch": 0.68307278378253, + "grad_norm": 0.11186059564352036, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 176700 + }, + { + "epoch": 0.6831114409859134, + "grad_norm": 0.08967024832963943, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 176710 + }, + { + "epoch": 0.6831500981892966, + "grad_norm": 0.12295638769865036, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 176720 + }, + { + "epoch": 0.6831887553926799, + "grad_norm": 0.09820482134819031, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 176730 + }, + { + "epoch": 0.6832274125960631, + "grad_norm": 0.10533328354358673, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 176740 + }, + { + "epoch": 0.6832660697994464, + "grad_norm": 0.10046335309743881, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 176750 + }, + { + "epoch": 0.6833047270028297, + "grad_norm": 0.14013177156448364, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 176760 + }, + { + "epoch": 0.683343384206213, + "grad_norm": 0.10282900184392929, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 176770 + }, + { + "epoch": 0.6833820414095962, + "grad_norm": 0.11350609362125397, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 176780 + }, + { + "epoch": 0.6834206986129795, + "grad_norm": 0.0963103398680687, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 176790 + }, + { + "epoch": 0.6834593558163629, + "grad_norm": 0.12058515101671219, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 176800 + }, + { + "epoch": 0.6834980130197461, + "grad_norm": 0.12234441190958023, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 176810 + }, + { + "epoch": 0.6835366702231294, + "grad_norm": 0.0914405956864357, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 176820 + }, + { + "epoch": 0.6835753274265126, + "grad_norm": 0.09852568060159683, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 176830 + }, + { + "epoch": 0.683613984629896, + "grad_norm": 0.1033724918961525, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 176840 + }, + { + "epoch": 0.6836526418332792, + "grad_norm": 0.10538680106401443, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 176850 + }, + { + "epoch": 0.6836912990366625, + "grad_norm": 0.08989083766937256, + "learning_rate": 0.002, + "loss": 2.347, + "step": 176860 + }, + { + "epoch": 0.6837299562400457, + "grad_norm": 0.09663797169923782, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 176870 + }, + { + "epoch": 0.6837686134434291, + "grad_norm": 0.10179861634969711, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 176880 + }, + { + "epoch": 0.6838072706468123, + "grad_norm": 0.09698754549026489, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 176890 + }, + { + "epoch": 0.6838459278501956, + "grad_norm": 0.11067965626716614, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 176900 + }, + { + "epoch": 0.6838845850535789, + "grad_norm": 0.10127593576908112, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 176910 + }, + { + "epoch": 0.6839232422569622, + "grad_norm": 0.10865731537342072, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 176920 + }, + { + "epoch": 0.6839618994603455, + "grad_norm": 0.12426352500915527, + "learning_rate": 0.002, + "loss": 2.338, + "step": 176930 + }, + { + "epoch": 0.6840005566637287, + "grad_norm": 0.09453696757555008, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 176940 + }, + { + "epoch": 0.684039213867112, + "grad_norm": 0.10218081623315811, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 176950 + }, + { + "epoch": 0.6840778710704952, + "grad_norm": 0.10880590230226517, + "learning_rate": 0.002, + "loss": 2.324, + "step": 176960 + }, + { + "epoch": 0.6841165282738786, + "grad_norm": 0.1122904047369957, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 176970 + }, + { + "epoch": 0.6841551854772618, + "grad_norm": 0.11233898252248764, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 176980 + }, + { + "epoch": 0.6841938426806451, + "grad_norm": 0.12309885770082474, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 176990 + }, + { + "epoch": 0.6842324998840283, + "grad_norm": 0.11137855798006058, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 177000 + }, + { + "epoch": 0.6842711570874117, + "grad_norm": 0.10196123272180557, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 177010 + }, + { + "epoch": 0.684309814290795, + "grad_norm": 0.1031576544046402, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 177020 + }, + { + "epoch": 0.6843484714941782, + "grad_norm": 0.10486750304698944, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 177030 + }, + { + "epoch": 0.6843871286975615, + "grad_norm": 0.10431206971406937, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 177040 + }, + { + "epoch": 0.6844257859009448, + "grad_norm": 0.10427211970090866, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 177050 + }, + { + "epoch": 0.6844644431043281, + "grad_norm": 0.10033883899450302, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 177060 + }, + { + "epoch": 0.6845031003077113, + "grad_norm": 0.11335708945989609, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 177070 + }, + { + "epoch": 0.6845417575110946, + "grad_norm": 0.10321256518363953, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 177080 + }, + { + "epoch": 0.684580414714478, + "grad_norm": 0.11628992855548859, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 177090 + }, + { + "epoch": 0.6846190719178612, + "grad_norm": 0.10992211103439331, + "learning_rate": 0.002, + "loss": 2.347, + "step": 177100 + }, + { + "epoch": 0.6846577291212445, + "grad_norm": 0.0979648232460022, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 177110 + }, + { + "epoch": 0.6846963863246277, + "grad_norm": 0.11584831774234772, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 177120 + }, + { + "epoch": 0.684735043528011, + "grad_norm": 0.09908290207386017, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 177130 + }, + { + "epoch": 0.6847737007313943, + "grad_norm": 0.1265435367822647, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 177140 + }, + { + "epoch": 0.6848123579347776, + "grad_norm": 0.1148119643330574, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 177150 + }, + { + "epoch": 0.6848510151381608, + "grad_norm": 0.09815677255392075, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 177160 + }, + { + "epoch": 0.6848896723415441, + "grad_norm": 0.09669837355613708, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 177170 + }, + { + "epoch": 0.6849283295449274, + "grad_norm": 0.11474636197090149, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 177180 + }, + { + "epoch": 0.6849669867483107, + "grad_norm": 0.09963209182024002, + "learning_rate": 0.002, + "loss": 2.347, + "step": 177190 + }, + { + "epoch": 0.6850056439516939, + "grad_norm": 0.1148679256439209, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 177200 + }, + { + "epoch": 0.6850443011550772, + "grad_norm": 0.09433385729789734, + "learning_rate": 0.002, + "loss": 2.335, + "step": 177210 + }, + { + "epoch": 0.6850829583584606, + "grad_norm": 0.13668915629386902, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 177220 + }, + { + "epoch": 0.6851216155618438, + "grad_norm": 0.10129014402627945, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 177230 + }, + { + "epoch": 0.6851602727652271, + "grad_norm": 0.11268419027328491, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 177240 + }, + { + "epoch": 0.6851989299686103, + "grad_norm": 0.11311889439821243, + "learning_rate": 0.002, + "loss": 2.342, + "step": 177250 + }, + { + "epoch": 0.6852375871719937, + "grad_norm": 0.09963889420032501, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 177260 + }, + { + "epoch": 0.6852762443753769, + "grad_norm": 0.10477506369352341, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 177270 + }, + { + "epoch": 0.6853149015787602, + "grad_norm": 0.10674279183149338, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 177280 + }, + { + "epoch": 0.6853535587821434, + "grad_norm": 0.11757767200469971, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 177290 + }, + { + "epoch": 0.6853922159855268, + "grad_norm": 0.11279809474945068, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 177300 + }, + { + "epoch": 0.68543087318891, + "grad_norm": 0.09075847268104553, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 177310 + }, + { + "epoch": 0.6854695303922933, + "grad_norm": 0.11798780411481857, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 177320 + }, + { + "epoch": 0.6855081875956766, + "grad_norm": 0.1009967178106308, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 177330 + }, + { + "epoch": 0.6855468447990598, + "grad_norm": 0.0941571444272995, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 177340 + }, + { + "epoch": 0.6855855020024432, + "grad_norm": 0.09473618865013123, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 177350 + }, + { + "epoch": 0.6856241592058264, + "grad_norm": 0.10038759559392929, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 177360 + }, + { + "epoch": 0.6856628164092097, + "grad_norm": 0.1028994470834732, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 177370 + }, + { + "epoch": 0.6857014736125929, + "grad_norm": 0.09498606622219086, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 177380 + }, + { + "epoch": 0.6857401308159763, + "grad_norm": 0.0928439348936081, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 177390 + }, + { + "epoch": 0.6857787880193595, + "grad_norm": 0.09801946580410004, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 177400 + }, + { + "epoch": 0.6858174452227428, + "grad_norm": 0.10568686574697495, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 177410 + }, + { + "epoch": 0.685856102426126, + "grad_norm": 0.10233966261148453, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 177420 + }, + { + "epoch": 0.6858947596295094, + "grad_norm": 0.142459899187088, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 177430 + }, + { + "epoch": 0.6859334168328927, + "grad_norm": 0.1762257218360901, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 177440 + }, + { + "epoch": 0.6859720740362759, + "grad_norm": 0.09656956791877747, + "learning_rate": 0.002, + "loss": 2.339, + "step": 177450 + }, + { + "epoch": 0.6860107312396592, + "grad_norm": 0.10861390084028244, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 177460 + }, + { + "epoch": 0.6860493884430425, + "grad_norm": 0.10502377152442932, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 177470 + }, + { + "epoch": 0.6860880456464258, + "grad_norm": 0.09659518301486969, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 177480 + }, + { + "epoch": 0.686126702849809, + "grad_norm": 0.11638438701629639, + "learning_rate": 0.002, + "loss": 2.342, + "step": 177490 + }, + { + "epoch": 0.6861653600531923, + "grad_norm": 0.1016044095158577, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 177500 + }, + { + "epoch": 0.6862040172565755, + "grad_norm": 0.10398321598768234, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 177510 + }, + { + "epoch": 0.6862426744599589, + "grad_norm": 0.11729230731725693, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 177520 + }, + { + "epoch": 0.6862813316633422, + "grad_norm": 0.10508260130882263, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 177530 + }, + { + "epoch": 0.6863199888667254, + "grad_norm": 0.12430533766746521, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 177540 + }, + { + "epoch": 0.6863586460701087, + "grad_norm": 0.10194669663906097, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 177550 + }, + { + "epoch": 0.686397303273492, + "grad_norm": 0.10797617584466934, + "learning_rate": 0.002, + "loss": 2.336, + "step": 177560 + }, + { + "epoch": 0.6864359604768753, + "grad_norm": 0.11274404078722, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 177570 + }, + { + "epoch": 0.6864746176802585, + "grad_norm": 0.09633027017116547, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 177580 + }, + { + "epoch": 0.6865132748836418, + "grad_norm": 0.11169687658548355, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 177590 + }, + { + "epoch": 0.6865519320870251, + "grad_norm": 0.09493947774171829, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 177600 + }, + { + "epoch": 0.6865905892904084, + "grad_norm": 0.1339365690946579, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 177610 + }, + { + "epoch": 0.6866292464937916, + "grad_norm": 0.09708207100629807, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 177620 + }, + { + "epoch": 0.6866679036971749, + "grad_norm": 0.12348034232854843, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 177630 + }, + { + "epoch": 0.6867065609005583, + "grad_norm": 0.10543914884328842, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 177640 + }, + { + "epoch": 0.6867452181039415, + "grad_norm": 0.09651929885149002, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 177650 + }, + { + "epoch": 0.6867838753073248, + "grad_norm": 0.125656396150589, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 177660 + }, + { + "epoch": 0.686822532510708, + "grad_norm": 0.11345012485980988, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 177670 + }, + { + "epoch": 0.6868611897140913, + "grad_norm": 0.0984870120882988, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 177680 + }, + { + "epoch": 0.6868998469174746, + "grad_norm": 0.11127850413322449, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 177690 + }, + { + "epoch": 0.6869385041208579, + "grad_norm": 0.1167495921254158, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 177700 + }, + { + "epoch": 0.6869771613242411, + "grad_norm": 0.10851190239191055, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 177710 + }, + { + "epoch": 0.6870158185276244, + "grad_norm": 0.09147991240024567, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 177720 + }, + { + "epoch": 0.6870544757310078, + "grad_norm": 0.10341312736272812, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 177730 + }, + { + "epoch": 0.687093132934391, + "grad_norm": 0.09881774336099625, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 177740 + }, + { + "epoch": 0.6871317901377743, + "grad_norm": 0.11159349232912064, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 177750 + }, + { + "epoch": 0.6871704473411575, + "grad_norm": 0.09615936875343323, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 177760 + }, + { + "epoch": 0.6872091045445409, + "grad_norm": 0.1058683916926384, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 177770 + }, + { + "epoch": 0.6872477617479241, + "grad_norm": 0.1070360466837883, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 177780 + }, + { + "epoch": 0.6872864189513074, + "grad_norm": 0.10910263657569885, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 177790 + }, + { + "epoch": 0.6873250761546906, + "grad_norm": 0.10926370322704315, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 177800 + }, + { + "epoch": 0.687363733358074, + "grad_norm": 0.09721733629703522, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 177810 + }, + { + "epoch": 0.6874023905614572, + "grad_norm": 0.09801986813545227, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 177820 + }, + { + "epoch": 0.6874410477648405, + "grad_norm": 0.1177133172750473, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 177830 + }, + { + "epoch": 0.6874797049682237, + "grad_norm": 0.09813974052667618, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 177840 + }, + { + "epoch": 0.6875183621716071, + "grad_norm": 0.11016160249710083, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 177850 + }, + { + "epoch": 0.6875570193749904, + "grad_norm": 0.11785726249217987, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 177860 + }, + { + "epoch": 0.6875956765783736, + "grad_norm": 0.10382388532161713, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 177870 + }, + { + "epoch": 0.6876343337817569, + "grad_norm": 0.10329584032297134, + "learning_rate": 0.002, + "loss": 2.338, + "step": 177880 + }, + { + "epoch": 0.6876729909851401, + "grad_norm": 0.12139497697353363, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 177890 + }, + { + "epoch": 0.6877116481885235, + "grad_norm": 0.12352616339921951, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 177900 + }, + { + "epoch": 0.6877503053919067, + "grad_norm": 0.0942232683300972, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 177910 + }, + { + "epoch": 0.68778896259529, + "grad_norm": 0.10492647439241409, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 177920 + }, + { + "epoch": 0.6878276197986732, + "grad_norm": 0.11326676607131958, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 177930 + }, + { + "epoch": 0.6878662770020566, + "grad_norm": 0.10235963016748428, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 177940 + }, + { + "epoch": 0.6879049342054399, + "grad_norm": 0.1606038212776184, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 177950 + }, + { + "epoch": 0.6879435914088231, + "grad_norm": 0.09358200430870056, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 177960 + }, + { + "epoch": 0.6879822486122064, + "grad_norm": 0.096051886677742, + "learning_rate": 0.002, + "loss": 2.3643, + "step": 177970 + }, + { + "epoch": 0.6880209058155897, + "grad_norm": 0.10265396535396576, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 177980 + }, + { + "epoch": 0.688059563018973, + "grad_norm": 0.11899688094854355, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 177990 + }, + { + "epoch": 0.6880982202223562, + "grad_norm": 0.10440745949745178, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 178000 + }, + { + "epoch": 0.6881368774257395, + "grad_norm": 0.11681189388036728, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 178010 + }, + { + "epoch": 0.6881755346291228, + "grad_norm": 0.0977269634604454, + "learning_rate": 0.002, + "loss": 2.344, + "step": 178020 + }, + { + "epoch": 0.6882141918325061, + "grad_norm": 0.09904341399669647, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 178030 + }, + { + "epoch": 0.6882528490358893, + "grad_norm": 0.12160401046276093, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 178040 + }, + { + "epoch": 0.6882915062392726, + "grad_norm": 0.11060669273138046, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 178050 + }, + { + "epoch": 0.6883301634426559, + "grad_norm": 0.10494236648082733, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 178060 + }, + { + "epoch": 0.6883688206460392, + "grad_norm": 0.11479184031486511, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 178070 + }, + { + "epoch": 0.6884074778494225, + "grad_norm": 0.10022313892841339, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 178080 + }, + { + "epoch": 0.6884461350528057, + "grad_norm": 0.10921555757522583, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 178090 + }, + { + "epoch": 0.688484792256189, + "grad_norm": 0.14623263478279114, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 178100 + }, + { + "epoch": 0.6885234494595723, + "grad_norm": 0.10130447149276733, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 178110 + }, + { + "epoch": 0.6885621066629556, + "grad_norm": 0.10313340276479721, + "learning_rate": 0.002, + "loss": 2.318, + "step": 178120 + }, + { + "epoch": 0.6886007638663388, + "grad_norm": 0.10620785504579544, + "learning_rate": 0.002, + "loss": 2.355, + "step": 178130 + }, + { + "epoch": 0.6886394210697221, + "grad_norm": 0.11439868062734604, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 178140 + }, + { + "epoch": 0.6886780782731055, + "grad_norm": 0.1117575466632843, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 178150 + }, + { + "epoch": 0.6887167354764887, + "grad_norm": 0.1372932493686676, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 178160 + }, + { + "epoch": 0.688755392679872, + "grad_norm": 0.11603402346372604, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 178170 + }, + { + "epoch": 0.6887940498832552, + "grad_norm": 0.09763218462467194, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 178180 + }, + { + "epoch": 0.6888327070866386, + "grad_norm": 0.09390419721603394, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 178190 + }, + { + "epoch": 0.6888713642900218, + "grad_norm": 0.09340616315603256, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 178200 + }, + { + "epoch": 0.6889100214934051, + "grad_norm": 0.11203593015670776, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 178210 + }, + { + "epoch": 0.6889486786967883, + "grad_norm": 0.12373124063014984, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 178220 + }, + { + "epoch": 0.6889873359001717, + "grad_norm": 0.09314067661762238, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 178230 + }, + { + "epoch": 0.689025993103555, + "grad_norm": 0.10530916601419449, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 178240 + }, + { + "epoch": 0.6890646503069382, + "grad_norm": 0.12179020792245865, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 178250 + }, + { + "epoch": 0.6891033075103215, + "grad_norm": 0.09535452723503113, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 178260 + }, + { + "epoch": 0.6891419647137047, + "grad_norm": 0.09405100345611572, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 178270 + }, + { + "epoch": 0.6891806219170881, + "grad_norm": 0.12436164915561676, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 178280 + }, + { + "epoch": 0.6892192791204713, + "grad_norm": 0.12237463146448135, + "learning_rate": 0.002, + "loss": 2.342, + "step": 178290 + }, + { + "epoch": 0.6892579363238546, + "grad_norm": 0.10916969180107117, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 178300 + }, + { + "epoch": 0.6892965935272378, + "grad_norm": 0.11725562810897827, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 178310 + }, + { + "epoch": 0.6893352507306212, + "grad_norm": 0.09735278785228729, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 178320 + }, + { + "epoch": 0.6893739079340044, + "grad_norm": 0.104241743683815, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 178330 + }, + { + "epoch": 0.6894125651373877, + "grad_norm": 0.11518935114145279, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 178340 + }, + { + "epoch": 0.6894512223407709, + "grad_norm": 0.09700454771518707, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 178350 + }, + { + "epoch": 0.6894898795441543, + "grad_norm": 0.1426231861114502, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 178360 + }, + { + "epoch": 0.6895285367475376, + "grad_norm": 0.11000684648752213, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 178370 + }, + { + "epoch": 0.6895671939509208, + "grad_norm": 0.12923002243041992, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 178380 + }, + { + "epoch": 0.6896058511543041, + "grad_norm": 0.11459232121706009, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 178390 + }, + { + "epoch": 0.6896445083576874, + "grad_norm": 0.10787385702133179, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 178400 + }, + { + "epoch": 0.6896831655610707, + "grad_norm": 0.10136546939611435, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 178410 + }, + { + "epoch": 0.6897218227644539, + "grad_norm": 0.12130826711654663, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 178420 + }, + { + "epoch": 0.6897604799678372, + "grad_norm": 0.11021111905574799, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 178430 + }, + { + "epoch": 0.6897991371712204, + "grad_norm": 0.11486952006816864, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 178440 + }, + { + "epoch": 0.6898377943746038, + "grad_norm": 0.11372072994709015, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 178450 + }, + { + "epoch": 0.689876451577987, + "grad_norm": 0.11879274249076843, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 178460 + }, + { + "epoch": 0.6899151087813703, + "grad_norm": 0.09907825291156769, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 178470 + }, + { + "epoch": 0.6899537659847536, + "grad_norm": 0.11506243795156479, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 178480 + }, + { + "epoch": 0.6899924231881369, + "grad_norm": 0.09706560522317886, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 178490 + }, + { + "epoch": 0.6900310803915202, + "grad_norm": 0.09016738831996918, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 178500 + }, + { + "epoch": 0.6900697375949034, + "grad_norm": 0.14885951578617096, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 178510 + }, + { + "epoch": 0.6901083947982867, + "grad_norm": 0.1034003421664238, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 178520 + }, + { + "epoch": 0.69014705200167, + "grad_norm": 0.09764369577169418, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 178530 + }, + { + "epoch": 0.6901857092050533, + "grad_norm": 0.09645101428031921, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 178540 + }, + { + "epoch": 0.6902243664084365, + "grad_norm": 0.11894281208515167, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 178550 + }, + { + "epoch": 0.6902630236118198, + "grad_norm": 0.11207878589630127, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 178560 + }, + { + "epoch": 0.6903016808152032, + "grad_norm": 0.09871028363704681, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 178570 + }, + { + "epoch": 0.6903403380185864, + "grad_norm": 0.1066136583685875, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 178580 + }, + { + "epoch": 0.6903789952219697, + "grad_norm": 0.12492348998785019, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 178590 + }, + { + "epoch": 0.6904176524253529, + "grad_norm": 0.11499127000570297, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 178600 + }, + { + "epoch": 0.6904563096287362, + "grad_norm": 0.11852894723415375, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 178610 + }, + { + "epoch": 0.6904949668321195, + "grad_norm": 0.13737890124320984, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 178620 + }, + { + "epoch": 0.6905336240355028, + "grad_norm": 0.10862556099891663, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 178630 + }, + { + "epoch": 0.690572281238886, + "grad_norm": 0.1020117849111557, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 178640 + }, + { + "epoch": 0.6906109384422693, + "grad_norm": 0.12550663948059082, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 178650 + }, + { + "epoch": 0.6906495956456526, + "grad_norm": 0.09471151977777481, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 178660 + }, + { + "epoch": 0.6906882528490359, + "grad_norm": 0.09872092306613922, + "learning_rate": 0.002, + "loss": 2.333, + "step": 178670 + }, + { + "epoch": 0.6907269100524192, + "grad_norm": 0.10408961027860641, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 178680 + }, + { + "epoch": 0.6907655672558024, + "grad_norm": 0.1041586622595787, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 178690 + }, + { + "epoch": 0.6908042244591858, + "grad_norm": 0.11138633638620377, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 178700 + }, + { + "epoch": 0.690842881662569, + "grad_norm": 0.09857725352048874, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 178710 + }, + { + "epoch": 0.6908815388659523, + "grad_norm": 0.10884217172861099, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 178720 + }, + { + "epoch": 0.6909201960693355, + "grad_norm": 0.1148877963423729, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 178730 + }, + { + "epoch": 0.6909588532727189, + "grad_norm": 0.10008355230093002, + "learning_rate": 0.002, + "loss": 2.334, + "step": 178740 + }, + { + "epoch": 0.6909975104761021, + "grad_norm": 0.09689811617136002, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 178750 + }, + { + "epoch": 0.6910361676794854, + "grad_norm": 0.11208871006965637, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 178760 + }, + { + "epoch": 0.6910748248828686, + "grad_norm": 0.11783494800329208, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 178770 + }, + { + "epoch": 0.691113482086252, + "grad_norm": 0.10670506209135056, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 178780 + }, + { + "epoch": 0.6911521392896353, + "grad_norm": 0.10943397879600525, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 178790 + }, + { + "epoch": 0.6911907964930185, + "grad_norm": 0.10499881953001022, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 178800 + }, + { + "epoch": 0.6912294536964018, + "grad_norm": 0.10083287954330444, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 178810 + }, + { + "epoch": 0.691268110899785, + "grad_norm": 0.10064470022916794, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 178820 + }, + { + "epoch": 0.6913067681031684, + "grad_norm": 0.09685681015253067, + "learning_rate": 0.002, + "loss": 2.341, + "step": 178830 + }, + { + "epoch": 0.6913454253065516, + "grad_norm": 0.12001106888055801, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 178840 + }, + { + "epoch": 0.6913840825099349, + "grad_norm": 0.09279981255531311, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 178850 + }, + { + "epoch": 0.6914227397133181, + "grad_norm": 0.09641458094120026, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 178860 + }, + { + "epoch": 0.6914613969167015, + "grad_norm": 0.10775553435087204, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 178870 + }, + { + "epoch": 0.6915000541200848, + "grad_norm": 0.09094482660293579, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 178880 + }, + { + "epoch": 0.691538711323468, + "grad_norm": 0.11673973500728607, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 178890 + }, + { + "epoch": 0.6915773685268513, + "grad_norm": 0.12628406286239624, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 178900 + }, + { + "epoch": 0.6916160257302346, + "grad_norm": 0.11211872845888138, + "learning_rate": 0.002, + "loss": 2.345, + "step": 178910 + }, + { + "epoch": 0.6916546829336179, + "grad_norm": 0.0927414521574974, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 178920 + }, + { + "epoch": 0.6916933401370011, + "grad_norm": 0.1253931224346161, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 178930 + }, + { + "epoch": 0.6917319973403844, + "grad_norm": 0.09514501690864563, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 178940 + }, + { + "epoch": 0.6917706545437677, + "grad_norm": 0.12133972346782684, + "learning_rate": 0.002, + "loss": 2.36, + "step": 178950 + }, + { + "epoch": 0.691809311747151, + "grad_norm": 0.10072533041238785, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 178960 + }, + { + "epoch": 0.6918479689505342, + "grad_norm": 0.1324368715286255, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 178970 + }, + { + "epoch": 0.6918866261539175, + "grad_norm": 0.10226622968912125, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 178980 + }, + { + "epoch": 0.6919252833573007, + "grad_norm": 0.09472037851810455, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 178990 + }, + { + "epoch": 0.6919639405606841, + "grad_norm": 0.10809981822967529, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 179000 + }, + { + "epoch": 0.6920025977640674, + "grad_norm": 0.097027488052845, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 179010 + }, + { + "epoch": 0.6920412549674506, + "grad_norm": 0.12362071126699448, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 179020 + }, + { + "epoch": 0.6920799121708339, + "grad_norm": 0.0996069461107254, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 179030 + }, + { + "epoch": 0.6921185693742172, + "grad_norm": 0.0949220284819603, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 179040 + }, + { + "epoch": 0.6921572265776005, + "grad_norm": 0.13637186586856842, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 179050 + }, + { + "epoch": 0.6921958837809837, + "grad_norm": 0.10567469894886017, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 179060 + }, + { + "epoch": 0.692234540984367, + "grad_norm": 0.10000057518482208, + "learning_rate": 0.002, + "loss": 2.348, + "step": 179070 + }, + { + "epoch": 0.6922731981877503, + "grad_norm": 0.13319118320941925, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 179080 + }, + { + "epoch": 0.6923118553911336, + "grad_norm": 0.1000233143568039, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 179090 + }, + { + "epoch": 0.6923505125945169, + "grad_norm": 0.1102403849363327, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 179100 + }, + { + "epoch": 0.6923891697979001, + "grad_norm": 0.10236970335245132, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 179110 + }, + { + "epoch": 0.6924278270012835, + "grad_norm": 0.10387825965881348, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 179120 + }, + { + "epoch": 0.6924664842046667, + "grad_norm": 0.09711598604917526, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 179130 + }, + { + "epoch": 0.69250514140805, + "grad_norm": 0.10135672986507416, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 179140 + }, + { + "epoch": 0.6925437986114332, + "grad_norm": 0.10131373256444931, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 179150 + }, + { + "epoch": 0.6925824558148165, + "grad_norm": 0.10842801630496979, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 179160 + }, + { + "epoch": 0.6926211130181998, + "grad_norm": 0.13098689913749695, + "learning_rate": 0.002, + "loss": 2.3679, + "step": 179170 + }, + { + "epoch": 0.6926597702215831, + "grad_norm": 0.09229902923107147, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 179180 + }, + { + "epoch": 0.6926984274249663, + "grad_norm": 0.09704066067934036, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 179190 + }, + { + "epoch": 0.6927370846283496, + "grad_norm": 0.10814200341701508, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 179200 + }, + { + "epoch": 0.692775741831733, + "grad_norm": 0.11031364649534225, + "learning_rate": 0.002, + "loss": 2.358, + "step": 179210 + }, + { + "epoch": 0.6928143990351162, + "grad_norm": 0.10160470753908157, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 179220 + }, + { + "epoch": 0.6928530562384995, + "grad_norm": 0.1103799045085907, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 179230 + }, + { + "epoch": 0.6928917134418827, + "grad_norm": 0.11409421265125275, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 179240 + }, + { + "epoch": 0.6929303706452661, + "grad_norm": 0.12143899500370026, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 179250 + }, + { + "epoch": 0.6929690278486493, + "grad_norm": 0.10524741560220718, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 179260 + }, + { + "epoch": 0.6930076850520326, + "grad_norm": 0.10471028834581375, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 179270 + }, + { + "epoch": 0.6930463422554158, + "grad_norm": 0.10988292843103409, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 179280 + }, + { + "epoch": 0.6930849994587992, + "grad_norm": 0.0952146053314209, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 179290 + }, + { + "epoch": 0.6931236566621825, + "grad_norm": 0.11299467831850052, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 179300 + }, + { + "epoch": 0.6931623138655657, + "grad_norm": 0.09987322986125946, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 179310 + }, + { + "epoch": 0.693200971068949, + "grad_norm": 0.09584300220012665, + "learning_rate": 0.002, + "loss": 2.339, + "step": 179320 + }, + { + "epoch": 0.6932396282723323, + "grad_norm": 0.1155521422624588, + "learning_rate": 0.002, + "loss": 2.35, + "step": 179330 + }, + { + "epoch": 0.6932782854757156, + "grad_norm": 0.10797114670276642, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 179340 + }, + { + "epoch": 0.6933169426790988, + "grad_norm": 0.12133597582578659, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 179350 + }, + { + "epoch": 0.6933555998824821, + "grad_norm": 0.09763203561306, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 179360 + }, + { + "epoch": 0.6933942570858653, + "grad_norm": 0.10273556411266327, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 179370 + }, + { + "epoch": 0.6934329142892487, + "grad_norm": 0.13102386891841888, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 179380 + }, + { + "epoch": 0.6934715714926319, + "grad_norm": 0.10223275423049927, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 179390 + }, + { + "epoch": 0.6935102286960152, + "grad_norm": 0.09479638189077377, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 179400 + }, + { + "epoch": 0.6935488858993984, + "grad_norm": 0.10030090063810349, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 179410 + }, + { + "epoch": 0.6935875431027818, + "grad_norm": 0.10234583169221878, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 179420 + }, + { + "epoch": 0.6936262003061651, + "grad_norm": 0.12980692088603973, + "learning_rate": 0.002, + "loss": 2.343, + "step": 179430 + }, + { + "epoch": 0.6936648575095483, + "grad_norm": 0.08630619943141937, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 179440 + }, + { + "epoch": 0.6937035147129316, + "grad_norm": 0.12653999030590057, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 179450 + }, + { + "epoch": 0.6937421719163149, + "grad_norm": 0.0999857485294342, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 179460 + }, + { + "epoch": 0.6937808291196982, + "grad_norm": 0.09825851023197174, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 179470 + }, + { + "epoch": 0.6938194863230814, + "grad_norm": 0.1375643014907837, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 179480 + }, + { + "epoch": 0.6938581435264647, + "grad_norm": 0.11085304617881775, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 179490 + }, + { + "epoch": 0.693896800729848, + "grad_norm": 0.09401252865791321, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 179500 + }, + { + "epoch": 0.6939354579332313, + "grad_norm": 0.10243132710456848, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 179510 + }, + { + "epoch": 0.6939741151366146, + "grad_norm": 0.10913652926683426, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 179520 + }, + { + "epoch": 0.6940127723399978, + "grad_norm": 0.14689086377620697, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 179530 + }, + { + "epoch": 0.6940514295433811, + "grad_norm": 0.1280713528394699, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 179540 + }, + { + "epoch": 0.6940900867467644, + "grad_norm": 0.10417623072862625, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 179550 + }, + { + "epoch": 0.6941287439501477, + "grad_norm": 0.09852035343647003, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 179560 + }, + { + "epoch": 0.6941674011535309, + "grad_norm": 0.11538185924291611, + "learning_rate": 0.002, + "loss": 2.316, + "step": 179570 + }, + { + "epoch": 0.6942060583569142, + "grad_norm": 0.11690109223127365, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 179580 + }, + { + "epoch": 0.6942447155602975, + "grad_norm": 0.10348938405513763, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 179590 + }, + { + "epoch": 0.6942833727636808, + "grad_norm": 0.1061256155371666, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 179600 + }, + { + "epoch": 0.694322029967064, + "grad_norm": 0.09781806915998459, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 179610 + }, + { + "epoch": 0.6943606871704473, + "grad_norm": 0.09417387843132019, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 179620 + }, + { + "epoch": 0.6943993443738307, + "grad_norm": 0.09854403138160706, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 179630 + }, + { + "epoch": 0.6944380015772139, + "grad_norm": 0.10567111521959305, + "learning_rate": 0.002, + "loss": 2.34, + "step": 179640 + }, + { + "epoch": 0.6944766587805972, + "grad_norm": 0.10450496524572372, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 179650 + }, + { + "epoch": 0.6945153159839804, + "grad_norm": 0.08845242857933044, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 179660 + }, + { + "epoch": 0.6945539731873638, + "grad_norm": 0.09574896097183228, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 179670 + }, + { + "epoch": 0.694592630390747, + "grad_norm": 0.09121603518724442, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 179680 + }, + { + "epoch": 0.6946312875941303, + "grad_norm": 0.10315220803022385, + "learning_rate": 0.002, + "loss": 2.347, + "step": 179690 + }, + { + "epoch": 0.6946699447975135, + "grad_norm": 0.10953135043382645, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 179700 + }, + { + "epoch": 0.6947086020008969, + "grad_norm": 0.10534404963254929, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 179710 + }, + { + "epoch": 0.6947472592042802, + "grad_norm": 0.10050157457590103, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 179720 + }, + { + "epoch": 0.6947859164076634, + "grad_norm": 0.0944216325879097, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 179730 + }, + { + "epoch": 0.6948245736110467, + "grad_norm": 0.13836099207401276, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 179740 + }, + { + "epoch": 0.6948632308144299, + "grad_norm": 0.10789356380701065, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 179750 + }, + { + "epoch": 0.6949018880178133, + "grad_norm": 0.11295302212238312, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 179760 + }, + { + "epoch": 0.6949405452211965, + "grad_norm": 0.11282958835363388, + "learning_rate": 0.002, + "loss": 2.355, + "step": 179770 + }, + { + "epoch": 0.6949792024245798, + "grad_norm": 0.09731011837720871, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 179780 + }, + { + "epoch": 0.695017859627963, + "grad_norm": 0.10323044657707214, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 179790 + }, + { + "epoch": 0.6950565168313464, + "grad_norm": 0.1089772880077362, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 179800 + }, + { + "epoch": 0.6950951740347296, + "grad_norm": 0.11274880915880203, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 179810 + }, + { + "epoch": 0.6951338312381129, + "grad_norm": 0.1387518048286438, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 179820 + }, + { + "epoch": 0.6951724884414962, + "grad_norm": 0.10620445013046265, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 179830 + }, + { + "epoch": 0.6952111456448795, + "grad_norm": 0.10442688316106796, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 179840 + }, + { + "epoch": 0.6952498028482628, + "grad_norm": 0.09788144379854202, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 179850 + }, + { + "epoch": 0.695288460051646, + "grad_norm": 0.09841413050889969, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 179860 + }, + { + "epoch": 0.6953271172550293, + "grad_norm": 0.12703649699687958, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 179870 + }, + { + "epoch": 0.6953657744584126, + "grad_norm": 0.1050918772816658, + "learning_rate": 0.002, + "loss": 2.331, + "step": 179880 + }, + { + "epoch": 0.6954044316617959, + "grad_norm": 0.10114361345767975, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 179890 + }, + { + "epoch": 0.6954430888651791, + "grad_norm": 0.10773694515228271, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 179900 + }, + { + "epoch": 0.6954817460685624, + "grad_norm": 0.10803357511758804, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 179910 + }, + { + "epoch": 0.6955204032719456, + "grad_norm": 0.0990663468837738, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 179920 + }, + { + "epoch": 0.695559060475329, + "grad_norm": 0.08901944756507874, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 179930 + }, + { + "epoch": 0.6955977176787123, + "grad_norm": 0.11902262270450592, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 179940 + }, + { + "epoch": 0.6956363748820955, + "grad_norm": 0.0996413454413414, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 179950 + }, + { + "epoch": 0.6956750320854788, + "grad_norm": 0.11540281772613525, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 179960 + }, + { + "epoch": 0.6957136892888621, + "grad_norm": 0.11333900690078735, + "learning_rate": 0.002, + "loss": 2.328, + "step": 179970 + }, + { + "epoch": 0.6957523464922454, + "grad_norm": 0.12207765132188797, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 179980 + }, + { + "epoch": 0.6957910036956286, + "grad_norm": 0.10058306157588959, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 179990 + }, + { + "epoch": 0.6958296608990119, + "grad_norm": 0.10925924777984619, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 180000 + }, + { + "epoch": 0.6958683181023952, + "grad_norm": 0.0989830419421196, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 180010 + }, + { + "epoch": 0.6959069753057785, + "grad_norm": 0.1015496775507927, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 180020 + }, + { + "epoch": 0.6959456325091617, + "grad_norm": 0.2994261085987091, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 180030 + }, + { + "epoch": 0.695984289712545, + "grad_norm": 0.11924055218696594, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 180040 + }, + { + "epoch": 0.6960229469159284, + "grad_norm": 0.10244481265544891, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 180050 + }, + { + "epoch": 0.6960616041193116, + "grad_norm": 0.09961318224668503, + "learning_rate": 0.002, + "loss": 2.335, + "step": 180060 + }, + { + "epoch": 0.6961002613226949, + "grad_norm": 0.11061107367277145, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 180070 + }, + { + "epoch": 0.6961389185260781, + "grad_norm": 0.10321884602308273, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 180080 + }, + { + "epoch": 0.6961775757294614, + "grad_norm": 0.11215357482433319, + "learning_rate": 0.002, + "loss": 2.342, + "step": 180090 + }, + { + "epoch": 0.6962162329328447, + "grad_norm": 0.11976996064186096, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 180100 + }, + { + "epoch": 0.696254890136228, + "grad_norm": 0.10721524804830551, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 180110 + }, + { + "epoch": 0.6962935473396112, + "grad_norm": 0.1116723120212555, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 180120 + }, + { + "epoch": 0.6963322045429945, + "grad_norm": 0.22823664546012878, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 180130 + }, + { + "epoch": 0.6963708617463779, + "grad_norm": 0.10076408833265305, + "learning_rate": 0.002, + "loss": 2.345, + "step": 180140 + }, + { + "epoch": 0.6964095189497611, + "grad_norm": 0.09941043704748154, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 180150 + }, + { + "epoch": 0.6964481761531444, + "grad_norm": 0.13223543763160706, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 180160 + }, + { + "epoch": 0.6964868333565276, + "grad_norm": 0.10886907577514648, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 180170 + }, + { + "epoch": 0.696525490559911, + "grad_norm": 0.1024252250790596, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 180180 + }, + { + "epoch": 0.6965641477632942, + "grad_norm": 0.10686971247196198, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 180190 + }, + { + "epoch": 0.6966028049666775, + "grad_norm": 0.10215549170970917, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 180200 + }, + { + "epoch": 0.6966414621700607, + "grad_norm": 0.10930994153022766, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 180210 + }, + { + "epoch": 0.6966801193734441, + "grad_norm": 0.09673038125038147, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 180220 + }, + { + "epoch": 0.6967187765768273, + "grad_norm": 0.11928710341453552, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 180230 + }, + { + "epoch": 0.6967574337802106, + "grad_norm": 0.1123550608754158, + "learning_rate": 0.002, + "loss": 2.342, + "step": 180240 + }, + { + "epoch": 0.6967960909835939, + "grad_norm": 0.08652473986148834, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 180250 + }, + { + "epoch": 0.6968347481869772, + "grad_norm": 0.09731192886829376, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 180260 + }, + { + "epoch": 0.6968734053903605, + "grad_norm": 0.10636240988969803, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 180270 + }, + { + "epoch": 0.6969120625937437, + "grad_norm": 0.10812932252883911, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 180280 + }, + { + "epoch": 0.696950719797127, + "grad_norm": 0.10161454230546951, + "learning_rate": 0.002, + "loss": 2.3646, + "step": 180290 + }, + { + "epoch": 0.6969893770005102, + "grad_norm": 0.10729075968265533, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 180300 + }, + { + "epoch": 0.6970280342038936, + "grad_norm": 0.11432657390832901, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 180310 + }, + { + "epoch": 0.6970666914072768, + "grad_norm": 0.09904944151639938, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 180320 + }, + { + "epoch": 0.6971053486106601, + "grad_norm": 0.09000122547149658, + "learning_rate": 0.002, + "loss": 2.351, + "step": 180330 + }, + { + "epoch": 0.6971440058140433, + "grad_norm": 0.10641942173242569, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 180340 + }, + { + "epoch": 0.6971826630174267, + "grad_norm": 0.11361725628376007, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 180350 + }, + { + "epoch": 0.69722132022081, + "grad_norm": 0.11069035530090332, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 180360 + }, + { + "epoch": 0.6972599774241932, + "grad_norm": 0.10378896445035934, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 180370 + }, + { + "epoch": 0.6972986346275765, + "grad_norm": 0.1104269027709961, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 180380 + }, + { + "epoch": 0.6973372918309598, + "grad_norm": 0.09437578916549683, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 180390 + }, + { + "epoch": 0.6973759490343431, + "grad_norm": 0.11597743630409241, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 180400 + }, + { + "epoch": 0.6974146062377263, + "grad_norm": 0.10915557295084, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 180410 + }, + { + "epoch": 0.6974532634411096, + "grad_norm": 0.09487058222293854, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 180420 + }, + { + "epoch": 0.697491920644493, + "grad_norm": 0.11591614037752151, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 180430 + }, + { + "epoch": 0.6975305778478762, + "grad_norm": 0.12642371654510498, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 180440 + }, + { + "epoch": 0.6975692350512595, + "grad_norm": 0.10184739530086517, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 180450 + }, + { + "epoch": 0.6976078922546427, + "grad_norm": 0.09792407602071762, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 180460 + }, + { + "epoch": 0.697646549458026, + "grad_norm": 0.14495137333869934, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 180470 + }, + { + "epoch": 0.6976852066614093, + "grad_norm": 0.10353922843933105, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 180480 + }, + { + "epoch": 0.6977238638647926, + "grad_norm": 0.09864120930433273, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 180490 + }, + { + "epoch": 0.6977625210681758, + "grad_norm": 0.10658890753984451, + "learning_rate": 0.002, + "loss": 2.359, + "step": 180500 + }, + { + "epoch": 0.6978011782715591, + "grad_norm": 0.1569095402956009, + "learning_rate": 0.002, + "loss": 2.35, + "step": 180510 + }, + { + "epoch": 0.6978398354749424, + "grad_norm": 0.11538206785917282, + "learning_rate": 0.002, + "loss": 2.348, + "step": 180520 + }, + { + "epoch": 0.6978784926783257, + "grad_norm": 0.09712937474250793, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 180530 + }, + { + "epoch": 0.6979171498817089, + "grad_norm": 0.11353567987680435, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 180540 + }, + { + "epoch": 0.6979558070850922, + "grad_norm": 0.09400054067373276, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 180550 + }, + { + "epoch": 0.6979944642884756, + "grad_norm": 0.11250081658363342, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 180560 + }, + { + "epoch": 0.6980331214918588, + "grad_norm": 0.10020963102579117, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 180570 + }, + { + "epoch": 0.6980717786952421, + "grad_norm": 0.21803762018680573, + "learning_rate": 0.002, + "loss": 2.348, + "step": 180580 + }, + { + "epoch": 0.6981104358986253, + "grad_norm": 0.11189590394496918, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 180590 + }, + { + "epoch": 0.6981490931020087, + "grad_norm": 0.10142678022384644, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 180600 + }, + { + "epoch": 0.6981877503053919, + "grad_norm": 0.09760603308677673, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 180610 + }, + { + "epoch": 0.6982264075087752, + "grad_norm": 0.10050047934055328, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 180620 + }, + { + "epoch": 0.6982650647121584, + "grad_norm": 0.11918167769908905, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 180630 + }, + { + "epoch": 0.6983037219155418, + "grad_norm": 0.11384829878807068, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 180640 + }, + { + "epoch": 0.698342379118925, + "grad_norm": 0.10176576673984528, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 180650 + }, + { + "epoch": 0.6983810363223083, + "grad_norm": 0.10738183557987213, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 180660 + }, + { + "epoch": 0.6984196935256916, + "grad_norm": 0.10513759404420853, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 180670 + }, + { + "epoch": 0.6984583507290748, + "grad_norm": 0.09060550481081009, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 180680 + }, + { + "epoch": 0.6984970079324582, + "grad_norm": 0.09310439974069595, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 180690 + }, + { + "epoch": 0.6985356651358414, + "grad_norm": 0.10888612270355225, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 180700 + }, + { + "epoch": 0.6985743223392247, + "grad_norm": 0.116920605301857, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 180710 + }, + { + "epoch": 0.6986129795426079, + "grad_norm": 0.12050127238035202, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 180720 + }, + { + "epoch": 0.6986516367459913, + "grad_norm": 0.08785030990839005, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 180730 + }, + { + "epoch": 0.6986902939493745, + "grad_norm": 0.11219271272420883, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 180740 + }, + { + "epoch": 0.6987289511527578, + "grad_norm": 0.10325204581022263, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 180750 + }, + { + "epoch": 0.698767608356141, + "grad_norm": 0.10277462750673294, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 180760 + }, + { + "epoch": 0.6988062655595244, + "grad_norm": 0.11527806520462036, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 180770 + }, + { + "epoch": 0.6988449227629077, + "grad_norm": 0.10413837432861328, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 180780 + }, + { + "epoch": 0.6988835799662909, + "grad_norm": 0.13020369410514832, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 180790 + }, + { + "epoch": 0.6989222371696742, + "grad_norm": 0.11335482448339462, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 180800 + }, + { + "epoch": 0.6989608943730575, + "grad_norm": 0.11713720858097076, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 180810 + }, + { + "epoch": 0.6989995515764408, + "grad_norm": 0.095161572098732, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 180820 + }, + { + "epoch": 0.699038208779824, + "grad_norm": 0.10159959644079208, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 180830 + }, + { + "epoch": 0.6990768659832073, + "grad_norm": 0.10313326120376587, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 180840 + }, + { + "epoch": 0.6991155231865905, + "grad_norm": 0.10190504044294357, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 180850 + }, + { + "epoch": 0.6991541803899739, + "grad_norm": 0.11342480778694153, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 180860 + }, + { + "epoch": 0.6991928375933572, + "grad_norm": 0.10530469566583633, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 180870 + }, + { + "epoch": 0.6992314947967404, + "grad_norm": 0.10882703214883804, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 180880 + }, + { + "epoch": 0.6992701520001237, + "grad_norm": 0.10685546696186066, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 180890 + }, + { + "epoch": 0.699308809203507, + "grad_norm": 0.10505948960781097, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 180900 + }, + { + "epoch": 0.6993474664068903, + "grad_norm": 0.1125546246767044, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 180910 + }, + { + "epoch": 0.6993861236102735, + "grad_norm": 0.0941130518913269, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 180920 + }, + { + "epoch": 0.6994247808136568, + "grad_norm": 0.10644858330488205, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 180930 + }, + { + "epoch": 0.6994634380170401, + "grad_norm": 0.10426277667284012, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 180940 + }, + { + "epoch": 0.6995020952204234, + "grad_norm": 0.09964601695537567, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 180950 + }, + { + "epoch": 0.6995407524238066, + "grad_norm": 0.09501111507415771, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 180960 + }, + { + "epoch": 0.6995794096271899, + "grad_norm": 0.1172516793012619, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 180970 + }, + { + "epoch": 0.6996180668305733, + "grad_norm": 0.11361086368560791, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 180980 + }, + { + "epoch": 0.6996567240339565, + "grad_norm": 0.11219489574432373, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 180990 + }, + { + "epoch": 0.6996953812373398, + "grad_norm": 0.10707321017980576, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 181000 + }, + { + "epoch": 0.699734038440723, + "grad_norm": 0.09972088038921356, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 181010 + }, + { + "epoch": 0.6997726956441063, + "grad_norm": 0.13037872314453125, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 181020 + }, + { + "epoch": 0.6998113528474896, + "grad_norm": 0.0949321910738945, + "learning_rate": 0.002, + "loss": 2.324, + "step": 181030 + }, + { + "epoch": 0.6998500100508729, + "grad_norm": 0.09620226174592972, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 181040 + }, + { + "epoch": 0.6998886672542561, + "grad_norm": 0.11894714832305908, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 181050 + }, + { + "epoch": 0.6999273244576394, + "grad_norm": 0.10436682403087616, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 181060 + }, + { + "epoch": 0.6999659816610228, + "grad_norm": 0.08825159817934036, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 181070 + }, + { + "epoch": 0.700004638864406, + "grad_norm": 0.11684508621692657, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 181080 + }, + { + "epoch": 0.7000432960677893, + "grad_norm": 0.08650648593902588, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 181090 + }, + { + "epoch": 0.7000819532711725, + "grad_norm": 0.10749147087335587, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 181100 + }, + { + "epoch": 0.7001206104745559, + "grad_norm": 0.10667852312326431, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 181110 + }, + { + "epoch": 0.7001592676779391, + "grad_norm": 0.1255321353673935, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 181120 + }, + { + "epoch": 0.7001979248813224, + "grad_norm": 0.09984520822763443, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 181130 + }, + { + "epoch": 0.7002365820847056, + "grad_norm": 0.12421584129333496, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 181140 + }, + { + "epoch": 0.700275239288089, + "grad_norm": 0.11375346034765244, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 181150 + }, + { + "epoch": 0.7003138964914722, + "grad_norm": 0.09819741547107697, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 181160 + }, + { + "epoch": 0.7003525536948555, + "grad_norm": 0.10152056068181992, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 181170 + }, + { + "epoch": 0.7003912108982387, + "grad_norm": 0.100888691842556, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 181180 + }, + { + "epoch": 0.7004298681016221, + "grad_norm": 0.1108546033501625, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 181190 + }, + { + "epoch": 0.7004685253050054, + "grad_norm": 0.10671539604663849, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 181200 + }, + { + "epoch": 0.7005071825083886, + "grad_norm": 0.09765645116567612, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 181210 + }, + { + "epoch": 0.7005458397117719, + "grad_norm": 0.11254343390464783, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 181220 + }, + { + "epoch": 0.7005844969151551, + "grad_norm": 0.1040448248386383, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 181230 + }, + { + "epoch": 0.7006231541185385, + "grad_norm": 0.11068741232156754, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 181240 + }, + { + "epoch": 0.7006618113219217, + "grad_norm": 0.12011968344449997, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 181250 + }, + { + "epoch": 0.700700468525305, + "grad_norm": 0.09599734842777252, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 181260 + }, + { + "epoch": 0.7007391257286882, + "grad_norm": 0.11694858968257904, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 181270 + }, + { + "epoch": 0.7007777829320716, + "grad_norm": 0.09414356201887131, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 181280 + }, + { + "epoch": 0.7008164401354549, + "grad_norm": 0.12005293369293213, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 181290 + }, + { + "epoch": 0.7008550973388381, + "grad_norm": 0.10294341295957565, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 181300 + }, + { + "epoch": 0.7008937545422214, + "grad_norm": 0.10715653002262115, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 181310 + }, + { + "epoch": 0.7009324117456047, + "grad_norm": 0.11320353299379349, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 181320 + }, + { + "epoch": 0.700971068948988, + "grad_norm": 0.13430100679397583, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 181330 + }, + { + "epoch": 0.7010097261523712, + "grad_norm": 0.125883087515831, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 181340 + }, + { + "epoch": 0.7010483833557545, + "grad_norm": 0.10331200808286667, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 181350 + }, + { + "epoch": 0.7010870405591378, + "grad_norm": 0.11117926239967346, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 181360 + }, + { + "epoch": 0.7011256977625211, + "grad_norm": 0.1090509444475174, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 181370 + }, + { + "epoch": 0.7011643549659043, + "grad_norm": 0.12386742979288101, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 181380 + }, + { + "epoch": 0.7012030121692876, + "grad_norm": 0.10557752847671509, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 181390 + }, + { + "epoch": 0.7012416693726709, + "grad_norm": 0.10935811698436737, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 181400 + }, + { + "epoch": 0.7012803265760542, + "grad_norm": 0.1020449548959732, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 181410 + }, + { + "epoch": 0.7013189837794375, + "grad_norm": 0.09770620614290237, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 181420 + }, + { + "epoch": 0.7013576409828207, + "grad_norm": 0.13287243247032166, + "learning_rate": 0.002, + "loss": 2.349, + "step": 181430 + }, + { + "epoch": 0.701396298186204, + "grad_norm": 0.10809620469808578, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 181440 + }, + { + "epoch": 0.7014349553895873, + "grad_norm": 0.09670908004045486, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 181450 + }, + { + "epoch": 0.7014736125929706, + "grad_norm": 0.10117173194885254, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 181460 + }, + { + "epoch": 0.7015122697963538, + "grad_norm": 0.10399699956178665, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 181470 + }, + { + "epoch": 0.7015509269997371, + "grad_norm": 0.09978353977203369, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 181480 + }, + { + "epoch": 0.7015895842031205, + "grad_norm": 0.10881523787975311, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 181490 + }, + { + "epoch": 0.7016282414065037, + "grad_norm": 0.10354939848184586, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 181500 + }, + { + "epoch": 0.701666898609887, + "grad_norm": 0.11343428492546082, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 181510 + }, + { + "epoch": 0.7017055558132702, + "grad_norm": 0.10757460445165634, + "learning_rate": 0.002, + "loss": 2.3102, + "step": 181520 + }, + { + "epoch": 0.7017442130166536, + "grad_norm": 0.11501331627368927, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 181530 + }, + { + "epoch": 0.7017828702200368, + "grad_norm": 0.09403284639120102, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 181540 + }, + { + "epoch": 0.7018215274234201, + "grad_norm": 0.11164639890193939, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 181550 + }, + { + "epoch": 0.7018601846268033, + "grad_norm": 0.09477049857378006, + "learning_rate": 0.002, + "loss": 2.338, + "step": 181560 + }, + { + "epoch": 0.7018988418301866, + "grad_norm": 0.1289137452840805, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 181570 + }, + { + "epoch": 0.70193749903357, + "grad_norm": 0.1001296266913414, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 181580 + }, + { + "epoch": 0.7019761562369532, + "grad_norm": 0.10789035260677338, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 181590 + }, + { + "epoch": 0.7020148134403364, + "grad_norm": 0.11146128922700882, + "learning_rate": 0.002, + "loss": 2.351, + "step": 181600 + }, + { + "epoch": 0.7020534706437197, + "grad_norm": 0.11408261954784393, + "learning_rate": 0.002, + "loss": 2.347, + "step": 181610 + }, + { + "epoch": 0.7020921278471031, + "grad_norm": 0.13551244139671326, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 181620 + }, + { + "epoch": 0.7021307850504863, + "grad_norm": 0.0898752361536026, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 181630 + }, + { + "epoch": 0.7021694422538696, + "grad_norm": 0.09439538419246674, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 181640 + }, + { + "epoch": 0.7022080994572528, + "grad_norm": 0.11113714426755905, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 181650 + }, + { + "epoch": 0.7022467566606362, + "grad_norm": 0.09541790932416916, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 181660 + }, + { + "epoch": 0.7022854138640194, + "grad_norm": 0.10134763270616531, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 181670 + }, + { + "epoch": 0.7023240710674027, + "grad_norm": 0.1164916381239891, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 181680 + }, + { + "epoch": 0.7023627282707859, + "grad_norm": 0.12147685140371323, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 181690 + }, + { + "epoch": 0.7024013854741693, + "grad_norm": 0.11087074130773544, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 181700 + }, + { + "epoch": 0.7024400426775526, + "grad_norm": 0.09606213122606277, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 181710 + }, + { + "epoch": 0.7024786998809358, + "grad_norm": 0.11690132319927216, + "learning_rate": 0.002, + "loss": 2.341, + "step": 181720 + }, + { + "epoch": 0.7025173570843191, + "grad_norm": 0.11286009103059769, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 181730 + }, + { + "epoch": 0.7025560142877024, + "grad_norm": 0.10286867618560791, + "learning_rate": 0.002, + "loss": 2.342, + "step": 181740 + }, + { + "epoch": 0.7025946714910857, + "grad_norm": 0.11863042414188385, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 181750 + }, + { + "epoch": 0.7026333286944689, + "grad_norm": 0.10015347599983215, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 181760 + }, + { + "epoch": 0.7026719858978522, + "grad_norm": 0.11568449437618256, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 181770 + }, + { + "epoch": 0.7027106431012354, + "grad_norm": 0.10303530097007751, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 181780 + }, + { + "epoch": 0.7027493003046188, + "grad_norm": 0.09969471395015717, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 181790 + }, + { + "epoch": 0.702787957508002, + "grad_norm": 0.09912727028131485, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 181800 + }, + { + "epoch": 0.7028266147113853, + "grad_norm": 0.10416741669178009, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 181810 + }, + { + "epoch": 0.7028652719147686, + "grad_norm": 0.10566399991512299, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 181820 + }, + { + "epoch": 0.7029039291181519, + "grad_norm": 0.10490906238555908, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 181830 + }, + { + "epoch": 0.7029425863215352, + "grad_norm": 0.08886059373617172, + "learning_rate": 0.002, + "loss": 2.347, + "step": 181840 + }, + { + "epoch": 0.7029812435249184, + "grad_norm": 0.09844937920570374, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 181850 + }, + { + "epoch": 0.7030199007283017, + "grad_norm": 0.12183602154254913, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 181860 + }, + { + "epoch": 0.703058557931685, + "grad_norm": 0.10586797446012497, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 181870 + }, + { + "epoch": 0.7030972151350683, + "grad_norm": 0.10901835560798645, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 181880 + }, + { + "epoch": 0.7031358723384515, + "grad_norm": 0.09768293797969818, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 181890 + }, + { + "epoch": 0.7031745295418348, + "grad_norm": 0.10768637806177139, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 181900 + }, + { + "epoch": 0.7032131867452182, + "grad_norm": 0.13276785612106323, + "learning_rate": 0.002, + "loss": 2.3644, + "step": 181910 + }, + { + "epoch": 0.7032518439486014, + "grad_norm": 0.11340388655662537, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 181920 + }, + { + "epoch": 0.7032905011519847, + "grad_norm": 0.11737782508134842, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 181930 + }, + { + "epoch": 0.7033291583553679, + "grad_norm": 0.11821569502353668, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 181940 + }, + { + "epoch": 0.7033678155587512, + "grad_norm": 0.09863719344139099, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 181950 + }, + { + "epoch": 0.7034064727621345, + "grad_norm": 0.10446591675281525, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 181960 + }, + { + "epoch": 0.7034451299655178, + "grad_norm": 0.11456377059221268, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 181970 + }, + { + "epoch": 0.703483787168901, + "grad_norm": 0.11112993210554123, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 181980 + }, + { + "epoch": 0.7035224443722843, + "grad_norm": 0.10735056549310684, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 181990 + }, + { + "epoch": 0.7035611015756676, + "grad_norm": 0.0971650630235672, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 182000 + }, + { + "epoch": 0.7035997587790509, + "grad_norm": 0.10977531224489212, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 182010 + }, + { + "epoch": 0.7036384159824342, + "grad_norm": 0.09874717891216278, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 182020 + }, + { + "epoch": 0.7036770731858174, + "grad_norm": 0.11938779801130295, + "learning_rate": 0.002, + "loss": 2.34, + "step": 182030 + }, + { + "epoch": 0.7037157303892008, + "grad_norm": 0.09552514553070068, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 182040 + }, + { + "epoch": 0.703754387592584, + "grad_norm": 0.1094004213809967, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 182050 + }, + { + "epoch": 0.7037930447959673, + "grad_norm": 0.11232999712228775, + "learning_rate": 0.002, + "loss": 2.345, + "step": 182060 + }, + { + "epoch": 0.7038317019993505, + "grad_norm": 0.1030832976102829, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 182070 + }, + { + "epoch": 0.7038703592027339, + "grad_norm": 0.09489600360393524, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 182080 + }, + { + "epoch": 0.7039090164061171, + "grad_norm": 0.10776722431182861, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 182090 + }, + { + "epoch": 0.7039476736095004, + "grad_norm": 0.11068689823150635, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 182100 + }, + { + "epoch": 0.7039863308128836, + "grad_norm": 0.10417316108942032, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 182110 + }, + { + "epoch": 0.704024988016267, + "grad_norm": 0.10432534664869308, + "learning_rate": 0.002, + "loss": 2.328, + "step": 182120 + }, + { + "epoch": 0.7040636452196503, + "grad_norm": 0.11190181225538254, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 182130 + }, + { + "epoch": 0.7041023024230335, + "grad_norm": 0.10109800100326538, + "learning_rate": 0.002, + "loss": 2.339, + "step": 182140 + }, + { + "epoch": 0.7041409596264168, + "grad_norm": 0.0988345518708229, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 182150 + }, + { + "epoch": 0.7041796168298, + "grad_norm": 0.11042675375938416, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 182160 + }, + { + "epoch": 0.7042182740331834, + "grad_norm": 0.10132566094398499, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 182170 + }, + { + "epoch": 0.7042569312365666, + "grad_norm": 0.10684694349765778, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 182180 + }, + { + "epoch": 0.7042955884399499, + "grad_norm": 0.1098591685295105, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 182190 + }, + { + "epoch": 0.7043342456433331, + "grad_norm": 0.09078842401504517, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 182200 + }, + { + "epoch": 0.7043729028467165, + "grad_norm": 0.09298959374427795, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 182210 + }, + { + "epoch": 0.7044115600500997, + "grad_norm": 0.12122335284948349, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 182220 + }, + { + "epoch": 0.704450217253483, + "grad_norm": 0.11889586597681046, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 182230 + }, + { + "epoch": 0.7044888744568663, + "grad_norm": 0.13856007158756256, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 182240 + }, + { + "epoch": 0.7045275316602496, + "grad_norm": 0.09847620874643326, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 182250 + }, + { + "epoch": 0.7045661888636329, + "grad_norm": 0.10424961149692535, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 182260 + }, + { + "epoch": 0.7046048460670161, + "grad_norm": 0.09979801625013351, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 182270 + }, + { + "epoch": 0.7046435032703994, + "grad_norm": 0.09866438060998917, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 182280 + }, + { + "epoch": 0.7046821604737827, + "grad_norm": 0.09678113460540771, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 182290 + }, + { + "epoch": 0.704720817677166, + "grad_norm": 0.12021298706531525, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 182300 + }, + { + "epoch": 0.7047594748805492, + "grad_norm": 0.12732869386672974, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 182310 + }, + { + "epoch": 0.7047981320839325, + "grad_norm": 0.10386567562818527, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 182320 + }, + { + "epoch": 0.7048367892873157, + "grad_norm": 0.09794675558805466, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 182330 + }, + { + "epoch": 0.7048754464906991, + "grad_norm": 0.09235762059688568, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 182340 + }, + { + "epoch": 0.7049141036940824, + "grad_norm": 0.11255183070898056, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 182350 + }, + { + "epoch": 0.7049527608974656, + "grad_norm": 0.09853653609752655, + "learning_rate": 0.002, + "loss": 2.33, + "step": 182360 + }, + { + "epoch": 0.7049914181008489, + "grad_norm": 0.09108425676822662, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 182370 + }, + { + "epoch": 0.7050300753042322, + "grad_norm": 0.10841450095176697, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 182380 + }, + { + "epoch": 0.7050687325076155, + "grad_norm": 0.13346466422080994, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 182390 + }, + { + "epoch": 0.7051073897109987, + "grad_norm": 0.1046786680817604, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 182400 + }, + { + "epoch": 0.705146046914382, + "grad_norm": 0.0977960154414177, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 182410 + }, + { + "epoch": 0.7051847041177653, + "grad_norm": 0.09477897733449936, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 182420 + }, + { + "epoch": 0.7052233613211486, + "grad_norm": 0.10036720335483551, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 182430 + }, + { + "epoch": 0.7052620185245319, + "grad_norm": 0.09965860843658447, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 182440 + }, + { + "epoch": 0.7053006757279151, + "grad_norm": 0.09179232269525528, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 182450 + }, + { + "epoch": 0.7053393329312985, + "grad_norm": 0.11638399958610535, + "learning_rate": 0.002, + "loss": 2.342, + "step": 182460 + }, + { + "epoch": 0.7053779901346817, + "grad_norm": 0.10395177453756332, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 182470 + }, + { + "epoch": 0.705416647338065, + "grad_norm": 0.10976772755384445, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 182480 + }, + { + "epoch": 0.7054553045414482, + "grad_norm": 0.10409149527549744, + "learning_rate": 0.002, + "loss": 2.34, + "step": 182490 + }, + { + "epoch": 0.7054939617448315, + "grad_norm": 0.09813567996025085, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 182500 + }, + { + "epoch": 0.7055326189482148, + "grad_norm": 0.1130143404006958, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 182510 + }, + { + "epoch": 0.7055712761515981, + "grad_norm": 0.12172242999076843, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 182520 + }, + { + "epoch": 0.7056099333549813, + "grad_norm": 0.11206705868244171, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 182530 + }, + { + "epoch": 0.7056485905583646, + "grad_norm": 0.09689457714557648, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 182540 + }, + { + "epoch": 0.705687247761748, + "grad_norm": 0.10289297997951508, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 182550 + }, + { + "epoch": 0.7057259049651312, + "grad_norm": 0.1013554036617279, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 182560 + }, + { + "epoch": 0.7057645621685145, + "grad_norm": 0.11464133858680725, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 182570 + }, + { + "epoch": 0.7058032193718977, + "grad_norm": 0.10789859294891357, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 182580 + }, + { + "epoch": 0.7058418765752811, + "grad_norm": 0.10919328778982162, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 182590 + }, + { + "epoch": 0.7058805337786643, + "grad_norm": 0.09283306449651718, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 182600 + }, + { + "epoch": 0.7059191909820476, + "grad_norm": 0.10101396590471268, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 182610 + }, + { + "epoch": 0.7059578481854308, + "grad_norm": 0.11432278901338577, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 182620 + }, + { + "epoch": 0.7059965053888142, + "grad_norm": 0.09779641777276993, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 182630 + }, + { + "epoch": 0.7060351625921975, + "grad_norm": 0.10827631503343582, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 182640 + }, + { + "epoch": 0.7060738197955807, + "grad_norm": 0.11150870472192764, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 182650 + }, + { + "epoch": 0.706112476998964, + "grad_norm": 0.08931907266378403, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 182660 + }, + { + "epoch": 0.7061511342023473, + "grad_norm": 0.14123156666755676, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 182670 + }, + { + "epoch": 0.7061897914057306, + "grad_norm": 0.10059284418821335, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 182680 + }, + { + "epoch": 0.7062284486091138, + "grad_norm": 0.10053300112485886, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 182690 + }, + { + "epoch": 0.7062671058124971, + "grad_norm": 0.11859703063964844, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 182700 + }, + { + "epoch": 0.7063057630158803, + "grad_norm": 0.11177325248718262, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 182710 + }, + { + "epoch": 0.7063444202192637, + "grad_norm": 0.10214826464653015, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 182720 + }, + { + "epoch": 0.7063830774226469, + "grad_norm": 0.09897692501544952, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 182730 + }, + { + "epoch": 0.7064217346260302, + "grad_norm": 0.10023060441017151, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 182740 + }, + { + "epoch": 0.7064603918294134, + "grad_norm": 0.10941066592931747, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 182750 + }, + { + "epoch": 0.7064990490327968, + "grad_norm": 0.10759110748767853, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 182760 + }, + { + "epoch": 0.7065377062361801, + "grad_norm": 0.09983283281326294, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 182770 + }, + { + "epoch": 0.7065763634395633, + "grad_norm": 0.09219998121261597, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 182780 + }, + { + "epoch": 0.7066150206429466, + "grad_norm": 0.10839740186929703, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 182790 + }, + { + "epoch": 0.7066536778463299, + "grad_norm": 0.10706480592489243, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 182800 + }, + { + "epoch": 0.7066923350497132, + "grad_norm": 0.1283886581659317, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 182810 + }, + { + "epoch": 0.7067309922530964, + "grad_norm": 0.09891193360090256, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 182820 + }, + { + "epoch": 0.7067696494564797, + "grad_norm": 0.10873159766197205, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 182830 + }, + { + "epoch": 0.706808306659863, + "grad_norm": 0.09865976125001907, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 182840 + }, + { + "epoch": 0.7068469638632463, + "grad_norm": 0.1111924797296524, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 182850 + }, + { + "epoch": 0.7068856210666296, + "grad_norm": 0.09795933216810226, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 182860 + }, + { + "epoch": 0.7069242782700128, + "grad_norm": 0.10794724524021149, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 182870 + }, + { + "epoch": 0.7069629354733961, + "grad_norm": 0.11744233220815659, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 182880 + }, + { + "epoch": 0.7070015926767794, + "grad_norm": 0.0983758494257927, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 182890 + }, + { + "epoch": 0.7070402498801627, + "grad_norm": 0.1338445246219635, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 182900 + }, + { + "epoch": 0.7070789070835459, + "grad_norm": 0.10137440264225006, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 182910 + }, + { + "epoch": 0.7071175642869292, + "grad_norm": 0.10454361140727997, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 182920 + }, + { + "epoch": 0.7071562214903125, + "grad_norm": 0.10040812939405441, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 182930 + }, + { + "epoch": 0.7071948786936958, + "grad_norm": 0.12701022624969482, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 182940 + }, + { + "epoch": 0.707233535897079, + "grad_norm": 0.0989936962723732, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 182950 + }, + { + "epoch": 0.7072721931004623, + "grad_norm": 0.10258396714925766, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 182960 + }, + { + "epoch": 0.7073108503038457, + "grad_norm": 0.10500797629356384, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 182970 + }, + { + "epoch": 0.7073495075072289, + "grad_norm": 0.10724397003650665, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 182980 + }, + { + "epoch": 0.7073881647106122, + "grad_norm": 0.11122574657201767, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 182990 + }, + { + "epoch": 0.7074268219139954, + "grad_norm": 0.11353199928998947, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 183000 + }, + { + "epoch": 0.7074654791173788, + "grad_norm": 0.09575947374105453, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 183010 + }, + { + "epoch": 0.707504136320762, + "grad_norm": 0.10303820669651031, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 183020 + }, + { + "epoch": 0.7075427935241453, + "grad_norm": 0.12164933234453201, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 183030 + }, + { + "epoch": 0.7075814507275285, + "grad_norm": 0.10083574801683426, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 183040 + }, + { + "epoch": 0.7076201079309119, + "grad_norm": 0.10467392951250076, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 183050 + }, + { + "epoch": 0.7076587651342952, + "grad_norm": 0.10307256877422333, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 183060 + }, + { + "epoch": 0.7076974223376784, + "grad_norm": 0.0985746756196022, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 183070 + }, + { + "epoch": 0.7077360795410617, + "grad_norm": 0.10553467273712158, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 183080 + }, + { + "epoch": 0.7077747367444449, + "grad_norm": 0.12108207494020462, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 183090 + }, + { + "epoch": 0.7078133939478283, + "grad_norm": 0.10229145735502243, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 183100 + }, + { + "epoch": 0.7078520511512115, + "grad_norm": 0.10687445104122162, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 183110 + }, + { + "epoch": 0.7078907083545948, + "grad_norm": 0.1538669615983963, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 183120 + }, + { + "epoch": 0.707929365557978, + "grad_norm": 0.09935051947832108, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 183130 + }, + { + "epoch": 0.7079680227613614, + "grad_norm": 0.1021176427602768, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 183140 + }, + { + "epoch": 0.7080066799647446, + "grad_norm": 0.09273340553045273, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 183150 + }, + { + "epoch": 0.7080453371681279, + "grad_norm": 0.10077358782291412, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 183160 + }, + { + "epoch": 0.7080839943715111, + "grad_norm": 0.10942483693361282, + "learning_rate": 0.002, + "loss": 2.344, + "step": 183170 + }, + { + "epoch": 0.7081226515748945, + "grad_norm": 0.10991012305021286, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 183180 + }, + { + "epoch": 0.7081613087782778, + "grad_norm": 0.0981731116771698, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 183190 + }, + { + "epoch": 0.708199965981661, + "grad_norm": 0.09662474691867828, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 183200 + }, + { + "epoch": 0.7082386231850443, + "grad_norm": 0.10109969228506088, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 183210 + }, + { + "epoch": 0.7082772803884276, + "grad_norm": 0.10239209979772568, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 183220 + }, + { + "epoch": 0.7083159375918109, + "grad_norm": 0.11773066222667694, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 183230 + }, + { + "epoch": 0.7083545947951941, + "grad_norm": 0.1125943660736084, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 183240 + }, + { + "epoch": 0.7083932519985774, + "grad_norm": 0.12445010244846344, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 183250 + }, + { + "epoch": 0.7084319092019606, + "grad_norm": 0.10312087833881378, + "learning_rate": 0.002, + "loss": 2.338, + "step": 183260 + }, + { + "epoch": 0.708470566405344, + "grad_norm": 0.10788668692111969, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 183270 + }, + { + "epoch": 0.7085092236087273, + "grad_norm": 0.11200711131095886, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 183280 + }, + { + "epoch": 0.7085478808121105, + "grad_norm": 0.12259982526302338, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 183290 + }, + { + "epoch": 0.7085865380154938, + "grad_norm": 0.12555958330631256, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 183300 + }, + { + "epoch": 0.7086251952188771, + "grad_norm": 0.12538139522075653, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 183310 + }, + { + "epoch": 0.7086638524222604, + "grad_norm": 0.12616588175296783, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 183320 + }, + { + "epoch": 0.7087025096256436, + "grad_norm": 0.1130484938621521, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 183330 + }, + { + "epoch": 0.7087411668290269, + "grad_norm": 0.1064705178141594, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 183340 + }, + { + "epoch": 0.7087798240324102, + "grad_norm": 0.10898733139038086, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 183350 + }, + { + "epoch": 0.7088184812357935, + "grad_norm": 0.10790088027715683, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 183360 + }, + { + "epoch": 0.7088571384391767, + "grad_norm": 0.11947724968194962, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 183370 + }, + { + "epoch": 0.70889579564256, + "grad_norm": 0.10596276074647903, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 183380 + }, + { + "epoch": 0.7089344528459434, + "grad_norm": 0.09440622478723526, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 183390 + }, + { + "epoch": 0.7089731100493266, + "grad_norm": 0.11583665013313293, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 183400 + }, + { + "epoch": 0.7090117672527099, + "grad_norm": 0.10019931942224503, + "learning_rate": 0.002, + "loss": 2.34, + "step": 183410 + }, + { + "epoch": 0.7090504244560931, + "grad_norm": 0.12555435299873352, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 183420 + }, + { + "epoch": 0.7090890816594764, + "grad_norm": 0.12102832645177841, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 183430 + }, + { + "epoch": 0.7091277388628597, + "grad_norm": 0.09992365539073944, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 183440 + }, + { + "epoch": 0.709166396066243, + "grad_norm": 0.10638532042503357, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 183450 + }, + { + "epoch": 0.7092050532696262, + "grad_norm": 0.10480239987373352, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 183460 + }, + { + "epoch": 0.7092437104730095, + "grad_norm": 0.11703069508075714, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 183470 + }, + { + "epoch": 0.7092823676763929, + "grad_norm": 0.09900394082069397, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 183480 + }, + { + "epoch": 0.7093210248797761, + "grad_norm": 0.09972035139799118, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 183490 + }, + { + "epoch": 0.7093596820831594, + "grad_norm": 0.09546131640672684, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 183500 + }, + { + "epoch": 0.7093983392865426, + "grad_norm": 0.1132010668516159, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 183510 + }, + { + "epoch": 0.709436996489926, + "grad_norm": 0.10164470970630646, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 183520 + }, + { + "epoch": 0.7094756536933092, + "grad_norm": 0.1110953763127327, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 183530 + }, + { + "epoch": 0.7095143108966925, + "grad_norm": 0.08955393731594086, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 183540 + }, + { + "epoch": 0.7095529681000757, + "grad_norm": 0.11253860592842102, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 183550 + }, + { + "epoch": 0.7095916253034591, + "grad_norm": 0.11447730660438538, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 183560 + }, + { + "epoch": 0.7096302825068423, + "grad_norm": 0.09639697521924973, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 183570 + }, + { + "epoch": 0.7096689397102256, + "grad_norm": 0.11987259984016418, + "learning_rate": 0.002, + "loss": 2.318, + "step": 183580 + }, + { + "epoch": 0.7097075969136089, + "grad_norm": 0.10117160528898239, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 183590 + }, + { + "epoch": 0.7097462541169922, + "grad_norm": 0.11126291751861572, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 183600 + }, + { + "epoch": 0.7097849113203755, + "grad_norm": 0.10469876229763031, + "learning_rate": 0.002, + "loss": 2.33, + "step": 183610 + }, + { + "epoch": 0.7098235685237587, + "grad_norm": 0.09360670298337936, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 183620 + }, + { + "epoch": 0.709862225727142, + "grad_norm": 0.11895138025283813, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 183630 + }, + { + "epoch": 0.7099008829305252, + "grad_norm": 0.0990893617272377, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 183640 + }, + { + "epoch": 0.7099395401339086, + "grad_norm": 0.09940275549888611, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 183650 + }, + { + "epoch": 0.7099781973372918, + "grad_norm": 0.11033112555742264, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 183660 + }, + { + "epoch": 0.7100168545406751, + "grad_norm": 0.102576345205307, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 183670 + }, + { + "epoch": 0.7100555117440583, + "grad_norm": 0.10215635597705841, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 183680 + }, + { + "epoch": 0.7100941689474417, + "grad_norm": 0.10245046019554138, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 183690 + }, + { + "epoch": 0.710132826150825, + "grad_norm": 0.11830546706914902, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 183700 + }, + { + "epoch": 0.7101714833542082, + "grad_norm": 0.0982842668890953, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 183710 + }, + { + "epoch": 0.7102101405575915, + "grad_norm": 0.1330679953098297, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 183720 + }, + { + "epoch": 0.7102487977609748, + "grad_norm": 0.105428546667099, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 183730 + }, + { + "epoch": 0.7102874549643581, + "grad_norm": 0.0875903069972992, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 183740 + }, + { + "epoch": 0.7103261121677413, + "grad_norm": 0.09843344241380692, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 183750 + }, + { + "epoch": 0.7103647693711246, + "grad_norm": 0.12091364711523056, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 183760 + }, + { + "epoch": 0.710403426574508, + "grad_norm": 0.10571946203708649, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 183770 + }, + { + "epoch": 0.7104420837778912, + "grad_norm": 0.09138306975364685, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 183780 + }, + { + "epoch": 0.7104807409812745, + "grad_norm": 0.10767629742622375, + "learning_rate": 0.002, + "loss": 2.332, + "step": 183790 + }, + { + "epoch": 0.7105193981846577, + "grad_norm": 0.10088904201984406, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 183800 + }, + { + "epoch": 0.710558055388041, + "grad_norm": 0.11030571907758713, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 183810 + }, + { + "epoch": 0.7105967125914243, + "grad_norm": 0.11730167269706726, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 183820 + }, + { + "epoch": 0.7106353697948076, + "grad_norm": 0.10828235000371933, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 183830 + }, + { + "epoch": 0.7106740269981908, + "grad_norm": 0.09817315638065338, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 183840 + }, + { + "epoch": 0.7107126842015741, + "grad_norm": 0.1154472678899765, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 183850 + }, + { + "epoch": 0.7107513414049574, + "grad_norm": 0.11157126724720001, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 183860 + }, + { + "epoch": 0.7107899986083407, + "grad_norm": 0.10947154462337494, + "learning_rate": 0.002, + "loss": 2.335, + "step": 183870 + }, + { + "epoch": 0.7108286558117239, + "grad_norm": 0.09724964201450348, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 183880 + }, + { + "epoch": 0.7108673130151072, + "grad_norm": 0.0903862863779068, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 183890 + }, + { + "epoch": 0.7109059702184906, + "grad_norm": 0.104709193110466, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 183900 + }, + { + "epoch": 0.7109446274218738, + "grad_norm": 0.0999518409371376, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 183910 + }, + { + "epoch": 0.7109832846252571, + "grad_norm": 0.11664184182882309, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 183920 + }, + { + "epoch": 0.7110219418286403, + "grad_norm": 0.11321470886468887, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 183930 + }, + { + "epoch": 0.7110605990320237, + "grad_norm": 0.09373850375413895, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 183940 + }, + { + "epoch": 0.7110992562354069, + "grad_norm": 0.11596566438674927, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 183950 + }, + { + "epoch": 0.7111379134387902, + "grad_norm": 0.10600019246339798, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 183960 + }, + { + "epoch": 0.7111765706421734, + "grad_norm": 0.09841473400592804, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 183970 + }, + { + "epoch": 0.7112152278455567, + "grad_norm": 0.10479195415973663, + "learning_rate": 0.002, + "loss": 2.329, + "step": 183980 + }, + { + "epoch": 0.71125388504894, + "grad_norm": 0.09592556208372116, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 183990 + }, + { + "epoch": 0.7112925422523233, + "grad_norm": 0.12123008072376251, + "learning_rate": 0.002, + "loss": 2.336, + "step": 184000 + }, + { + "epoch": 0.7113311994557066, + "grad_norm": 0.09279517084360123, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 184010 + }, + { + "epoch": 0.7113698566590898, + "grad_norm": 0.09922918677330017, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 184020 + }, + { + "epoch": 0.7114085138624732, + "grad_norm": 0.11793326586484909, + "learning_rate": 0.002, + "loss": 2.333, + "step": 184030 + }, + { + "epoch": 0.7114471710658564, + "grad_norm": 0.10107146948575974, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 184040 + }, + { + "epoch": 0.7114858282692397, + "grad_norm": 0.1058441773056984, + "learning_rate": 0.002, + "loss": 2.343, + "step": 184050 + }, + { + "epoch": 0.7115244854726229, + "grad_norm": 0.11791115254163742, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 184060 + }, + { + "epoch": 0.7115631426760063, + "grad_norm": 0.09689020365476608, + "learning_rate": 0.002, + "loss": 2.346, + "step": 184070 + }, + { + "epoch": 0.7116017998793895, + "grad_norm": 0.09788362681865692, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 184080 + }, + { + "epoch": 0.7116404570827728, + "grad_norm": 0.10828198492527008, + "learning_rate": 0.002, + "loss": 2.343, + "step": 184090 + }, + { + "epoch": 0.711679114286156, + "grad_norm": 0.10432813316583633, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 184100 + }, + { + "epoch": 0.7117177714895394, + "grad_norm": 0.09556274116039276, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 184110 + }, + { + "epoch": 0.7117564286929227, + "grad_norm": 0.09298153221607208, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 184120 + }, + { + "epoch": 0.7117950858963059, + "grad_norm": 0.21416905522346497, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 184130 + }, + { + "epoch": 0.7118337430996892, + "grad_norm": 0.10230828821659088, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 184140 + }, + { + "epoch": 0.7118724003030725, + "grad_norm": 0.10380011051893234, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 184150 + }, + { + "epoch": 0.7119110575064558, + "grad_norm": 0.10810267925262451, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 184160 + }, + { + "epoch": 0.711949714709839, + "grad_norm": 0.09327961504459381, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 184170 + }, + { + "epoch": 0.7119883719132223, + "grad_norm": 0.11230014264583588, + "learning_rate": 0.002, + "loss": 2.341, + "step": 184180 + }, + { + "epoch": 0.7120270291166055, + "grad_norm": 0.10888976603746414, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 184190 + }, + { + "epoch": 0.7120656863199889, + "grad_norm": 0.10669904947280884, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 184200 + }, + { + "epoch": 0.7121043435233722, + "grad_norm": 0.10787302255630493, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 184210 + }, + { + "epoch": 0.7121430007267554, + "grad_norm": 0.10550106316804886, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 184220 + }, + { + "epoch": 0.7121816579301387, + "grad_norm": 0.09902001172304153, + "learning_rate": 0.002, + "loss": 2.34, + "step": 184230 + }, + { + "epoch": 0.712220315133522, + "grad_norm": 0.09852997213602066, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 184240 + }, + { + "epoch": 0.7122589723369053, + "grad_norm": 0.11110862344503403, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 184250 + }, + { + "epoch": 0.7122976295402885, + "grad_norm": 0.1000390499830246, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 184260 + }, + { + "epoch": 0.7123362867436718, + "grad_norm": 0.0991111770272255, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 184270 + }, + { + "epoch": 0.7123749439470551, + "grad_norm": 0.11547648161649704, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 184280 + }, + { + "epoch": 0.7124136011504384, + "grad_norm": 0.1573108732700348, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 184290 + }, + { + "epoch": 0.7124522583538216, + "grad_norm": 0.10133729130029678, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 184300 + }, + { + "epoch": 0.7124909155572049, + "grad_norm": 0.10251521319150925, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 184310 + }, + { + "epoch": 0.7125295727605883, + "grad_norm": 0.08018740266561508, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 184320 + }, + { + "epoch": 0.7125682299639715, + "grad_norm": 0.12203532457351685, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 184330 + }, + { + "epoch": 0.7126068871673548, + "grad_norm": 0.23121006786823273, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 184340 + }, + { + "epoch": 0.712645544370738, + "grad_norm": 0.11483977735042572, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 184350 + }, + { + "epoch": 0.7126842015741213, + "grad_norm": 0.11053165048360825, + "learning_rate": 0.002, + "loss": 2.345, + "step": 184360 + }, + { + "epoch": 0.7127228587775046, + "grad_norm": 0.14932291209697723, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 184370 + }, + { + "epoch": 0.7127615159808879, + "grad_norm": 0.09915328025817871, + "learning_rate": 0.002, + "loss": 2.3129, + "step": 184380 + }, + { + "epoch": 0.7128001731842711, + "grad_norm": 0.10788097977638245, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 184390 + }, + { + "epoch": 0.7128388303876544, + "grad_norm": 0.11338665336370468, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 184400 + }, + { + "epoch": 0.7128774875910378, + "grad_norm": 0.11638886481523514, + "learning_rate": 0.002, + "loss": 2.35, + "step": 184410 + }, + { + "epoch": 0.712916144794421, + "grad_norm": 0.10881996154785156, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 184420 + }, + { + "epoch": 0.7129548019978043, + "grad_norm": 0.1067526638507843, + "learning_rate": 0.002, + "loss": 2.333, + "step": 184430 + }, + { + "epoch": 0.7129934592011875, + "grad_norm": 0.09515440464019775, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 184440 + }, + { + "epoch": 0.7130321164045709, + "grad_norm": 0.09346569329500198, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 184450 + }, + { + "epoch": 0.7130707736079541, + "grad_norm": 0.10614271461963654, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 184460 + }, + { + "epoch": 0.7131094308113374, + "grad_norm": 0.10147388279438019, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 184470 + }, + { + "epoch": 0.7131480880147206, + "grad_norm": 0.1059550866484642, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 184480 + }, + { + "epoch": 0.713186745218104, + "grad_norm": 0.11526099592447281, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 184490 + }, + { + "epoch": 0.7132254024214872, + "grad_norm": 0.10168834030628204, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 184500 + }, + { + "epoch": 0.7132640596248705, + "grad_norm": 0.10906929522752762, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 184510 + }, + { + "epoch": 0.7133027168282537, + "grad_norm": 0.10011370480060577, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 184520 + }, + { + "epoch": 0.7133413740316371, + "grad_norm": 0.1059103012084961, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 184530 + }, + { + "epoch": 0.7133800312350204, + "grad_norm": 0.09475652873516083, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 184540 + }, + { + "epoch": 0.7134186884384036, + "grad_norm": 0.10671989619731903, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 184550 + }, + { + "epoch": 0.7134573456417869, + "grad_norm": 0.12009799480438232, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 184560 + }, + { + "epoch": 0.7134960028451701, + "grad_norm": 0.0910855382680893, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 184570 + }, + { + "epoch": 0.7135346600485535, + "grad_norm": 0.11664850264787674, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 184580 + }, + { + "epoch": 0.7135733172519367, + "grad_norm": 0.10083947330713272, + "learning_rate": 0.002, + "loss": 2.337, + "step": 184590 + }, + { + "epoch": 0.71361197445532, + "grad_norm": 0.11706222593784332, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 184600 + }, + { + "epoch": 0.7136506316587032, + "grad_norm": 0.11299701035022736, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 184610 + }, + { + "epoch": 0.7136892888620866, + "grad_norm": 0.09368452429771423, + "learning_rate": 0.002, + "loss": 2.348, + "step": 184620 + }, + { + "epoch": 0.7137279460654699, + "grad_norm": 0.11548180878162384, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 184630 + }, + { + "epoch": 0.7137666032688531, + "grad_norm": 0.09980639070272446, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 184640 + }, + { + "epoch": 0.7138052604722364, + "grad_norm": 0.1116255670785904, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 184650 + }, + { + "epoch": 0.7138439176756197, + "grad_norm": 0.08737179636955261, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 184660 + }, + { + "epoch": 0.713882574879003, + "grad_norm": 0.12267066538333893, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 184670 + }, + { + "epoch": 0.7139212320823862, + "grad_norm": 0.10829506069421768, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 184680 + }, + { + "epoch": 0.7139598892857695, + "grad_norm": 0.09659047424793243, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 184690 + }, + { + "epoch": 0.7139985464891528, + "grad_norm": 0.09008140116930008, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 184700 + }, + { + "epoch": 0.7140372036925361, + "grad_norm": 0.14132778346538544, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 184710 + }, + { + "epoch": 0.7140758608959193, + "grad_norm": 0.10938479751348495, + "learning_rate": 0.002, + "loss": 2.349, + "step": 184720 + }, + { + "epoch": 0.7141145180993026, + "grad_norm": 0.10237931460142136, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 184730 + }, + { + "epoch": 0.7141531753026858, + "grad_norm": 0.09910931438207626, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 184740 + }, + { + "epoch": 0.7141918325060692, + "grad_norm": 0.09198702871799469, + "learning_rate": 0.002, + "loss": 2.341, + "step": 184750 + }, + { + "epoch": 0.7142304897094525, + "grad_norm": 0.11978321522474289, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 184760 + }, + { + "epoch": 0.7142691469128357, + "grad_norm": 0.09299996495246887, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 184770 + }, + { + "epoch": 0.714307804116219, + "grad_norm": 0.09913870692253113, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 184780 + }, + { + "epoch": 0.7143464613196023, + "grad_norm": 0.11584877967834473, + "learning_rate": 0.002, + "loss": 2.3653, + "step": 184790 + }, + { + "epoch": 0.7143851185229856, + "grad_norm": 0.12831422686576843, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 184800 + }, + { + "epoch": 0.7144237757263688, + "grad_norm": 0.10814099758863449, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 184810 + }, + { + "epoch": 0.7144624329297521, + "grad_norm": 0.10544437915086746, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 184820 + }, + { + "epoch": 0.7145010901331355, + "grad_norm": 0.09678446501493454, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 184830 + }, + { + "epoch": 0.7145397473365187, + "grad_norm": 0.10445859283208847, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 184840 + }, + { + "epoch": 0.714578404539902, + "grad_norm": 0.10959655791521072, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 184850 + }, + { + "epoch": 0.7146170617432852, + "grad_norm": 0.09721376746892929, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 184860 + }, + { + "epoch": 0.7146557189466686, + "grad_norm": 0.10366586595773697, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 184870 + }, + { + "epoch": 0.7146943761500518, + "grad_norm": 0.10293979197740555, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 184880 + }, + { + "epoch": 0.7147330333534351, + "grad_norm": 0.12619462609291077, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 184890 + }, + { + "epoch": 0.7147716905568183, + "grad_norm": 0.08829128742218018, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 184900 + }, + { + "epoch": 0.7148103477602016, + "grad_norm": 0.10005389899015427, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 184910 + }, + { + "epoch": 0.7148490049635849, + "grad_norm": 0.14041490852832794, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 184920 + }, + { + "epoch": 0.7148876621669682, + "grad_norm": 0.10810524225234985, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 184930 + }, + { + "epoch": 0.7149263193703514, + "grad_norm": 0.0931611880660057, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 184940 + }, + { + "epoch": 0.7149649765737347, + "grad_norm": 0.11990271508693695, + "learning_rate": 0.002, + "loss": 2.328, + "step": 184950 + }, + { + "epoch": 0.7150036337771181, + "grad_norm": 0.10736245661973953, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 184960 + }, + { + "epoch": 0.7150422909805013, + "grad_norm": 0.10061245411634445, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 184970 + }, + { + "epoch": 0.7150809481838846, + "grad_norm": 0.10759148746728897, + "learning_rate": 0.002, + "loss": 2.332, + "step": 184980 + }, + { + "epoch": 0.7151196053872678, + "grad_norm": 0.11481016129255295, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 184990 + }, + { + "epoch": 0.7151582625906512, + "grad_norm": 0.09845632314682007, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 185000 + }, + { + "epoch": 0.7151969197940344, + "grad_norm": 0.11303407698869705, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 185010 + }, + { + "epoch": 0.7152355769974177, + "grad_norm": 0.09801331162452698, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 185020 + }, + { + "epoch": 0.7152742342008009, + "grad_norm": 0.10794106125831604, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 185030 + }, + { + "epoch": 0.7153128914041843, + "grad_norm": 0.10937917232513428, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 185040 + }, + { + "epoch": 0.7153515486075676, + "grad_norm": 0.1305137276649475, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 185050 + }, + { + "epoch": 0.7153902058109508, + "grad_norm": 0.11963261663913727, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 185060 + }, + { + "epoch": 0.7154288630143341, + "grad_norm": 0.4527278244495392, + "learning_rate": 0.002, + "loss": 2.371, + "step": 185070 + }, + { + "epoch": 0.7154675202177174, + "grad_norm": 0.107776939868927, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 185080 + }, + { + "epoch": 0.7155061774211007, + "grad_norm": 0.1005530133843422, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 185090 + }, + { + "epoch": 0.7155448346244839, + "grad_norm": 0.11894986033439636, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 185100 + }, + { + "epoch": 0.7155834918278672, + "grad_norm": 0.09806102514266968, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 185110 + }, + { + "epoch": 0.7156221490312504, + "grad_norm": 0.09273909777402878, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 185120 + }, + { + "epoch": 0.7156608062346338, + "grad_norm": 0.14330172538757324, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 185130 + }, + { + "epoch": 0.715699463438017, + "grad_norm": 0.0993657335639, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 185140 + }, + { + "epoch": 0.7157381206414003, + "grad_norm": 0.08665373176336288, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 185150 + }, + { + "epoch": 0.7157767778447836, + "grad_norm": 0.10238996893167496, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 185160 + }, + { + "epoch": 0.7158154350481669, + "grad_norm": 0.11059210449457169, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 185170 + }, + { + "epoch": 0.7158540922515502, + "grad_norm": 0.1170814260840416, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 185180 + }, + { + "epoch": 0.7158927494549334, + "grad_norm": 0.0992651954293251, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 185190 + }, + { + "epoch": 0.7159314066583167, + "grad_norm": 0.10449914634227753, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 185200 + }, + { + "epoch": 0.7159700638617, + "grad_norm": 0.10232613235712051, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 185210 + }, + { + "epoch": 0.7160087210650833, + "grad_norm": 0.09571287781000137, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 185220 + }, + { + "epoch": 0.7160473782684665, + "grad_norm": 0.0938621312379837, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 185230 + }, + { + "epoch": 0.7160860354718498, + "grad_norm": 0.10125449299812317, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 185240 + }, + { + "epoch": 0.7161246926752332, + "grad_norm": 0.12445887178182602, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 185250 + }, + { + "epoch": 0.7161633498786164, + "grad_norm": 0.11819303780794144, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 185260 + }, + { + "epoch": 0.7162020070819997, + "grad_norm": 0.11657149344682693, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 185270 + }, + { + "epoch": 0.7162406642853829, + "grad_norm": 0.09366388618946075, + "learning_rate": 0.002, + "loss": 2.338, + "step": 185280 + }, + { + "epoch": 0.7162793214887662, + "grad_norm": 0.0998849868774414, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 185290 + }, + { + "epoch": 0.7163179786921495, + "grad_norm": 0.1090717613697052, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 185300 + }, + { + "epoch": 0.7163566358955328, + "grad_norm": 0.10590784251689911, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 185310 + }, + { + "epoch": 0.716395293098916, + "grad_norm": 0.10902975499629974, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 185320 + }, + { + "epoch": 0.7164339503022993, + "grad_norm": 0.11425049602985382, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 185330 + }, + { + "epoch": 0.7164726075056826, + "grad_norm": 0.104640431702137, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 185340 + }, + { + "epoch": 0.7165112647090659, + "grad_norm": 0.10949306190013885, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 185350 + }, + { + "epoch": 0.7165499219124492, + "grad_norm": 0.09471194446086884, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 185360 + }, + { + "epoch": 0.7165885791158324, + "grad_norm": 0.10628344863653183, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 185370 + }, + { + "epoch": 0.7166272363192158, + "grad_norm": 0.10102435946464539, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 185380 + }, + { + "epoch": 0.716665893522599, + "grad_norm": 0.09791155159473419, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 185390 + }, + { + "epoch": 0.7167045507259823, + "grad_norm": 0.0973888710141182, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 185400 + }, + { + "epoch": 0.7167432079293655, + "grad_norm": 0.1219591274857521, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 185410 + }, + { + "epoch": 0.7167818651327489, + "grad_norm": 0.09689916670322418, + "learning_rate": 0.002, + "loss": 2.326, + "step": 185420 + }, + { + "epoch": 0.7168205223361321, + "grad_norm": 0.10497520864009857, + "learning_rate": 0.002, + "loss": 2.325, + "step": 185430 + }, + { + "epoch": 0.7168591795395154, + "grad_norm": 0.10271918773651123, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 185440 + }, + { + "epoch": 0.7168978367428986, + "grad_norm": 0.11452703922986984, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 185450 + }, + { + "epoch": 0.716936493946282, + "grad_norm": 0.10196271538734436, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 185460 + }, + { + "epoch": 0.7169751511496653, + "grad_norm": 0.1002076268196106, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 185470 + }, + { + "epoch": 0.7170138083530485, + "grad_norm": 0.11561955511569977, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 185480 + }, + { + "epoch": 0.7170524655564318, + "grad_norm": 0.09349963814020157, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 185490 + }, + { + "epoch": 0.717091122759815, + "grad_norm": 0.11077411472797394, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 185500 + }, + { + "epoch": 0.7171297799631984, + "grad_norm": 0.10149725526571274, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 185510 + }, + { + "epoch": 0.7171684371665816, + "grad_norm": 0.09096281230449677, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 185520 + }, + { + "epoch": 0.7172070943699649, + "grad_norm": 0.1042969599366188, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 185530 + }, + { + "epoch": 0.7172457515733481, + "grad_norm": 0.13028311729431152, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 185540 + }, + { + "epoch": 0.7172844087767315, + "grad_norm": 0.10507809370756149, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 185550 + }, + { + "epoch": 0.7173230659801147, + "grad_norm": 0.09843814373016357, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 185560 + }, + { + "epoch": 0.717361723183498, + "grad_norm": 0.0921286940574646, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 185570 + }, + { + "epoch": 0.7174003803868813, + "grad_norm": 0.09608946740627289, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 185580 + }, + { + "epoch": 0.7174390375902646, + "grad_norm": 0.12892523407936096, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 185590 + }, + { + "epoch": 0.7174776947936479, + "grad_norm": 0.09652720391750336, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 185600 + }, + { + "epoch": 0.7175163519970311, + "grad_norm": 0.09985087811946869, + "learning_rate": 0.002, + "loss": 2.3603, + "step": 185610 + }, + { + "epoch": 0.7175550092004144, + "grad_norm": 0.12451973557472229, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 185620 + }, + { + "epoch": 0.7175936664037977, + "grad_norm": 0.09739361703395844, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 185630 + }, + { + "epoch": 0.717632323607181, + "grad_norm": 0.10199489444494247, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 185640 + }, + { + "epoch": 0.7176709808105642, + "grad_norm": 0.10813910514116287, + "learning_rate": 0.002, + "loss": 2.347, + "step": 185650 + }, + { + "epoch": 0.7177096380139475, + "grad_norm": 0.11378483474254608, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 185660 + }, + { + "epoch": 0.7177482952173307, + "grad_norm": 0.09850470721721649, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 185670 + }, + { + "epoch": 0.7177869524207141, + "grad_norm": 0.10502240061759949, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 185680 + }, + { + "epoch": 0.7178256096240974, + "grad_norm": 0.12115588039159775, + "learning_rate": 0.002, + "loss": 2.337, + "step": 185690 + }, + { + "epoch": 0.7178642668274806, + "grad_norm": 0.11206604540348053, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 185700 + }, + { + "epoch": 0.7179029240308639, + "grad_norm": 0.09093813598155975, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 185710 + }, + { + "epoch": 0.7179415812342472, + "grad_norm": 0.10363386571407318, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 185720 + }, + { + "epoch": 0.7179802384376305, + "grad_norm": 0.10542728006839752, + "learning_rate": 0.002, + "loss": 2.349, + "step": 185730 + }, + { + "epoch": 0.7180188956410137, + "grad_norm": 0.10961460322141647, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 185740 + }, + { + "epoch": 0.718057552844397, + "grad_norm": 0.14820969104766846, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 185750 + }, + { + "epoch": 0.7180962100477803, + "grad_norm": 0.1017719954252243, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 185760 + }, + { + "epoch": 0.7181348672511636, + "grad_norm": 0.10929939150810242, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 185770 + }, + { + "epoch": 0.7181735244545469, + "grad_norm": 0.10564550757408142, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 185780 + }, + { + "epoch": 0.7182121816579301, + "grad_norm": 0.08694026619195938, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 185790 + }, + { + "epoch": 0.7182508388613135, + "grad_norm": 0.10610771179199219, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 185800 + }, + { + "epoch": 0.7182894960646967, + "grad_norm": 0.10886865109205246, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 185810 + }, + { + "epoch": 0.71832815326808, + "grad_norm": 0.11758004128932953, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 185820 + }, + { + "epoch": 0.7183668104714632, + "grad_norm": 0.10590004920959473, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 185830 + }, + { + "epoch": 0.7184054676748465, + "grad_norm": 0.0975596159696579, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 185840 + }, + { + "epoch": 0.7184441248782298, + "grad_norm": 0.09157725423574448, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 185850 + }, + { + "epoch": 0.7184827820816131, + "grad_norm": 0.126144677400589, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 185860 + }, + { + "epoch": 0.7185214392849963, + "grad_norm": 0.10521572083234787, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 185870 + }, + { + "epoch": 0.7185600964883796, + "grad_norm": 0.10988739877939224, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 185880 + }, + { + "epoch": 0.718598753691763, + "grad_norm": 0.10514355450868607, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 185890 + }, + { + "epoch": 0.7186374108951462, + "grad_norm": 0.11994270235300064, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 185900 + }, + { + "epoch": 0.7186760680985295, + "grad_norm": 0.11317486315965652, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 185910 + }, + { + "epoch": 0.7187147253019127, + "grad_norm": 0.11995750665664673, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 185920 + }, + { + "epoch": 0.7187533825052961, + "grad_norm": 0.09820850938558578, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 185930 + }, + { + "epoch": 0.7187920397086793, + "grad_norm": 0.10075823217630386, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 185940 + }, + { + "epoch": 0.7188306969120626, + "grad_norm": 0.10743725299835205, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 185950 + }, + { + "epoch": 0.7188693541154458, + "grad_norm": 0.10513687133789062, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 185960 + }, + { + "epoch": 0.7189080113188292, + "grad_norm": 0.138917937874794, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 185970 + }, + { + "epoch": 0.7189466685222125, + "grad_norm": 0.11696998029947281, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 185980 + }, + { + "epoch": 0.7189853257255957, + "grad_norm": 0.09162256121635437, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 185990 + }, + { + "epoch": 0.719023982928979, + "grad_norm": 0.11939941346645355, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 186000 + }, + { + "epoch": 0.7190626401323623, + "grad_norm": 0.0931636318564415, + "learning_rate": 0.002, + "loss": 2.349, + "step": 186010 + }, + { + "epoch": 0.7191012973357456, + "grad_norm": 0.11141252517700195, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 186020 + }, + { + "epoch": 0.7191399545391288, + "grad_norm": 0.10563148558139801, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 186030 + }, + { + "epoch": 0.7191786117425121, + "grad_norm": 0.1159094050526619, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 186040 + }, + { + "epoch": 0.7192172689458953, + "grad_norm": 0.11379499733448029, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 186050 + }, + { + "epoch": 0.7192559261492787, + "grad_norm": 0.0985812321305275, + "learning_rate": 0.002, + "loss": 2.349, + "step": 186060 + }, + { + "epoch": 0.7192945833526619, + "grad_norm": 0.12417516112327576, + "learning_rate": 0.002, + "loss": 2.323, + "step": 186070 + }, + { + "epoch": 0.7193332405560452, + "grad_norm": 0.10407594591379166, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 186080 + }, + { + "epoch": 0.7193718977594284, + "grad_norm": 0.09413602948188782, + "learning_rate": 0.002, + "loss": 2.337, + "step": 186090 + }, + { + "epoch": 0.7194105549628118, + "grad_norm": 0.10107819736003876, + "learning_rate": 0.002, + "loss": 2.34, + "step": 186100 + }, + { + "epoch": 0.7194492121661951, + "grad_norm": 0.11445695906877518, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 186110 + }, + { + "epoch": 0.7194878693695783, + "grad_norm": 0.09609300643205643, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 186120 + }, + { + "epoch": 0.7195265265729616, + "grad_norm": 0.10091094672679901, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 186130 + }, + { + "epoch": 0.7195651837763449, + "grad_norm": 0.09473997354507446, + "learning_rate": 0.002, + "loss": 2.341, + "step": 186140 + }, + { + "epoch": 0.7196038409797282, + "grad_norm": 0.1108432486653328, + "learning_rate": 0.002, + "loss": 2.347, + "step": 186150 + }, + { + "epoch": 0.7196424981831114, + "grad_norm": 0.10067521035671234, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 186160 + }, + { + "epoch": 0.7196811553864947, + "grad_norm": 0.09316106885671616, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 186170 + }, + { + "epoch": 0.719719812589878, + "grad_norm": 0.10489480942487717, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 186180 + }, + { + "epoch": 0.7197584697932613, + "grad_norm": 0.08913571387529373, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 186190 + }, + { + "epoch": 0.7197971269966446, + "grad_norm": 0.10339523106813431, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 186200 + }, + { + "epoch": 0.7198357842000278, + "grad_norm": 0.11942090839147568, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 186210 + }, + { + "epoch": 0.7198744414034111, + "grad_norm": 0.10357536375522614, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 186220 + }, + { + "epoch": 0.7199130986067944, + "grad_norm": 0.09975355863571167, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 186230 + }, + { + "epoch": 0.7199517558101777, + "grad_norm": 0.09567605704069138, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 186240 + }, + { + "epoch": 0.7199904130135609, + "grad_norm": 0.09296274930238724, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 186250 + }, + { + "epoch": 0.7200290702169442, + "grad_norm": 0.0937802791595459, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 186260 + }, + { + "epoch": 0.7200677274203275, + "grad_norm": 0.08700099587440491, + "learning_rate": 0.002, + "loss": 2.346, + "step": 186270 + }, + { + "epoch": 0.7201063846237108, + "grad_norm": 0.10534818470478058, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 186280 + }, + { + "epoch": 0.720145041827094, + "grad_norm": 0.09784476459026337, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 186290 + }, + { + "epoch": 0.7201836990304773, + "grad_norm": 0.14407220482826233, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 186300 + }, + { + "epoch": 0.7202223562338607, + "grad_norm": 0.10064949095249176, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 186310 + }, + { + "epoch": 0.7202610134372439, + "grad_norm": 0.10450860857963562, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 186320 + }, + { + "epoch": 0.7202996706406272, + "grad_norm": 0.10671655088663101, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 186330 + }, + { + "epoch": 0.7203383278440104, + "grad_norm": 0.10465175658464432, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 186340 + }, + { + "epoch": 0.7203769850473938, + "grad_norm": 0.10027614235877991, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 186350 + }, + { + "epoch": 0.720415642250777, + "grad_norm": 0.11972982436418533, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 186360 + }, + { + "epoch": 0.7204542994541603, + "grad_norm": 0.10320338606834412, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 186370 + }, + { + "epoch": 0.7204929566575435, + "grad_norm": 0.10482943803071976, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 186380 + }, + { + "epoch": 0.7205316138609269, + "grad_norm": 0.10837873071432114, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 186390 + }, + { + "epoch": 0.7205702710643102, + "grad_norm": 0.15739254653453827, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 186400 + }, + { + "epoch": 0.7206089282676934, + "grad_norm": 0.11209744960069656, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 186410 + }, + { + "epoch": 0.7206475854710767, + "grad_norm": 0.08895806223154068, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 186420 + }, + { + "epoch": 0.7206862426744599, + "grad_norm": 0.09714076668024063, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 186430 + }, + { + "epoch": 0.7207248998778433, + "grad_norm": 0.10694416612386703, + "learning_rate": 0.002, + "loss": 2.336, + "step": 186440 + }, + { + "epoch": 0.7207635570812265, + "grad_norm": 0.103749580681324, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 186450 + }, + { + "epoch": 0.7208022142846098, + "grad_norm": 0.11582319438457489, + "learning_rate": 0.002, + "loss": 2.346, + "step": 186460 + }, + { + "epoch": 0.720840871487993, + "grad_norm": 0.09324432909488678, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 186470 + }, + { + "epoch": 0.7208795286913764, + "grad_norm": 0.1003182902932167, + "learning_rate": 0.002, + "loss": 2.352, + "step": 186480 + }, + { + "epoch": 0.7209181858947596, + "grad_norm": 0.1040387973189354, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 186490 + }, + { + "epoch": 0.7209568430981429, + "grad_norm": 0.10195592045783997, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 186500 + }, + { + "epoch": 0.7209955003015261, + "grad_norm": 0.09478186070919037, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 186510 + }, + { + "epoch": 0.7210341575049095, + "grad_norm": 0.1099415048956871, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 186520 + }, + { + "epoch": 0.7210728147082928, + "grad_norm": 0.09218371659517288, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 186530 + }, + { + "epoch": 0.721111471911676, + "grad_norm": 0.09654217213392258, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 186540 + }, + { + "epoch": 0.7211501291150593, + "grad_norm": 0.10185280442237854, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 186550 + }, + { + "epoch": 0.7211887863184426, + "grad_norm": 0.09472206234931946, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 186560 + }, + { + "epoch": 0.7212274435218259, + "grad_norm": 0.11460986733436584, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 186570 + }, + { + "epoch": 0.7212661007252091, + "grad_norm": 0.09128446131944656, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 186580 + }, + { + "epoch": 0.7213047579285924, + "grad_norm": 0.10730069130659103, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 186590 + }, + { + "epoch": 0.7213434151319756, + "grad_norm": 0.10237306356430054, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 186600 + }, + { + "epoch": 0.721382072335359, + "grad_norm": 0.09643815457820892, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 186610 + }, + { + "epoch": 0.7214207295387423, + "grad_norm": 0.11439349502325058, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 186620 + }, + { + "epoch": 0.7214593867421255, + "grad_norm": 0.1082044318318367, + "learning_rate": 0.002, + "loss": 2.343, + "step": 186630 + }, + { + "epoch": 0.7214980439455088, + "grad_norm": 0.10950101912021637, + "learning_rate": 0.002, + "loss": 2.35, + "step": 186640 + }, + { + "epoch": 0.7215367011488921, + "grad_norm": 0.10193609446287155, + "learning_rate": 0.002, + "loss": 2.342, + "step": 186650 + }, + { + "epoch": 0.7215753583522754, + "grad_norm": 0.11126836389303207, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 186660 + }, + { + "epoch": 0.7216140155556586, + "grad_norm": 0.12311969697475433, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 186670 + }, + { + "epoch": 0.7216526727590419, + "grad_norm": 0.10100258886814117, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 186680 + }, + { + "epoch": 0.7216913299624252, + "grad_norm": 0.10313405841588974, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 186690 + }, + { + "epoch": 0.7217299871658085, + "grad_norm": 0.10727465897798538, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 186700 + }, + { + "epoch": 0.7217686443691917, + "grad_norm": 0.09539248794317245, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 186710 + }, + { + "epoch": 0.721807301572575, + "grad_norm": 0.11575020104646683, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 186720 + }, + { + "epoch": 0.7218459587759584, + "grad_norm": 0.10815134644508362, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 186730 + }, + { + "epoch": 0.7218846159793416, + "grad_norm": 0.11231502145528793, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 186740 + }, + { + "epoch": 0.7219232731827249, + "grad_norm": 0.12856830656528473, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 186750 + }, + { + "epoch": 0.7219619303861081, + "grad_norm": 0.10370416939258575, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 186760 + }, + { + "epoch": 0.7220005875894914, + "grad_norm": 0.11301801353693008, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 186770 + }, + { + "epoch": 0.7220392447928747, + "grad_norm": 0.10336112976074219, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 186780 + }, + { + "epoch": 0.722077901996258, + "grad_norm": 0.09517168253660202, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 186790 + }, + { + "epoch": 0.7221165591996412, + "grad_norm": 0.09284123033285141, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 186800 + }, + { + "epoch": 0.7221552164030245, + "grad_norm": 0.10720662027597427, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 186810 + }, + { + "epoch": 0.7221938736064079, + "grad_norm": 0.09715081751346588, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 186820 + }, + { + "epoch": 0.7222325308097911, + "grad_norm": 0.09416729211807251, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 186830 + }, + { + "epoch": 0.7222711880131744, + "grad_norm": 0.12686994671821594, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 186840 + }, + { + "epoch": 0.7223098452165576, + "grad_norm": 0.08926520496606827, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 186850 + }, + { + "epoch": 0.722348502419941, + "grad_norm": 0.11334452778100967, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 186860 + }, + { + "epoch": 0.7223871596233242, + "grad_norm": 0.11346606910228729, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 186870 + }, + { + "epoch": 0.7224258168267075, + "grad_norm": 0.11881070584058762, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 186880 + }, + { + "epoch": 0.7224644740300907, + "grad_norm": 0.11092095822095871, + "learning_rate": 0.002, + "loss": 2.333, + "step": 186890 + }, + { + "epoch": 0.7225031312334741, + "grad_norm": 0.09544173628091812, + "learning_rate": 0.002, + "loss": 2.348, + "step": 186900 + }, + { + "epoch": 0.7225417884368573, + "grad_norm": 0.12076514959335327, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 186910 + }, + { + "epoch": 0.7225804456402406, + "grad_norm": 0.10290089249610901, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 186920 + }, + { + "epoch": 0.7226191028436239, + "grad_norm": 0.12027185410261154, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 186930 + }, + { + "epoch": 0.7226577600470072, + "grad_norm": 0.09199009835720062, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 186940 + }, + { + "epoch": 0.7226964172503905, + "grad_norm": 0.10237187892198563, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 186950 + }, + { + "epoch": 0.7227350744537737, + "grad_norm": 0.09242359548807144, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 186960 + }, + { + "epoch": 0.722773731657157, + "grad_norm": 0.09228871017694473, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 186970 + }, + { + "epoch": 0.7228123888605402, + "grad_norm": 0.10070665925741196, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 186980 + }, + { + "epoch": 0.7228510460639236, + "grad_norm": 0.12258857488632202, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 186990 + }, + { + "epoch": 0.7228897032673068, + "grad_norm": 0.11036261171102524, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 187000 + }, + { + "epoch": 0.7229283604706901, + "grad_norm": 0.10454872995615005, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 187010 + }, + { + "epoch": 0.7229670176740733, + "grad_norm": 0.1144997626543045, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 187020 + }, + { + "epoch": 0.7230056748774567, + "grad_norm": 0.11421671509742737, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 187030 + }, + { + "epoch": 0.72304433208084, + "grad_norm": 0.11393462866544724, + "learning_rate": 0.002, + "loss": 2.341, + "step": 187040 + }, + { + "epoch": 0.7230829892842232, + "grad_norm": 0.0887039452791214, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 187050 + }, + { + "epoch": 0.7231216464876065, + "grad_norm": 0.13317154347896576, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 187060 + }, + { + "epoch": 0.7231603036909898, + "grad_norm": 0.09806982427835464, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 187070 + }, + { + "epoch": 0.7231989608943731, + "grad_norm": 0.1163717657327652, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 187080 + }, + { + "epoch": 0.7232376180977563, + "grad_norm": 0.101486437022686, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 187090 + }, + { + "epoch": 0.7232762753011396, + "grad_norm": 0.12469431012868881, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 187100 + }, + { + "epoch": 0.723314932504523, + "grad_norm": 0.10116591304540634, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 187110 + }, + { + "epoch": 0.7233535897079062, + "grad_norm": 0.10730016231536865, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 187120 + }, + { + "epoch": 0.7233922469112894, + "grad_norm": 0.10383022576570511, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 187130 + }, + { + "epoch": 0.7234309041146727, + "grad_norm": 0.09397551417350769, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 187140 + }, + { + "epoch": 0.723469561318056, + "grad_norm": 0.10737968981266022, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 187150 + }, + { + "epoch": 0.7235082185214393, + "grad_norm": 0.10347533971071243, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 187160 + }, + { + "epoch": 0.7235468757248226, + "grad_norm": 0.13405457139015198, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 187170 + }, + { + "epoch": 0.7235855329282058, + "grad_norm": 0.09673825651407242, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 187180 + }, + { + "epoch": 0.7236241901315891, + "grad_norm": 0.12408943474292755, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 187190 + }, + { + "epoch": 0.7236628473349724, + "grad_norm": 0.12734411656856537, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 187200 + }, + { + "epoch": 0.7237015045383557, + "grad_norm": 0.09845370799303055, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 187210 + }, + { + "epoch": 0.7237401617417389, + "grad_norm": 0.1021999716758728, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 187220 + }, + { + "epoch": 0.7237788189451222, + "grad_norm": 0.10909496247768402, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 187230 + }, + { + "epoch": 0.7238174761485056, + "grad_norm": 0.10415089130401611, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 187240 + }, + { + "epoch": 0.7238561333518888, + "grad_norm": 0.10948038846254349, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 187250 + }, + { + "epoch": 0.7238947905552721, + "grad_norm": 0.10909941792488098, + "learning_rate": 0.002, + "loss": 2.333, + "step": 187260 + }, + { + "epoch": 0.7239334477586553, + "grad_norm": 0.10298583656549454, + "learning_rate": 0.002, + "loss": 2.346, + "step": 187270 + }, + { + "epoch": 0.7239721049620387, + "grad_norm": 0.09963478147983551, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 187280 + }, + { + "epoch": 0.7240107621654219, + "grad_norm": 0.09870238602161407, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 187290 + }, + { + "epoch": 0.7240494193688052, + "grad_norm": 0.11888501793146133, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 187300 + }, + { + "epoch": 0.7240880765721884, + "grad_norm": 0.10366848856210709, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 187310 + }, + { + "epoch": 0.7241267337755717, + "grad_norm": 0.10529571026563644, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 187320 + }, + { + "epoch": 0.724165390978955, + "grad_norm": 0.145135298371315, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 187330 + }, + { + "epoch": 0.7242040481823383, + "grad_norm": 0.09621170163154602, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 187340 + }, + { + "epoch": 0.7242427053857216, + "grad_norm": 0.10496283322572708, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 187350 + }, + { + "epoch": 0.7242813625891048, + "grad_norm": 0.0986151173710823, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 187360 + }, + { + "epoch": 0.7243200197924882, + "grad_norm": 0.09692153334617615, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 187370 + }, + { + "epoch": 0.7243586769958714, + "grad_norm": 0.11592217534780502, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 187380 + }, + { + "epoch": 0.7243973341992547, + "grad_norm": 0.09770040214061737, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 187390 + }, + { + "epoch": 0.7244359914026379, + "grad_norm": 0.0982050821185112, + "learning_rate": 0.002, + "loss": 2.344, + "step": 187400 + }, + { + "epoch": 0.7244746486060213, + "grad_norm": 0.0895521268248558, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 187410 + }, + { + "epoch": 0.7245133058094045, + "grad_norm": 0.11532026529312134, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 187420 + }, + { + "epoch": 0.7245519630127878, + "grad_norm": 0.10454227775335312, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 187430 + }, + { + "epoch": 0.724590620216171, + "grad_norm": 0.10045175999403, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 187440 + }, + { + "epoch": 0.7246292774195544, + "grad_norm": 0.09500908106565475, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 187450 + }, + { + "epoch": 0.7246679346229377, + "grad_norm": 0.10398609936237335, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 187460 + }, + { + "epoch": 0.7247065918263209, + "grad_norm": 0.10558445006608963, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 187470 + }, + { + "epoch": 0.7247452490297042, + "grad_norm": 0.10564364492893219, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 187480 + }, + { + "epoch": 0.7247839062330875, + "grad_norm": 0.10846003144979477, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 187490 + }, + { + "epoch": 0.7248225634364708, + "grad_norm": 0.13258378207683563, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 187500 + }, + { + "epoch": 0.724861220639854, + "grad_norm": 0.11121944338083267, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 187510 + }, + { + "epoch": 0.7248998778432373, + "grad_norm": 0.09945479035377502, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 187520 + }, + { + "epoch": 0.7249385350466205, + "grad_norm": 0.11722530424594879, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 187530 + }, + { + "epoch": 0.7249771922500039, + "grad_norm": 0.10056839138269424, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 187540 + }, + { + "epoch": 0.7250158494533872, + "grad_norm": 0.09518096596002579, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 187550 + }, + { + "epoch": 0.7250545066567704, + "grad_norm": 0.10595232993364334, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 187560 + }, + { + "epoch": 0.7250931638601537, + "grad_norm": 0.09073364734649658, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 187570 + }, + { + "epoch": 0.725131821063537, + "grad_norm": 0.11355112493038177, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 187580 + }, + { + "epoch": 0.7251704782669203, + "grad_norm": 0.09228280931711197, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 187590 + }, + { + "epoch": 0.7252091354703035, + "grad_norm": 0.11262501776218414, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 187600 + }, + { + "epoch": 0.7252477926736868, + "grad_norm": 0.10293961316347122, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 187610 + }, + { + "epoch": 0.7252864498770701, + "grad_norm": 0.12528426945209503, + "learning_rate": 0.002, + "loss": 2.326, + "step": 187620 + }, + { + "epoch": 0.7253251070804534, + "grad_norm": 0.13686411082744598, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 187630 + }, + { + "epoch": 0.7253637642838366, + "grad_norm": 0.09556593745946884, + "learning_rate": 0.002, + "loss": 2.338, + "step": 187640 + }, + { + "epoch": 0.7254024214872199, + "grad_norm": 0.08956517279148102, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 187650 + }, + { + "epoch": 0.7254410786906033, + "grad_norm": 0.09875398874282837, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 187660 + }, + { + "epoch": 0.7254797358939865, + "grad_norm": 0.10570773482322693, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 187670 + }, + { + "epoch": 0.7255183930973698, + "grad_norm": 0.10656732320785522, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 187680 + }, + { + "epoch": 0.725557050300753, + "grad_norm": 0.0977579653263092, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 187690 + }, + { + "epoch": 0.7255957075041363, + "grad_norm": 0.1082698181271553, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 187700 + }, + { + "epoch": 0.7256343647075196, + "grad_norm": 0.10016846656799316, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 187710 + }, + { + "epoch": 0.7256730219109029, + "grad_norm": 0.09752330929040909, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 187720 + }, + { + "epoch": 0.7257116791142861, + "grad_norm": 0.09331176429986954, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 187730 + }, + { + "epoch": 0.7257503363176694, + "grad_norm": 0.10335078835487366, + "learning_rate": 0.002, + "loss": 2.342, + "step": 187740 + }, + { + "epoch": 0.7257889935210527, + "grad_norm": 0.13783538341522217, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 187750 + }, + { + "epoch": 0.725827650724436, + "grad_norm": 0.10869824141263962, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 187760 + }, + { + "epoch": 0.7258663079278193, + "grad_norm": 0.1278955191373825, + "learning_rate": 0.002, + "loss": 2.338, + "step": 187770 + }, + { + "epoch": 0.7259049651312025, + "grad_norm": 0.1017908975481987, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 187780 + }, + { + "epoch": 0.7259436223345859, + "grad_norm": 0.09236116707324982, + "learning_rate": 0.002, + "loss": 2.325, + "step": 187790 + }, + { + "epoch": 0.7259822795379691, + "grad_norm": 0.11204037815332413, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 187800 + }, + { + "epoch": 0.7260209367413524, + "grad_norm": 0.09957336634397507, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 187810 + }, + { + "epoch": 0.7260595939447356, + "grad_norm": 0.45305535197257996, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 187820 + }, + { + "epoch": 0.726098251148119, + "grad_norm": 0.10775720328092575, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 187830 + }, + { + "epoch": 0.7261369083515022, + "grad_norm": 0.09511881321668625, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 187840 + }, + { + "epoch": 0.7261755655548855, + "grad_norm": 0.1379610151052475, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 187850 + }, + { + "epoch": 0.7262142227582687, + "grad_norm": 0.10464418679475784, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 187860 + }, + { + "epoch": 0.7262528799616521, + "grad_norm": 0.09923885762691498, + "learning_rate": 0.002, + "loss": 2.331, + "step": 187870 + }, + { + "epoch": 0.7262915371650354, + "grad_norm": 0.10207533836364746, + "learning_rate": 0.002, + "loss": 2.344, + "step": 187880 + }, + { + "epoch": 0.7263301943684186, + "grad_norm": 0.09654777497053146, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 187890 + }, + { + "epoch": 0.7263688515718019, + "grad_norm": 0.09469664841890335, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 187900 + }, + { + "epoch": 0.7264075087751851, + "grad_norm": 0.08925016224384308, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 187910 + }, + { + "epoch": 0.7264461659785685, + "grad_norm": 0.10459795594215393, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 187920 + }, + { + "epoch": 0.7264848231819517, + "grad_norm": 0.08447825908660889, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 187930 + }, + { + "epoch": 0.726523480385335, + "grad_norm": 0.11422178894281387, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 187940 + }, + { + "epoch": 0.7265621375887182, + "grad_norm": 0.11474387347698212, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 187950 + }, + { + "epoch": 0.7266007947921016, + "grad_norm": 0.09868604689836502, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 187960 + }, + { + "epoch": 0.7266394519954849, + "grad_norm": 0.11762841045856476, + "learning_rate": 0.002, + "loss": 2.353, + "step": 187970 + }, + { + "epoch": 0.7266781091988681, + "grad_norm": 0.10311324149370193, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 187980 + }, + { + "epoch": 0.7267167664022514, + "grad_norm": 0.10104495286941528, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 187990 + }, + { + "epoch": 0.7267554236056347, + "grad_norm": 0.09831292927265167, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 188000 + }, + { + "epoch": 0.726794080809018, + "grad_norm": 0.11889403313398361, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 188010 + }, + { + "epoch": 0.7268327380124012, + "grad_norm": 0.21537570655345917, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 188020 + }, + { + "epoch": 0.7268713952157845, + "grad_norm": 0.10098782926797867, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 188030 + }, + { + "epoch": 0.7269100524191678, + "grad_norm": 0.1068992167711258, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 188040 + }, + { + "epoch": 0.7269487096225511, + "grad_norm": 0.1335403174161911, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 188050 + }, + { + "epoch": 0.7269873668259343, + "grad_norm": 0.11194917559623718, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 188060 + }, + { + "epoch": 0.7270260240293176, + "grad_norm": 0.13839438557624817, + "learning_rate": 0.002, + "loss": 2.318, + "step": 188070 + }, + { + "epoch": 0.7270646812327008, + "grad_norm": 0.10070030391216278, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 188080 + }, + { + "epoch": 0.7271033384360842, + "grad_norm": 0.09300320595502853, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 188090 + }, + { + "epoch": 0.7271419956394675, + "grad_norm": 0.1040036752820015, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 188100 + }, + { + "epoch": 0.7271806528428507, + "grad_norm": 0.13322311639785767, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 188110 + }, + { + "epoch": 0.727219310046234, + "grad_norm": 0.10786911100149155, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 188120 + }, + { + "epoch": 0.7272579672496173, + "grad_norm": 0.11439685523509979, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 188130 + }, + { + "epoch": 0.7272966244530006, + "grad_norm": 0.11760654300451279, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 188140 + }, + { + "epoch": 0.7273352816563838, + "grad_norm": 0.10219842195510864, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 188150 + }, + { + "epoch": 0.7273739388597671, + "grad_norm": 0.12190880626440048, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 188160 + }, + { + "epoch": 0.7274125960631505, + "grad_norm": 0.10149639844894409, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 188170 + }, + { + "epoch": 0.7274512532665337, + "grad_norm": 0.10100218653678894, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 188180 + }, + { + "epoch": 0.727489910469917, + "grad_norm": 0.1045944094657898, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 188190 + }, + { + "epoch": 0.7275285676733002, + "grad_norm": 0.09726139903068542, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 188200 + }, + { + "epoch": 0.7275672248766836, + "grad_norm": 0.1029781699180603, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 188210 + }, + { + "epoch": 0.7276058820800668, + "grad_norm": 0.1261553019285202, + "learning_rate": 0.002, + "loss": 2.339, + "step": 188220 + }, + { + "epoch": 0.7276445392834501, + "grad_norm": 0.11521897464990616, + "learning_rate": 0.002, + "loss": 2.333, + "step": 188230 + }, + { + "epoch": 0.7276831964868333, + "grad_norm": 0.08909417688846588, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 188240 + }, + { + "epoch": 0.7277218536902166, + "grad_norm": 0.0991474986076355, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 188250 + }, + { + "epoch": 0.7277605108935999, + "grad_norm": 0.11477166414260864, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 188260 + }, + { + "epoch": 0.7277991680969832, + "grad_norm": 0.15573208034038544, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 188270 + }, + { + "epoch": 0.7278378253003664, + "grad_norm": 0.11599601060152054, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 188280 + }, + { + "epoch": 0.7278764825037497, + "grad_norm": 0.11741472035646439, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 188290 + }, + { + "epoch": 0.7279151397071331, + "grad_norm": 0.1139383614063263, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 188300 + }, + { + "epoch": 0.7279537969105163, + "grad_norm": 0.09910546988248825, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 188310 + }, + { + "epoch": 0.7279924541138996, + "grad_norm": 0.09799962490797043, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 188320 + }, + { + "epoch": 0.7280311113172828, + "grad_norm": 0.09024277329444885, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 188330 + }, + { + "epoch": 0.7280697685206662, + "grad_norm": 0.10700663924217224, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 188340 + }, + { + "epoch": 0.7281084257240494, + "grad_norm": 0.09063316136598587, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 188350 + }, + { + "epoch": 0.7281470829274327, + "grad_norm": 0.10456562787294388, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 188360 + }, + { + "epoch": 0.7281857401308159, + "grad_norm": 0.1041400134563446, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 188370 + }, + { + "epoch": 0.7282243973341993, + "grad_norm": 0.11024066060781479, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 188380 + }, + { + "epoch": 0.7282630545375826, + "grad_norm": 0.09569244086742401, + "learning_rate": 0.002, + "loss": 2.333, + "step": 188390 + }, + { + "epoch": 0.7283017117409658, + "grad_norm": 0.11875152587890625, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 188400 + }, + { + "epoch": 0.7283403689443491, + "grad_norm": 0.12720754742622375, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 188410 + }, + { + "epoch": 0.7283790261477324, + "grad_norm": 0.11065363883972168, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 188420 + }, + { + "epoch": 0.7284176833511157, + "grad_norm": 0.10257042944431305, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 188430 + }, + { + "epoch": 0.7284563405544989, + "grad_norm": 0.09267813712358475, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 188440 + }, + { + "epoch": 0.7284949977578822, + "grad_norm": 0.11144296079874039, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 188450 + }, + { + "epoch": 0.7285336549612654, + "grad_norm": 0.10019582509994507, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 188460 + }, + { + "epoch": 0.7285723121646488, + "grad_norm": 0.10946372896432877, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 188470 + }, + { + "epoch": 0.728610969368032, + "grad_norm": 0.1142217218875885, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 188480 + }, + { + "epoch": 0.7286496265714153, + "grad_norm": 0.09371957927942276, + "learning_rate": 0.002, + "loss": 2.3615, + "step": 188490 + }, + { + "epoch": 0.7286882837747986, + "grad_norm": 0.12522457540035248, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 188500 + }, + { + "epoch": 0.7287269409781819, + "grad_norm": 0.11884385347366333, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 188510 + }, + { + "epoch": 0.7287655981815652, + "grad_norm": 0.0966656431555748, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 188520 + }, + { + "epoch": 0.7288042553849484, + "grad_norm": 0.10101541131734848, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 188530 + }, + { + "epoch": 0.7288429125883317, + "grad_norm": 0.09151628613471985, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 188540 + }, + { + "epoch": 0.728881569791715, + "grad_norm": 0.10385442525148392, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 188550 + }, + { + "epoch": 0.7289202269950983, + "grad_norm": 0.10391923040151596, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 188560 + }, + { + "epoch": 0.7289588841984815, + "grad_norm": 0.10009454935789108, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 188570 + }, + { + "epoch": 0.7289975414018648, + "grad_norm": 0.1031494289636612, + "learning_rate": 0.002, + "loss": 2.355, + "step": 188580 + }, + { + "epoch": 0.7290361986052482, + "grad_norm": 0.12123165279626846, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 188590 + }, + { + "epoch": 0.7290748558086314, + "grad_norm": 0.09856395423412323, + "learning_rate": 0.002, + "loss": 2.355, + "step": 188600 + }, + { + "epoch": 0.7291135130120147, + "grad_norm": 0.1068253144621849, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 188610 + }, + { + "epoch": 0.7291521702153979, + "grad_norm": 0.14589570462703705, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 188620 + }, + { + "epoch": 0.7291908274187812, + "grad_norm": 0.09643730521202087, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 188630 + }, + { + "epoch": 0.7292294846221645, + "grad_norm": 0.12230101972818375, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 188640 + }, + { + "epoch": 0.7292681418255478, + "grad_norm": 0.096110999584198, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 188650 + }, + { + "epoch": 0.729306799028931, + "grad_norm": 0.10471905767917633, + "learning_rate": 0.002, + "loss": 2.341, + "step": 188660 + }, + { + "epoch": 0.7293454562323143, + "grad_norm": 0.11107119917869568, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 188670 + }, + { + "epoch": 0.7293841134356976, + "grad_norm": 0.10437849909067154, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 188680 + }, + { + "epoch": 0.7294227706390809, + "grad_norm": 0.09436061233282089, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 188690 + }, + { + "epoch": 0.7294614278424641, + "grad_norm": 0.09973449259996414, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 188700 + }, + { + "epoch": 0.7295000850458474, + "grad_norm": 0.12083771079778671, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 188710 + }, + { + "epoch": 0.7295387422492308, + "grad_norm": 0.10498762875795364, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 188720 + }, + { + "epoch": 0.729577399452614, + "grad_norm": 0.10566692054271698, + "learning_rate": 0.002, + "loss": 2.333, + "step": 188730 + }, + { + "epoch": 0.7296160566559973, + "grad_norm": 0.097226083278656, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 188740 + }, + { + "epoch": 0.7296547138593805, + "grad_norm": 0.12635381519794464, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 188750 + }, + { + "epoch": 0.7296933710627639, + "grad_norm": 0.09882877767086029, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 188760 + }, + { + "epoch": 0.7297320282661471, + "grad_norm": 0.1020134687423706, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 188770 + }, + { + "epoch": 0.7297706854695304, + "grad_norm": 0.09664612263441086, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 188780 + }, + { + "epoch": 0.7298093426729136, + "grad_norm": 0.11408518254756927, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 188790 + }, + { + "epoch": 0.729847999876297, + "grad_norm": 0.1196506917476654, + "learning_rate": 0.002, + "loss": 2.332, + "step": 188800 + }, + { + "epoch": 0.7298866570796803, + "grad_norm": 0.10588350147008896, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 188810 + }, + { + "epoch": 0.7299253142830635, + "grad_norm": 0.10277391225099564, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 188820 + }, + { + "epoch": 0.7299639714864468, + "grad_norm": 0.09160996973514557, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 188830 + }, + { + "epoch": 0.73000262868983, + "grad_norm": 0.10029343515634537, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 188840 + }, + { + "epoch": 0.7300412858932134, + "grad_norm": 0.09637118130922318, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 188850 + }, + { + "epoch": 0.7300799430965966, + "grad_norm": 0.12911668419837952, + "learning_rate": 0.002, + "loss": 2.312, + "step": 188860 + }, + { + "epoch": 0.7301186002999799, + "grad_norm": 0.0958036333322525, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 188870 + }, + { + "epoch": 0.7301572575033631, + "grad_norm": 0.10825416445732117, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 188880 + }, + { + "epoch": 0.7301959147067465, + "grad_norm": 0.1088646650314331, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 188890 + }, + { + "epoch": 0.7302345719101297, + "grad_norm": 0.10222936421632767, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 188900 + }, + { + "epoch": 0.730273229113513, + "grad_norm": 0.10391924530267715, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 188910 + }, + { + "epoch": 0.7303118863168963, + "grad_norm": 0.0968007892370224, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 188920 + }, + { + "epoch": 0.7303505435202796, + "grad_norm": 0.4386405944824219, + "learning_rate": 0.002, + "loss": 2.347, + "step": 188930 + }, + { + "epoch": 0.7303892007236629, + "grad_norm": 0.12861132621765137, + "learning_rate": 0.002, + "loss": 2.354, + "step": 188940 + }, + { + "epoch": 0.7304278579270461, + "grad_norm": 0.10842836648225784, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 188950 + }, + { + "epoch": 0.7304665151304294, + "grad_norm": 0.09640350192785263, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 188960 + }, + { + "epoch": 0.7305051723338127, + "grad_norm": 0.10740409791469574, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 188970 + }, + { + "epoch": 0.730543829537196, + "grad_norm": 0.09772370010614395, + "learning_rate": 0.002, + "loss": 2.337, + "step": 188980 + }, + { + "epoch": 0.7305824867405792, + "grad_norm": 0.09943395853042603, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 188990 + }, + { + "epoch": 0.7306211439439625, + "grad_norm": 0.10708174854516983, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 189000 + }, + { + "epoch": 0.7306598011473457, + "grad_norm": 0.10401121526956558, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 189010 + }, + { + "epoch": 0.7306984583507291, + "grad_norm": 0.09652461111545563, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 189020 + }, + { + "epoch": 0.7307371155541124, + "grad_norm": 0.10575590282678604, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 189030 + }, + { + "epoch": 0.7307757727574956, + "grad_norm": 0.08972377330064774, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 189040 + }, + { + "epoch": 0.7308144299608789, + "grad_norm": 0.11111640930175781, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 189050 + }, + { + "epoch": 0.7308530871642622, + "grad_norm": 0.11301799863576889, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 189060 + }, + { + "epoch": 0.7308917443676455, + "grad_norm": 0.09813731163740158, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 189070 + }, + { + "epoch": 0.7309304015710287, + "grad_norm": 0.11718396842479706, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 189080 + }, + { + "epoch": 0.730969058774412, + "grad_norm": 0.10873469710350037, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 189090 + }, + { + "epoch": 0.7310077159777953, + "grad_norm": 0.0982939749956131, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 189100 + }, + { + "epoch": 0.7310463731811786, + "grad_norm": 0.09334085136651993, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 189110 + }, + { + "epoch": 0.7310850303845619, + "grad_norm": 0.10179479420185089, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 189120 + }, + { + "epoch": 0.7311236875879451, + "grad_norm": 0.0983622670173645, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 189130 + }, + { + "epoch": 0.7311623447913285, + "grad_norm": 0.1025642529129982, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 189140 + }, + { + "epoch": 0.7312010019947117, + "grad_norm": 0.10435685515403748, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 189150 + }, + { + "epoch": 0.731239659198095, + "grad_norm": 0.11407685279846191, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 189160 + }, + { + "epoch": 0.7312783164014782, + "grad_norm": 0.3479335606098175, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 189170 + }, + { + "epoch": 0.7313169736048615, + "grad_norm": 0.09884588420391083, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 189180 + }, + { + "epoch": 0.7313556308082448, + "grad_norm": 0.10729101300239563, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 189190 + }, + { + "epoch": 0.7313942880116281, + "grad_norm": 0.12231425940990448, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 189200 + }, + { + "epoch": 0.7314329452150113, + "grad_norm": 0.09892288595438004, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 189210 + }, + { + "epoch": 0.7314716024183946, + "grad_norm": 0.12404607981443405, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 189220 + }, + { + "epoch": 0.731510259621778, + "grad_norm": 0.09801378846168518, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 189230 + }, + { + "epoch": 0.7315489168251612, + "grad_norm": 0.10780314356088638, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 189240 + }, + { + "epoch": 0.7315875740285445, + "grad_norm": 0.1150708794593811, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 189250 + }, + { + "epoch": 0.7316262312319277, + "grad_norm": 0.100004181265831, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 189260 + }, + { + "epoch": 0.7316648884353111, + "grad_norm": 0.10322702676057816, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 189270 + }, + { + "epoch": 0.7317035456386943, + "grad_norm": 0.09638111293315887, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 189280 + }, + { + "epoch": 0.7317422028420776, + "grad_norm": 0.10771961510181427, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 189290 + }, + { + "epoch": 0.7317808600454608, + "grad_norm": 0.09627382457256317, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 189300 + }, + { + "epoch": 0.7318195172488442, + "grad_norm": 0.12090234458446503, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 189310 + }, + { + "epoch": 0.7318581744522275, + "grad_norm": 0.09437627345323563, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 189320 + }, + { + "epoch": 0.7318968316556107, + "grad_norm": 0.11243630945682526, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 189330 + }, + { + "epoch": 0.731935488858994, + "grad_norm": 0.11899004131555557, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 189340 + }, + { + "epoch": 0.7319741460623773, + "grad_norm": 0.10806789994239807, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 189350 + }, + { + "epoch": 0.7320128032657606, + "grad_norm": 0.11363356560468674, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 189360 + }, + { + "epoch": 0.7320514604691438, + "grad_norm": 0.1054205521941185, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 189370 + }, + { + "epoch": 0.7320901176725271, + "grad_norm": 0.12295114248991013, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 189380 + }, + { + "epoch": 0.7321287748759103, + "grad_norm": 0.0959784984588623, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 189390 + }, + { + "epoch": 0.7321674320792937, + "grad_norm": 0.09075594693422318, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 189400 + }, + { + "epoch": 0.7322060892826769, + "grad_norm": 0.10640782117843628, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 189410 + }, + { + "epoch": 0.7322447464860602, + "grad_norm": 0.11442548781633377, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 189420 + }, + { + "epoch": 0.7322834036894434, + "grad_norm": 0.0960836112499237, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 189430 + }, + { + "epoch": 0.7323220608928268, + "grad_norm": 0.1013663038611412, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 189440 + }, + { + "epoch": 0.7323607180962101, + "grad_norm": 0.1008801981806755, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 189450 + }, + { + "epoch": 0.7323993752995933, + "grad_norm": 0.10865923017263412, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 189460 + }, + { + "epoch": 0.7324380325029766, + "grad_norm": 0.10891637206077576, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 189470 + }, + { + "epoch": 0.7324766897063599, + "grad_norm": 0.12851367890834808, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 189480 + }, + { + "epoch": 0.7325153469097432, + "grad_norm": 0.09730476886034012, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 189490 + }, + { + "epoch": 0.7325540041131264, + "grad_norm": 0.11946532875299454, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 189500 + }, + { + "epoch": 0.7325926613165097, + "grad_norm": 0.10447601974010468, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 189510 + }, + { + "epoch": 0.732631318519893, + "grad_norm": 0.10364936292171478, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 189520 + }, + { + "epoch": 0.7326699757232763, + "grad_norm": 0.10739912837743759, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 189530 + }, + { + "epoch": 0.7327086329266596, + "grad_norm": 0.10711197555065155, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 189540 + }, + { + "epoch": 0.7327472901300428, + "grad_norm": 0.11326030641794205, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 189550 + }, + { + "epoch": 0.7327859473334261, + "grad_norm": 0.11343449354171753, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 189560 + }, + { + "epoch": 0.7328246045368094, + "grad_norm": 0.10510224103927612, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 189570 + }, + { + "epoch": 0.7328632617401927, + "grad_norm": 0.10465297102928162, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 189580 + }, + { + "epoch": 0.7329019189435759, + "grad_norm": 0.112489253282547, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 189590 + }, + { + "epoch": 0.7329405761469592, + "grad_norm": 0.09400103986263275, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 189600 + }, + { + "epoch": 0.7329792333503425, + "grad_norm": 0.10640868544578552, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 189610 + }, + { + "epoch": 0.7330178905537258, + "grad_norm": 0.11750825494527817, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 189620 + }, + { + "epoch": 0.733056547757109, + "grad_norm": 0.09786515682935715, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 189630 + }, + { + "epoch": 0.7330952049604923, + "grad_norm": 0.11627877503633499, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 189640 + }, + { + "epoch": 0.7331338621638757, + "grad_norm": 0.1656639128923416, + "learning_rate": 0.002, + "loss": 2.352, + "step": 189650 + }, + { + "epoch": 0.7331725193672589, + "grad_norm": 0.10381890088319778, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 189660 + }, + { + "epoch": 0.7332111765706422, + "grad_norm": 0.11755353212356567, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 189670 + }, + { + "epoch": 0.7332498337740254, + "grad_norm": 0.09677081555128098, + "learning_rate": 0.002, + "loss": 2.359, + "step": 189680 + }, + { + "epoch": 0.7332884909774088, + "grad_norm": 0.09234776347875595, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 189690 + }, + { + "epoch": 0.733327148180792, + "grad_norm": 0.10879413783550262, + "learning_rate": 0.002, + "loss": 2.343, + "step": 189700 + }, + { + "epoch": 0.7333658053841753, + "grad_norm": 0.10317614674568176, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 189710 + }, + { + "epoch": 0.7334044625875585, + "grad_norm": 0.09623871743679047, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 189720 + }, + { + "epoch": 0.7334431197909418, + "grad_norm": 0.10473791509866714, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 189730 + }, + { + "epoch": 0.7334817769943252, + "grad_norm": 0.09343335032463074, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 189740 + }, + { + "epoch": 0.7335204341977084, + "grad_norm": 0.1115652471780777, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 189750 + }, + { + "epoch": 0.7335590914010917, + "grad_norm": 0.14288023114204407, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 189760 + }, + { + "epoch": 0.7335977486044749, + "grad_norm": 0.1096411868929863, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 189770 + }, + { + "epoch": 0.7336364058078583, + "grad_norm": 0.1012328639626503, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 189780 + }, + { + "epoch": 0.7336750630112415, + "grad_norm": 0.09999390691518784, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 189790 + }, + { + "epoch": 0.7337137202146248, + "grad_norm": 0.11927925050258636, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 189800 + }, + { + "epoch": 0.733752377418008, + "grad_norm": 0.11045157164335251, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 189810 + }, + { + "epoch": 0.7337910346213914, + "grad_norm": 0.10713623464107513, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 189820 + }, + { + "epoch": 0.7338296918247746, + "grad_norm": 0.0943923145532608, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 189830 + }, + { + "epoch": 0.7338683490281579, + "grad_norm": 0.09983383119106293, + "learning_rate": 0.002, + "loss": 2.334, + "step": 189840 + }, + { + "epoch": 0.7339070062315411, + "grad_norm": 0.11707711219787598, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 189850 + }, + { + "epoch": 0.7339456634349245, + "grad_norm": 0.12364879250526428, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 189860 + }, + { + "epoch": 0.7339843206383078, + "grad_norm": 0.10379421710968018, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 189870 + }, + { + "epoch": 0.734022977841691, + "grad_norm": 0.0928567424416542, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 189880 + }, + { + "epoch": 0.7340616350450743, + "grad_norm": 0.1087023988366127, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 189890 + }, + { + "epoch": 0.7341002922484576, + "grad_norm": 0.0999482050538063, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 189900 + }, + { + "epoch": 0.7341389494518409, + "grad_norm": 0.0944395586848259, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 189910 + }, + { + "epoch": 0.7341776066552241, + "grad_norm": 0.09572398662567139, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 189920 + }, + { + "epoch": 0.7342162638586074, + "grad_norm": 0.09270896762609482, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 189930 + }, + { + "epoch": 0.7342549210619906, + "grad_norm": 0.09984765946865082, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 189940 + }, + { + "epoch": 0.734293578265374, + "grad_norm": 0.10290472954511642, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 189950 + }, + { + "epoch": 0.7343322354687573, + "grad_norm": 0.1093931496143341, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 189960 + }, + { + "epoch": 0.7343708926721405, + "grad_norm": 0.11136391758918762, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 189970 + }, + { + "epoch": 0.7344095498755238, + "grad_norm": 0.09859353303909302, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 189980 + }, + { + "epoch": 0.7344482070789071, + "grad_norm": 0.10707998275756836, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 189990 + }, + { + "epoch": 0.7344868642822904, + "grad_norm": 0.09826818108558655, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 190000 + }, + { + "epoch": 0.7345255214856736, + "grad_norm": 0.1676829308271408, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 190010 + }, + { + "epoch": 0.7345641786890569, + "grad_norm": 0.1222255527973175, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 190020 + }, + { + "epoch": 0.7346028358924402, + "grad_norm": 0.08716561645269394, + "learning_rate": 0.002, + "loss": 2.339, + "step": 190030 + }, + { + "epoch": 0.7346414930958235, + "grad_norm": 0.10340414196252823, + "learning_rate": 0.002, + "loss": 2.351, + "step": 190040 + }, + { + "epoch": 0.7346801502992067, + "grad_norm": 0.1073414757847786, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 190050 + }, + { + "epoch": 0.73471880750259, + "grad_norm": 0.11026755720376968, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 190060 + }, + { + "epoch": 0.7347574647059734, + "grad_norm": 0.12019442766904831, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 190070 + }, + { + "epoch": 0.7347961219093566, + "grad_norm": 0.10721049457788467, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 190080 + }, + { + "epoch": 0.7348347791127399, + "grad_norm": 0.10024671256542206, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 190090 + }, + { + "epoch": 0.7348734363161231, + "grad_norm": 0.09239985793828964, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 190100 + }, + { + "epoch": 0.7349120935195064, + "grad_norm": 0.09348446875810623, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 190110 + }, + { + "epoch": 0.7349507507228897, + "grad_norm": 0.11527527123689651, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 190120 + }, + { + "epoch": 0.734989407926273, + "grad_norm": 0.10688439756631851, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 190130 + }, + { + "epoch": 0.7350280651296562, + "grad_norm": 0.10173143446445465, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 190140 + }, + { + "epoch": 0.7350667223330395, + "grad_norm": 0.09939683228731155, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 190150 + }, + { + "epoch": 0.7351053795364229, + "grad_norm": 0.10313131660223007, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 190160 + }, + { + "epoch": 0.7351440367398061, + "grad_norm": 0.09282536804676056, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 190170 + }, + { + "epoch": 0.7351826939431894, + "grad_norm": 0.09631568938493729, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 190180 + }, + { + "epoch": 0.7352213511465726, + "grad_norm": 0.09160216152667999, + "learning_rate": 0.002, + "loss": 2.3125, + "step": 190190 + }, + { + "epoch": 0.735260008349956, + "grad_norm": 0.10035625845193863, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 190200 + }, + { + "epoch": 0.7352986655533392, + "grad_norm": 0.10442013293504715, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 190210 + }, + { + "epoch": 0.7353373227567225, + "grad_norm": 0.10213469713926315, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 190220 + }, + { + "epoch": 0.7353759799601057, + "grad_norm": 0.11790081113576889, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 190230 + }, + { + "epoch": 0.7354146371634891, + "grad_norm": 0.10916659981012344, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 190240 + }, + { + "epoch": 0.7354532943668723, + "grad_norm": 0.11207661777734756, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 190250 + }, + { + "epoch": 0.7354919515702556, + "grad_norm": 0.10120797902345657, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 190260 + }, + { + "epoch": 0.7355306087736388, + "grad_norm": 0.11318664997816086, + "learning_rate": 0.002, + "loss": 2.341, + "step": 190270 + }, + { + "epoch": 0.7355692659770222, + "grad_norm": 0.09600941836833954, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 190280 + }, + { + "epoch": 0.7356079231804055, + "grad_norm": 0.1251683384180069, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 190290 + }, + { + "epoch": 0.7356465803837887, + "grad_norm": 0.09991093724966049, + "learning_rate": 0.002, + "loss": 2.344, + "step": 190300 + }, + { + "epoch": 0.735685237587172, + "grad_norm": 0.0951126366853714, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 190310 + }, + { + "epoch": 0.7357238947905552, + "grad_norm": 0.11469744145870209, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 190320 + }, + { + "epoch": 0.7357625519939386, + "grad_norm": 0.10002505034208298, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 190330 + }, + { + "epoch": 0.7358012091973218, + "grad_norm": 0.10277873277664185, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 190340 + }, + { + "epoch": 0.7358398664007051, + "grad_norm": 0.11609012633562088, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 190350 + }, + { + "epoch": 0.7358785236040883, + "grad_norm": 0.2405710369348526, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 190360 + }, + { + "epoch": 0.7359171808074717, + "grad_norm": 0.1104801669716835, + "learning_rate": 0.002, + "loss": 2.344, + "step": 190370 + }, + { + "epoch": 0.735955838010855, + "grad_norm": 0.11027061939239502, + "learning_rate": 0.002, + "loss": 2.355, + "step": 190380 + }, + { + "epoch": 0.7359944952142382, + "grad_norm": 0.09764520823955536, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 190390 + }, + { + "epoch": 0.7360331524176215, + "grad_norm": 0.10245082527399063, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 190400 + }, + { + "epoch": 0.7360718096210048, + "grad_norm": 0.09403353184461594, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 190410 + }, + { + "epoch": 0.7361104668243881, + "grad_norm": 0.11879895627498627, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 190420 + }, + { + "epoch": 0.7361491240277713, + "grad_norm": 0.12122533470392227, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 190430 + }, + { + "epoch": 0.7361877812311546, + "grad_norm": 0.1289253979921341, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 190440 + }, + { + "epoch": 0.7362264384345379, + "grad_norm": 0.12489252537488937, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 190450 + }, + { + "epoch": 0.7362650956379212, + "grad_norm": 0.11175944656133652, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 190460 + }, + { + "epoch": 0.7363037528413044, + "grad_norm": 0.09459386765956879, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 190470 + }, + { + "epoch": 0.7363424100446877, + "grad_norm": 0.1060258150100708, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 190480 + }, + { + "epoch": 0.736381067248071, + "grad_norm": 0.10711876302957535, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 190490 + }, + { + "epoch": 0.7364197244514543, + "grad_norm": 0.10872458666563034, + "learning_rate": 0.002, + "loss": 2.352, + "step": 190500 + }, + { + "epoch": 0.7364583816548376, + "grad_norm": 0.10507549345493317, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 190510 + }, + { + "epoch": 0.7364970388582208, + "grad_norm": 0.10001684725284576, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 190520 + }, + { + "epoch": 0.7365356960616041, + "grad_norm": 0.13420626521110535, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 190530 + }, + { + "epoch": 0.7365743532649874, + "grad_norm": 0.11179409176111221, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 190540 + }, + { + "epoch": 0.7366130104683707, + "grad_norm": 0.2960387170314789, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 190550 + }, + { + "epoch": 0.7366516676717539, + "grad_norm": 0.09838057309389114, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 190560 + }, + { + "epoch": 0.7366903248751372, + "grad_norm": 0.11178871989250183, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 190570 + }, + { + "epoch": 0.7367289820785206, + "grad_norm": 0.10592333227396011, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 190580 + }, + { + "epoch": 0.7367676392819038, + "grad_norm": 0.09014479070901871, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 190590 + }, + { + "epoch": 0.7368062964852871, + "grad_norm": 0.10933726280927658, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 190600 + }, + { + "epoch": 0.7368449536886703, + "grad_norm": 0.09692629426717758, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 190610 + }, + { + "epoch": 0.7368836108920537, + "grad_norm": 0.11744661629199982, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 190620 + }, + { + "epoch": 0.7369222680954369, + "grad_norm": 0.10347847640514374, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 190630 + }, + { + "epoch": 0.7369609252988202, + "grad_norm": 0.11454054713249207, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 190640 + }, + { + "epoch": 0.7369995825022034, + "grad_norm": 0.11765061318874359, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 190650 + }, + { + "epoch": 0.7370382397055867, + "grad_norm": 0.10797842592000961, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 190660 + }, + { + "epoch": 0.73707689690897, + "grad_norm": 0.10087337344884872, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 190670 + }, + { + "epoch": 0.7371155541123533, + "grad_norm": 0.12900900840759277, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 190680 + }, + { + "epoch": 0.7371542113157366, + "grad_norm": 0.11788572371006012, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 190690 + }, + { + "epoch": 0.7371928685191198, + "grad_norm": 0.09947472810745239, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 190700 + }, + { + "epoch": 0.7372315257225032, + "grad_norm": 0.13462482392787933, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 190710 + }, + { + "epoch": 0.7372701829258864, + "grad_norm": 0.0978013277053833, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 190720 + }, + { + "epoch": 0.7373088401292697, + "grad_norm": 0.11396888643503189, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 190730 + }, + { + "epoch": 0.7373474973326529, + "grad_norm": 0.11444772034883499, + "learning_rate": 0.002, + "loss": 2.342, + "step": 190740 + }, + { + "epoch": 0.7373861545360363, + "grad_norm": 0.09565328806638718, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 190750 + }, + { + "epoch": 0.7374248117394195, + "grad_norm": 0.09409163147211075, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 190760 + }, + { + "epoch": 0.7374634689428028, + "grad_norm": 0.09765222668647766, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 190770 + }, + { + "epoch": 0.737502126146186, + "grad_norm": 0.11007392406463623, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 190780 + }, + { + "epoch": 0.7375407833495694, + "grad_norm": 0.10825943946838379, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 190790 + }, + { + "epoch": 0.7375794405529527, + "grad_norm": 0.255696177482605, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 190800 + }, + { + "epoch": 0.7376180977563359, + "grad_norm": 0.10646931082010269, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 190810 + }, + { + "epoch": 0.7376567549597192, + "grad_norm": 0.09007599949836731, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 190820 + }, + { + "epoch": 0.7376954121631025, + "grad_norm": 0.10881149023771286, + "learning_rate": 0.002, + "loss": 2.352, + "step": 190830 + }, + { + "epoch": 0.7377340693664858, + "grad_norm": 0.08729346096515656, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 190840 + }, + { + "epoch": 0.737772726569869, + "grad_norm": 0.10455001890659332, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 190850 + }, + { + "epoch": 0.7378113837732523, + "grad_norm": 0.11487884074449539, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 190860 + }, + { + "epoch": 0.7378500409766355, + "grad_norm": 0.10313017666339874, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 190870 + }, + { + "epoch": 0.7378886981800189, + "grad_norm": 0.10160215198993683, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 190880 + }, + { + "epoch": 0.7379273553834022, + "grad_norm": 0.0965922549366951, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 190890 + }, + { + "epoch": 0.7379660125867854, + "grad_norm": 0.10829756408929825, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 190900 + }, + { + "epoch": 0.7380046697901687, + "grad_norm": 0.10519444942474365, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 190910 + }, + { + "epoch": 0.738043326993552, + "grad_norm": 0.1222805604338646, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 190920 + }, + { + "epoch": 0.7380819841969353, + "grad_norm": 0.11346898972988129, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 190930 + }, + { + "epoch": 0.7381206414003185, + "grad_norm": 0.10387032479047775, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 190940 + }, + { + "epoch": 0.7381592986037018, + "grad_norm": 0.10021762549877167, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 190950 + }, + { + "epoch": 0.7381979558070851, + "grad_norm": 0.10510313510894775, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 190960 + }, + { + "epoch": 0.7382366130104684, + "grad_norm": 0.09277694672346115, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 190970 + }, + { + "epoch": 0.7382752702138516, + "grad_norm": 0.11668671667575836, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 190980 + }, + { + "epoch": 0.7383139274172349, + "grad_norm": 0.11169299483299255, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 190990 + }, + { + "epoch": 0.7383525846206183, + "grad_norm": 0.09932884573936462, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 191000 + }, + { + "epoch": 0.7383912418240015, + "grad_norm": 0.09709254652261734, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 191010 + }, + { + "epoch": 0.7384298990273848, + "grad_norm": 0.08417420834302902, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 191020 + }, + { + "epoch": 0.738468556230768, + "grad_norm": 0.10885326564311981, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 191030 + }, + { + "epoch": 0.7385072134341513, + "grad_norm": 0.09797409176826477, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 191040 + }, + { + "epoch": 0.7385458706375346, + "grad_norm": 0.09689269214868546, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 191050 + }, + { + "epoch": 0.7385845278409179, + "grad_norm": 0.11540959030389786, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 191060 + }, + { + "epoch": 0.7386231850443011, + "grad_norm": 0.10804573446512222, + "learning_rate": 0.002, + "loss": 2.348, + "step": 191070 + }, + { + "epoch": 0.7386618422476844, + "grad_norm": 0.11262696981430054, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 191080 + }, + { + "epoch": 0.7387004994510677, + "grad_norm": 0.09762831032276154, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 191090 + }, + { + "epoch": 0.738739156654451, + "grad_norm": 0.10504738986492157, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 191100 + }, + { + "epoch": 0.7387778138578343, + "grad_norm": 0.0954015776515007, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 191110 + }, + { + "epoch": 0.7388164710612175, + "grad_norm": 0.10296234488487244, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 191120 + }, + { + "epoch": 0.7388551282646009, + "grad_norm": 0.12148667871952057, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 191130 + }, + { + "epoch": 0.7388937854679841, + "grad_norm": 0.09568410366773605, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 191140 + }, + { + "epoch": 0.7389324426713674, + "grad_norm": 0.09313368797302246, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 191150 + }, + { + "epoch": 0.7389710998747506, + "grad_norm": 0.09592664986848831, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 191160 + }, + { + "epoch": 0.739009757078134, + "grad_norm": 0.0917651429772377, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 191170 + }, + { + "epoch": 0.7390484142815172, + "grad_norm": 0.09378854930400848, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 191180 + }, + { + "epoch": 0.7390870714849005, + "grad_norm": 0.1044289618730545, + "learning_rate": 0.002, + "loss": 2.334, + "step": 191190 + }, + { + "epoch": 0.7391257286882837, + "grad_norm": 0.10949849337339401, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 191200 + }, + { + "epoch": 0.7391643858916671, + "grad_norm": 0.12043566256761551, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 191210 + }, + { + "epoch": 0.7392030430950504, + "grad_norm": 0.09803483635187149, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 191220 + }, + { + "epoch": 0.7392417002984336, + "grad_norm": 0.09884842485189438, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 191230 + }, + { + "epoch": 0.7392803575018169, + "grad_norm": 0.10830123722553253, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 191240 + }, + { + "epoch": 0.7393190147052001, + "grad_norm": 0.10305802524089813, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 191250 + }, + { + "epoch": 0.7393576719085835, + "grad_norm": 0.09698376804590225, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 191260 + }, + { + "epoch": 0.7393963291119667, + "grad_norm": 0.11778568476438522, + "learning_rate": 0.002, + "loss": 2.34, + "step": 191270 + }, + { + "epoch": 0.73943498631535, + "grad_norm": 0.10067816823720932, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 191280 + }, + { + "epoch": 0.7394736435187332, + "grad_norm": 0.0829603299498558, + "learning_rate": 0.002, + "loss": 2.334, + "step": 191290 + }, + { + "epoch": 0.7395123007221166, + "grad_norm": 0.1029440388083458, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 191300 + }, + { + "epoch": 0.7395509579254999, + "grad_norm": 0.1000521332025528, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 191310 + }, + { + "epoch": 0.7395896151288831, + "grad_norm": 0.09940711408853531, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 191320 + }, + { + "epoch": 0.7396282723322664, + "grad_norm": 0.10901033133268356, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 191330 + }, + { + "epoch": 0.7396669295356497, + "grad_norm": 0.10165905952453613, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 191340 + }, + { + "epoch": 0.739705586739033, + "grad_norm": 0.09632701426744461, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 191350 + }, + { + "epoch": 0.7397442439424162, + "grad_norm": 0.0995948389172554, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 191360 + }, + { + "epoch": 0.7397829011457995, + "grad_norm": 0.11200392991304398, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 191370 + }, + { + "epoch": 0.7398215583491828, + "grad_norm": 0.10719288140535355, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 191380 + }, + { + "epoch": 0.7398602155525661, + "grad_norm": 0.11133550107479095, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 191390 + }, + { + "epoch": 0.7398988727559493, + "grad_norm": 0.10408426076173782, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 191400 + }, + { + "epoch": 0.7399375299593326, + "grad_norm": 0.09851880371570587, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 191410 + }, + { + "epoch": 0.7399761871627158, + "grad_norm": 0.11316296458244324, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 191420 + }, + { + "epoch": 0.7400148443660992, + "grad_norm": 0.11002680659294128, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 191430 + }, + { + "epoch": 0.7400535015694825, + "grad_norm": 0.11452383548021317, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 191440 + }, + { + "epoch": 0.7400921587728657, + "grad_norm": 0.10038605332374573, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 191450 + }, + { + "epoch": 0.740130815976249, + "grad_norm": 0.10871243476867676, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 191460 + }, + { + "epoch": 0.7401694731796323, + "grad_norm": 0.10097894817590714, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 191470 + }, + { + "epoch": 0.7402081303830156, + "grad_norm": 0.09067437052726746, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 191480 + }, + { + "epoch": 0.7402467875863988, + "grad_norm": 0.10413364320993423, + "learning_rate": 0.002, + "loss": 2.346, + "step": 191490 + }, + { + "epoch": 0.7402854447897821, + "grad_norm": 0.09894074499607086, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 191500 + }, + { + "epoch": 0.7403241019931655, + "grad_norm": 0.10216416418552399, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 191510 + }, + { + "epoch": 0.7403627591965487, + "grad_norm": 0.1033996120095253, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 191520 + }, + { + "epoch": 0.740401416399932, + "grad_norm": 0.09722767025232315, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 191530 + }, + { + "epoch": 0.7404400736033152, + "grad_norm": 0.11290927231311798, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 191540 + }, + { + "epoch": 0.7404787308066986, + "grad_norm": 0.09460388869047165, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 191550 + }, + { + "epoch": 0.7405173880100818, + "grad_norm": 0.11431378871202469, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 191560 + }, + { + "epoch": 0.7405560452134651, + "grad_norm": 0.10955523699522018, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 191570 + }, + { + "epoch": 0.7405947024168483, + "grad_norm": 0.1201447993516922, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 191580 + }, + { + "epoch": 0.7406333596202316, + "grad_norm": 0.10241043567657471, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 191590 + }, + { + "epoch": 0.7406720168236149, + "grad_norm": 0.10200318694114685, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 191600 + }, + { + "epoch": 0.7407106740269982, + "grad_norm": 0.09642831981182098, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 191610 + }, + { + "epoch": 0.7407493312303814, + "grad_norm": 0.09920842945575714, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 191620 + }, + { + "epoch": 0.7407879884337647, + "grad_norm": 0.13008153438568115, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 191630 + }, + { + "epoch": 0.7408266456371481, + "grad_norm": 0.10366988182067871, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 191640 + }, + { + "epoch": 0.7408653028405313, + "grad_norm": 0.1006745919585228, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 191650 + }, + { + "epoch": 0.7409039600439146, + "grad_norm": 0.10098261386156082, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 191660 + }, + { + "epoch": 0.7409426172472978, + "grad_norm": 0.09893878549337387, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 191670 + }, + { + "epoch": 0.7409812744506812, + "grad_norm": 0.10766490548849106, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 191680 + }, + { + "epoch": 0.7410199316540644, + "grad_norm": 0.10770270973443985, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 191690 + }, + { + "epoch": 0.7410585888574477, + "grad_norm": 0.10260528326034546, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 191700 + }, + { + "epoch": 0.7410972460608309, + "grad_norm": 0.10132882744073868, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 191710 + }, + { + "epoch": 0.7411359032642143, + "grad_norm": 0.11207374930381775, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 191720 + }, + { + "epoch": 0.7411745604675976, + "grad_norm": 0.11558990180492401, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 191730 + }, + { + "epoch": 0.7412132176709808, + "grad_norm": 0.09880539029836655, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 191740 + }, + { + "epoch": 0.7412518748743641, + "grad_norm": 0.09084846824407578, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 191750 + }, + { + "epoch": 0.7412905320777474, + "grad_norm": 0.10979164391756058, + "learning_rate": 0.002, + "loss": 2.348, + "step": 191760 + }, + { + "epoch": 0.7413291892811307, + "grad_norm": 0.10676354914903641, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 191770 + }, + { + "epoch": 0.7413678464845139, + "grad_norm": 0.12359759211540222, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 191780 + }, + { + "epoch": 0.7414065036878972, + "grad_norm": 0.08710779994726181, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 191790 + }, + { + "epoch": 0.7414451608912804, + "grad_norm": 0.09951417148113251, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 191800 + }, + { + "epoch": 0.7414838180946638, + "grad_norm": 0.09909050911664963, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 191810 + }, + { + "epoch": 0.741522475298047, + "grad_norm": 0.09301384538412094, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 191820 + }, + { + "epoch": 0.7415611325014303, + "grad_norm": 0.10332369804382324, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 191830 + }, + { + "epoch": 0.7415997897048136, + "grad_norm": 0.11144154518842697, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 191840 + }, + { + "epoch": 0.7416384469081969, + "grad_norm": 0.11473096907138824, + "learning_rate": 0.002, + "loss": 2.33, + "step": 191850 + }, + { + "epoch": 0.7416771041115802, + "grad_norm": 0.09674740582704544, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 191860 + }, + { + "epoch": 0.7417157613149634, + "grad_norm": 0.10228494554758072, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 191870 + }, + { + "epoch": 0.7417544185183467, + "grad_norm": 0.08817991614341736, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 191880 + }, + { + "epoch": 0.74179307572173, + "grad_norm": 0.10097159445285797, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 191890 + }, + { + "epoch": 0.7418317329251133, + "grad_norm": 0.09437946230173111, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 191900 + }, + { + "epoch": 0.7418703901284965, + "grad_norm": 0.38441652059555054, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 191910 + }, + { + "epoch": 0.7419090473318798, + "grad_norm": 0.112345851957798, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 191920 + }, + { + "epoch": 0.7419477045352632, + "grad_norm": 0.09058287739753723, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 191930 + }, + { + "epoch": 0.7419863617386464, + "grad_norm": 0.11062506586313248, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 191940 + }, + { + "epoch": 0.7420250189420297, + "grad_norm": 0.10590583086013794, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 191950 + }, + { + "epoch": 0.7420636761454129, + "grad_norm": 0.11020754277706146, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 191960 + }, + { + "epoch": 0.7421023333487962, + "grad_norm": 0.09555596858263016, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 191970 + }, + { + "epoch": 0.7421409905521795, + "grad_norm": 0.08776164799928665, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 191980 + }, + { + "epoch": 0.7421796477555628, + "grad_norm": 0.10907012969255447, + "learning_rate": 0.002, + "loss": 2.334, + "step": 191990 + }, + { + "epoch": 0.742218304958946, + "grad_norm": 0.11223597079515457, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 192000 + }, + { + "epoch": 0.7422569621623293, + "grad_norm": 0.10648026317358017, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 192010 + }, + { + "epoch": 0.7422956193657126, + "grad_norm": 0.09623145312070847, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 192020 + }, + { + "epoch": 0.7423342765690959, + "grad_norm": 0.09438787400722504, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 192030 + }, + { + "epoch": 0.7423729337724791, + "grad_norm": 0.1130327582359314, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 192040 + }, + { + "epoch": 0.7424115909758624, + "grad_norm": 0.10181952267885208, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 192050 + }, + { + "epoch": 0.7424502481792458, + "grad_norm": 0.09615079313516617, + "learning_rate": 0.002, + "loss": 2.341, + "step": 192060 + }, + { + "epoch": 0.742488905382629, + "grad_norm": 0.122190922498703, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 192070 + }, + { + "epoch": 0.7425275625860123, + "grad_norm": 0.09345017373561859, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 192080 + }, + { + "epoch": 0.7425662197893955, + "grad_norm": 0.11099596321582794, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 192090 + }, + { + "epoch": 0.7426048769927789, + "grad_norm": 0.10582207888364792, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 192100 + }, + { + "epoch": 0.7426435341961621, + "grad_norm": 0.1213928833603859, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 192110 + }, + { + "epoch": 0.7426821913995454, + "grad_norm": 0.11554766446352005, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 192120 + }, + { + "epoch": 0.7427208486029286, + "grad_norm": 0.10213926434516907, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 192130 + }, + { + "epoch": 0.7427595058063119, + "grad_norm": 0.09706184267997742, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 192140 + }, + { + "epoch": 0.7427981630096953, + "grad_norm": 0.10466006398200989, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 192150 + }, + { + "epoch": 0.7428368202130785, + "grad_norm": 0.1048019751906395, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 192160 + }, + { + "epoch": 0.7428754774164618, + "grad_norm": 0.10146649181842804, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 192170 + }, + { + "epoch": 0.742914134619845, + "grad_norm": 0.09698516130447388, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 192180 + }, + { + "epoch": 0.7429527918232284, + "grad_norm": 0.10559068620204926, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 192190 + }, + { + "epoch": 0.7429914490266116, + "grad_norm": 0.10536423325538635, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 192200 + }, + { + "epoch": 0.7430301062299949, + "grad_norm": 0.09452679008245468, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 192210 + }, + { + "epoch": 0.7430687634333781, + "grad_norm": 0.10646776854991913, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 192220 + }, + { + "epoch": 0.7431074206367615, + "grad_norm": 0.10824066400527954, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 192230 + }, + { + "epoch": 0.7431460778401447, + "grad_norm": 0.11754116415977478, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 192240 + }, + { + "epoch": 0.743184735043528, + "grad_norm": 0.11462371796369553, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 192250 + }, + { + "epoch": 0.7432233922469113, + "grad_norm": 0.10152816027402878, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 192260 + }, + { + "epoch": 0.7432620494502946, + "grad_norm": 0.1248365193605423, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 192270 + }, + { + "epoch": 0.7433007066536779, + "grad_norm": 0.10560888797044754, + "learning_rate": 0.002, + "loss": 2.335, + "step": 192280 + }, + { + "epoch": 0.7433393638570611, + "grad_norm": 0.10040289908647537, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 192290 + }, + { + "epoch": 0.7433780210604444, + "grad_norm": 0.1166427955031395, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 192300 + }, + { + "epoch": 0.7434166782638277, + "grad_norm": 0.11556585133075714, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 192310 + }, + { + "epoch": 0.743455335467211, + "grad_norm": 0.09876317530870438, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 192320 + }, + { + "epoch": 0.7434939926705942, + "grad_norm": 0.11803120374679565, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 192330 + }, + { + "epoch": 0.7435326498739775, + "grad_norm": 0.10722057521343231, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 192340 + }, + { + "epoch": 0.7435713070773607, + "grad_norm": 0.10615711659193039, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 192350 + }, + { + "epoch": 0.7436099642807441, + "grad_norm": 0.09986952692270279, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 192360 + }, + { + "epoch": 0.7436486214841274, + "grad_norm": 0.17439299821853638, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 192370 + }, + { + "epoch": 0.7436872786875106, + "grad_norm": 0.11447092890739441, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 192380 + }, + { + "epoch": 0.7437259358908939, + "grad_norm": 0.10483738780021667, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 192390 + }, + { + "epoch": 0.7437645930942772, + "grad_norm": 0.10391459614038467, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 192400 + }, + { + "epoch": 0.7438032502976605, + "grad_norm": 0.10389243811368942, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 192410 + }, + { + "epoch": 0.7438419075010437, + "grad_norm": 0.10303031653165817, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 192420 + }, + { + "epoch": 0.743880564704427, + "grad_norm": 0.10993155837059021, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 192430 + }, + { + "epoch": 0.7439192219078103, + "grad_norm": 0.11139877885580063, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 192440 + }, + { + "epoch": 0.7439578791111936, + "grad_norm": 0.11654575914144516, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 192450 + }, + { + "epoch": 0.7439965363145769, + "grad_norm": 0.11384663730859756, + "learning_rate": 0.002, + "loss": 2.347, + "step": 192460 + }, + { + "epoch": 0.7440351935179601, + "grad_norm": 0.10386136919260025, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 192470 + }, + { + "epoch": 0.7440738507213435, + "grad_norm": 0.11686296761035919, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 192480 + }, + { + "epoch": 0.7441125079247267, + "grad_norm": 0.08788301795721054, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 192490 + }, + { + "epoch": 0.74415116512811, + "grad_norm": 0.13229641318321228, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 192500 + }, + { + "epoch": 0.7441898223314932, + "grad_norm": 0.11530508100986481, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 192510 + }, + { + "epoch": 0.7442284795348765, + "grad_norm": 0.10118814557790756, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 192520 + }, + { + "epoch": 0.7442671367382598, + "grad_norm": 0.10761450976133347, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 192530 + }, + { + "epoch": 0.7443057939416431, + "grad_norm": 0.09663943201303482, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 192540 + }, + { + "epoch": 0.7443444511450263, + "grad_norm": 0.14217162132263184, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 192550 + }, + { + "epoch": 0.7443831083484096, + "grad_norm": 0.1184287816286087, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 192560 + }, + { + "epoch": 0.744421765551793, + "grad_norm": 0.1200072392821312, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 192570 + }, + { + "epoch": 0.7444604227551762, + "grad_norm": 0.10290877521038055, + "learning_rate": 0.002, + "loss": 2.344, + "step": 192580 + }, + { + "epoch": 0.7444990799585595, + "grad_norm": 0.11370282620191574, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 192590 + }, + { + "epoch": 0.7445377371619427, + "grad_norm": 0.10389462858438492, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 192600 + }, + { + "epoch": 0.7445763943653261, + "grad_norm": 0.10813149809837341, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 192610 + }, + { + "epoch": 0.7446150515687093, + "grad_norm": 0.11805222928524017, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 192620 + }, + { + "epoch": 0.7446537087720926, + "grad_norm": 0.10988860577344894, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 192630 + }, + { + "epoch": 0.7446923659754758, + "grad_norm": 0.12260608375072479, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 192640 + }, + { + "epoch": 0.7447310231788592, + "grad_norm": 0.12242330610752106, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 192650 + }, + { + "epoch": 0.7447696803822424, + "grad_norm": 0.09700918197631836, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 192660 + }, + { + "epoch": 0.7448083375856257, + "grad_norm": 0.12920258939266205, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 192670 + }, + { + "epoch": 0.744846994789009, + "grad_norm": 0.10415103286504745, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 192680 + }, + { + "epoch": 0.7448856519923923, + "grad_norm": 0.10821574181318283, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 192690 + }, + { + "epoch": 0.7449243091957756, + "grad_norm": 0.11690433323383331, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 192700 + }, + { + "epoch": 0.7449629663991588, + "grad_norm": 0.11213494837284088, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 192710 + }, + { + "epoch": 0.7450016236025421, + "grad_norm": 0.10390239208936691, + "learning_rate": 0.002, + "loss": 2.347, + "step": 192720 + }, + { + "epoch": 0.7450402808059253, + "grad_norm": 0.10177457332611084, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 192730 + }, + { + "epoch": 0.7450789380093087, + "grad_norm": 0.10255581140518188, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 192740 + }, + { + "epoch": 0.7451175952126919, + "grad_norm": 0.09474656730890274, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 192750 + }, + { + "epoch": 0.7451562524160752, + "grad_norm": 0.12057298421859741, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 192760 + }, + { + "epoch": 0.7451949096194584, + "grad_norm": 0.1214093342423439, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 192770 + }, + { + "epoch": 0.7452335668228418, + "grad_norm": 0.10260576009750366, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 192780 + }, + { + "epoch": 0.7452722240262251, + "grad_norm": 0.11322708427906036, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 192790 + }, + { + "epoch": 0.7453108812296083, + "grad_norm": 0.11825566738843918, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 192800 + }, + { + "epoch": 0.7453495384329916, + "grad_norm": 0.09943481534719467, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 192810 + }, + { + "epoch": 0.7453881956363749, + "grad_norm": 0.09891356527805328, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 192820 + }, + { + "epoch": 0.7454268528397582, + "grad_norm": 0.10330082476139069, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 192830 + }, + { + "epoch": 0.7454655100431414, + "grad_norm": 0.09493868798017502, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 192840 + }, + { + "epoch": 0.7455041672465247, + "grad_norm": 0.12381740659475327, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 192850 + }, + { + "epoch": 0.745542824449908, + "grad_norm": 0.10590598732233047, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 192860 + }, + { + "epoch": 0.7455814816532913, + "grad_norm": 0.09802375733852386, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 192870 + }, + { + "epoch": 0.7456201388566746, + "grad_norm": 0.10863767564296722, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 192880 + }, + { + "epoch": 0.7456587960600578, + "grad_norm": 0.13834026455879211, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 192890 + }, + { + "epoch": 0.7456974532634411, + "grad_norm": 0.10244042426347733, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 192900 + }, + { + "epoch": 0.7457361104668244, + "grad_norm": 0.09882020950317383, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 192910 + }, + { + "epoch": 0.7457747676702077, + "grad_norm": 0.10287920385599136, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 192920 + }, + { + "epoch": 0.7458134248735909, + "grad_norm": 0.12767957150936127, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 192930 + }, + { + "epoch": 0.7458520820769742, + "grad_norm": 0.10538261383771896, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 192940 + }, + { + "epoch": 0.7458907392803575, + "grad_norm": 0.11202721297740936, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 192950 + }, + { + "epoch": 0.7459293964837408, + "grad_norm": 0.12798555195331573, + "learning_rate": 0.002, + "loss": 2.331, + "step": 192960 + }, + { + "epoch": 0.745968053687124, + "grad_norm": 0.09712910652160645, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 192970 + }, + { + "epoch": 0.7460067108905073, + "grad_norm": 0.0980161651968956, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 192980 + }, + { + "epoch": 0.7460453680938907, + "grad_norm": 0.11752115935087204, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 192990 + }, + { + "epoch": 0.7460840252972739, + "grad_norm": 0.10274143517017365, + "learning_rate": 0.002, + "loss": 2.343, + "step": 193000 + }, + { + "epoch": 0.7461226825006572, + "grad_norm": 0.10485357791185379, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 193010 + }, + { + "epoch": 0.7461613397040404, + "grad_norm": 0.09393350034952164, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 193020 + }, + { + "epoch": 0.7461999969074238, + "grad_norm": 0.10289428383111954, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 193030 + }, + { + "epoch": 0.746238654110807, + "grad_norm": 0.10881324112415314, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 193040 + }, + { + "epoch": 0.7462773113141903, + "grad_norm": 0.11512856185436249, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 193050 + }, + { + "epoch": 0.7463159685175735, + "grad_norm": 0.12571898102760315, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 193060 + }, + { + "epoch": 0.7463546257209568, + "grad_norm": 0.08950827270746231, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 193070 + }, + { + "epoch": 0.7463932829243402, + "grad_norm": 0.09514682739973068, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 193080 + }, + { + "epoch": 0.7464319401277234, + "grad_norm": 0.10203292220830917, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 193090 + }, + { + "epoch": 0.7464705973311067, + "grad_norm": 0.11616890877485275, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 193100 + }, + { + "epoch": 0.7465092545344899, + "grad_norm": 0.1304377019405365, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 193110 + }, + { + "epoch": 0.7465479117378733, + "grad_norm": 0.11245515197515488, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 193120 + }, + { + "epoch": 0.7465865689412565, + "grad_norm": 0.10679817199707031, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 193130 + }, + { + "epoch": 0.7466252261446398, + "grad_norm": 0.20088240504264832, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 193140 + }, + { + "epoch": 0.746663883348023, + "grad_norm": 0.10575690120458603, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 193150 + }, + { + "epoch": 0.7467025405514064, + "grad_norm": 0.11533773690462112, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 193160 + }, + { + "epoch": 0.7467411977547896, + "grad_norm": 0.09326361119747162, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 193170 + }, + { + "epoch": 0.7467798549581729, + "grad_norm": 0.09732891619205475, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 193180 + }, + { + "epoch": 0.7468185121615561, + "grad_norm": 0.11433549970388412, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 193190 + }, + { + "epoch": 0.7468571693649395, + "grad_norm": 0.12189090996980667, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 193200 + }, + { + "epoch": 0.7468958265683228, + "grad_norm": 0.1078023612499237, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 193210 + }, + { + "epoch": 0.746934483771706, + "grad_norm": 0.10564499348402023, + "learning_rate": 0.002, + "loss": 2.352, + "step": 193220 + }, + { + "epoch": 0.7469731409750893, + "grad_norm": 0.1329101026058197, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 193230 + }, + { + "epoch": 0.7470117981784726, + "grad_norm": 0.10954732447862625, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 193240 + }, + { + "epoch": 0.7470504553818559, + "grad_norm": 0.11114715039730072, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 193250 + }, + { + "epoch": 0.7470891125852391, + "grad_norm": 0.10692232847213745, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 193260 + }, + { + "epoch": 0.7471277697886224, + "grad_norm": 0.10856243222951889, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 193270 + }, + { + "epoch": 0.7471664269920056, + "grad_norm": 0.1030806303024292, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 193280 + }, + { + "epoch": 0.747205084195389, + "grad_norm": 0.10333049297332764, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 193290 + }, + { + "epoch": 0.7472437413987723, + "grad_norm": 0.10423696041107178, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 193300 + }, + { + "epoch": 0.7472823986021555, + "grad_norm": 0.09971468150615692, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 193310 + }, + { + "epoch": 0.7473210558055388, + "grad_norm": 0.09328296780586243, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 193320 + }, + { + "epoch": 0.7473597130089221, + "grad_norm": 0.09091471135616302, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 193330 + }, + { + "epoch": 0.7473983702123054, + "grad_norm": 0.09623685479164124, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 193340 + }, + { + "epoch": 0.7474370274156886, + "grad_norm": 0.11122284084558487, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 193350 + }, + { + "epoch": 0.7474756846190719, + "grad_norm": 0.1184777170419693, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 193360 + }, + { + "epoch": 0.7475143418224552, + "grad_norm": 0.0959344357252121, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 193370 + }, + { + "epoch": 0.7475529990258385, + "grad_norm": 0.11243680119514465, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 193380 + }, + { + "epoch": 0.7475916562292217, + "grad_norm": 0.10417129844427109, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 193390 + }, + { + "epoch": 0.747630313432605, + "grad_norm": 0.14244239032268524, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 193400 + }, + { + "epoch": 0.7476689706359884, + "grad_norm": 0.10252571105957031, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 193410 + }, + { + "epoch": 0.7477076278393716, + "grad_norm": 0.10040729492902756, + "learning_rate": 0.002, + "loss": 2.349, + "step": 193420 + }, + { + "epoch": 0.7477462850427549, + "grad_norm": 0.10703305900096893, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 193430 + }, + { + "epoch": 0.7477849422461381, + "grad_norm": 0.10389512032270432, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 193440 + }, + { + "epoch": 0.7478235994495214, + "grad_norm": 0.10328540951013565, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 193450 + }, + { + "epoch": 0.7478622566529047, + "grad_norm": 0.09823735803365707, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 193460 + }, + { + "epoch": 0.747900913856288, + "grad_norm": 0.1279013454914093, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 193470 + }, + { + "epoch": 0.7479395710596712, + "grad_norm": 0.10907725989818573, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 193480 + }, + { + "epoch": 0.7479782282630545, + "grad_norm": 0.09639472514390945, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 193490 + }, + { + "epoch": 0.7480168854664379, + "grad_norm": 0.10604395717382431, + "learning_rate": 0.002, + "loss": 2.361, + "step": 193500 + }, + { + "epoch": 0.7480555426698211, + "grad_norm": 0.1024792343378067, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 193510 + }, + { + "epoch": 0.7480941998732044, + "grad_norm": 0.1102176234126091, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 193520 + }, + { + "epoch": 0.7481328570765876, + "grad_norm": 0.10424952208995819, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 193530 + }, + { + "epoch": 0.748171514279971, + "grad_norm": 0.11627110838890076, + "learning_rate": 0.002, + "loss": 2.325, + "step": 193540 + }, + { + "epoch": 0.7482101714833542, + "grad_norm": 0.10998349636793137, + "learning_rate": 0.002, + "loss": 2.343, + "step": 193550 + }, + { + "epoch": 0.7482488286867375, + "grad_norm": 0.11724622547626495, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 193560 + }, + { + "epoch": 0.7482874858901207, + "grad_norm": 0.09914011508226395, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 193570 + }, + { + "epoch": 0.7483261430935041, + "grad_norm": 0.10280796885490417, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 193580 + }, + { + "epoch": 0.7483648002968873, + "grad_norm": 0.11854589730501175, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 193590 + }, + { + "epoch": 0.7484034575002706, + "grad_norm": 0.12063703685998917, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 193600 + }, + { + "epoch": 0.7484421147036538, + "grad_norm": 0.10675029456615448, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 193610 + }, + { + "epoch": 0.7484807719070372, + "grad_norm": 0.08650118857622147, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 193620 + }, + { + "epoch": 0.7485194291104205, + "grad_norm": 0.10270006954669952, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 193630 + }, + { + "epoch": 0.7485580863138037, + "grad_norm": 0.08983159065246582, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 193640 + }, + { + "epoch": 0.748596743517187, + "grad_norm": 0.10007721185684204, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 193650 + }, + { + "epoch": 0.7486354007205702, + "grad_norm": 0.12380728870630264, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 193660 + }, + { + "epoch": 0.7486740579239536, + "grad_norm": 0.15238088369369507, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 193670 + }, + { + "epoch": 0.7487127151273368, + "grad_norm": 0.1207793727517128, + "learning_rate": 0.002, + "loss": 2.343, + "step": 193680 + }, + { + "epoch": 0.7487513723307201, + "grad_norm": 0.1177428811788559, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 193690 + }, + { + "epoch": 0.7487900295341033, + "grad_norm": 0.1081174910068512, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 193700 + }, + { + "epoch": 0.7488286867374867, + "grad_norm": 0.10161493718624115, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 193710 + }, + { + "epoch": 0.74886734394087, + "grad_norm": 0.1058894693851471, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 193720 + }, + { + "epoch": 0.7489060011442532, + "grad_norm": 0.10565893352031708, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 193730 + }, + { + "epoch": 0.7489446583476365, + "grad_norm": 0.10452398657798767, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 193740 + }, + { + "epoch": 0.7489833155510198, + "grad_norm": 0.10174624621868134, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 193750 + }, + { + "epoch": 0.7490219727544031, + "grad_norm": 0.09607743471860886, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 193760 + }, + { + "epoch": 0.7490606299577863, + "grad_norm": 0.12087777256965637, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 193770 + }, + { + "epoch": 0.7490992871611696, + "grad_norm": 0.10284367203712463, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 193780 + }, + { + "epoch": 0.7491379443645529, + "grad_norm": 0.11069675534963608, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 193790 + }, + { + "epoch": 0.7491766015679362, + "grad_norm": 0.1019555851817131, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 193800 + }, + { + "epoch": 0.7492152587713194, + "grad_norm": 0.10605629533529282, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 193810 + }, + { + "epoch": 0.7492539159747027, + "grad_norm": 0.09732295572757721, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 193820 + }, + { + "epoch": 0.749292573178086, + "grad_norm": 0.09522780776023865, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 193830 + }, + { + "epoch": 0.7493312303814693, + "grad_norm": 0.14510689675807953, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 193840 + }, + { + "epoch": 0.7493698875848526, + "grad_norm": 0.10406015813350677, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 193850 + }, + { + "epoch": 0.7494085447882358, + "grad_norm": 0.09954486787319183, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 193860 + }, + { + "epoch": 0.7494472019916191, + "grad_norm": 0.10951186716556549, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 193870 + }, + { + "epoch": 0.7494858591950024, + "grad_norm": 0.09955327212810516, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 193880 + }, + { + "epoch": 0.7495245163983857, + "grad_norm": 0.0938873291015625, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 193890 + }, + { + "epoch": 0.7495631736017689, + "grad_norm": 0.11965670436620712, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 193900 + }, + { + "epoch": 0.7496018308051522, + "grad_norm": 0.11990394443273544, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 193910 + }, + { + "epoch": 0.7496404880085356, + "grad_norm": 0.09880927950143814, + "learning_rate": 0.002, + "loss": 2.338, + "step": 193920 + }, + { + "epoch": 0.7496791452119188, + "grad_norm": 0.11151470243930817, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 193930 + }, + { + "epoch": 0.7497178024153021, + "grad_norm": 0.10723958164453506, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 193940 + }, + { + "epoch": 0.7497564596186853, + "grad_norm": 0.11936169117689133, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 193950 + }, + { + "epoch": 0.7497951168220687, + "grad_norm": 0.09781043231487274, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 193960 + }, + { + "epoch": 0.7498337740254519, + "grad_norm": 0.08709392696619034, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 193970 + }, + { + "epoch": 0.7498724312288352, + "grad_norm": 0.10388471931219101, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 193980 + }, + { + "epoch": 0.7499110884322184, + "grad_norm": 0.18327024579048157, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 193990 + }, + { + "epoch": 0.7499497456356017, + "grad_norm": 0.11415718495845795, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 194000 + }, + { + "epoch": 0.749988402838985, + "grad_norm": 0.1244000792503357, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 194010 + }, + { + "epoch": 0.7500270600423683, + "grad_norm": 0.10619892179965973, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 194020 + }, + { + "epoch": 0.7500657172457516, + "grad_norm": 0.11513705551624298, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 194030 + }, + { + "epoch": 0.7501043744491348, + "grad_norm": 0.09795688092708588, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 194040 + }, + { + "epoch": 0.7501430316525182, + "grad_norm": 0.10906984657049179, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 194050 + }, + { + "epoch": 0.7501816888559014, + "grad_norm": 0.10620440542697906, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 194060 + }, + { + "epoch": 0.7502203460592847, + "grad_norm": 0.10489355772733688, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 194070 + }, + { + "epoch": 0.7502590032626679, + "grad_norm": 0.09925409406423569, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 194080 + }, + { + "epoch": 0.7502976604660513, + "grad_norm": 0.11829238384962082, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 194090 + }, + { + "epoch": 0.7503363176694345, + "grad_norm": 0.10316840559244156, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 194100 + }, + { + "epoch": 0.7503749748728178, + "grad_norm": 0.09597914665937424, + "learning_rate": 0.002, + "loss": 2.339, + "step": 194110 + }, + { + "epoch": 0.750413632076201, + "grad_norm": 0.13279080390930176, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 194120 + }, + { + "epoch": 0.7504522892795844, + "grad_norm": 0.11907597631216049, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 194130 + }, + { + "epoch": 0.7504909464829677, + "grad_norm": 0.1021730899810791, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 194140 + }, + { + "epoch": 0.7505296036863509, + "grad_norm": 0.11217319220304489, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 194150 + }, + { + "epoch": 0.7505682608897342, + "grad_norm": 0.09784113615751266, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 194160 + }, + { + "epoch": 0.7506069180931175, + "grad_norm": 0.11889325827360153, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 194170 + }, + { + "epoch": 0.7506455752965008, + "grad_norm": 0.08993558585643768, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 194180 + }, + { + "epoch": 0.750684232499884, + "grad_norm": 0.09891916066408157, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 194190 + }, + { + "epoch": 0.7507228897032673, + "grad_norm": 0.09812989085912704, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 194200 + }, + { + "epoch": 0.7507615469066505, + "grad_norm": 0.12857575714588165, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 194210 + }, + { + "epoch": 0.7508002041100339, + "grad_norm": 0.10943218320608139, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 194220 + }, + { + "epoch": 0.7508388613134171, + "grad_norm": 0.10419569164514542, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 194230 + }, + { + "epoch": 0.7508775185168004, + "grad_norm": 0.0994962528347969, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 194240 + }, + { + "epoch": 0.7509161757201837, + "grad_norm": 0.10264037549495697, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 194250 + }, + { + "epoch": 0.750954832923567, + "grad_norm": 0.1005292534828186, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 194260 + }, + { + "epoch": 0.7509934901269503, + "grad_norm": 0.10990706831216812, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 194270 + }, + { + "epoch": 0.7510321473303335, + "grad_norm": 0.12250801920890808, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 194280 + }, + { + "epoch": 0.7510708045337168, + "grad_norm": 0.10842498391866684, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 194290 + }, + { + "epoch": 0.7511094617371001, + "grad_norm": 0.11050568521022797, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 194300 + }, + { + "epoch": 0.7511481189404834, + "grad_norm": 0.11261072754859924, + "learning_rate": 0.002, + "loss": 2.3098, + "step": 194310 + }, + { + "epoch": 0.7511867761438666, + "grad_norm": 0.10377780348062515, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 194320 + }, + { + "epoch": 0.7512254333472499, + "grad_norm": 0.09198221564292908, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 194330 + }, + { + "epoch": 0.7512640905506333, + "grad_norm": 0.09806704521179199, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 194340 + }, + { + "epoch": 0.7513027477540165, + "grad_norm": 0.13210226595401764, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 194350 + }, + { + "epoch": 0.7513414049573998, + "grad_norm": 0.11560340225696564, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 194360 + }, + { + "epoch": 0.751380062160783, + "grad_norm": 0.10489094257354736, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 194370 + }, + { + "epoch": 0.7514187193641663, + "grad_norm": 0.10843883454799652, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 194380 + }, + { + "epoch": 0.7514573765675496, + "grad_norm": 0.10712260752916336, + "learning_rate": 0.002, + "loss": 2.346, + "step": 194390 + }, + { + "epoch": 0.7514960337709329, + "grad_norm": 0.0975417047739029, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 194400 + }, + { + "epoch": 0.7515346909743161, + "grad_norm": 0.0889492928981781, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 194410 + }, + { + "epoch": 0.7515733481776994, + "grad_norm": 0.1247059628367424, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 194420 + }, + { + "epoch": 0.7516120053810827, + "grad_norm": 0.11323944479227066, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 194430 + }, + { + "epoch": 0.751650662584466, + "grad_norm": 0.09631729871034622, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 194440 + }, + { + "epoch": 0.7516893197878493, + "grad_norm": 0.09453834593296051, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 194450 + }, + { + "epoch": 0.7517279769912325, + "grad_norm": 0.1044001653790474, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 194460 + }, + { + "epoch": 0.7517666341946159, + "grad_norm": 0.10327418148517609, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 194470 + }, + { + "epoch": 0.7518052913979991, + "grad_norm": 0.10132991522550583, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 194480 + }, + { + "epoch": 0.7518439486013824, + "grad_norm": 0.1620233654975891, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 194490 + }, + { + "epoch": 0.7518826058047656, + "grad_norm": 0.10502295196056366, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 194500 + }, + { + "epoch": 0.751921263008149, + "grad_norm": 0.10931259393692017, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 194510 + }, + { + "epoch": 0.7519599202115322, + "grad_norm": 0.11826196312904358, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 194520 + }, + { + "epoch": 0.7519985774149155, + "grad_norm": 0.11824844777584076, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 194530 + }, + { + "epoch": 0.7520372346182987, + "grad_norm": 0.10922195017337799, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 194540 + }, + { + "epoch": 0.7520758918216821, + "grad_norm": 0.13291779160499573, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 194550 + }, + { + "epoch": 0.7521145490250654, + "grad_norm": 0.10089297592639923, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 194560 + }, + { + "epoch": 0.7521532062284486, + "grad_norm": 0.11099760979413986, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 194570 + }, + { + "epoch": 0.7521918634318319, + "grad_norm": 0.10743943601846695, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 194580 + }, + { + "epoch": 0.7522305206352151, + "grad_norm": 0.11014439910650253, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 194590 + }, + { + "epoch": 0.7522691778385985, + "grad_norm": 0.09815085679292679, + "learning_rate": 0.002, + "loss": 2.331, + "step": 194600 + }, + { + "epoch": 0.7523078350419817, + "grad_norm": 0.10134302824735641, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 194610 + }, + { + "epoch": 0.752346492245365, + "grad_norm": 0.11710251122713089, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 194620 + }, + { + "epoch": 0.7523851494487482, + "grad_norm": 0.10482754558324814, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 194630 + }, + { + "epoch": 0.7524238066521316, + "grad_norm": 0.10847599804401398, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 194640 + }, + { + "epoch": 0.7524624638555149, + "grad_norm": 0.08916240185499191, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 194650 + }, + { + "epoch": 0.7525011210588981, + "grad_norm": 0.11111991107463837, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 194660 + }, + { + "epoch": 0.7525397782622814, + "grad_norm": 0.12952084839344025, + "learning_rate": 0.002, + "loss": 2.342, + "step": 194670 + }, + { + "epoch": 0.7525784354656647, + "grad_norm": 0.10298001766204834, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 194680 + }, + { + "epoch": 0.752617092669048, + "grad_norm": 0.11402563750743866, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 194690 + }, + { + "epoch": 0.7526557498724312, + "grad_norm": 0.13321805000305176, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 194700 + }, + { + "epoch": 0.7526944070758145, + "grad_norm": 0.10599828511476517, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 194710 + }, + { + "epoch": 0.7527330642791978, + "grad_norm": 0.10890563577413559, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 194720 + }, + { + "epoch": 0.7527717214825811, + "grad_norm": 0.10188310593366623, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 194730 + }, + { + "epoch": 0.7528103786859643, + "grad_norm": 0.10961693525314331, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 194740 + }, + { + "epoch": 0.7528490358893476, + "grad_norm": 0.08141937106847763, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 194750 + }, + { + "epoch": 0.7528876930927308, + "grad_norm": 0.11148255318403244, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 194760 + }, + { + "epoch": 0.7529263502961142, + "grad_norm": 0.0978025570511818, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 194770 + }, + { + "epoch": 0.7529650074994975, + "grad_norm": 0.09731242060661316, + "learning_rate": 0.002, + "loss": 2.348, + "step": 194780 + }, + { + "epoch": 0.7530036647028807, + "grad_norm": 0.09884215146303177, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 194790 + }, + { + "epoch": 0.753042321906264, + "grad_norm": 0.09421688318252563, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 194800 + }, + { + "epoch": 0.7530809791096473, + "grad_norm": 0.11129975318908691, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 194810 + }, + { + "epoch": 0.7531196363130306, + "grad_norm": 0.11319313943386078, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 194820 + }, + { + "epoch": 0.7531582935164138, + "grad_norm": 0.12444295734167099, + "learning_rate": 0.002, + "loss": 2.342, + "step": 194830 + }, + { + "epoch": 0.7531969507197971, + "grad_norm": 0.09347015619277954, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 194840 + }, + { + "epoch": 0.7532356079231805, + "grad_norm": 0.10135524719953537, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 194850 + }, + { + "epoch": 0.7532742651265637, + "grad_norm": 0.10133438557386398, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 194860 + }, + { + "epoch": 0.753312922329947, + "grad_norm": 0.10349283367395401, + "learning_rate": 0.002, + "loss": 2.34, + "step": 194870 + }, + { + "epoch": 0.7533515795333302, + "grad_norm": 0.0932273268699646, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 194880 + }, + { + "epoch": 0.7533902367367136, + "grad_norm": 0.10109259188175201, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 194890 + }, + { + "epoch": 0.7534288939400968, + "grad_norm": 0.11513467133045197, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 194900 + }, + { + "epoch": 0.7534675511434801, + "grad_norm": 0.09564170241355896, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 194910 + }, + { + "epoch": 0.7535062083468633, + "grad_norm": 0.08503452688455582, + "learning_rate": 0.002, + "loss": 2.334, + "step": 194920 + }, + { + "epoch": 0.7535448655502466, + "grad_norm": 0.11041513830423355, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 194930 + }, + { + "epoch": 0.7535835227536299, + "grad_norm": 0.11314481496810913, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 194940 + }, + { + "epoch": 0.7536221799570132, + "grad_norm": 0.13088074326515198, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 194950 + }, + { + "epoch": 0.7536608371603964, + "grad_norm": 0.09050699323415756, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 194960 + }, + { + "epoch": 0.7536994943637797, + "grad_norm": 0.11696422845125198, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 194970 + }, + { + "epoch": 0.7537381515671631, + "grad_norm": 0.1043548732995987, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 194980 + }, + { + "epoch": 0.7537768087705463, + "grad_norm": 0.10563669353723526, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 194990 + }, + { + "epoch": 0.7538154659739296, + "grad_norm": 0.13230913877487183, + "learning_rate": 0.002, + "loss": 2.326, + "step": 195000 + }, + { + "epoch": 0.7538541231773128, + "grad_norm": 0.0942579135298729, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 195010 + }, + { + "epoch": 0.7538927803806962, + "grad_norm": 0.10348600894212723, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 195020 + }, + { + "epoch": 0.7539314375840794, + "grad_norm": 0.09573236107826233, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 195030 + }, + { + "epoch": 0.7539700947874627, + "grad_norm": 0.10151621699333191, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 195040 + }, + { + "epoch": 0.7540087519908459, + "grad_norm": 0.10730200260877609, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 195050 + }, + { + "epoch": 0.7540474091942293, + "grad_norm": 0.09535379707813263, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 195060 + }, + { + "epoch": 0.7540860663976126, + "grad_norm": 0.1367979794740677, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 195070 + }, + { + "epoch": 0.7541247236009958, + "grad_norm": 0.1024353876709938, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 195080 + }, + { + "epoch": 0.7541633808043791, + "grad_norm": 0.11906842142343521, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 195090 + }, + { + "epoch": 0.7542020380077624, + "grad_norm": 0.10515178740024567, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 195100 + }, + { + "epoch": 0.7542406952111457, + "grad_norm": 0.23441074788570404, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 195110 + }, + { + "epoch": 0.7542793524145289, + "grad_norm": 0.11064566671848297, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 195120 + }, + { + "epoch": 0.7543180096179122, + "grad_norm": 0.09228164702653885, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 195130 + }, + { + "epoch": 0.7543566668212954, + "grad_norm": 0.10810859501361847, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 195140 + }, + { + "epoch": 0.7543953240246788, + "grad_norm": 0.10493254661560059, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 195150 + }, + { + "epoch": 0.754433981228062, + "grad_norm": 0.11579349637031555, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 195160 + }, + { + "epoch": 0.7544726384314453, + "grad_norm": 0.08747828751802444, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 195170 + }, + { + "epoch": 0.7545112956348285, + "grad_norm": 0.11919780820608139, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 195180 + }, + { + "epoch": 0.7545499528382119, + "grad_norm": 0.10446533560752869, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 195190 + }, + { + "epoch": 0.7545886100415952, + "grad_norm": 0.09462517499923706, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 195200 + }, + { + "epoch": 0.7546272672449784, + "grad_norm": 0.12835873663425446, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 195210 + }, + { + "epoch": 0.7546659244483617, + "grad_norm": 0.11104355752468109, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 195220 + }, + { + "epoch": 0.754704581651745, + "grad_norm": 0.10860821604728699, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 195230 + }, + { + "epoch": 0.7547432388551283, + "grad_norm": 0.10522500425577164, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 195240 + }, + { + "epoch": 0.7547818960585115, + "grad_norm": 0.0963943675160408, + "learning_rate": 0.002, + "loss": 2.337, + "step": 195250 + }, + { + "epoch": 0.7548205532618948, + "grad_norm": 0.10599056631326675, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 195260 + }, + { + "epoch": 0.7548592104652782, + "grad_norm": 0.0898580327630043, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 195270 + }, + { + "epoch": 0.7548978676686614, + "grad_norm": 0.1415392905473709, + "learning_rate": 0.002, + "loss": 2.3617, + "step": 195280 + }, + { + "epoch": 0.7549365248720447, + "grad_norm": 0.10526993870735168, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 195290 + }, + { + "epoch": 0.7549751820754279, + "grad_norm": 0.10201417654752731, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 195300 + }, + { + "epoch": 0.7550138392788112, + "grad_norm": 0.10746268182992935, + "learning_rate": 0.002, + "loss": 2.334, + "step": 195310 + }, + { + "epoch": 0.7550524964821945, + "grad_norm": 0.09328329563140869, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 195320 + }, + { + "epoch": 0.7550911536855778, + "grad_norm": 0.1050257459282875, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 195330 + }, + { + "epoch": 0.755129810888961, + "grad_norm": 0.11118963360786438, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 195340 + }, + { + "epoch": 0.7551684680923443, + "grad_norm": 0.11248045414686203, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 195350 + }, + { + "epoch": 0.7552071252957276, + "grad_norm": 0.10005900263786316, + "learning_rate": 0.002, + "loss": 2.35, + "step": 195360 + }, + { + "epoch": 0.7552457824991109, + "grad_norm": 0.10226564854383469, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 195370 + }, + { + "epoch": 0.7552844397024941, + "grad_norm": 0.1185409426689148, + "learning_rate": 0.002, + "loss": 2.354, + "step": 195380 + }, + { + "epoch": 0.7553230969058774, + "grad_norm": 0.10499685257673264, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 195390 + }, + { + "epoch": 0.7553617541092608, + "grad_norm": 0.09735177457332611, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 195400 + }, + { + "epoch": 0.755400411312644, + "grad_norm": 0.09485459327697754, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 195410 + }, + { + "epoch": 0.7554390685160273, + "grad_norm": 0.10462170094251633, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 195420 + }, + { + "epoch": 0.7554777257194105, + "grad_norm": 0.10008876770734787, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 195430 + }, + { + "epoch": 0.7555163829227939, + "grad_norm": 0.27592208981513977, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 195440 + }, + { + "epoch": 0.7555550401261771, + "grad_norm": 0.1063537746667862, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 195450 + }, + { + "epoch": 0.7555936973295604, + "grad_norm": 0.11505308747291565, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 195460 + }, + { + "epoch": 0.7556323545329436, + "grad_norm": 0.11546620726585388, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 195470 + }, + { + "epoch": 0.7556710117363269, + "grad_norm": 0.10170117765665054, + "learning_rate": 0.002, + "loss": 2.318, + "step": 195480 + }, + { + "epoch": 0.7557096689397103, + "grad_norm": 0.09899432212114334, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 195490 + }, + { + "epoch": 0.7557483261430935, + "grad_norm": 0.10428054630756378, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 195500 + }, + { + "epoch": 0.7557869833464768, + "grad_norm": 0.13404028117656708, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 195510 + }, + { + "epoch": 0.75582564054986, + "grad_norm": 0.11861197650432587, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 195520 + }, + { + "epoch": 0.7558642977532434, + "grad_norm": 0.09653604030609131, + "learning_rate": 0.002, + "loss": 2.35, + "step": 195530 + }, + { + "epoch": 0.7559029549566266, + "grad_norm": 0.10192802548408508, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 195540 + }, + { + "epoch": 0.7559416121600099, + "grad_norm": 0.09712671488523483, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 195550 + }, + { + "epoch": 0.7559802693633931, + "grad_norm": 0.13238525390625, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 195560 + }, + { + "epoch": 0.7560189265667765, + "grad_norm": 0.10193489491939545, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 195570 + }, + { + "epoch": 0.7560575837701597, + "grad_norm": 0.1082800030708313, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 195580 + }, + { + "epoch": 0.756096240973543, + "grad_norm": 0.11322148889303207, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 195590 + }, + { + "epoch": 0.7561348981769263, + "grad_norm": 0.10525218397378922, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 195600 + }, + { + "epoch": 0.7561735553803096, + "grad_norm": 0.10929600894451141, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 195610 + }, + { + "epoch": 0.7562122125836929, + "grad_norm": 0.10341084003448486, + "learning_rate": 0.002, + "loss": 2.339, + "step": 195620 + }, + { + "epoch": 0.7562508697870761, + "grad_norm": 0.09806688874959946, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 195630 + }, + { + "epoch": 0.7562895269904594, + "grad_norm": 0.12035530060529709, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 195640 + }, + { + "epoch": 0.7563281841938427, + "grad_norm": 0.11518548429012299, + "learning_rate": 0.002, + "loss": 2.349, + "step": 195650 + }, + { + "epoch": 0.756366841397226, + "grad_norm": 0.10069727897644043, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 195660 + }, + { + "epoch": 0.7564054986006092, + "grad_norm": 0.11515224725008011, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 195670 + }, + { + "epoch": 0.7564441558039925, + "grad_norm": 0.10596141964197159, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 195680 + }, + { + "epoch": 0.7564828130073757, + "grad_norm": 0.10848668217658997, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 195690 + }, + { + "epoch": 0.7565214702107591, + "grad_norm": 0.10180913656949997, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 195700 + }, + { + "epoch": 0.7565601274141424, + "grad_norm": 0.0984707847237587, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 195710 + }, + { + "epoch": 0.7565987846175256, + "grad_norm": 0.10491156578063965, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 195720 + }, + { + "epoch": 0.7566374418209089, + "grad_norm": 0.10026510804891586, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 195730 + }, + { + "epoch": 0.7566760990242922, + "grad_norm": 0.10417931526899338, + "learning_rate": 0.002, + "loss": 2.357, + "step": 195740 + }, + { + "epoch": 0.7567147562276755, + "grad_norm": 0.08421199023723602, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 195750 + }, + { + "epoch": 0.7567534134310587, + "grad_norm": 0.11094094067811966, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 195760 + }, + { + "epoch": 0.756792070634442, + "grad_norm": 0.09546635299921036, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 195770 + }, + { + "epoch": 0.7568307278378253, + "grad_norm": 0.13321033120155334, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 195780 + }, + { + "epoch": 0.7568693850412086, + "grad_norm": 0.11739350110292435, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 195790 + }, + { + "epoch": 0.7569080422445918, + "grad_norm": 0.11135275661945343, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 195800 + }, + { + "epoch": 0.7569466994479751, + "grad_norm": 0.10918545722961426, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 195810 + }, + { + "epoch": 0.7569853566513585, + "grad_norm": 0.09732171148061752, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 195820 + }, + { + "epoch": 0.7570240138547417, + "grad_norm": 0.10396996885538101, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 195830 + }, + { + "epoch": 0.757062671058125, + "grad_norm": 0.09502806514501572, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 195840 + }, + { + "epoch": 0.7571013282615082, + "grad_norm": 0.09656667709350586, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 195850 + }, + { + "epoch": 0.7571399854648915, + "grad_norm": 0.10245371609926224, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 195860 + }, + { + "epoch": 0.7571786426682748, + "grad_norm": 0.1009119525551796, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 195870 + }, + { + "epoch": 0.7572172998716581, + "grad_norm": 0.10753369331359863, + "learning_rate": 0.002, + "loss": 2.333, + "step": 195880 + }, + { + "epoch": 0.7572559570750413, + "grad_norm": 0.11064845323562622, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 195890 + }, + { + "epoch": 0.7572946142784246, + "grad_norm": 0.1408444494009018, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 195900 + }, + { + "epoch": 0.757333271481808, + "grad_norm": 0.10942525416612625, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 195910 + }, + { + "epoch": 0.7573719286851912, + "grad_norm": 0.08875524997711182, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 195920 + }, + { + "epoch": 0.7574105858885745, + "grad_norm": 0.1011844277381897, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 195930 + }, + { + "epoch": 0.7574492430919577, + "grad_norm": 0.1290528029203415, + "learning_rate": 0.002, + "loss": 2.346, + "step": 195940 + }, + { + "epoch": 0.7574879002953411, + "grad_norm": 0.09972189366817474, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 195950 + }, + { + "epoch": 0.7575265574987243, + "grad_norm": 0.09701741486787796, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 195960 + }, + { + "epoch": 0.7575652147021076, + "grad_norm": 0.10647466033697128, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 195970 + }, + { + "epoch": 0.7576038719054908, + "grad_norm": 0.08993512392044067, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 195980 + }, + { + "epoch": 0.7576425291088742, + "grad_norm": 1.3735077381134033, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 195990 + }, + { + "epoch": 0.7576811863122574, + "grad_norm": 0.10575959831476212, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 196000 + }, + { + "epoch": 0.7577198435156407, + "grad_norm": 0.09548277407884598, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 196010 + }, + { + "epoch": 0.757758500719024, + "grad_norm": 0.09989648312330246, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 196020 + }, + { + "epoch": 0.7577971579224073, + "grad_norm": 0.10474807769060135, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 196030 + }, + { + "epoch": 0.7578358151257906, + "grad_norm": 0.09582081437110901, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 196040 + }, + { + "epoch": 0.7578744723291738, + "grad_norm": 0.10808078199625015, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 196050 + }, + { + "epoch": 0.7579131295325571, + "grad_norm": 0.11338485777378082, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 196060 + }, + { + "epoch": 0.7579517867359403, + "grad_norm": 0.11452560871839523, + "learning_rate": 0.002, + "loss": 2.357, + "step": 196070 + }, + { + "epoch": 0.7579904439393237, + "grad_norm": 0.10131363570690155, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 196080 + }, + { + "epoch": 0.7580291011427069, + "grad_norm": 0.09486867487430573, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 196090 + }, + { + "epoch": 0.7580677583460902, + "grad_norm": 0.1085309162735939, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 196100 + }, + { + "epoch": 0.7581064155494734, + "grad_norm": 0.09008552134037018, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 196110 + }, + { + "epoch": 0.7581450727528568, + "grad_norm": 0.0982944443821907, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 196120 + }, + { + "epoch": 0.7581837299562401, + "grad_norm": 0.11804826557636261, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 196130 + }, + { + "epoch": 0.7582223871596233, + "grad_norm": 0.09671904146671295, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 196140 + }, + { + "epoch": 0.7582610443630066, + "grad_norm": 0.11080244183540344, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 196150 + }, + { + "epoch": 0.7582997015663899, + "grad_norm": 0.12381796538829803, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 196160 + }, + { + "epoch": 0.7583383587697732, + "grad_norm": 0.11330335587263107, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 196170 + }, + { + "epoch": 0.7583770159731564, + "grad_norm": 0.10313785076141357, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 196180 + }, + { + "epoch": 0.7584156731765397, + "grad_norm": 0.09532544761896133, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 196190 + }, + { + "epoch": 0.758454330379923, + "grad_norm": 0.10902924090623856, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 196200 + }, + { + "epoch": 0.7584929875833063, + "grad_norm": 0.09729165583848953, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 196210 + }, + { + "epoch": 0.7585316447866896, + "grad_norm": 0.12448497116565704, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 196220 + }, + { + "epoch": 0.7585703019900728, + "grad_norm": 0.09325090795755386, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 196230 + }, + { + "epoch": 0.7586089591934561, + "grad_norm": 0.1104232668876648, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 196240 + }, + { + "epoch": 0.7586476163968394, + "grad_norm": 0.10128456354141235, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 196250 + }, + { + "epoch": 0.7586862736002227, + "grad_norm": 0.10114264488220215, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 196260 + }, + { + "epoch": 0.7587249308036059, + "grad_norm": 0.1039217859506607, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 196270 + }, + { + "epoch": 0.7587635880069892, + "grad_norm": 0.09960024803876877, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 196280 + }, + { + "epoch": 0.7588022452103725, + "grad_norm": 0.109873928129673, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 196290 + }, + { + "epoch": 0.7588409024137558, + "grad_norm": 0.09769205749034882, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 196300 + }, + { + "epoch": 0.758879559617139, + "grad_norm": 0.1883150339126587, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 196310 + }, + { + "epoch": 0.7589182168205223, + "grad_norm": 0.1155470684170723, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 196320 + }, + { + "epoch": 0.7589568740239057, + "grad_norm": 0.09333515912294388, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 196330 + }, + { + "epoch": 0.7589955312272889, + "grad_norm": 0.10450160503387451, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 196340 + }, + { + "epoch": 0.7590341884306722, + "grad_norm": 0.13585828244686127, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 196350 + }, + { + "epoch": 0.7590728456340554, + "grad_norm": 0.10029277950525284, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 196360 + }, + { + "epoch": 0.7591115028374388, + "grad_norm": 0.09286853671073914, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 196370 + }, + { + "epoch": 0.759150160040822, + "grad_norm": 0.12459070980548859, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 196380 + }, + { + "epoch": 0.7591888172442053, + "grad_norm": 0.11947477608919144, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 196390 + }, + { + "epoch": 0.7592274744475885, + "grad_norm": 0.1244572103023529, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 196400 + }, + { + "epoch": 0.7592661316509718, + "grad_norm": 0.11129307746887207, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 196410 + }, + { + "epoch": 0.7593047888543552, + "grad_norm": 0.10143723338842392, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 196420 + }, + { + "epoch": 0.7593434460577384, + "grad_norm": 0.11217688024044037, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 196430 + }, + { + "epoch": 0.7593821032611217, + "grad_norm": 0.10073287785053253, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 196440 + }, + { + "epoch": 0.7594207604645049, + "grad_norm": 0.1112406775355339, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 196450 + }, + { + "epoch": 0.7594594176678883, + "grad_norm": 0.11295268684625626, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 196460 + }, + { + "epoch": 0.7594980748712715, + "grad_norm": 0.10510469228029251, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 196470 + }, + { + "epoch": 0.7595367320746548, + "grad_norm": 0.10133729130029678, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 196480 + }, + { + "epoch": 0.759575389278038, + "grad_norm": 0.10159589350223541, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 196490 + }, + { + "epoch": 0.7596140464814214, + "grad_norm": 0.1075550764799118, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 196500 + }, + { + "epoch": 0.7596527036848046, + "grad_norm": 0.11190401762723923, + "learning_rate": 0.002, + "loss": 2.345, + "step": 196510 + }, + { + "epoch": 0.7596913608881879, + "grad_norm": 0.10415247827768326, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 196520 + }, + { + "epoch": 0.7597300180915711, + "grad_norm": 0.10985806584358215, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 196530 + }, + { + "epoch": 0.7597686752949545, + "grad_norm": 0.09883502870798111, + "learning_rate": 0.002, + "loss": 2.347, + "step": 196540 + }, + { + "epoch": 0.7598073324983378, + "grad_norm": 0.09660974889993668, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 196550 + }, + { + "epoch": 0.759845989701721, + "grad_norm": 0.11280705779790878, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 196560 + }, + { + "epoch": 0.7598846469051043, + "grad_norm": 0.10233590006828308, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 196570 + }, + { + "epoch": 0.7599233041084876, + "grad_norm": 0.09991362690925598, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 196580 + }, + { + "epoch": 0.7599619613118709, + "grad_norm": 0.09371718019247055, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 196590 + }, + { + "epoch": 0.7600006185152541, + "grad_norm": 0.10569316893815994, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 196600 + }, + { + "epoch": 0.7600392757186374, + "grad_norm": 0.1058524027466774, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 196610 + }, + { + "epoch": 0.7600779329220206, + "grad_norm": 0.10566135495901108, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 196620 + }, + { + "epoch": 0.760116590125404, + "grad_norm": 0.09536425024271011, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 196630 + }, + { + "epoch": 0.7601552473287873, + "grad_norm": 0.10409044474363327, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 196640 + }, + { + "epoch": 0.7601939045321705, + "grad_norm": 0.09831932932138443, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 196650 + }, + { + "epoch": 0.7602325617355538, + "grad_norm": 0.10347718745470047, + "learning_rate": 0.002, + "loss": 2.329, + "step": 196660 + }, + { + "epoch": 0.7602712189389371, + "grad_norm": 0.1087682694196701, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 196670 + }, + { + "epoch": 0.7603098761423204, + "grad_norm": 0.09737783670425415, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 196680 + }, + { + "epoch": 0.7603485333457036, + "grad_norm": 0.09639918804168701, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 196690 + }, + { + "epoch": 0.7603871905490869, + "grad_norm": 0.13335011899471283, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 196700 + }, + { + "epoch": 0.7604258477524702, + "grad_norm": 0.09955698251724243, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 196710 + }, + { + "epoch": 0.7604645049558535, + "grad_norm": 0.09763280302286148, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 196720 + }, + { + "epoch": 0.7605031621592367, + "grad_norm": 0.09992306679487228, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 196730 + }, + { + "epoch": 0.76054181936262, + "grad_norm": 0.14371579885482788, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 196740 + }, + { + "epoch": 0.7605804765660034, + "grad_norm": 0.12052550166845322, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 196750 + }, + { + "epoch": 0.7606191337693866, + "grad_norm": 0.11212249100208282, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 196760 + }, + { + "epoch": 0.7606577909727699, + "grad_norm": 0.09525085240602493, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 196770 + }, + { + "epoch": 0.7606964481761531, + "grad_norm": 0.11087365448474884, + "learning_rate": 0.002, + "loss": 2.34, + "step": 196780 + }, + { + "epoch": 0.7607351053795364, + "grad_norm": 0.09551934152841568, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 196790 + }, + { + "epoch": 0.7607737625829197, + "grad_norm": 0.11185195297002792, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 196800 + }, + { + "epoch": 0.760812419786303, + "grad_norm": 0.13765457272529602, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 196810 + }, + { + "epoch": 0.7608510769896862, + "grad_norm": 0.10338455438613892, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 196820 + }, + { + "epoch": 0.7608897341930695, + "grad_norm": 0.10170518606901169, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 196830 + }, + { + "epoch": 0.7609283913964529, + "grad_norm": 0.10933112353086472, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 196840 + }, + { + "epoch": 0.7609670485998361, + "grad_norm": 0.1284535825252533, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 196850 + }, + { + "epoch": 0.7610057058032194, + "grad_norm": 0.08637114614248276, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 196860 + }, + { + "epoch": 0.7610443630066026, + "grad_norm": 0.09608016163110733, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 196870 + }, + { + "epoch": 0.761083020209986, + "grad_norm": 0.09457556158304214, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 196880 + }, + { + "epoch": 0.7611216774133692, + "grad_norm": 0.09683346748352051, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 196890 + }, + { + "epoch": 0.7611603346167525, + "grad_norm": 0.09376256912946701, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 196900 + }, + { + "epoch": 0.7611989918201357, + "grad_norm": 0.12256647646427155, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 196910 + }, + { + "epoch": 0.7612376490235191, + "grad_norm": 0.1191987469792366, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 196920 + }, + { + "epoch": 0.7612763062269023, + "grad_norm": 0.09384308755397797, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 196930 + }, + { + "epoch": 0.7613149634302856, + "grad_norm": 0.09424065798521042, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 196940 + }, + { + "epoch": 0.7613536206336688, + "grad_norm": 0.09528655558824539, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 196950 + }, + { + "epoch": 0.7613922778370522, + "grad_norm": 0.11627255380153656, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 196960 + }, + { + "epoch": 0.7614309350404355, + "grad_norm": 0.11006615310907364, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 196970 + }, + { + "epoch": 0.7614695922438187, + "grad_norm": 0.10915512591600418, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 196980 + }, + { + "epoch": 0.761508249447202, + "grad_norm": 0.10492531210184097, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 196990 + }, + { + "epoch": 0.7615469066505852, + "grad_norm": 0.12003140896558762, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 197000 + }, + { + "epoch": 0.7615855638539686, + "grad_norm": 0.09126259386539459, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 197010 + }, + { + "epoch": 0.7616242210573518, + "grad_norm": 0.09396257996559143, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 197020 + }, + { + "epoch": 0.7616628782607351, + "grad_norm": 0.09638381004333496, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 197030 + }, + { + "epoch": 0.7617015354641183, + "grad_norm": 0.12181457877159119, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 197040 + }, + { + "epoch": 0.7617401926675017, + "grad_norm": 0.0980510339140892, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 197050 + }, + { + "epoch": 0.761778849870885, + "grad_norm": 0.1004633978009224, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 197060 + }, + { + "epoch": 0.7618175070742682, + "grad_norm": 0.1116318479180336, + "learning_rate": 0.002, + "loss": 2.344, + "step": 197070 + }, + { + "epoch": 0.7618561642776515, + "grad_norm": 0.11675788462162018, + "learning_rate": 0.002, + "loss": 2.3684, + "step": 197080 + }, + { + "epoch": 0.7618948214810348, + "grad_norm": 0.11029686778783798, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 197090 + }, + { + "epoch": 0.7619334786844181, + "grad_norm": 0.43609610199928284, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 197100 + }, + { + "epoch": 0.7619721358878013, + "grad_norm": 0.10058358311653137, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 197110 + }, + { + "epoch": 0.7620107930911846, + "grad_norm": 0.10088063031435013, + "learning_rate": 0.002, + "loss": 2.342, + "step": 197120 + }, + { + "epoch": 0.7620494502945679, + "grad_norm": 0.09839877486228943, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 197130 + }, + { + "epoch": 0.7620881074979512, + "grad_norm": 0.11465706676244736, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 197140 + }, + { + "epoch": 0.7621267647013344, + "grad_norm": 0.10121915489435196, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 197150 + }, + { + "epoch": 0.7621654219047177, + "grad_norm": 0.10331589728593826, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 197160 + }, + { + "epoch": 0.762204079108101, + "grad_norm": 0.10166185349225998, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 197170 + }, + { + "epoch": 0.7622427363114843, + "grad_norm": 0.11593326181173325, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 197180 + }, + { + "epoch": 0.7622813935148676, + "grad_norm": 0.09611688554286957, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 197190 + }, + { + "epoch": 0.7623200507182508, + "grad_norm": 0.11437992751598358, + "learning_rate": 0.002, + "loss": 2.348, + "step": 197200 + }, + { + "epoch": 0.7623587079216341, + "grad_norm": 0.1314302235841751, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 197210 + }, + { + "epoch": 0.7623973651250174, + "grad_norm": 0.09535528719425201, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 197220 + }, + { + "epoch": 0.7624360223284007, + "grad_norm": 0.1051691323518753, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 197230 + }, + { + "epoch": 0.7624746795317839, + "grad_norm": 0.11101983487606049, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 197240 + }, + { + "epoch": 0.7625133367351672, + "grad_norm": 0.10246936231851578, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 197250 + }, + { + "epoch": 0.7625519939385506, + "grad_norm": 0.10206905007362366, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 197260 + }, + { + "epoch": 0.7625906511419338, + "grad_norm": 0.12120827287435532, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 197270 + }, + { + "epoch": 0.7626293083453171, + "grad_norm": 0.10170960426330566, + "learning_rate": 0.002, + "loss": 2.362, + "step": 197280 + }, + { + "epoch": 0.7626679655487003, + "grad_norm": 0.09327639639377594, + "learning_rate": 0.002, + "loss": 2.341, + "step": 197290 + }, + { + "epoch": 0.7627066227520837, + "grad_norm": 0.10556710511445999, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 197300 + }, + { + "epoch": 0.7627452799554669, + "grad_norm": 0.11077040433883667, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 197310 + }, + { + "epoch": 0.7627839371588502, + "grad_norm": 0.10688727349042892, + "learning_rate": 0.002, + "loss": 2.341, + "step": 197320 + }, + { + "epoch": 0.7628225943622334, + "grad_norm": 0.10626237094402313, + "learning_rate": 0.002, + "loss": 2.352, + "step": 197330 + }, + { + "epoch": 0.7628612515656167, + "grad_norm": 0.09951891005039215, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 197340 + }, + { + "epoch": 0.762899908769, + "grad_norm": 0.10448072850704193, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 197350 + }, + { + "epoch": 0.7629385659723833, + "grad_norm": 0.113190196454525, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 197360 + }, + { + "epoch": 0.7629772231757666, + "grad_norm": 0.09222755581140518, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 197370 + }, + { + "epoch": 0.7630158803791498, + "grad_norm": 0.11096397042274475, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 197380 + }, + { + "epoch": 0.7630545375825332, + "grad_norm": 0.09983041882514954, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 197390 + }, + { + "epoch": 0.7630931947859164, + "grad_norm": 0.0992233157157898, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 197400 + }, + { + "epoch": 0.7631318519892997, + "grad_norm": 0.10024307668209076, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 197410 + }, + { + "epoch": 0.7631705091926829, + "grad_norm": 0.1278943419456482, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 197420 + }, + { + "epoch": 0.7632091663960663, + "grad_norm": 0.09778820723295212, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 197430 + }, + { + "epoch": 0.7632478235994495, + "grad_norm": 0.12126130610704422, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 197440 + }, + { + "epoch": 0.7632864808028328, + "grad_norm": 0.1095130667090416, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 197450 + }, + { + "epoch": 0.763325138006216, + "grad_norm": 0.11171168088912964, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 197460 + }, + { + "epoch": 0.7633637952095994, + "grad_norm": 0.11522161960601807, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 197470 + }, + { + "epoch": 0.7634024524129827, + "grad_norm": 0.09989353269338608, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 197480 + }, + { + "epoch": 0.7634411096163659, + "grad_norm": 0.16353625059127808, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 197490 + }, + { + "epoch": 0.7634797668197492, + "grad_norm": 0.10433971881866455, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 197500 + }, + { + "epoch": 0.7635184240231325, + "grad_norm": 0.09696116298437119, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 197510 + }, + { + "epoch": 0.7635570812265158, + "grad_norm": 0.09195984154939651, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 197520 + }, + { + "epoch": 0.763595738429899, + "grad_norm": 0.09522832185029984, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 197530 + }, + { + "epoch": 0.7636343956332823, + "grad_norm": 0.10282301157712936, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 197540 + }, + { + "epoch": 0.7636730528366655, + "grad_norm": 0.0923994779586792, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 197550 + }, + { + "epoch": 0.7637117100400489, + "grad_norm": 0.10165669769048691, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 197560 + }, + { + "epoch": 0.7637503672434321, + "grad_norm": 0.11128440499305725, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 197570 + }, + { + "epoch": 0.7637890244468154, + "grad_norm": 0.10605587065219879, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 197580 + }, + { + "epoch": 0.7638276816501987, + "grad_norm": 0.11360124498605728, + "learning_rate": 0.002, + "loss": 2.345, + "step": 197590 + }, + { + "epoch": 0.763866338853582, + "grad_norm": 0.09772372990846634, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 197600 + }, + { + "epoch": 0.7639049960569653, + "grad_norm": 0.11086378991603851, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 197610 + }, + { + "epoch": 0.7639436532603485, + "grad_norm": 0.11184732615947723, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 197620 + }, + { + "epoch": 0.7639823104637318, + "grad_norm": 0.11088281869888306, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 197630 + }, + { + "epoch": 0.7640209676671151, + "grad_norm": 0.10725362598896027, + "learning_rate": 0.002, + "loss": 2.347, + "step": 197640 + }, + { + "epoch": 0.7640596248704984, + "grad_norm": 0.09956399351358414, + "learning_rate": 0.002, + "loss": 2.34, + "step": 197650 + }, + { + "epoch": 0.7640982820738816, + "grad_norm": 0.09949301183223724, + "learning_rate": 0.002, + "loss": 2.32, + "step": 197660 + }, + { + "epoch": 0.7641369392772649, + "grad_norm": 0.0930347889661789, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 197670 + }, + { + "epoch": 0.7641755964806483, + "grad_norm": 0.10376568883657455, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 197680 + }, + { + "epoch": 0.7642142536840315, + "grad_norm": 0.10890960693359375, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 197690 + }, + { + "epoch": 0.7642529108874148, + "grad_norm": 0.15354023873806, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 197700 + }, + { + "epoch": 0.764291568090798, + "grad_norm": 0.11070028692483902, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 197710 + }, + { + "epoch": 0.7643302252941813, + "grad_norm": 0.10421784222126007, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 197720 + }, + { + "epoch": 0.7643688824975646, + "grad_norm": 0.09906643629074097, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 197730 + }, + { + "epoch": 0.7644075397009479, + "grad_norm": 0.12415380030870438, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 197740 + }, + { + "epoch": 0.7644461969043311, + "grad_norm": 0.09926281869411469, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 197750 + }, + { + "epoch": 0.7644848541077144, + "grad_norm": 0.10176531225442886, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 197760 + }, + { + "epoch": 0.7645235113110977, + "grad_norm": 0.09986831992864609, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 197770 + }, + { + "epoch": 0.764562168514481, + "grad_norm": 0.09915333241224289, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 197780 + }, + { + "epoch": 0.7646008257178643, + "grad_norm": 0.10770957171916962, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 197790 + }, + { + "epoch": 0.7646394829212475, + "grad_norm": 0.09436184167861938, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 197800 + }, + { + "epoch": 0.7646781401246309, + "grad_norm": 0.10471966117620468, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 197810 + }, + { + "epoch": 0.7647167973280141, + "grad_norm": 0.09943972527980804, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 197820 + }, + { + "epoch": 0.7647554545313974, + "grad_norm": 0.11482524871826172, + "learning_rate": 0.002, + "loss": 2.339, + "step": 197830 + }, + { + "epoch": 0.7647941117347806, + "grad_norm": 0.11444584280252457, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 197840 + }, + { + "epoch": 0.764832768938164, + "grad_norm": 0.10195798426866531, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 197850 + }, + { + "epoch": 0.7648714261415472, + "grad_norm": 0.09855667501688004, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 197860 + }, + { + "epoch": 0.7649100833449305, + "grad_norm": 0.11103340983390808, + "learning_rate": 0.002, + "loss": 2.328, + "step": 197870 + }, + { + "epoch": 0.7649487405483137, + "grad_norm": 0.11886495351791382, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 197880 + }, + { + "epoch": 0.764987397751697, + "grad_norm": 0.11483073979616165, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 197890 + }, + { + "epoch": 0.7650260549550804, + "grad_norm": 0.107142373919487, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 197900 + }, + { + "epoch": 0.7650647121584636, + "grad_norm": 0.10728046298027039, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 197910 + }, + { + "epoch": 0.7651033693618469, + "grad_norm": 0.11045504361391068, + "learning_rate": 0.002, + "loss": 2.342, + "step": 197920 + }, + { + "epoch": 0.7651420265652301, + "grad_norm": 0.12309353798627853, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 197930 + }, + { + "epoch": 0.7651806837686135, + "grad_norm": 0.09460339695215225, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 197940 + }, + { + "epoch": 0.7652193409719967, + "grad_norm": 0.09481628239154816, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 197950 + }, + { + "epoch": 0.76525799817538, + "grad_norm": 0.11327333003282547, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 197960 + }, + { + "epoch": 0.7652966553787632, + "grad_norm": 0.10542219132184982, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 197970 + }, + { + "epoch": 0.7653353125821466, + "grad_norm": 0.11292482167482376, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 197980 + }, + { + "epoch": 0.7653739697855299, + "grad_norm": 0.11517397314310074, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 197990 + }, + { + "epoch": 0.7654126269889131, + "grad_norm": 0.10499373078346252, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 198000 + }, + { + "epoch": 0.7654512841922964, + "grad_norm": 0.1152128055691719, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 198010 + }, + { + "epoch": 0.7654899413956797, + "grad_norm": 0.09470925480127335, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 198020 + }, + { + "epoch": 0.765528598599063, + "grad_norm": 0.09626875072717667, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 198030 + }, + { + "epoch": 0.7655672558024462, + "grad_norm": 0.1008152961730957, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 198040 + }, + { + "epoch": 0.7656059130058295, + "grad_norm": 0.09875728189945221, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 198050 + }, + { + "epoch": 0.7656445702092128, + "grad_norm": 0.10646268725395203, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 198060 + }, + { + "epoch": 0.7656832274125961, + "grad_norm": 0.10834681242704391, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 198070 + }, + { + "epoch": 0.7657218846159793, + "grad_norm": 0.12466521561145782, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 198080 + }, + { + "epoch": 0.7657605418193626, + "grad_norm": 0.08603180944919586, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 198090 + }, + { + "epoch": 0.7657991990227458, + "grad_norm": 0.1612296998500824, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 198100 + }, + { + "epoch": 0.7658378562261292, + "grad_norm": 0.1047164797782898, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 198110 + }, + { + "epoch": 0.7658765134295125, + "grad_norm": 0.09899793565273285, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 198120 + }, + { + "epoch": 0.7659151706328957, + "grad_norm": 0.1119118481874466, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 198130 + }, + { + "epoch": 0.765953827836279, + "grad_norm": 0.11324197798967361, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 198140 + }, + { + "epoch": 0.7659924850396623, + "grad_norm": 0.10404713451862335, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 198150 + }, + { + "epoch": 0.7660311422430456, + "grad_norm": 0.1023205816745758, + "learning_rate": 0.002, + "loss": 2.345, + "step": 198160 + }, + { + "epoch": 0.7660697994464288, + "grad_norm": 0.10559005290269852, + "learning_rate": 0.002, + "loss": 2.344, + "step": 198170 + }, + { + "epoch": 0.7661084566498121, + "grad_norm": 0.0908447802066803, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 198180 + }, + { + "epoch": 0.7661471138531954, + "grad_norm": 0.09740599989891052, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 198190 + }, + { + "epoch": 0.7661857710565787, + "grad_norm": 0.11079949140548706, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 198200 + }, + { + "epoch": 0.766224428259962, + "grad_norm": 0.10781296342611313, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 198210 + }, + { + "epoch": 0.7662630854633452, + "grad_norm": 0.09478621929883957, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 198220 + }, + { + "epoch": 0.7663017426667286, + "grad_norm": 0.1038641482591629, + "learning_rate": 0.002, + "loss": 2.338, + "step": 198230 + }, + { + "epoch": 0.7663403998701118, + "grad_norm": 0.10898188501596451, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 198240 + }, + { + "epoch": 0.7663790570734951, + "grad_norm": 0.09819826483726501, + "learning_rate": 0.002, + "loss": 2.336, + "step": 198250 + }, + { + "epoch": 0.7664177142768783, + "grad_norm": 0.11344137042760849, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 198260 + }, + { + "epoch": 0.7664563714802616, + "grad_norm": 0.11274783313274384, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 198270 + }, + { + "epoch": 0.7664950286836449, + "grad_norm": 0.1241864413022995, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 198280 + }, + { + "epoch": 0.7665336858870282, + "grad_norm": 0.0982760488986969, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 198290 + }, + { + "epoch": 0.7665723430904114, + "grad_norm": 0.09791640192270279, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 198300 + }, + { + "epoch": 0.7666110002937947, + "grad_norm": 0.09974829107522964, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 198310 + }, + { + "epoch": 0.7666496574971781, + "grad_norm": 0.10934463888406754, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 198320 + }, + { + "epoch": 0.7666883147005613, + "grad_norm": 0.09941709786653519, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 198330 + }, + { + "epoch": 0.7667269719039446, + "grad_norm": 0.11662741750478745, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 198340 + }, + { + "epoch": 0.7667656291073278, + "grad_norm": 0.0965876653790474, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 198350 + }, + { + "epoch": 0.7668042863107112, + "grad_norm": 0.10685340315103531, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 198360 + }, + { + "epoch": 0.7668429435140944, + "grad_norm": 0.08948315680027008, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 198370 + }, + { + "epoch": 0.7668816007174777, + "grad_norm": 0.19111230969429016, + "learning_rate": 0.002, + "loss": 2.339, + "step": 198380 + }, + { + "epoch": 0.7669202579208609, + "grad_norm": 0.09882853180170059, + "learning_rate": 0.002, + "loss": 2.341, + "step": 198390 + }, + { + "epoch": 0.7669589151242443, + "grad_norm": 0.09126832336187363, + "learning_rate": 0.002, + "loss": 2.347, + "step": 198400 + }, + { + "epoch": 0.7669975723276276, + "grad_norm": 0.09862853586673737, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 198410 + }, + { + "epoch": 0.7670362295310108, + "grad_norm": 0.10504437983036041, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 198420 + }, + { + "epoch": 0.7670748867343941, + "grad_norm": 0.11240246891975403, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 198430 + }, + { + "epoch": 0.7671135439377774, + "grad_norm": 0.1019548624753952, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 198440 + }, + { + "epoch": 0.7671522011411607, + "grad_norm": 0.11174459755420685, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 198450 + }, + { + "epoch": 0.7671908583445439, + "grad_norm": 0.1078474372625351, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 198460 + }, + { + "epoch": 0.7672295155479272, + "grad_norm": 0.11592960357666016, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 198470 + }, + { + "epoch": 0.7672681727513104, + "grad_norm": 0.12025527656078339, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 198480 + }, + { + "epoch": 0.7673068299546938, + "grad_norm": 0.11120238900184631, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 198490 + }, + { + "epoch": 0.767345487158077, + "grad_norm": 0.11129934340715408, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 198500 + }, + { + "epoch": 0.7673841443614603, + "grad_norm": 0.10082795470952988, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 198510 + }, + { + "epoch": 0.7674228015648435, + "grad_norm": 0.08847419917583466, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 198520 + }, + { + "epoch": 0.7674614587682269, + "grad_norm": 0.15819251537322998, + "learning_rate": 0.002, + "loss": 2.336, + "step": 198530 + }, + { + "epoch": 0.7675001159716102, + "grad_norm": 0.09503842145204544, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 198540 + }, + { + "epoch": 0.7675387731749934, + "grad_norm": 0.107664093375206, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 198550 + }, + { + "epoch": 0.7675774303783767, + "grad_norm": 0.10473312437534332, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 198560 + }, + { + "epoch": 0.76761608758176, + "grad_norm": 0.08977878093719482, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 198570 + }, + { + "epoch": 0.7676547447851433, + "grad_norm": 0.09275360405445099, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 198580 + }, + { + "epoch": 0.7676934019885265, + "grad_norm": 0.10299757122993469, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 198590 + }, + { + "epoch": 0.7677320591919098, + "grad_norm": 0.09542661905288696, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 198600 + }, + { + "epoch": 0.7677707163952932, + "grad_norm": 0.09400220960378647, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 198610 + }, + { + "epoch": 0.7678093735986764, + "grad_norm": 0.1107887402176857, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 198620 + }, + { + "epoch": 0.7678480308020597, + "grad_norm": 0.1019064262509346, + "learning_rate": 0.002, + "loss": 2.324, + "step": 198630 + }, + { + "epoch": 0.7678866880054429, + "grad_norm": 0.11500236392021179, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 198640 + }, + { + "epoch": 0.7679253452088262, + "grad_norm": 0.09175709635019302, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 198650 + }, + { + "epoch": 0.7679640024122095, + "grad_norm": 0.108940489590168, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 198660 + }, + { + "epoch": 0.7680026596155928, + "grad_norm": 0.09465809166431427, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 198670 + }, + { + "epoch": 0.768041316818976, + "grad_norm": 0.12499912083148956, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 198680 + }, + { + "epoch": 0.7680799740223593, + "grad_norm": 0.09948600083589554, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 198690 + }, + { + "epoch": 0.7681186312257426, + "grad_norm": 0.09964347630739212, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 198700 + }, + { + "epoch": 0.7681572884291259, + "grad_norm": 0.11331164091825485, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 198710 + }, + { + "epoch": 0.7681959456325091, + "grad_norm": 0.11339457333087921, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 198720 + }, + { + "epoch": 0.7682346028358924, + "grad_norm": 0.10204224288463593, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 198730 + }, + { + "epoch": 0.7682732600392758, + "grad_norm": 0.1111808642745018, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 198740 + }, + { + "epoch": 0.768311917242659, + "grad_norm": 0.12683844566345215, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 198750 + }, + { + "epoch": 0.7683505744460423, + "grad_norm": 0.11128699779510498, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 198760 + }, + { + "epoch": 0.7683892316494255, + "grad_norm": 0.09829942882061005, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 198770 + }, + { + "epoch": 0.7684278888528089, + "grad_norm": 0.1252112090587616, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 198780 + }, + { + "epoch": 0.7684665460561921, + "grad_norm": 0.11201326549053192, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 198790 + }, + { + "epoch": 0.7685052032595754, + "grad_norm": 0.1164776012301445, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 198800 + }, + { + "epoch": 0.7685438604629586, + "grad_norm": 0.10831308364868164, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 198810 + }, + { + "epoch": 0.7685825176663419, + "grad_norm": 0.09699341654777527, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 198820 + }, + { + "epoch": 0.7686211748697253, + "grad_norm": 0.10668003559112549, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 198830 + }, + { + "epoch": 0.7686598320731085, + "grad_norm": 0.10341284424066544, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 198840 + }, + { + "epoch": 0.7686984892764918, + "grad_norm": 0.10115321725606918, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 198850 + }, + { + "epoch": 0.768737146479875, + "grad_norm": 0.0972030982375145, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 198860 + }, + { + "epoch": 0.7687758036832584, + "grad_norm": 0.09830368310213089, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 198870 + }, + { + "epoch": 0.7688144608866416, + "grad_norm": 0.12544550001621246, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 198880 + }, + { + "epoch": 0.7688531180900249, + "grad_norm": 0.08823167532682419, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 198890 + }, + { + "epoch": 0.7688917752934081, + "grad_norm": 0.09672213345766068, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 198900 + }, + { + "epoch": 0.7689304324967915, + "grad_norm": 0.10546525567770004, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 198910 + }, + { + "epoch": 0.7689690897001747, + "grad_norm": 0.10554137825965881, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 198920 + }, + { + "epoch": 0.769007746903558, + "grad_norm": 0.09676895290613174, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 198930 + }, + { + "epoch": 0.7690464041069413, + "grad_norm": 0.11075710505247116, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 198940 + }, + { + "epoch": 0.7690850613103246, + "grad_norm": 0.10539600253105164, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 198950 + }, + { + "epoch": 0.7691237185137079, + "grad_norm": 0.10727138817310333, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 198960 + }, + { + "epoch": 0.7691623757170911, + "grad_norm": 0.0944301187992096, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 198970 + }, + { + "epoch": 0.7692010329204744, + "grad_norm": 0.10199394077062607, + "learning_rate": 0.002, + "loss": 2.344, + "step": 198980 + }, + { + "epoch": 0.7692396901238577, + "grad_norm": 0.11412839591503143, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 198990 + }, + { + "epoch": 0.769278347327241, + "grad_norm": 0.10684604942798615, + "learning_rate": 0.002, + "loss": 2.345, + "step": 199000 + }, + { + "epoch": 0.7693170045306242, + "grad_norm": 0.10144107788801193, + "learning_rate": 0.002, + "loss": 2.331, + "step": 199010 + }, + { + "epoch": 0.7693556617340075, + "grad_norm": 0.10650350153446198, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 199020 + }, + { + "epoch": 0.7693943189373907, + "grad_norm": 0.10620681196451187, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 199030 + }, + { + "epoch": 0.7694329761407741, + "grad_norm": 0.0976841002702713, + "learning_rate": 0.002, + "loss": 2.332, + "step": 199040 + }, + { + "epoch": 0.7694716333441574, + "grad_norm": 0.09857263416051865, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 199050 + }, + { + "epoch": 0.7695102905475406, + "grad_norm": 0.10668320953845978, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 199060 + }, + { + "epoch": 0.7695489477509239, + "grad_norm": 0.11263915151357651, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 199070 + }, + { + "epoch": 0.7695876049543072, + "grad_norm": 0.1733059585094452, + "learning_rate": 0.002, + "loss": 2.337, + "step": 199080 + }, + { + "epoch": 0.7696262621576905, + "grad_norm": 0.10049034655094147, + "learning_rate": 0.002, + "loss": 2.352, + "step": 199090 + }, + { + "epoch": 0.7696649193610737, + "grad_norm": 0.10561171174049377, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 199100 + }, + { + "epoch": 0.769703576564457, + "grad_norm": 0.10926441848278046, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 199110 + }, + { + "epoch": 0.7697422337678403, + "grad_norm": 0.11219510436058044, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 199120 + }, + { + "epoch": 0.7697808909712236, + "grad_norm": 0.10429269075393677, + "learning_rate": 0.002, + "loss": 2.344, + "step": 199130 + }, + { + "epoch": 0.7698195481746068, + "grad_norm": 0.1282796859741211, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 199140 + }, + { + "epoch": 0.7698582053779901, + "grad_norm": 0.10086347907781601, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 199150 + }, + { + "epoch": 0.7698968625813735, + "grad_norm": 0.10646980255842209, + "learning_rate": 0.002, + "loss": 2.34, + "step": 199160 + }, + { + "epoch": 0.7699355197847567, + "grad_norm": 0.10037669539451599, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 199170 + }, + { + "epoch": 0.76997417698814, + "grad_norm": 0.10189975053071976, + "learning_rate": 0.002, + "loss": 2.326, + "step": 199180 + }, + { + "epoch": 0.7700128341915232, + "grad_norm": 0.11893989145755768, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 199190 + }, + { + "epoch": 0.7700514913949065, + "grad_norm": 0.08906824141740799, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 199200 + }, + { + "epoch": 0.7700901485982898, + "grad_norm": 0.09848733246326447, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 199210 + }, + { + "epoch": 0.7701288058016731, + "grad_norm": 0.09648095816373825, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 199220 + }, + { + "epoch": 0.7701674630050563, + "grad_norm": 0.10332004725933075, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 199230 + }, + { + "epoch": 0.7702061202084396, + "grad_norm": 0.09868767857551575, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 199240 + }, + { + "epoch": 0.770244777411823, + "grad_norm": 0.09203759580850601, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 199250 + }, + { + "epoch": 0.7702834346152062, + "grad_norm": 0.09470377117395401, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 199260 + }, + { + "epoch": 0.7703220918185895, + "grad_norm": 0.09742645174264908, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 199270 + }, + { + "epoch": 0.7703607490219727, + "grad_norm": 0.12007997184991837, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 199280 + }, + { + "epoch": 0.7703994062253561, + "grad_norm": 0.09929712861776352, + "learning_rate": 0.002, + "loss": 2.347, + "step": 199290 + }, + { + "epoch": 0.7704380634287393, + "grad_norm": 0.13558223843574524, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 199300 + }, + { + "epoch": 0.7704767206321226, + "grad_norm": 0.09942631423473358, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 199310 + }, + { + "epoch": 0.7705153778355058, + "grad_norm": 0.09960619360208511, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 199320 + }, + { + "epoch": 0.7705540350388892, + "grad_norm": 0.10436469316482544, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 199330 + }, + { + "epoch": 0.7705926922422724, + "grad_norm": 0.11296504735946655, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 199340 + }, + { + "epoch": 0.7706313494456557, + "grad_norm": 0.11591428518295288, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 199350 + }, + { + "epoch": 0.770670006649039, + "grad_norm": 0.10151247680187225, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 199360 + }, + { + "epoch": 0.7707086638524223, + "grad_norm": 0.13121308386325836, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 199370 + }, + { + "epoch": 0.7707473210558056, + "grad_norm": 0.1014532521367073, + "learning_rate": 0.002, + "loss": 2.339, + "step": 199380 + }, + { + "epoch": 0.7707859782591888, + "grad_norm": 0.09893794357776642, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 199390 + }, + { + "epoch": 0.7708246354625721, + "grad_norm": 0.11550657451152802, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 199400 + }, + { + "epoch": 0.7708632926659553, + "grad_norm": 0.09744752943515778, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 199410 + }, + { + "epoch": 0.7709019498693387, + "grad_norm": 0.09887513518333435, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 199420 + }, + { + "epoch": 0.7709406070727219, + "grad_norm": 0.13161727786064148, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 199430 + }, + { + "epoch": 0.7709792642761052, + "grad_norm": 0.10517606884241104, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 199440 + }, + { + "epoch": 0.7710179214794884, + "grad_norm": 0.10197417438030243, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 199450 + }, + { + "epoch": 0.7710565786828718, + "grad_norm": 0.0997900515794754, + "learning_rate": 0.002, + "loss": 2.337, + "step": 199460 + }, + { + "epoch": 0.7710952358862551, + "grad_norm": 0.12040169537067413, + "learning_rate": 0.002, + "loss": 2.346, + "step": 199470 + }, + { + "epoch": 0.7711338930896383, + "grad_norm": 0.10656525194644928, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 199480 + }, + { + "epoch": 0.7711725502930216, + "grad_norm": 0.09704674035310745, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 199490 + }, + { + "epoch": 0.7712112074964049, + "grad_norm": 0.10840927809476852, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 199500 + }, + { + "epoch": 0.7712498646997882, + "grad_norm": 0.10759053379297256, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 199510 + }, + { + "epoch": 0.7712885219031714, + "grad_norm": 0.09853320568799973, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 199520 + }, + { + "epoch": 0.7713271791065547, + "grad_norm": 0.10677963495254517, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 199530 + }, + { + "epoch": 0.771365836309938, + "grad_norm": 0.09678921103477478, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 199540 + }, + { + "epoch": 0.7714044935133213, + "grad_norm": 0.09936831146478653, + "learning_rate": 0.002, + "loss": 2.324, + "step": 199550 + }, + { + "epoch": 0.7714431507167046, + "grad_norm": 0.09832432866096497, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 199560 + }, + { + "epoch": 0.7714818079200878, + "grad_norm": 0.105009064078331, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 199570 + }, + { + "epoch": 0.771520465123471, + "grad_norm": 0.09333699196577072, + "learning_rate": 0.002, + "loss": 2.332, + "step": 199580 + }, + { + "epoch": 0.7715591223268544, + "grad_norm": 0.10455092787742615, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 199590 + }, + { + "epoch": 0.7715977795302377, + "grad_norm": 0.15664206445217133, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 199600 + }, + { + "epoch": 0.7716364367336209, + "grad_norm": 0.11696160584688187, + "learning_rate": 0.002, + "loss": 2.335, + "step": 199610 + }, + { + "epoch": 0.7716750939370042, + "grad_norm": 0.10086527466773987, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 199620 + }, + { + "epoch": 0.7717137511403875, + "grad_norm": 0.09339655190706253, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 199630 + }, + { + "epoch": 0.7717524083437708, + "grad_norm": 0.09983646124601364, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 199640 + }, + { + "epoch": 0.771791065547154, + "grad_norm": 0.14009705185890198, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 199650 + }, + { + "epoch": 0.7718297227505373, + "grad_norm": 0.13227851688861847, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 199660 + }, + { + "epoch": 0.7718683799539207, + "grad_norm": 0.14454631507396698, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 199670 + }, + { + "epoch": 0.7719070371573039, + "grad_norm": 0.1293637454509735, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 199680 + }, + { + "epoch": 0.7719456943606872, + "grad_norm": 0.1015193909406662, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 199690 + }, + { + "epoch": 0.7719843515640704, + "grad_norm": 0.11463967710733414, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 199700 + }, + { + "epoch": 0.7720230087674538, + "grad_norm": 0.0929514467716217, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 199710 + }, + { + "epoch": 0.772061665970837, + "grad_norm": 0.10017342865467072, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 199720 + }, + { + "epoch": 0.7721003231742203, + "grad_norm": 0.10128919035196304, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 199730 + }, + { + "epoch": 0.7721389803776035, + "grad_norm": 1.573693037033081, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 199740 + }, + { + "epoch": 0.7721776375809868, + "grad_norm": 0.12044832110404968, + "learning_rate": 0.002, + "loss": 2.3715, + "step": 199750 + }, + { + "epoch": 0.7722162947843701, + "grad_norm": 0.10077551752328873, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 199760 + }, + { + "epoch": 0.7722549519877534, + "grad_norm": 0.10538480430841446, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 199770 + }, + { + "epoch": 0.7722936091911367, + "grad_norm": 0.10388049483299255, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 199780 + }, + { + "epoch": 0.7723322663945199, + "grad_norm": 0.10569577664136887, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 199790 + }, + { + "epoch": 0.7723709235979033, + "grad_norm": 0.09584219008684158, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 199800 + }, + { + "epoch": 0.7724095808012865, + "grad_norm": 0.11350366473197937, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 199810 + }, + { + "epoch": 0.7724482380046698, + "grad_norm": 0.09763793647289276, + "learning_rate": 0.002, + "loss": 2.333, + "step": 199820 + }, + { + "epoch": 0.772486895208053, + "grad_norm": 0.0916409119963646, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 199830 + }, + { + "epoch": 0.7725255524114364, + "grad_norm": 0.11362338066101074, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 199840 + }, + { + "epoch": 0.7725642096148196, + "grad_norm": 0.10839736461639404, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 199850 + }, + { + "epoch": 0.7726028668182029, + "grad_norm": 0.1145472452044487, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 199860 + }, + { + "epoch": 0.7726415240215861, + "grad_norm": 0.1063324436545372, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 199870 + }, + { + "epoch": 0.7726801812249695, + "grad_norm": 0.10044882446527481, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 199880 + }, + { + "epoch": 0.7727188384283528, + "grad_norm": 0.09982284158468246, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 199890 + }, + { + "epoch": 0.772757495631736, + "grad_norm": 0.0994446650147438, + "learning_rate": 0.002, + "loss": 2.346, + "step": 199900 + }, + { + "epoch": 0.7727961528351193, + "grad_norm": 0.09719647467136383, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 199910 + }, + { + "epoch": 0.7728348100385026, + "grad_norm": 0.11520253121852875, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 199920 + }, + { + "epoch": 0.7728734672418859, + "grad_norm": 0.12013207376003265, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 199930 + }, + { + "epoch": 0.7729121244452691, + "grad_norm": 0.09739863872528076, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 199940 + }, + { + "epoch": 0.7729507816486524, + "grad_norm": 0.09742973744869232, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 199950 + }, + { + "epoch": 0.7729894388520356, + "grad_norm": 0.08914405852556229, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 199960 + }, + { + "epoch": 0.773028096055419, + "grad_norm": 0.11501894891262054, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 199970 + }, + { + "epoch": 0.7730667532588023, + "grad_norm": 0.10237744450569153, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 199980 + }, + { + "epoch": 0.7731054104621855, + "grad_norm": 0.0975678488612175, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 199990 + }, + { + "epoch": 0.7731440676655688, + "grad_norm": 0.1083686575293541, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 200000 + }, + { + "epoch": 0.7731827248689521, + "grad_norm": 0.11673914641141891, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 200010 + }, + { + "epoch": 0.7732213820723354, + "grad_norm": 0.11466530710458755, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 200020 + }, + { + "epoch": 0.7732600392757186, + "grad_norm": 0.08803026378154755, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 200030 + }, + { + "epoch": 0.7732986964791019, + "grad_norm": 0.10837738960981369, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 200040 + }, + { + "epoch": 0.7733373536824852, + "grad_norm": 0.0982581079006195, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 200050 + }, + { + "epoch": 0.7733760108858685, + "grad_norm": 0.11603618413209915, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 200060 + }, + { + "epoch": 0.7734146680892517, + "grad_norm": 0.10336446762084961, + "learning_rate": 0.002, + "loss": 2.343, + "step": 200070 + }, + { + "epoch": 0.773453325292635, + "grad_norm": 0.11303336173295975, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 200080 + }, + { + "epoch": 0.7734919824960184, + "grad_norm": 0.12811653316020966, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 200090 + }, + { + "epoch": 0.7735306396994016, + "grad_norm": 0.0947103500366211, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 200100 + }, + { + "epoch": 0.7735692969027849, + "grad_norm": 0.09696483612060547, + "learning_rate": 0.002, + "loss": 2.352, + "step": 200110 + }, + { + "epoch": 0.7736079541061681, + "grad_norm": 0.09755218029022217, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 200120 + }, + { + "epoch": 0.7736466113095514, + "grad_norm": 0.12071071565151215, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 200130 + }, + { + "epoch": 0.7736852685129347, + "grad_norm": 0.10463309288024902, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 200140 + }, + { + "epoch": 0.773723925716318, + "grad_norm": 0.09501010179519653, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 200150 + }, + { + "epoch": 0.7737625829197012, + "grad_norm": 0.13405849039554596, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 200160 + }, + { + "epoch": 0.7738012401230845, + "grad_norm": 0.08840669691562653, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 200170 + }, + { + "epoch": 0.7738398973264679, + "grad_norm": 0.1252623349428177, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 200180 + }, + { + "epoch": 0.7738785545298511, + "grad_norm": 0.11263350397348404, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 200190 + }, + { + "epoch": 0.7739172117332344, + "grad_norm": 0.09044620394706726, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 200200 + }, + { + "epoch": 0.7739558689366176, + "grad_norm": 0.11641410738229752, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 200210 + }, + { + "epoch": 0.773994526140001, + "grad_norm": 0.1021437793970108, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 200220 + }, + { + "epoch": 0.7740331833433842, + "grad_norm": 0.10622060298919678, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 200230 + }, + { + "epoch": 0.7740718405467675, + "grad_norm": 0.13026146590709686, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 200240 + }, + { + "epoch": 0.7741104977501507, + "grad_norm": 0.4658176004886627, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 200250 + }, + { + "epoch": 0.7741491549535341, + "grad_norm": 0.09170550853013992, + "learning_rate": 0.002, + "loss": 2.346, + "step": 200260 + }, + { + "epoch": 0.7741878121569173, + "grad_norm": 0.10174133628606796, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 200270 + }, + { + "epoch": 0.7742264693603006, + "grad_norm": 0.09880983084440231, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 200280 + }, + { + "epoch": 0.7742651265636838, + "grad_norm": 0.10881564766168594, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 200290 + }, + { + "epoch": 0.7743037837670672, + "grad_norm": 0.10786505788564682, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 200300 + }, + { + "epoch": 0.7743424409704505, + "grad_norm": 0.10389269143342972, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 200310 + }, + { + "epoch": 0.7743810981738337, + "grad_norm": 0.12156102806329727, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 200320 + }, + { + "epoch": 0.774419755377217, + "grad_norm": 0.10176001489162445, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 200330 + }, + { + "epoch": 0.7744584125806002, + "grad_norm": 0.11010489612817764, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 200340 + }, + { + "epoch": 0.7744970697839836, + "grad_norm": 0.10778948664665222, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 200350 + }, + { + "epoch": 0.7745357269873668, + "grad_norm": 0.10530728101730347, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 200360 + }, + { + "epoch": 0.7745743841907501, + "grad_norm": 0.10748562216758728, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 200370 + }, + { + "epoch": 0.7746130413941333, + "grad_norm": 0.1270465850830078, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 200380 + }, + { + "epoch": 0.7746516985975167, + "grad_norm": 0.11276346445083618, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 200390 + }, + { + "epoch": 0.7746903558009, + "grad_norm": 0.09427843242883682, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 200400 + }, + { + "epoch": 0.7747290130042832, + "grad_norm": 0.10547053068876266, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 200410 + }, + { + "epoch": 0.7747676702076665, + "grad_norm": 0.0937475636601448, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 200420 + }, + { + "epoch": 0.7748063274110498, + "grad_norm": 0.10085562616586685, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 200430 + }, + { + "epoch": 0.7748449846144331, + "grad_norm": 0.10139386355876923, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 200440 + }, + { + "epoch": 0.7748836418178163, + "grad_norm": 0.11379513144493103, + "learning_rate": 0.002, + "loss": 2.344, + "step": 200450 + }, + { + "epoch": 0.7749222990211996, + "grad_norm": 0.1242862194776535, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 200460 + }, + { + "epoch": 0.7749609562245829, + "grad_norm": 0.10932227224111557, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 200470 + }, + { + "epoch": 0.7749996134279662, + "grad_norm": 0.10406124591827393, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 200480 + }, + { + "epoch": 0.7750382706313494, + "grad_norm": 0.10119037330150604, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 200490 + }, + { + "epoch": 0.7750769278347327, + "grad_norm": 0.10155569016933441, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 200500 + }, + { + "epoch": 0.775115585038116, + "grad_norm": 0.09935098141431808, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 200510 + }, + { + "epoch": 0.7751542422414993, + "grad_norm": 0.11348678171634674, + "learning_rate": 0.002, + "loss": 2.34, + "step": 200520 + }, + { + "epoch": 0.7751928994448826, + "grad_norm": 0.09598572552204132, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 200530 + }, + { + "epoch": 0.7752315566482658, + "grad_norm": 0.11006144434213638, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 200540 + }, + { + "epoch": 0.7752702138516491, + "grad_norm": 0.10205881297588348, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 200550 + }, + { + "epoch": 0.7753088710550324, + "grad_norm": 0.11025692522525787, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 200560 + }, + { + "epoch": 0.7753475282584157, + "grad_norm": 0.10488265752792358, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 200570 + }, + { + "epoch": 0.7753861854617989, + "grad_norm": 0.09900173544883728, + "learning_rate": 0.002, + "loss": 2.34, + "step": 200580 + }, + { + "epoch": 0.7754248426651822, + "grad_norm": 0.09940632432699203, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 200590 + }, + { + "epoch": 0.7754634998685656, + "grad_norm": 0.10207568109035492, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 200600 + }, + { + "epoch": 0.7755021570719488, + "grad_norm": 0.09659580886363983, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 200610 + }, + { + "epoch": 0.7755408142753321, + "grad_norm": 0.13341741263866425, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 200620 + }, + { + "epoch": 0.7755794714787153, + "grad_norm": 0.12346010655164719, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 200630 + }, + { + "epoch": 0.7756181286820987, + "grad_norm": 0.09464599192142487, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 200640 + }, + { + "epoch": 0.7756567858854819, + "grad_norm": 0.10147619992494583, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 200650 + }, + { + "epoch": 0.7756954430888652, + "grad_norm": 0.09558901935815811, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 200660 + }, + { + "epoch": 0.7757341002922484, + "grad_norm": 0.11636707186698914, + "learning_rate": 0.002, + "loss": 2.338, + "step": 200670 + }, + { + "epoch": 0.7757727574956317, + "grad_norm": 0.1300460696220398, + "learning_rate": 0.002, + "loss": 2.34, + "step": 200680 + }, + { + "epoch": 0.775811414699015, + "grad_norm": 0.09909453988075256, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 200690 + }, + { + "epoch": 0.7758500719023983, + "grad_norm": 0.09746900200843811, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 200700 + }, + { + "epoch": 0.7758887291057815, + "grad_norm": 0.09796787053346634, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 200710 + }, + { + "epoch": 0.7759273863091648, + "grad_norm": 0.11972364038228989, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 200720 + }, + { + "epoch": 0.7759660435125482, + "grad_norm": 0.09397576004266739, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 200730 + }, + { + "epoch": 0.7760047007159314, + "grad_norm": 0.09508085995912552, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 200740 + }, + { + "epoch": 0.7760433579193147, + "grad_norm": 0.09870732575654984, + "learning_rate": 0.002, + "loss": 2.339, + "step": 200750 + }, + { + "epoch": 0.7760820151226979, + "grad_norm": 0.10102210938930511, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 200760 + }, + { + "epoch": 0.7761206723260813, + "grad_norm": 0.10540328174829483, + "learning_rate": 0.002, + "loss": 2.341, + "step": 200770 + }, + { + "epoch": 0.7761593295294645, + "grad_norm": 0.09322625398635864, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 200780 + }, + { + "epoch": 0.7761979867328478, + "grad_norm": 0.10350510478019714, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 200790 + }, + { + "epoch": 0.776236643936231, + "grad_norm": 0.10563094168901443, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 200800 + }, + { + "epoch": 0.7762753011396144, + "grad_norm": 0.10559462755918503, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 200810 + }, + { + "epoch": 0.7763139583429977, + "grad_norm": 0.0866955816745758, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 200820 + }, + { + "epoch": 0.7763526155463809, + "grad_norm": 0.10731297731399536, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 200830 + }, + { + "epoch": 0.7763912727497642, + "grad_norm": 0.09620799124240875, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 200840 + }, + { + "epoch": 0.7764299299531475, + "grad_norm": 0.1093180775642395, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 200850 + }, + { + "epoch": 0.7764685871565308, + "grad_norm": 0.10197804123163223, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 200860 + }, + { + "epoch": 0.776507244359914, + "grad_norm": 0.12022387236356735, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 200870 + }, + { + "epoch": 0.7765459015632973, + "grad_norm": 0.08632820844650269, + "learning_rate": 0.002, + "loss": 2.347, + "step": 200880 + }, + { + "epoch": 0.7765845587666805, + "grad_norm": 0.11173965036869049, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 200890 + }, + { + "epoch": 0.7766232159700639, + "grad_norm": 0.10657903552055359, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 200900 + }, + { + "epoch": 0.7766618731734471, + "grad_norm": 0.0989794209599495, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 200910 + }, + { + "epoch": 0.7767005303768304, + "grad_norm": 0.10527005046606064, + "learning_rate": 0.002, + "loss": 2.349, + "step": 200920 + }, + { + "epoch": 0.7767391875802137, + "grad_norm": 0.12138840556144714, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 200930 + }, + { + "epoch": 0.776777844783597, + "grad_norm": 0.10083303600549698, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 200940 + }, + { + "epoch": 0.7768165019869803, + "grad_norm": 0.09858760237693787, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 200950 + }, + { + "epoch": 0.7768551591903635, + "grad_norm": 0.1061653196811676, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 200960 + }, + { + "epoch": 0.7768938163937468, + "grad_norm": 0.09524355828762054, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 200970 + }, + { + "epoch": 0.7769324735971301, + "grad_norm": 0.1071450337767601, + "learning_rate": 0.002, + "loss": 2.34, + "step": 200980 + }, + { + "epoch": 0.7769711308005134, + "grad_norm": 0.09246594458818436, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 200990 + }, + { + "epoch": 0.7770097880038966, + "grad_norm": 0.08741657435894012, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 201000 + }, + { + "epoch": 0.7770484452072799, + "grad_norm": 0.1069013699889183, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 201010 + }, + { + "epoch": 0.7770871024106633, + "grad_norm": 0.09173951297998428, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 201020 + }, + { + "epoch": 0.7771257596140465, + "grad_norm": 0.09130797535181046, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 201030 + }, + { + "epoch": 0.7771644168174298, + "grad_norm": 0.09466081857681274, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 201040 + }, + { + "epoch": 0.777203074020813, + "grad_norm": 0.11741433292627335, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 201050 + }, + { + "epoch": 0.7772417312241963, + "grad_norm": 0.09997253119945526, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 201060 + }, + { + "epoch": 0.7772803884275796, + "grad_norm": 0.0999874547123909, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 201070 + }, + { + "epoch": 0.7773190456309629, + "grad_norm": 0.11438187211751938, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 201080 + }, + { + "epoch": 0.7773577028343461, + "grad_norm": 0.19451777637004852, + "learning_rate": 0.002, + "loss": 2.335, + "step": 201090 + }, + { + "epoch": 0.7773963600377294, + "grad_norm": 0.10860533267259598, + "learning_rate": 0.002, + "loss": 2.336, + "step": 201100 + }, + { + "epoch": 0.7774350172411127, + "grad_norm": 0.1131187304854393, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 201110 + }, + { + "epoch": 0.777473674444496, + "grad_norm": 0.10684671252965927, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 201120 + }, + { + "epoch": 0.7775123316478793, + "grad_norm": 0.10805738717317581, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 201130 + }, + { + "epoch": 0.7775509888512625, + "grad_norm": 0.0960422232747078, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 201140 + }, + { + "epoch": 0.7775896460546459, + "grad_norm": 0.12220508605241776, + "learning_rate": 0.002, + "loss": 2.346, + "step": 201150 + }, + { + "epoch": 0.7776283032580291, + "grad_norm": 0.09800433367490768, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 201160 + }, + { + "epoch": 0.7776669604614124, + "grad_norm": 0.09436652064323425, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 201170 + }, + { + "epoch": 0.7777056176647956, + "grad_norm": 0.0989791750907898, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 201180 + }, + { + "epoch": 0.777744274868179, + "grad_norm": 0.10169640928506851, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 201190 + }, + { + "epoch": 0.7777829320715622, + "grad_norm": 0.11647674441337585, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 201200 + }, + { + "epoch": 0.7778215892749455, + "grad_norm": 0.12052612751722336, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 201210 + }, + { + "epoch": 0.7778602464783287, + "grad_norm": 0.11180232465267181, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 201220 + }, + { + "epoch": 0.777898903681712, + "grad_norm": 0.09033545106649399, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 201230 + }, + { + "epoch": 0.7779375608850954, + "grad_norm": 0.10001803934574127, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 201240 + }, + { + "epoch": 0.7779762180884786, + "grad_norm": 0.1214684322476387, + "learning_rate": 0.002, + "loss": 2.347, + "step": 201250 + }, + { + "epoch": 0.7780148752918619, + "grad_norm": 0.09808328002691269, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 201260 + }, + { + "epoch": 0.7780535324952451, + "grad_norm": 0.08842272311449051, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 201270 + }, + { + "epoch": 0.7780921896986285, + "grad_norm": 0.13397300243377686, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 201280 + }, + { + "epoch": 0.7781308469020117, + "grad_norm": 0.1140327900648117, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 201290 + }, + { + "epoch": 0.778169504105395, + "grad_norm": 0.09129363298416138, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 201300 + }, + { + "epoch": 0.7782081613087782, + "grad_norm": 0.10088945925235748, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 201310 + }, + { + "epoch": 0.7782468185121616, + "grad_norm": 0.09986527264118195, + "learning_rate": 0.002, + "loss": 2.334, + "step": 201320 + }, + { + "epoch": 0.7782854757155448, + "grad_norm": 0.10828401148319244, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 201330 + }, + { + "epoch": 0.7783241329189281, + "grad_norm": 0.10116782784461975, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 201340 + }, + { + "epoch": 0.7783627901223114, + "grad_norm": 0.09193418174982071, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 201350 + }, + { + "epoch": 0.7784014473256947, + "grad_norm": 0.09904490411281586, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 201360 + }, + { + "epoch": 0.778440104529078, + "grad_norm": 0.09490885585546494, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 201370 + }, + { + "epoch": 0.7784787617324612, + "grad_norm": 0.13544167578220367, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 201380 + }, + { + "epoch": 0.7785174189358445, + "grad_norm": 0.10728796571493149, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 201390 + }, + { + "epoch": 0.7785560761392278, + "grad_norm": 0.0904412791132927, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 201400 + }, + { + "epoch": 0.7785947333426111, + "grad_norm": 0.1182757243514061, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 201410 + }, + { + "epoch": 0.7786333905459943, + "grad_norm": 0.10843978822231293, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 201420 + }, + { + "epoch": 0.7786720477493776, + "grad_norm": 0.09948401898145676, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 201430 + }, + { + "epoch": 0.7787107049527608, + "grad_norm": 0.10118865966796875, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 201440 + }, + { + "epoch": 0.7787493621561442, + "grad_norm": 0.12530890107154846, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 201450 + }, + { + "epoch": 0.7787880193595275, + "grad_norm": 0.11728890240192413, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 201460 + }, + { + "epoch": 0.7788266765629107, + "grad_norm": 0.10972714424133301, + "learning_rate": 0.002, + "loss": 2.346, + "step": 201470 + }, + { + "epoch": 0.778865333766294, + "grad_norm": 0.10338211804628372, + "learning_rate": 0.002, + "loss": 2.344, + "step": 201480 + }, + { + "epoch": 0.7789039909696773, + "grad_norm": 0.11228205263614655, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 201490 + }, + { + "epoch": 0.7789426481730606, + "grad_norm": 0.11558589339256287, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 201500 + }, + { + "epoch": 0.7789813053764438, + "grad_norm": 0.10174691677093506, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 201510 + }, + { + "epoch": 0.7790199625798271, + "grad_norm": 0.10404979437589645, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 201520 + }, + { + "epoch": 0.7790586197832104, + "grad_norm": 0.09674325585365295, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 201530 + }, + { + "epoch": 0.7790972769865937, + "grad_norm": 0.10181877762079239, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 201540 + }, + { + "epoch": 0.779135934189977, + "grad_norm": 0.10503356903791428, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 201550 + }, + { + "epoch": 0.7791745913933602, + "grad_norm": 0.10027583688497543, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 201560 + }, + { + "epoch": 0.7792132485967436, + "grad_norm": 0.09758977591991425, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 201570 + }, + { + "epoch": 0.7792519058001268, + "grad_norm": 0.10359928011894226, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 201580 + }, + { + "epoch": 0.7792905630035101, + "grad_norm": 0.1090666875243187, + "learning_rate": 0.002, + "loss": 2.35, + "step": 201590 + }, + { + "epoch": 0.7793292202068933, + "grad_norm": 0.1132933497428894, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 201600 + }, + { + "epoch": 0.7793678774102766, + "grad_norm": 0.10195934027433395, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 201610 + }, + { + "epoch": 0.7794065346136599, + "grad_norm": 0.10770484060049057, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 201620 + }, + { + "epoch": 0.7794451918170432, + "grad_norm": 0.10276725143194199, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 201630 + }, + { + "epoch": 0.7794838490204264, + "grad_norm": 0.09507951885461807, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 201640 + }, + { + "epoch": 0.7795225062238097, + "grad_norm": 0.11635395884513855, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 201650 + }, + { + "epoch": 0.7795611634271931, + "grad_norm": 0.09882384538650513, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 201660 + }, + { + "epoch": 0.7795998206305763, + "grad_norm": 0.10508741438388824, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 201670 + }, + { + "epoch": 0.7796384778339596, + "grad_norm": 0.10111743956804276, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 201680 + }, + { + "epoch": 0.7796771350373428, + "grad_norm": 0.09746131300926208, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 201690 + }, + { + "epoch": 0.7797157922407262, + "grad_norm": 0.09846270829439163, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 201700 + }, + { + "epoch": 0.7797544494441094, + "grad_norm": 0.09761416912078857, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 201710 + }, + { + "epoch": 0.7797931066474927, + "grad_norm": 0.10397903621196747, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 201720 + }, + { + "epoch": 0.7798317638508759, + "grad_norm": 0.09711068868637085, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 201730 + }, + { + "epoch": 0.7798704210542593, + "grad_norm": 0.10659918934106827, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 201740 + }, + { + "epoch": 0.7799090782576426, + "grad_norm": 0.11457981914281845, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 201750 + }, + { + "epoch": 0.7799477354610258, + "grad_norm": 0.11186759173870087, + "learning_rate": 0.002, + "loss": 2.325, + "step": 201760 + }, + { + "epoch": 0.7799863926644091, + "grad_norm": 0.08898892253637314, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 201770 + }, + { + "epoch": 0.7800250498677924, + "grad_norm": 0.09332229942083359, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 201780 + }, + { + "epoch": 0.7800637070711757, + "grad_norm": 0.1085081398487091, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 201790 + }, + { + "epoch": 0.7801023642745589, + "grad_norm": 0.12998148798942566, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 201800 + }, + { + "epoch": 0.7801410214779422, + "grad_norm": 0.1137484461069107, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 201810 + }, + { + "epoch": 0.7801796786813254, + "grad_norm": 0.10778406262397766, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 201820 + }, + { + "epoch": 0.7802183358847088, + "grad_norm": 0.10134479403495789, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 201830 + }, + { + "epoch": 0.780256993088092, + "grad_norm": 0.11462592333555222, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 201840 + }, + { + "epoch": 0.7802956502914753, + "grad_norm": 0.11337092518806458, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 201850 + }, + { + "epoch": 0.7803343074948585, + "grad_norm": 0.09721145778894424, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 201860 + }, + { + "epoch": 0.7803729646982419, + "grad_norm": 0.11457212269306183, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 201870 + }, + { + "epoch": 0.7804116219016252, + "grad_norm": 0.10253756493330002, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 201880 + }, + { + "epoch": 0.7804502791050084, + "grad_norm": 0.11434777081012726, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 201890 + }, + { + "epoch": 0.7804889363083917, + "grad_norm": 0.12110897898674011, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 201900 + }, + { + "epoch": 0.780527593511775, + "grad_norm": 0.11207719892263412, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 201910 + }, + { + "epoch": 0.7805662507151583, + "grad_norm": 0.11025089770555496, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 201920 + }, + { + "epoch": 0.7806049079185415, + "grad_norm": 0.11021070182323456, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 201930 + }, + { + "epoch": 0.7806435651219248, + "grad_norm": 0.1151655986905098, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 201940 + }, + { + "epoch": 0.7806822223253082, + "grad_norm": 0.10133794695138931, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 201950 + }, + { + "epoch": 0.7807208795286914, + "grad_norm": 0.11351914703845978, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 201960 + }, + { + "epoch": 0.7807595367320747, + "grad_norm": 0.09121467918157578, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 201970 + }, + { + "epoch": 0.7807981939354579, + "grad_norm": 0.1189068853855133, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 201980 + }, + { + "epoch": 0.7808368511388412, + "grad_norm": 0.11156963557004929, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 201990 + }, + { + "epoch": 0.7808755083422245, + "grad_norm": 0.0933595597743988, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 202000 + }, + { + "epoch": 0.7809141655456078, + "grad_norm": 0.10935278981924057, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 202010 + }, + { + "epoch": 0.780952822748991, + "grad_norm": 0.09758707880973816, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 202020 + }, + { + "epoch": 0.7809914799523743, + "grad_norm": 0.11436577141284943, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 202030 + }, + { + "epoch": 0.7810301371557576, + "grad_norm": 0.10427207499742508, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 202040 + }, + { + "epoch": 0.7810687943591409, + "grad_norm": 0.09066711366176605, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 202050 + }, + { + "epoch": 0.7811074515625241, + "grad_norm": 0.11137081682682037, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 202060 + }, + { + "epoch": 0.7811461087659074, + "grad_norm": 0.10201691091060638, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 202070 + }, + { + "epoch": 0.7811847659692908, + "grad_norm": 0.1071251705288887, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 202080 + }, + { + "epoch": 0.781223423172674, + "grad_norm": 0.10883677750825882, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 202090 + }, + { + "epoch": 0.7812620803760573, + "grad_norm": 0.13566423952579498, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 202100 + }, + { + "epoch": 0.7813007375794405, + "grad_norm": 0.09948530793190002, + "learning_rate": 0.002, + "loss": 2.344, + "step": 202110 + }, + { + "epoch": 0.7813393947828239, + "grad_norm": 0.11490985006093979, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 202120 + }, + { + "epoch": 0.7813780519862071, + "grad_norm": 0.11161590367555618, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 202130 + }, + { + "epoch": 0.7814167091895904, + "grad_norm": 0.10396748036146164, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 202140 + }, + { + "epoch": 0.7814553663929736, + "grad_norm": 0.12490322440862656, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 202150 + }, + { + "epoch": 0.7814940235963569, + "grad_norm": 0.1008855327963829, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 202160 + }, + { + "epoch": 0.7815326807997403, + "grad_norm": 0.1047447994351387, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 202170 + }, + { + "epoch": 0.7815713380031235, + "grad_norm": 0.10873875766992569, + "learning_rate": 0.002, + "loss": 2.339, + "step": 202180 + }, + { + "epoch": 0.7816099952065068, + "grad_norm": 0.12432877719402313, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 202190 + }, + { + "epoch": 0.78164865240989, + "grad_norm": 0.11288367211818695, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 202200 + }, + { + "epoch": 0.7816873096132734, + "grad_norm": 0.1022430807352066, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 202210 + }, + { + "epoch": 0.7817259668166566, + "grad_norm": 0.11609577387571335, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 202220 + }, + { + "epoch": 0.7817646240200399, + "grad_norm": 0.10611993074417114, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 202230 + }, + { + "epoch": 0.7818032812234231, + "grad_norm": 0.10908140987157822, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 202240 + }, + { + "epoch": 0.7818419384268065, + "grad_norm": 0.09165285527706146, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 202250 + }, + { + "epoch": 0.7818805956301897, + "grad_norm": 0.10701560974121094, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 202260 + }, + { + "epoch": 0.781919252833573, + "grad_norm": 0.11059151589870453, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 202270 + }, + { + "epoch": 0.7819579100369562, + "grad_norm": 0.10945714265108109, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 202280 + }, + { + "epoch": 0.7819965672403396, + "grad_norm": 0.0986083596944809, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 202290 + }, + { + "epoch": 0.7820352244437229, + "grad_norm": 0.10011819005012512, + "learning_rate": 0.002, + "loss": 2.344, + "step": 202300 + }, + { + "epoch": 0.7820738816471061, + "grad_norm": 0.10819520801305771, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 202310 + }, + { + "epoch": 0.7821125388504894, + "grad_norm": 0.10630880296230316, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 202320 + }, + { + "epoch": 0.7821511960538727, + "grad_norm": 0.12038673460483551, + "learning_rate": 0.002, + "loss": 2.338, + "step": 202330 + }, + { + "epoch": 0.782189853257256, + "grad_norm": 0.11459699273109436, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 202340 + }, + { + "epoch": 0.7822285104606392, + "grad_norm": 0.10157457739114761, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 202350 + }, + { + "epoch": 0.7822671676640225, + "grad_norm": 0.09755659103393555, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 202360 + }, + { + "epoch": 0.7823058248674057, + "grad_norm": 0.09254967421293259, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 202370 + }, + { + "epoch": 0.7823444820707891, + "grad_norm": 0.09767594188451767, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 202380 + }, + { + "epoch": 0.7823831392741724, + "grad_norm": 0.14414367079734802, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 202390 + }, + { + "epoch": 0.7824217964775556, + "grad_norm": 0.10989902168512344, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 202400 + }, + { + "epoch": 0.7824604536809389, + "grad_norm": 0.10024165362119675, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 202410 + }, + { + "epoch": 0.7824991108843222, + "grad_norm": 0.10078973323106766, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 202420 + }, + { + "epoch": 0.7825377680877055, + "grad_norm": 0.10243651270866394, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 202430 + }, + { + "epoch": 0.7825764252910887, + "grad_norm": 0.12386278808116913, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 202440 + }, + { + "epoch": 0.782615082494472, + "grad_norm": 0.09828358143568039, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 202450 + }, + { + "epoch": 0.7826537396978553, + "grad_norm": 0.10247497260570526, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 202460 + }, + { + "epoch": 0.7826923969012386, + "grad_norm": 0.1063644215464592, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 202470 + }, + { + "epoch": 0.7827310541046218, + "grad_norm": 0.08840660005807877, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 202480 + }, + { + "epoch": 0.7827697113080051, + "grad_norm": 0.10195198655128479, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 202490 + }, + { + "epoch": 0.7828083685113885, + "grad_norm": 0.119685597717762, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 202500 + }, + { + "epoch": 0.7828470257147717, + "grad_norm": 0.10883565992116928, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 202510 + }, + { + "epoch": 0.782885682918155, + "grad_norm": 0.10767433792352676, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 202520 + }, + { + "epoch": 0.7829243401215382, + "grad_norm": 0.09536431729793549, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 202530 + }, + { + "epoch": 0.7829629973249215, + "grad_norm": 0.09645789861679077, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 202540 + }, + { + "epoch": 0.7830016545283048, + "grad_norm": 0.09781122207641602, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 202550 + }, + { + "epoch": 0.7830403117316881, + "grad_norm": 0.11750921607017517, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 202560 + }, + { + "epoch": 0.7830789689350713, + "grad_norm": 0.12031945586204529, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 202570 + }, + { + "epoch": 0.7831176261384546, + "grad_norm": 0.13411380350589752, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 202580 + }, + { + "epoch": 0.783156283341838, + "grad_norm": 0.09357387572526932, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 202590 + }, + { + "epoch": 0.7831949405452212, + "grad_norm": 0.09378042072057724, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 202600 + }, + { + "epoch": 0.7832335977486045, + "grad_norm": 0.11569520086050034, + "learning_rate": 0.002, + "loss": 2.343, + "step": 202610 + }, + { + "epoch": 0.7832722549519877, + "grad_norm": 0.09979037195444107, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 202620 + }, + { + "epoch": 0.7833109121553711, + "grad_norm": 0.0989861935377121, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 202630 + }, + { + "epoch": 0.7833495693587543, + "grad_norm": 0.12113470584154129, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 202640 + }, + { + "epoch": 0.7833882265621376, + "grad_norm": 0.09723356366157532, + "learning_rate": 0.002, + "loss": 2.349, + "step": 202650 + }, + { + "epoch": 0.7834268837655208, + "grad_norm": 0.1004517674446106, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 202660 + }, + { + "epoch": 0.7834655409689042, + "grad_norm": 0.10730737447738647, + "learning_rate": 0.002, + "loss": 2.346, + "step": 202670 + }, + { + "epoch": 0.7835041981722874, + "grad_norm": 0.10889331996440887, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 202680 + }, + { + "epoch": 0.7835428553756707, + "grad_norm": 0.12497900426387787, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 202690 + }, + { + "epoch": 0.783581512579054, + "grad_norm": 0.09737855941057205, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 202700 + }, + { + "epoch": 0.7836201697824373, + "grad_norm": 0.10107357054948807, + "learning_rate": 0.002, + "loss": 2.348, + "step": 202710 + }, + { + "epoch": 0.7836588269858206, + "grad_norm": 0.11377954483032227, + "learning_rate": 0.002, + "loss": 2.334, + "step": 202720 + }, + { + "epoch": 0.7836974841892038, + "grad_norm": 0.11078076809644699, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 202730 + }, + { + "epoch": 0.7837361413925871, + "grad_norm": 0.09624568372964859, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 202740 + }, + { + "epoch": 0.7837747985959703, + "grad_norm": 0.12164346873760223, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 202750 + }, + { + "epoch": 0.7838134557993537, + "grad_norm": 0.09853264689445496, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 202760 + }, + { + "epoch": 0.7838521130027369, + "grad_norm": 0.11185454577207565, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 202770 + }, + { + "epoch": 0.7838907702061202, + "grad_norm": 0.1107863038778305, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 202780 + }, + { + "epoch": 0.7839294274095034, + "grad_norm": 0.09015390276908875, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 202790 + }, + { + "epoch": 0.7839680846128868, + "grad_norm": 0.11195258796215057, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 202800 + }, + { + "epoch": 0.7840067418162701, + "grad_norm": 0.10910385102033615, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 202810 + }, + { + "epoch": 0.7840453990196533, + "grad_norm": 0.10820984840393066, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 202820 + }, + { + "epoch": 0.7840840562230366, + "grad_norm": 0.09601178765296936, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 202830 + }, + { + "epoch": 0.7841227134264199, + "grad_norm": 0.15023267269134521, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 202840 + }, + { + "epoch": 0.7841613706298032, + "grad_norm": 0.10086756944656372, + "learning_rate": 0.002, + "loss": 2.331, + "step": 202850 + }, + { + "epoch": 0.7842000278331864, + "grad_norm": 0.09821043908596039, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 202860 + }, + { + "epoch": 0.7842386850365697, + "grad_norm": 0.13208691775798798, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 202870 + }, + { + "epoch": 0.784277342239953, + "grad_norm": 0.10310870409011841, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 202880 + }, + { + "epoch": 0.7843159994433363, + "grad_norm": 0.0981186106801033, + "learning_rate": 0.002, + "loss": 2.327, + "step": 202890 + }, + { + "epoch": 0.7843546566467196, + "grad_norm": 0.12439095228910446, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 202900 + }, + { + "epoch": 0.7843933138501028, + "grad_norm": 0.10339083522558212, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 202910 + }, + { + "epoch": 0.784431971053486, + "grad_norm": 0.0957341343164444, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 202920 + }, + { + "epoch": 0.7844706282568694, + "grad_norm": 0.1126188188791275, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 202930 + }, + { + "epoch": 0.7845092854602527, + "grad_norm": 0.09723176062107086, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 202940 + }, + { + "epoch": 0.7845479426636359, + "grad_norm": 0.10407697409391403, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 202950 + }, + { + "epoch": 0.7845865998670192, + "grad_norm": 0.10393688082695007, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 202960 + }, + { + "epoch": 0.7846252570704025, + "grad_norm": 0.11379170417785645, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 202970 + }, + { + "epoch": 0.7846639142737858, + "grad_norm": 0.10667712986469269, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 202980 + }, + { + "epoch": 0.784702571477169, + "grad_norm": 0.09885899722576141, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 202990 + }, + { + "epoch": 0.7847412286805523, + "grad_norm": 0.11075273156166077, + "learning_rate": 0.002, + "loss": 2.345, + "step": 203000 + }, + { + "epoch": 0.7847798858839357, + "grad_norm": 0.11645198613405228, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 203010 + }, + { + "epoch": 0.7848185430873189, + "grad_norm": 0.09348627924919128, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 203020 + }, + { + "epoch": 0.7848572002907022, + "grad_norm": 0.11713188886642456, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 203030 + }, + { + "epoch": 0.7848958574940854, + "grad_norm": 0.10126195847988129, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 203040 + }, + { + "epoch": 0.7849345146974688, + "grad_norm": 0.1038556843996048, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 203050 + }, + { + "epoch": 0.784973171900852, + "grad_norm": 0.10732719302177429, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 203060 + }, + { + "epoch": 0.7850118291042353, + "grad_norm": 0.117129385471344, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 203070 + }, + { + "epoch": 0.7850504863076185, + "grad_norm": 0.11160501092672348, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 203080 + }, + { + "epoch": 0.7850891435110018, + "grad_norm": 0.1068291887640953, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 203090 + }, + { + "epoch": 0.7851278007143851, + "grad_norm": 0.12483925372362137, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 203100 + }, + { + "epoch": 0.7851664579177684, + "grad_norm": 0.0880567654967308, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 203110 + }, + { + "epoch": 0.7852051151211517, + "grad_norm": 0.09908737242221832, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 203120 + }, + { + "epoch": 0.7852437723245349, + "grad_norm": 0.09003641456365585, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 203130 + }, + { + "epoch": 0.7852824295279183, + "grad_norm": 0.1001494899392128, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 203140 + }, + { + "epoch": 0.7853210867313015, + "grad_norm": 0.11012919247150421, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 203150 + }, + { + "epoch": 0.7853597439346848, + "grad_norm": 0.09928996860980988, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 203160 + }, + { + "epoch": 0.785398401138068, + "grad_norm": 0.08990038931369781, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 203170 + }, + { + "epoch": 0.7854370583414514, + "grad_norm": 0.10726006329059601, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 203180 + }, + { + "epoch": 0.7854757155448346, + "grad_norm": 0.1108621284365654, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 203190 + }, + { + "epoch": 0.7855143727482179, + "grad_norm": 0.0969405546784401, + "learning_rate": 0.002, + "loss": 2.337, + "step": 203200 + }, + { + "epoch": 0.7855530299516011, + "grad_norm": 0.09812358021736145, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 203210 + }, + { + "epoch": 0.7855916871549845, + "grad_norm": 0.123712919652462, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 203220 + }, + { + "epoch": 0.7856303443583678, + "grad_norm": 0.12026149779558182, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 203230 + }, + { + "epoch": 0.785669001561751, + "grad_norm": 0.09507454186677933, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 203240 + }, + { + "epoch": 0.7857076587651343, + "grad_norm": 0.0988507866859436, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 203250 + }, + { + "epoch": 0.7857463159685176, + "grad_norm": 0.10693545639514923, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 203260 + }, + { + "epoch": 0.7857849731719009, + "grad_norm": 0.10580098628997803, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 203270 + }, + { + "epoch": 0.7858236303752841, + "grad_norm": 0.09059664607048035, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 203280 + }, + { + "epoch": 0.7858622875786674, + "grad_norm": 0.09549517184495926, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 203290 + }, + { + "epoch": 0.7859009447820506, + "grad_norm": 0.13090062141418457, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 203300 + }, + { + "epoch": 0.785939601985434, + "grad_norm": 0.10545851290225983, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 203310 + }, + { + "epoch": 0.7859782591888173, + "grad_norm": 0.10016728937625885, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 203320 + }, + { + "epoch": 0.7860169163922005, + "grad_norm": 0.09274963289499283, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 203330 + }, + { + "epoch": 0.7860555735955838, + "grad_norm": 0.11836186051368713, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 203340 + }, + { + "epoch": 0.7860942307989671, + "grad_norm": 0.10120458155870438, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 203350 + }, + { + "epoch": 0.7861328880023504, + "grad_norm": 0.09683175384998322, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 203360 + }, + { + "epoch": 0.7861715452057336, + "grad_norm": 0.09533454477787018, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 203370 + }, + { + "epoch": 0.7862102024091169, + "grad_norm": 0.11609603464603424, + "learning_rate": 0.002, + "loss": 2.339, + "step": 203380 + }, + { + "epoch": 0.7862488596125002, + "grad_norm": 0.10426200181245804, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 203390 + }, + { + "epoch": 0.7862875168158835, + "grad_norm": 0.11988259851932526, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 203400 + }, + { + "epoch": 0.7863261740192667, + "grad_norm": 0.11909215897321701, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 203410 + }, + { + "epoch": 0.78636483122265, + "grad_norm": 0.1037282645702362, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 203420 + }, + { + "epoch": 0.7864034884260334, + "grad_norm": 0.10475391149520874, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 203430 + }, + { + "epoch": 0.7864421456294166, + "grad_norm": 0.12629707157611847, + "learning_rate": 0.002, + "loss": 2.347, + "step": 203440 + }, + { + "epoch": 0.7864808028327999, + "grad_norm": 0.10719470679759979, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 203450 + }, + { + "epoch": 0.7865194600361831, + "grad_norm": 0.653010368347168, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 203460 + }, + { + "epoch": 0.7865581172395664, + "grad_norm": 0.11120946705341339, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 203470 + }, + { + "epoch": 0.7865967744429497, + "grad_norm": 0.10024162381887436, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 203480 + }, + { + "epoch": 0.786635431646333, + "grad_norm": 0.11460886150598526, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 203490 + }, + { + "epoch": 0.7866740888497162, + "grad_norm": 0.11218295991420746, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 203500 + }, + { + "epoch": 0.7867127460530995, + "grad_norm": 0.10510541498661041, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 203510 + }, + { + "epoch": 0.7867514032564829, + "grad_norm": 0.09944749623537064, + "learning_rate": 0.002, + "loss": 2.348, + "step": 203520 + }, + { + "epoch": 0.7867900604598661, + "grad_norm": 0.100336953997612, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 203530 + }, + { + "epoch": 0.7868287176632494, + "grad_norm": 0.10724803060293198, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 203540 + }, + { + "epoch": 0.7868673748666326, + "grad_norm": 0.10677690804004669, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 203550 + }, + { + "epoch": 0.786906032070016, + "grad_norm": 0.09538012742996216, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 203560 + }, + { + "epoch": 0.7869446892733992, + "grad_norm": 0.11513350158929825, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 203570 + }, + { + "epoch": 0.7869833464767825, + "grad_norm": 0.0956110879778862, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 203580 + }, + { + "epoch": 0.7870220036801657, + "grad_norm": 0.10411123931407928, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 203590 + }, + { + "epoch": 0.7870606608835491, + "grad_norm": 0.1055002361536026, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 203600 + }, + { + "epoch": 0.7870993180869323, + "grad_norm": 0.12185431271791458, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 203610 + }, + { + "epoch": 0.7871379752903156, + "grad_norm": 0.10589416325092316, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 203620 + }, + { + "epoch": 0.7871766324936988, + "grad_norm": 0.11253327876329422, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 203630 + }, + { + "epoch": 0.7872152896970821, + "grad_norm": 0.11665678024291992, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 203640 + }, + { + "epoch": 0.7872539469004655, + "grad_norm": 0.09713021665811539, + "learning_rate": 0.002, + "loss": 2.341, + "step": 203650 + }, + { + "epoch": 0.7872926041038487, + "grad_norm": 0.10635494440793991, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 203660 + }, + { + "epoch": 0.787331261307232, + "grad_norm": 0.14344890415668488, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 203670 + }, + { + "epoch": 0.7873699185106152, + "grad_norm": 0.10709847509860992, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 203680 + }, + { + "epoch": 0.7874085757139986, + "grad_norm": 0.10784828662872314, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 203690 + }, + { + "epoch": 0.7874472329173818, + "grad_norm": 0.29096582531929016, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 203700 + }, + { + "epoch": 0.7874858901207651, + "grad_norm": 0.10271017998456955, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 203710 + }, + { + "epoch": 0.7875245473241483, + "grad_norm": 0.10930080711841583, + "learning_rate": 0.002, + "loss": 2.3649, + "step": 203720 + }, + { + "epoch": 0.7875632045275317, + "grad_norm": 0.08935771882534027, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 203730 + }, + { + "epoch": 0.787601861730915, + "grad_norm": 0.10123781859874725, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 203740 + }, + { + "epoch": 0.7876405189342982, + "grad_norm": 0.11019125580787659, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 203750 + }, + { + "epoch": 0.7876791761376815, + "grad_norm": 0.12791241705417633, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 203760 + }, + { + "epoch": 0.7877178333410648, + "grad_norm": 0.0956585705280304, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 203770 + }, + { + "epoch": 0.7877564905444481, + "grad_norm": 0.10439072549343109, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 203780 + }, + { + "epoch": 0.7877951477478313, + "grad_norm": 0.10286793112754822, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 203790 + }, + { + "epoch": 0.7878338049512146, + "grad_norm": 0.11320628225803375, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 203800 + }, + { + "epoch": 0.7878724621545979, + "grad_norm": 0.10071314871311188, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 203810 + }, + { + "epoch": 0.7879111193579812, + "grad_norm": 0.1389821320772171, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 203820 + }, + { + "epoch": 0.7879497765613644, + "grad_norm": 0.13283565640449524, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 203830 + }, + { + "epoch": 0.7879884337647477, + "grad_norm": 0.10520830750465393, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 203840 + }, + { + "epoch": 0.788027090968131, + "grad_norm": 0.11765392869710922, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 203850 + }, + { + "epoch": 0.7880657481715143, + "grad_norm": 0.12679457664489746, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 203860 + }, + { + "epoch": 0.7881044053748976, + "grad_norm": 0.10883157700300217, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 203870 + }, + { + "epoch": 0.7881430625782808, + "grad_norm": 0.1118154376745224, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 203880 + }, + { + "epoch": 0.7881817197816641, + "grad_norm": 0.09302463382482529, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 203890 + }, + { + "epoch": 0.7882203769850474, + "grad_norm": 0.12338177114725113, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 203900 + }, + { + "epoch": 0.7882590341884307, + "grad_norm": 0.11272131651639938, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 203910 + }, + { + "epoch": 0.7882976913918139, + "grad_norm": 0.10373769700527191, + "learning_rate": 0.002, + "loss": 2.343, + "step": 203920 + }, + { + "epoch": 0.7883363485951972, + "grad_norm": 0.10257022827863693, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 203930 + }, + { + "epoch": 0.7883750057985806, + "grad_norm": 0.10867653042078018, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 203940 + }, + { + "epoch": 0.7884136630019638, + "grad_norm": 0.10311832278966904, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 203950 + }, + { + "epoch": 0.7884523202053471, + "grad_norm": 0.10829376429319382, + "learning_rate": 0.002, + "loss": 2.34, + "step": 203960 + }, + { + "epoch": 0.7884909774087303, + "grad_norm": 0.11858326941728592, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 203970 + }, + { + "epoch": 0.7885296346121137, + "grad_norm": 0.1111568734049797, + "learning_rate": 0.002, + "loss": 2.335, + "step": 203980 + }, + { + "epoch": 0.7885682918154969, + "grad_norm": 0.11772432923316956, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 203990 + }, + { + "epoch": 0.7886069490188802, + "grad_norm": 0.10340186208486557, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 204000 + }, + { + "epoch": 0.7886456062222634, + "grad_norm": 0.11137323826551437, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 204010 + }, + { + "epoch": 0.7886842634256467, + "grad_norm": 0.09535515308380127, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 204020 + }, + { + "epoch": 0.78872292062903, + "grad_norm": 0.10966328531503677, + "learning_rate": 0.002, + "loss": 2.331, + "step": 204030 + }, + { + "epoch": 0.7887615778324133, + "grad_norm": 0.10557172447443008, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 204040 + }, + { + "epoch": 0.7888002350357965, + "grad_norm": 0.09890732169151306, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 204050 + }, + { + "epoch": 0.7888388922391798, + "grad_norm": 0.13068756461143494, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 204060 + }, + { + "epoch": 0.7888775494425632, + "grad_norm": 0.11536012589931488, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 204070 + }, + { + "epoch": 0.7889162066459464, + "grad_norm": 0.09284942597150803, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 204080 + }, + { + "epoch": 0.7889548638493297, + "grad_norm": 0.09574364870786667, + "learning_rate": 0.002, + "loss": 2.333, + "step": 204090 + }, + { + "epoch": 0.7889935210527129, + "grad_norm": 0.10232340544462204, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 204100 + }, + { + "epoch": 0.7890321782560963, + "grad_norm": 0.12331674993038177, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 204110 + }, + { + "epoch": 0.7890708354594795, + "grad_norm": 0.10304983705282211, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 204120 + }, + { + "epoch": 0.7891094926628628, + "grad_norm": 0.11414653807878494, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 204130 + }, + { + "epoch": 0.789148149866246, + "grad_norm": 0.11107272654771805, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 204140 + }, + { + "epoch": 0.7891868070696294, + "grad_norm": 0.10653676092624664, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 204150 + }, + { + "epoch": 0.7892254642730127, + "grad_norm": 0.10755713284015656, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 204160 + }, + { + "epoch": 0.7892641214763959, + "grad_norm": 0.11351131647825241, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 204170 + }, + { + "epoch": 0.7893027786797792, + "grad_norm": 0.11735298484563828, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 204180 + }, + { + "epoch": 0.7893414358831625, + "grad_norm": 0.10693307220935822, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 204190 + }, + { + "epoch": 0.7893800930865458, + "grad_norm": 0.090082548558712, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 204200 + }, + { + "epoch": 0.789418750289929, + "grad_norm": 0.09632305055856705, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 204210 + }, + { + "epoch": 0.7894574074933123, + "grad_norm": 0.09143831580877304, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 204220 + }, + { + "epoch": 0.7894960646966955, + "grad_norm": 0.11290735006332397, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 204230 + }, + { + "epoch": 0.7895347219000789, + "grad_norm": 0.16610337793827057, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 204240 + }, + { + "epoch": 0.7895733791034621, + "grad_norm": 0.09449631720781326, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 204250 + }, + { + "epoch": 0.7896120363068454, + "grad_norm": 0.11649568378925323, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 204260 + }, + { + "epoch": 0.7896506935102287, + "grad_norm": 0.11000768840312958, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 204270 + }, + { + "epoch": 0.789689350713612, + "grad_norm": 0.09472779929637909, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 204280 + }, + { + "epoch": 0.7897280079169953, + "grad_norm": 0.1212298795580864, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 204290 + }, + { + "epoch": 0.7897666651203785, + "grad_norm": 0.10486337542533875, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 204300 + }, + { + "epoch": 0.7898053223237618, + "grad_norm": 0.12343774735927582, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 204310 + }, + { + "epoch": 0.7898439795271451, + "grad_norm": 0.10356009751558304, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 204320 + }, + { + "epoch": 0.7898826367305284, + "grad_norm": 0.11104033887386322, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 204330 + }, + { + "epoch": 0.7899212939339116, + "grad_norm": 0.10338528454303741, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 204340 + }, + { + "epoch": 0.7899599511372949, + "grad_norm": 0.11514335125684738, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 204350 + }, + { + "epoch": 0.7899986083406783, + "grad_norm": 0.10399248450994492, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 204360 + }, + { + "epoch": 0.7900372655440615, + "grad_norm": 0.08885734528303146, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 204370 + }, + { + "epoch": 0.7900759227474448, + "grad_norm": 0.11299405992031097, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 204380 + }, + { + "epoch": 0.790114579950828, + "grad_norm": 0.10836345702409744, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 204390 + }, + { + "epoch": 0.7901532371542113, + "grad_norm": 0.11612921208143234, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 204400 + }, + { + "epoch": 0.7901918943575946, + "grad_norm": 0.10149534791707993, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 204410 + }, + { + "epoch": 0.7902305515609779, + "grad_norm": 0.08733243495225906, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 204420 + }, + { + "epoch": 0.7902692087643611, + "grad_norm": 0.10776301473379135, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 204430 + }, + { + "epoch": 0.7903078659677444, + "grad_norm": 0.11737163364887238, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 204440 + }, + { + "epoch": 0.7903465231711277, + "grad_norm": 0.09715598076581955, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 204450 + }, + { + "epoch": 0.790385180374511, + "grad_norm": 0.11796846240758896, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 204460 + }, + { + "epoch": 0.7904238375778943, + "grad_norm": 0.09173443913459778, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 204470 + }, + { + "epoch": 0.7904624947812775, + "grad_norm": 0.09172165393829346, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 204480 + }, + { + "epoch": 0.7905011519846609, + "grad_norm": 0.13926003873348236, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 204490 + }, + { + "epoch": 0.7905398091880441, + "grad_norm": 0.11290960013866425, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 204500 + }, + { + "epoch": 0.7905784663914274, + "grad_norm": 0.1287475973367691, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 204510 + }, + { + "epoch": 0.7906171235948106, + "grad_norm": 0.12370341271162033, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 204520 + }, + { + "epoch": 0.790655780798194, + "grad_norm": 0.09490542113780975, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 204530 + }, + { + "epoch": 0.7906944380015772, + "grad_norm": 0.0927913635969162, + "learning_rate": 0.002, + "loss": 2.337, + "step": 204540 + }, + { + "epoch": 0.7907330952049605, + "grad_norm": 0.11680828034877777, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 204550 + }, + { + "epoch": 0.7907717524083437, + "grad_norm": 0.09189026057720184, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 204560 + }, + { + "epoch": 0.790810409611727, + "grad_norm": 0.10120948404073715, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 204570 + }, + { + "epoch": 0.7908490668151104, + "grad_norm": 0.12998700141906738, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 204580 + }, + { + "epoch": 0.7908877240184936, + "grad_norm": 0.09915055334568024, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 204590 + }, + { + "epoch": 0.7909263812218769, + "grad_norm": 0.10773010551929474, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 204600 + }, + { + "epoch": 0.7909650384252601, + "grad_norm": 0.11004921048879623, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 204610 + }, + { + "epoch": 0.7910036956286435, + "grad_norm": 0.1033116951584816, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 204620 + }, + { + "epoch": 0.7910423528320267, + "grad_norm": 0.10842995345592499, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 204630 + }, + { + "epoch": 0.79108101003541, + "grad_norm": 0.1112065464258194, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 204640 + }, + { + "epoch": 0.7911196672387932, + "grad_norm": 0.10177678614854813, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 204650 + }, + { + "epoch": 0.7911583244421766, + "grad_norm": 0.11695540696382523, + "learning_rate": 0.002, + "loss": 2.341, + "step": 204660 + }, + { + "epoch": 0.7911969816455598, + "grad_norm": 0.11951947212219238, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 204670 + }, + { + "epoch": 0.7912356388489431, + "grad_norm": 0.08816482871770859, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 204680 + }, + { + "epoch": 0.7912742960523264, + "grad_norm": 0.10402724891901016, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 204690 + }, + { + "epoch": 0.7913129532557097, + "grad_norm": 0.11488895863294601, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 204700 + }, + { + "epoch": 0.791351610459093, + "grad_norm": 0.10502402484416962, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 204710 + }, + { + "epoch": 0.7913902676624762, + "grad_norm": 0.08945094794034958, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 204720 + }, + { + "epoch": 0.7914289248658595, + "grad_norm": 0.09670726954936981, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 204730 + }, + { + "epoch": 0.7914675820692428, + "grad_norm": 0.10783292353153229, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 204740 + }, + { + "epoch": 0.7915062392726261, + "grad_norm": 0.10084598511457443, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 204750 + }, + { + "epoch": 0.7915448964760093, + "grad_norm": 0.10058614611625671, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 204760 + }, + { + "epoch": 0.7915835536793926, + "grad_norm": 0.10897661000490189, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 204770 + }, + { + "epoch": 0.7916222108827758, + "grad_norm": 0.11623561382293701, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 204780 + }, + { + "epoch": 0.7916608680861592, + "grad_norm": 0.10288413614034653, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 204790 + }, + { + "epoch": 0.7916995252895425, + "grad_norm": 0.10230749100446701, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 204800 + }, + { + "epoch": 0.7917381824929257, + "grad_norm": 0.11927718669176102, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 204810 + }, + { + "epoch": 0.791776839696309, + "grad_norm": 0.1255546510219574, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 204820 + }, + { + "epoch": 0.7918154968996923, + "grad_norm": 0.10689831525087357, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 204830 + }, + { + "epoch": 0.7918541541030756, + "grad_norm": 0.09706177562475204, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 204840 + }, + { + "epoch": 0.7918928113064588, + "grad_norm": 0.10006806999444962, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 204850 + }, + { + "epoch": 0.7919314685098421, + "grad_norm": 0.09978881478309631, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 204860 + }, + { + "epoch": 0.7919701257132254, + "grad_norm": 0.09379451721906662, + "learning_rate": 0.002, + "loss": 2.341, + "step": 204870 + }, + { + "epoch": 0.7920087829166087, + "grad_norm": 0.10791292041540146, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 204880 + }, + { + "epoch": 0.792047440119992, + "grad_norm": 0.1182226613163948, + "learning_rate": 0.002, + "loss": 2.349, + "step": 204890 + }, + { + "epoch": 0.7920860973233752, + "grad_norm": 0.09794960916042328, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 204900 + }, + { + "epoch": 0.7921247545267586, + "grad_norm": 0.10860683768987656, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 204910 + }, + { + "epoch": 0.7921634117301418, + "grad_norm": 0.10471311956644058, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 204920 + }, + { + "epoch": 0.7922020689335251, + "grad_norm": 0.1075335219502449, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 204930 + }, + { + "epoch": 0.7922407261369083, + "grad_norm": 0.1021302193403244, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 204940 + }, + { + "epoch": 0.7922793833402916, + "grad_norm": 0.10729537904262543, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 204950 + }, + { + "epoch": 0.7923180405436749, + "grad_norm": 0.10054760426282883, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 204960 + }, + { + "epoch": 0.7923566977470582, + "grad_norm": 0.10623308271169662, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 204970 + }, + { + "epoch": 0.7923953549504414, + "grad_norm": 0.1051439717411995, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 204980 + }, + { + "epoch": 0.7924340121538247, + "grad_norm": 0.09262803196907043, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 204990 + }, + { + "epoch": 0.7924726693572081, + "grad_norm": 0.10029026865959167, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 205000 + }, + { + "epoch": 0.7925113265605913, + "grad_norm": 0.1111687645316124, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 205010 + }, + { + "epoch": 0.7925499837639746, + "grad_norm": 0.09792038798332214, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 205020 + }, + { + "epoch": 0.7925886409673578, + "grad_norm": 0.12564076483249664, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 205030 + }, + { + "epoch": 0.7926272981707412, + "grad_norm": 0.12421346455812454, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 205040 + }, + { + "epoch": 0.7926659553741244, + "grad_norm": 0.11315683275461197, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 205050 + }, + { + "epoch": 0.7927046125775077, + "grad_norm": 0.11583825200796127, + "learning_rate": 0.002, + "loss": 2.343, + "step": 205060 + }, + { + "epoch": 0.7927432697808909, + "grad_norm": 0.09638478606939316, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 205070 + }, + { + "epoch": 0.7927819269842743, + "grad_norm": 0.10320941358804703, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 205080 + }, + { + "epoch": 0.7928205841876576, + "grad_norm": 0.1072765588760376, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 205090 + }, + { + "epoch": 0.7928592413910408, + "grad_norm": 0.09241588413715363, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 205100 + }, + { + "epoch": 0.792897898594424, + "grad_norm": 0.09480182081460953, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 205110 + }, + { + "epoch": 0.7929365557978074, + "grad_norm": 0.10750217735767365, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 205120 + }, + { + "epoch": 0.7929752130011907, + "grad_norm": 0.10552404820919037, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 205130 + }, + { + "epoch": 0.7930138702045739, + "grad_norm": 0.17118145525455475, + "learning_rate": 0.002, + "loss": 2.348, + "step": 205140 + }, + { + "epoch": 0.7930525274079572, + "grad_norm": 0.20626536011695862, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 205150 + }, + { + "epoch": 0.7930911846113404, + "grad_norm": 0.12056166678667068, + "learning_rate": 0.002, + "loss": 2.347, + "step": 205160 + }, + { + "epoch": 0.7931298418147238, + "grad_norm": 0.34020501375198364, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 205170 + }, + { + "epoch": 0.793168499018107, + "grad_norm": 0.10309938341379166, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 205180 + }, + { + "epoch": 0.7932071562214903, + "grad_norm": 0.10739479959011078, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 205190 + }, + { + "epoch": 0.7932458134248735, + "grad_norm": 0.11030566692352295, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 205200 + }, + { + "epoch": 0.7932844706282569, + "grad_norm": 0.1024133712053299, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 205210 + }, + { + "epoch": 0.7933231278316402, + "grad_norm": 0.09965585917234421, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 205220 + }, + { + "epoch": 0.7933617850350234, + "grad_norm": 0.10700208693742752, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 205230 + }, + { + "epoch": 0.7934004422384067, + "grad_norm": 0.11820707470178604, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 205240 + }, + { + "epoch": 0.79343909944179, + "grad_norm": 0.0988406091928482, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 205250 + }, + { + "epoch": 0.7934777566451733, + "grad_norm": 0.0951349139213562, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 205260 + }, + { + "epoch": 0.7935164138485565, + "grad_norm": 0.09339561313390732, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 205270 + }, + { + "epoch": 0.7935550710519398, + "grad_norm": 0.11482594907283783, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 205280 + }, + { + "epoch": 0.7935937282553231, + "grad_norm": 0.12239178270101547, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 205290 + }, + { + "epoch": 0.7936323854587064, + "grad_norm": 0.09025915712118149, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 205300 + }, + { + "epoch": 0.7936710426620897, + "grad_norm": 0.11617813259363174, + "learning_rate": 0.002, + "loss": 2.353, + "step": 205310 + }, + { + "epoch": 0.7937096998654729, + "grad_norm": 0.1109839677810669, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 205320 + }, + { + "epoch": 0.7937483570688562, + "grad_norm": 0.10583146661520004, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 205330 + }, + { + "epoch": 0.7937870142722395, + "grad_norm": 0.10988876223564148, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 205340 + }, + { + "epoch": 0.7938256714756228, + "grad_norm": 0.09644675254821777, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 205350 + }, + { + "epoch": 0.793864328679006, + "grad_norm": 0.10545558482408524, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 205360 + }, + { + "epoch": 0.7939029858823893, + "grad_norm": 0.09953219443559647, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 205370 + }, + { + "epoch": 0.7939416430857726, + "grad_norm": 0.12311132997274399, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 205380 + }, + { + "epoch": 0.7939803002891559, + "grad_norm": 0.10079408437013626, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 205390 + }, + { + "epoch": 0.7940189574925391, + "grad_norm": 0.10783235728740692, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 205400 + }, + { + "epoch": 0.7940576146959224, + "grad_norm": 0.10919714719057083, + "learning_rate": 0.002, + "loss": 2.34, + "step": 205410 + }, + { + "epoch": 0.7940962718993058, + "grad_norm": 0.11178895831108093, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 205420 + }, + { + "epoch": 0.794134929102689, + "grad_norm": 0.10486460477113724, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 205430 + }, + { + "epoch": 0.7941735863060723, + "grad_norm": 0.11997535824775696, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 205440 + }, + { + "epoch": 0.7942122435094555, + "grad_norm": 0.10198147594928741, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 205450 + }, + { + "epoch": 0.7942509007128389, + "grad_norm": 0.09956075251102448, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 205460 + }, + { + "epoch": 0.7942895579162221, + "grad_norm": 0.13137564063072205, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 205470 + }, + { + "epoch": 0.7943282151196054, + "grad_norm": 0.09812214970588684, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 205480 + }, + { + "epoch": 0.7943668723229886, + "grad_norm": 0.11089440435171127, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 205490 + }, + { + "epoch": 0.7944055295263719, + "grad_norm": 0.11397991329431534, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 205500 + }, + { + "epoch": 0.7944441867297553, + "grad_norm": 0.20118115842342377, + "learning_rate": 0.002, + "loss": 2.336, + "step": 205510 + }, + { + "epoch": 0.7944828439331385, + "grad_norm": 0.11045759916305542, + "learning_rate": 0.002, + "loss": 2.337, + "step": 205520 + }, + { + "epoch": 0.7945215011365218, + "grad_norm": 0.11609234660863876, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 205530 + }, + { + "epoch": 0.794560158339905, + "grad_norm": 0.10128235816955566, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 205540 + }, + { + "epoch": 0.7945988155432884, + "grad_norm": 0.12456586211919785, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 205550 + }, + { + "epoch": 0.7946374727466716, + "grad_norm": 0.12482677400112152, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 205560 + }, + { + "epoch": 0.7946761299500549, + "grad_norm": 0.09432988613843918, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 205570 + }, + { + "epoch": 0.7947147871534381, + "grad_norm": 0.1161578968167305, + "learning_rate": 0.002, + "loss": 2.3659, + "step": 205580 + }, + { + "epoch": 0.7947534443568215, + "grad_norm": 0.09937795251607895, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 205590 + }, + { + "epoch": 0.7947921015602047, + "grad_norm": 0.11066994071006775, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 205600 + }, + { + "epoch": 0.794830758763588, + "grad_norm": 0.12263431400060654, + "learning_rate": 0.002, + "loss": 2.343, + "step": 205610 + }, + { + "epoch": 0.7948694159669712, + "grad_norm": 0.11063994467258453, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 205620 + }, + { + "epoch": 0.7949080731703546, + "grad_norm": 0.11227592080831528, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 205630 + }, + { + "epoch": 0.7949467303737379, + "grad_norm": 0.11528632044792175, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 205640 + }, + { + "epoch": 0.7949853875771211, + "grad_norm": 0.12380699068307877, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 205650 + }, + { + "epoch": 0.7950240447805044, + "grad_norm": 0.10481059551239014, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 205660 + }, + { + "epoch": 0.7950627019838877, + "grad_norm": 0.11022672802209854, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 205670 + }, + { + "epoch": 0.795101359187271, + "grad_norm": 0.1075207069516182, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 205680 + }, + { + "epoch": 0.7951400163906542, + "grad_norm": 0.10176742076873779, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 205690 + }, + { + "epoch": 0.7951786735940375, + "grad_norm": 0.10005369782447815, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 205700 + }, + { + "epoch": 0.7952173307974207, + "grad_norm": 0.11575303226709366, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 205710 + }, + { + "epoch": 0.7952559880008041, + "grad_norm": 0.10827762633562088, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 205720 + }, + { + "epoch": 0.7952946452041874, + "grad_norm": 0.0917559415102005, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 205730 + }, + { + "epoch": 0.7953333024075706, + "grad_norm": 0.10644976794719696, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 205740 + }, + { + "epoch": 0.7953719596109539, + "grad_norm": 0.11294444650411606, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 205750 + }, + { + "epoch": 0.7954106168143372, + "grad_norm": 0.0953550785779953, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 205760 + }, + { + "epoch": 0.7954492740177205, + "grad_norm": 0.11978161334991455, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 205770 + }, + { + "epoch": 0.7954879312211037, + "grad_norm": 0.10544034093618393, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 205780 + }, + { + "epoch": 0.795526588424487, + "grad_norm": 0.09561526030302048, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 205790 + }, + { + "epoch": 0.7955652456278703, + "grad_norm": 0.09656016528606415, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 205800 + }, + { + "epoch": 0.7956039028312536, + "grad_norm": 0.10788822174072266, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 205810 + }, + { + "epoch": 0.7956425600346368, + "grad_norm": 0.10486281663179398, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 205820 + }, + { + "epoch": 0.7956812172380201, + "grad_norm": 0.1045752465724945, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 205830 + }, + { + "epoch": 0.7957198744414035, + "grad_norm": 0.09988079220056534, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 205840 + }, + { + "epoch": 0.7957585316447867, + "grad_norm": 0.0971662700176239, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 205850 + }, + { + "epoch": 0.79579718884817, + "grad_norm": 0.10432875156402588, + "learning_rate": 0.002, + "loss": 2.339, + "step": 205860 + }, + { + "epoch": 0.7958358460515532, + "grad_norm": 0.10157095640897751, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 205870 + }, + { + "epoch": 0.7958745032549365, + "grad_norm": 0.11968620866537094, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 205880 + }, + { + "epoch": 0.7959131604583198, + "grad_norm": 0.1179286390542984, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 205890 + }, + { + "epoch": 0.7959518176617031, + "grad_norm": 0.10262852162122726, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 205900 + }, + { + "epoch": 0.7959904748650863, + "grad_norm": 0.09464697539806366, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 205910 + }, + { + "epoch": 0.7960291320684696, + "grad_norm": 0.11842875927686691, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 205920 + }, + { + "epoch": 0.796067789271853, + "grad_norm": 0.15821444988250732, + "learning_rate": 0.002, + "loss": 2.347, + "step": 205930 + }, + { + "epoch": 0.7961064464752362, + "grad_norm": 0.10689578205347061, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 205940 + }, + { + "epoch": 0.7961451036786195, + "grad_norm": 0.09144774824380875, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 205950 + }, + { + "epoch": 0.7961837608820027, + "grad_norm": 0.09909002482891083, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 205960 + }, + { + "epoch": 0.7962224180853861, + "grad_norm": 0.09510427713394165, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 205970 + }, + { + "epoch": 0.7962610752887693, + "grad_norm": 0.12517790496349335, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 205980 + }, + { + "epoch": 0.7962997324921526, + "grad_norm": 0.09638720005750656, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 205990 + }, + { + "epoch": 0.7963383896955358, + "grad_norm": 0.10534534603357315, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 206000 + }, + { + "epoch": 0.7963770468989192, + "grad_norm": 0.15926168859004974, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 206010 + }, + { + "epoch": 0.7964157041023024, + "grad_norm": 0.10147106647491455, + "learning_rate": 0.002, + "loss": 2.339, + "step": 206020 + }, + { + "epoch": 0.7964543613056857, + "grad_norm": 0.10951374471187592, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 206030 + }, + { + "epoch": 0.796493018509069, + "grad_norm": 0.11713884770870209, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 206040 + }, + { + "epoch": 0.7965316757124522, + "grad_norm": 0.0972195565700531, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 206050 + }, + { + "epoch": 0.7965703329158356, + "grad_norm": 0.0912977084517479, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 206060 + }, + { + "epoch": 0.7966089901192188, + "grad_norm": 0.12983092665672302, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 206070 + }, + { + "epoch": 0.7966476473226021, + "grad_norm": 0.09781789034605026, + "learning_rate": 0.002, + "loss": 2.353, + "step": 206080 + }, + { + "epoch": 0.7966863045259853, + "grad_norm": 0.1111692562699318, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 206090 + }, + { + "epoch": 0.7967249617293687, + "grad_norm": 0.11107601225376129, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 206100 + }, + { + "epoch": 0.7967636189327519, + "grad_norm": 0.10714568942785263, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 206110 + }, + { + "epoch": 0.7968022761361352, + "grad_norm": 0.0960366278886795, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 206120 + }, + { + "epoch": 0.7968409333395184, + "grad_norm": 0.12267144024372101, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 206130 + }, + { + "epoch": 0.7968795905429018, + "grad_norm": 0.11556356400251389, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 206140 + }, + { + "epoch": 0.7969182477462851, + "grad_norm": 0.09737807512283325, + "learning_rate": 0.002, + "loss": 2.353, + "step": 206150 + }, + { + "epoch": 0.7969569049496683, + "grad_norm": 0.13528063893318176, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 206160 + }, + { + "epoch": 0.7969955621530516, + "grad_norm": 0.09639405459165573, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 206170 + }, + { + "epoch": 0.7970342193564349, + "grad_norm": 0.0964895561337471, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 206180 + }, + { + "epoch": 0.7970728765598182, + "grad_norm": 0.10793425142765045, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 206190 + }, + { + "epoch": 0.7971115337632014, + "grad_norm": 0.10876777023077011, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 206200 + }, + { + "epoch": 0.7971501909665847, + "grad_norm": 0.11629603058099747, + "learning_rate": 0.002, + "loss": 2.339, + "step": 206210 + }, + { + "epoch": 0.797188848169968, + "grad_norm": 0.09371237456798553, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 206220 + }, + { + "epoch": 0.7972275053733513, + "grad_norm": 0.11186051368713379, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 206230 + }, + { + "epoch": 0.7972661625767345, + "grad_norm": 0.107077457010746, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 206240 + }, + { + "epoch": 0.7973048197801178, + "grad_norm": 0.12543454766273499, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 206250 + }, + { + "epoch": 0.797343476983501, + "grad_norm": 0.10010968148708344, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 206260 + }, + { + "epoch": 0.7973821341868844, + "grad_norm": 0.12213873863220215, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 206270 + }, + { + "epoch": 0.7974207913902677, + "grad_norm": 0.1030530333518982, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 206280 + }, + { + "epoch": 0.7974594485936509, + "grad_norm": 0.1313806176185608, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 206290 + }, + { + "epoch": 0.7974981057970342, + "grad_norm": 0.09398311376571655, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 206300 + }, + { + "epoch": 0.7975367630004175, + "grad_norm": 0.11330290138721466, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 206310 + }, + { + "epoch": 0.7975754202038008, + "grad_norm": 0.11221243441104889, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 206320 + }, + { + "epoch": 0.797614077407184, + "grad_norm": 0.11574556678533554, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 206330 + }, + { + "epoch": 0.7976527346105673, + "grad_norm": 0.11567448079586029, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 206340 + }, + { + "epoch": 0.7976913918139507, + "grad_norm": 0.09949743002653122, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 206350 + }, + { + "epoch": 0.7977300490173339, + "grad_norm": 0.10788501054048538, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 206360 + }, + { + "epoch": 0.7977687062207172, + "grad_norm": 0.1038108840584755, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 206370 + }, + { + "epoch": 0.7978073634241004, + "grad_norm": 0.09708569943904877, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 206380 + }, + { + "epoch": 0.7978460206274838, + "grad_norm": 0.10167262703180313, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 206390 + }, + { + "epoch": 0.797884677830867, + "grad_norm": 0.12423945218324661, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 206400 + }, + { + "epoch": 0.7979233350342503, + "grad_norm": 0.10096772015094757, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 206410 + }, + { + "epoch": 0.7979619922376335, + "grad_norm": 0.10546797513961792, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 206420 + }, + { + "epoch": 0.7980006494410168, + "grad_norm": 0.10690139979124069, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 206430 + }, + { + "epoch": 0.7980393066444001, + "grad_norm": 0.10070551186800003, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 206440 + }, + { + "epoch": 0.7980779638477834, + "grad_norm": 0.11277178674936295, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 206450 + }, + { + "epoch": 0.7981166210511667, + "grad_norm": 0.1061372309923172, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 206460 + }, + { + "epoch": 0.7981552782545499, + "grad_norm": 0.11042293906211853, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 206470 + }, + { + "epoch": 0.7981939354579333, + "grad_norm": 0.09617602825164795, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 206480 + }, + { + "epoch": 0.7982325926613165, + "grad_norm": 0.11727935075759888, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 206490 + }, + { + "epoch": 0.7982712498646998, + "grad_norm": 0.10299355536699295, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 206500 + }, + { + "epoch": 0.798309907068083, + "grad_norm": 0.10291710495948792, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 206510 + }, + { + "epoch": 0.7983485642714664, + "grad_norm": 0.09439451992511749, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 206520 + }, + { + "epoch": 0.7983872214748496, + "grad_norm": 0.10197708010673523, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 206530 + }, + { + "epoch": 0.7984258786782329, + "grad_norm": 0.11696817725896835, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 206540 + }, + { + "epoch": 0.7984645358816161, + "grad_norm": 0.10836425423622131, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 206550 + }, + { + "epoch": 0.7985031930849995, + "grad_norm": 0.10485026985406876, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 206560 + }, + { + "epoch": 0.7985418502883828, + "grad_norm": 0.09178721159696579, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 206570 + }, + { + "epoch": 0.798580507491766, + "grad_norm": 0.11446939408779144, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 206580 + }, + { + "epoch": 0.7986191646951493, + "grad_norm": 0.11932355165481567, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 206590 + }, + { + "epoch": 0.7986578218985326, + "grad_norm": 0.10004696249961853, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 206600 + }, + { + "epoch": 0.7986964791019159, + "grad_norm": 0.10023650527000427, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 206610 + }, + { + "epoch": 0.7987351363052991, + "grad_norm": 0.11644736677408218, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 206620 + }, + { + "epoch": 0.7987737935086824, + "grad_norm": 0.10645174980163574, + "learning_rate": 0.002, + "loss": 2.346, + "step": 206630 + }, + { + "epoch": 0.7988124507120656, + "grad_norm": 0.09977910667657852, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 206640 + }, + { + "epoch": 0.798851107915449, + "grad_norm": 0.12423045188188553, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 206650 + }, + { + "epoch": 0.7988897651188323, + "grad_norm": 0.09672603756189346, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 206660 + }, + { + "epoch": 0.7989284223222155, + "grad_norm": 0.09629687666893005, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 206670 + }, + { + "epoch": 0.7989670795255988, + "grad_norm": 0.1061956062912941, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 206680 + }, + { + "epoch": 0.7990057367289821, + "grad_norm": 0.09424664080142975, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 206690 + }, + { + "epoch": 0.7990443939323654, + "grad_norm": 0.11218780279159546, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 206700 + }, + { + "epoch": 0.7990830511357486, + "grad_norm": 0.1013312116265297, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 206710 + }, + { + "epoch": 0.7991217083391319, + "grad_norm": 0.11477039009332657, + "learning_rate": 0.002, + "loss": 2.343, + "step": 206720 + }, + { + "epoch": 0.7991603655425152, + "grad_norm": 0.10796338319778442, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 206730 + }, + { + "epoch": 0.7991990227458985, + "grad_norm": 0.10321065783500671, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 206740 + }, + { + "epoch": 0.7992376799492817, + "grad_norm": 0.10886584967374802, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 206750 + }, + { + "epoch": 0.799276337152665, + "grad_norm": 0.09953373670578003, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 206760 + }, + { + "epoch": 0.7993149943560484, + "grad_norm": 0.11496644467115402, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 206770 + }, + { + "epoch": 0.7993536515594316, + "grad_norm": 0.09579180181026459, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 206780 + }, + { + "epoch": 0.7993923087628149, + "grad_norm": 0.10276324301958084, + "learning_rate": 0.002, + "loss": 2.322, + "step": 206790 + }, + { + "epoch": 0.7994309659661981, + "grad_norm": 0.10177237540483475, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 206800 + }, + { + "epoch": 0.7994696231695814, + "grad_norm": 0.09037131816148758, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 206810 + }, + { + "epoch": 0.7995082803729647, + "grad_norm": 0.09242180734872818, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 206820 + }, + { + "epoch": 0.799546937576348, + "grad_norm": 0.09812980145215988, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 206830 + }, + { + "epoch": 0.7995855947797312, + "grad_norm": 0.1272347867488861, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 206840 + }, + { + "epoch": 0.7996242519831145, + "grad_norm": 0.08748415857553482, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 206850 + }, + { + "epoch": 0.7996629091864979, + "grad_norm": 0.09045463800430298, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 206860 + }, + { + "epoch": 0.7997015663898811, + "grad_norm": 0.18325842916965485, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 206870 + }, + { + "epoch": 0.7997402235932644, + "grad_norm": 0.10421092808246613, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 206880 + }, + { + "epoch": 0.7997788807966476, + "grad_norm": 0.10916467010974884, + "learning_rate": 0.002, + "loss": 2.3628, + "step": 206890 + }, + { + "epoch": 0.799817538000031, + "grad_norm": 0.10946245491504669, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 206900 + }, + { + "epoch": 0.7998561952034142, + "grad_norm": 0.11079275608062744, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 206910 + }, + { + "epoch": 0.7998948524067975, + "grad_norm": 0.10049515962600708, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 206920 + }, + { + "epoch": 0.7999335096101807, + "grad_norm": 0.10186389833688736, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 206930 + }, + { + "epoch": 0.7999721668135641, + "grad_norm": 0.11401376873254776, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 206940 + }, + { + "epoch": 0.8000108240169473, + "grad_norm": 0.11036250740289688, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 206950 + }, + { + "epoch": 0.8000494812203306, + "grad_norm": 0.10665077716112137, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 206960 + }, + { + "epoch": 0.8000881384237138, + "grad_norm": 0.10211837291717529, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 206970 + }, + { + "epoch": 0.8001267956270971, + "grad_norm": 0.12194209545850754, + "learning_rate": 0.002, + "loss": 2.335, + "step": 206980 + }, + { + "epoch": 0.8001654528304805, + "grad_norm": 0.09793148189783096, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 206990 + }, + { + "epoch": 0.8002041100338637, + "grad_norm": 0.12128207832574844, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 207000 + }, + { + "epoch": 0.800242767237247, + "grad_norm": 0.12369337677955627, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 207010 + }, + { + "epoch": 0.8002814244406302, + "grad_norm": 0.11725110560655594, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 207020 + }, + { + "epoch": 0.8003200816440136, + "grad_norm": 0.12829987704753876, + "learning_rate": 0.002, + "loss": 2.335, + "step": 207030 + }, + { + "epoch": 0.8003587388473968, + "grad_norm": 0.09357552975416183, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 207040 + }, + { + "epoch": 0.8003973960507801, + "grad_norm": 0.10493844002485275, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 207050 + }, + { + "epoch": 0.8004360532541633, + "grad_norm": 0.11441578716039658, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 207060 + }, + { + "epoch": 0.8004747104575467, + "grad_norm": 0.09632222354412079, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 207070 + }, + { + "epoch": 0.80051336766093, + "grad_norm": 0.11511845886707306, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 207080 + }, + { + "epoch": 0.8005520248643132, + "grad_norm": 0.10339998453855515, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 207090 + }, + { + "epoch": 0.8005906820676965, + "grad_norm": 0.09853748232126236, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 207100 + }, + { + "epoch": 0.8006293392710798, + "grad_norm": 0.10093691200017929, + "learning_rate": 0.002, + "loss": 2.328, + "step": 207110 + }, + { + "epoch": 0.8006679964744631, + "grad_norm": 0.10225550830364227, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 207120 + }, + { + "epoch": 0.8007066536778463, + "grad_norm": 0.11786044389009476, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 207130 + }, + { + "epoch": 0.8007453108812296, + "grad_norm": 0.09901709854602814, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 207140 + }, + { + "epoch": 0.8007839680846129, + "grad_norm": 0.09919336438179016, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 207150 + }, + { + "epoch": 0.8008226252879962, + "grad_norm": 0.11238361895084381, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 207160 + }, + { + "epoch": 0.8008612824913794, + "grad_norm": 0.10497693717479706, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 207170 + }, + { + "epoch": 0.8008999396947627, + "grad_norm": 0.13379651308059692, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 207180 + }, + { + "epoch": 0.800938596898146, + "grad_norm": 0.10189051181077957, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 207190 + }, + { + "epoch": 0.8009772541015293, + "grad_norm": 0.10896746069192886, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 207200 + }, + { + "epoch": 0.8010159113049126, + "grad_norm": 0.10068132728338242, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 207210 + }, + { + "epoch": 0.8010545685082958, + "grad_norm": 0.1001577228307724, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 207220 + }, + { + "epoch": 0.8010932257116791, + "grad_norm": 0.09665859490633011, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 207230 + }, + { + "epoch": 0.8011318829150624, + "grad_norm": 0.09988411515951157, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 207240 + }, + { + "epoch": 0.8011705401184457, + "grad_norm": 0.12435844540596008, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 207250 + }, + { + "epoch": 0.8012091973218289, + "grad_norm": 0.10896141082048416, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 207260 + }, + { + "epoch": 0.8012478545252122, + "grad_norm": 0.11095938086509705, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 207270 + }, + { + "epoch": 0.8012865117285956, + "grad_norm": 0.11793738603591919, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 207280 + }, + { + "epoch": 0.8013251689319788, + "grad_norm": 0.11566495895385742, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 207290 + }, + { + "epoch": 0.8013638261353621, + "grad_norm": 0.09072500467300415, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 207300 + }, + { + "epoch": 0.8014024833387453, + "grad_norm": 0.10017773509025574, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 207310 + }, + { + "epoch": 0.8014411405421287, + "grad_norm": 0.12244265526533127, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 207320 + }, + { + "epoch": 0.8014797977455119, + "grad_norm": 0.09469608962535858, + "learning_rate": 0.002, + "loss": 2.346, + "step": 207330 + }, + { + "epoch": 0.8015184549488952, + "grad_norm": 0.13152846693992615, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 207340 + }, + { + "epoch": 0.8015571121522784, + "grad_norm": 0.09928417950868607, + "learning_rate": 0.002, + "loss": 2.346, + "step": 207350 + }, + { + "epoch": 0.8015957693556617, + "grad_norm": 0.11704422533512115, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 207360 + }, + { + "epoch": 0.801634426559045, + "grad_norm": 0.11043064296245575, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 207370 + }, + { + "epoch": 0.8016730837624283, + "grad_norm": 0.10696674138307571, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 207380 + }, + { + "epoch": 0.8017117409658115, + "grad_norm": 0.1035086140036583, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 207390 + }, + { + "epoch": 0.8017503981691948, + "grad_norm": 0.14738266170024872, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 207400 + }, + { + "epoch": 0.8017890553725782, + "grad_norm": 0.09723462909460068, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 207410 + }, + { + "epoch": 0.8018277125759614, + "grad_norm": 0.11064042150974274, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 207420 + }, + { + "epoch": 0.8018663697793447, + "grad_norm": 0.09808441996574402, + "learning_rate": 0.002, + "loss": 2.344, + "step": 207430 + }, + { + "epoch": 0.8019050269827279, + "grad_norm": 0.09821963310241699, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 207440 + }, + { + "epoch": 0.8019436841861113, + "grad_norm": 0.09830199927091599, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 207450 + }, + { + "epoch": 0.8019823413894945, + "grad_norm": 0.1040264442563057, + "learning_rate": 0.002, + "loss": 2.355, + "step": 207460 + }, + { + "epoch": 0.8020209985928778, + "grad_norm": 0.09451977163553238, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 207470 + }, + { + "epoch": 0.802059655796261, + "grad_norm": 0.09999929368495941, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 207480 + }, + { + "epoch": 0.8020983129996444, + "grad_norm": 0.1217181608080864, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 207490 + }, + { + "epoch": 0.8021369702030277, + "grad_norm": 0.09493351727724075, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 207500 + }, + { + "epoch": 0.8021756274064109, + "grad_norm": 0.09583764523267746, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 207510 + }, + { + "epoch": 0.8022142846097942, + "grad_norm": 0.1117585077881813, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 207520 + }, + { + "epoch": 0.8022529418131775, + "grad_norm": 0.28829827904701233, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 207530 + }, + { + "epoch": 0.8022915990165608, + "grad_norm": 0.13149257004261017, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 207540 + }, + { + "epoch": 0.802330256219944, + "grad_norm": 0.10232976824045181, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 207550 + }, + { + "epoch": 0.8023689134233273, + "grad_norm": 0.09590692818164825, + "learning_rate": 0.002, + "loss": 2.35, + "step": 207560 + }, + { + "epoch": 0.8024075706267105, + "grad_norm": 0.10753864794969559, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 207570 + }, + { + "epoch": 0.8024462278300939, + "grad_norm": 0.0987880751490593, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 207580 + }, + { + "epoch": 0.8024848850334771, + "grad_norm": 0.09377659857273102, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 207590 + }, + { + "epoch": 0.8025235422368604, + "grad_norm": 0.10115095973014832, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 207600 + }, + { + "epoch": 0.8025621994402437, + "grad_norm": 0.10874200612306595, + "learning_rate": 0.002, + "loss": 2.337, + "step": 207610 + }, + { + "epoch": 0.802600856643627, + "grad_norm": 0.10040194541215897, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 207620 + }, + { + "epoch": 0.8026395138470103, + "grad_norm": 0.11709320545196533, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 207630 + }, + { + "epoch": 0.8026781710503935, + "grad_norm": 0.09988772869110107, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 207640 + }, + { + "epoch": 0.8027168282537768, + "grad_norm": 0.10499367862939835, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 207650 + }, + { + "epoch": 0.8027554854571601, + "grad_norm": 0.13137167692184448, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 207660 + }, + { + "epoch": 0.8027941426605434, + "grad_norm": 0.14281345903873444, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 207670 + }, + { + "epoch": 0.8028327998639266, + "grad_norm": 0.0973275750875473, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 207680 + }, + { + "epoch": 0.8028714570673099, + "grad_norm": 0.10529021918773651, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 207690 + }, + { + "epoch": 0.8029101142706933, + "grad_norm": 0.09296000003814697, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 207700 + }, + { + "epoch": 0.8029487714740765, + "grad_norm": 0.0963568240404129, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 207710 + }, + { + "epoch": 0.8029874286774598, + "grad_norm": 0.1171727329492569, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 207720 + }, + { + "epoch": 0.803026085880843, + "grad_norm": 0.10067886859178543, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 207730 + }, + { + "epoch": 0.8030647430842263, + "grad_norm": 0.10748159140348434, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 207740 + }, + { + "epoch": 0.8031034002876096, + "grad_norm": 0.10766609013080597, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 207750 + }, + { + "epoch": 0.8031420574909929, + "grad_norm": 0.11034165322780609, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 207760 + }, + { + "epoch": 0.8031807146943761, + "grad_norm": 0.11566515266895294, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 207770 + }, + { + "epoch": 0.8032193718977594, + "grad_norm": 0.09075692296028137, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 207780 + }, + { + "epoch": 0.8032580291011427, + "grad_norm": 0.12387843430042267, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 207790 + }, + { + "epoch": 0.803296686304526, + "grad_norm": 0.1029786691069603, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 207800 + }, + { + "epoch": 0.8033353435079092, + "grad_norm": 0.09761625528335571, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 207810 + }, + { + "epoch": 0.8033740007112925, + "grad_norm": 0.11875072866678238, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 207820 + }, + { + "epoch": 0.8034126579146759, + "grad_norm": 0.12390270084142685, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 207830 + }, + { + "epoch": 0.8034513151180591, + "grad_norm": 0.14737088978290558, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 207840 + }, + { + "epoch": 0.8034899723214424, + "grad_norm": 0.11713667958974838, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 207850 + }, + { + "epoch": 0.8035286295248256, + "grad_norm": 0.10130646824836731, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 207860 + }, + { + "epoch": 0.803567286728209, + "grad_norm": 0.11106660962104797, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 207870 + }, + { + "epoch": 0.8036059439315922, + "grad_norm": 0.10673379898071289, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 207880 + }, + { + "epoch": 0.8036446011349755, + "grad_norm": 0.105867400765419, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 207890 + }, + { + "epoch": 0.8036832583383587, + "grad_norm": 0.10368683189153671, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 207900 + }, + { + "epoch": 0.803721915541742, + "grad_norm": 0.1729949712753296, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 207910 + }, + { + "epoch": 0.8037605727451254, + "grad_norm": 0.12230349332094193, + "learning_rate": 0.002, + "loss": 2.342, + "step": 207920 + }, + { + "epoch": 0.8037992299485086, + "grad_norm": 0.16800034046173096, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 207930 + }, + { + "epoch": 0.8038378871518919, + "grad_norm": 0.11105374246835709, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 207940 + }, + { + "epoch": 0.8038765443552751, + "grad_norm": 0.11874057352542877, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 207950 + }, + { + "epoch": 0.8039152015586585, + "grad_norm": 0.10733795911073685, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 207960 + }, + { + "epoch": 0.8039538587620417, + "grad_norm": 0.10871712863445282, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 207970 + }, + { + "epoch": 0.803992515965425, + "grad_norm": 0.10429523140192032, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 207980 + }, + { + "epoch": 0.8040311731688082, + "grad_norm": 0.0885714516043663, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 207990 + }, + { + "epoch": 0.8040698303721916, + "grad_norm": 0.10083181411027908, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 208000 + }, + { + "epoch": 0.8041084875755748, + "grad_norm": 0.10782654583454132, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 208010 + }, + { + "epoch": 0.8041471447789581, + "grad_norm": 0.10470322519540787, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 208020 + }, + { + "epoch": 0.8041858019823414, + "grad_norm": 0.11609132587909698, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 208030 + }, + { + "epoch": 0.8042244591857247, + "grad_norm": 0.09933394938707352, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 208040 + }, + { + "epoch": 0.804263116389108, + "grad_norm": 0.10304474085569382, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 208050 + }, + { + "epoch": 0.8043017735924912, + "grad_norm": 0.10988190770149231, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 208060 + }, + { + "epoch": 0.8043404307958745, + "grad_norm": 0.09970662742853165, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 208070 + }, + { + "epoch": 0.8043790879992578, + "grad_norm": 0.12120552361011505, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 208080 + }, + { + "epoch": 0.8044177452026411, + "grad_norm": 0.10486899316310883, + "learning_rate": 0.002, + "loss": 2.331, + "step": 208090 + }, + { + "epoch": 0.8044564024060243, + "grad_norm": 0.1178613230586052, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 208100 + }, + { + "epoch": 0.8044950596094076, + "grad_norm": 0.10001028329133987, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 208110 + }, + { + "epoch": 0.8045337168127908, + "grad_norm": 0.09504573047161102, + "learning_rate": 0.002, + "loss": 2.3626, + "step": 208120 + }, + { + "epoch": 0.8045723740161742, + "grad_norm": 0.09070435911417007, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 208130 + }, + { + "epoch": 0.8046110312195575, + "grad_norm": 0.09632452577352524, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 208140 + }, + { + "epoch": 0.8046496884229407, + "grad_norm": 0.11543071269989014, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 208150 + }, + { + "epoch": 0.804688345626324, + "grad_norm": 0.104801706969738, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 208160 + }, + { + "epoch": 0.8047270028297073, + "grad_norm": 0.09996666014194489, + "learning_rate": 0.002, + "loss": 2.341, + "step": 208170 + }, + { + "epoch": 0.8047656600330906, + "grad_norm": 0.10567789524793625, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 208180 + }, + { + "epoch": 0.8048043172364738, + "grad_norm": 0.0922398567199707, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 208190 + }, + { + "epoch": 0.8048429744398571, + "grad_norm": 0.10439912974834442, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 208200 + }, + { + "epoch": 0.8048816316432404, + "grad_norm": 0.10547574609518051, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 208210 + }, + { + "epoch": 0.8049202888466237, + "grad_norm": 0.09658071398735046, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 208220 + }, + { + "epoch": 0.804958946050007, + "grad_norm": 0.0904550701379776, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 208230 + }, + { + "epoch": 0.8049976032533902, + "grad_norm": 0.10824532806873322, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 208240 + }, + { + "epoch": 0.8050362604567736, + "grad_norm": 0.11004411429166794, + "learning_rate": 0.002, + "loss": 2.323, + "step": 208250 + }, + { + "epoch": 0.8050749176601568, + "grad_norm": 0.11853194236755371, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 208260 + }, + { + "epoch": 0.8051135748635401, + "grad_norm": 0.1130291149020195, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 208270 + }, + { + "epoch": 0.8051522320669233, + "grad_norm": 0.09779933094978333, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 208280 + }, + { + "epoch": 0.8051908892703066, + "grad_norm": 0.10665308684110641, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 208290 + }, + { + "epoch": 0.8052295464736899, + "grad_norm": 0.10099265724420547, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 208300 + }, + { + "epoch": 0.8052682036770732, + "grad_norm": 0.12212111800909042, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 208310 + }, + { + "epoch": 0.8053068608804564, + "grad_norm": 0.10795871913433075, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 208320 + }, + { + "epoch": 0.8053455180838397, + "grad_norm": 0.10837031155824661, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 208330 + }, + { + "epoch": 0.8053841752872231, + "grad_norm": 0.10169872641563416, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 208340 + }, + { + "epoch": 0.8054228324906063, + "grad_norm": 0.10496000945568085, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 208350 + }, + { + "epoch": 0.8054614896939896, + "grad_norm": 0.08883891254663467, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 208360 + }, + { + "epoch": 0.8055001468973728, + "grad_norm": 0.10567431151866913, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 208370 + }, + { + "epoch": 0.8055388041007562, + "grad_norm": 0.12218322604894638, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 208380 + }, + { + "epoch": 0.8055774613041394, + "grad_norm": 0.11944884061813354, + "learning_rate": 0.002, + "loss": 2.333, + "step": 208390 + }, + { + "epoch": 0.8056161185075227, + "grad_norm": 0.09559208154678345, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 208400 + }, + { + "epoch": 0.8056547757109059, + "grad_norm": 0.11223572492599487, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 208410 + }, + { + "epoch": 0.8056934329142893, + "grad_norm": 0.09969627857208252, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 208420 + }, + { + "epoch": 0.8057320901176726, + "grad_norm": 0.10251709073781967, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 208430 + }, + { + "epoch": 0.8057707473210558, + "grad_norm": 0.10395486652851105, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 208440 + }, + { + "epoch": 0.805809404524439, + "grad_norm": 0.10985516756772995, + "learning_rate": 0.002, + "loss": 2.348, + "step": 208450 + }, + { + "epoch": 0.8058480617278224, + "grad_norm": 0.11914392560720444, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 208460 + }, + { + "epoch": 0.8058867189312057, + "grad_norm": 0.09631257504224777, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 208470 + }, + { + "epoch": 0.8059253761345889, + "grad_norm": 0.1129194125533104, + "learning_rate": 0.002, + "loss": 2.341, + "step": 208480 + }, + { + "epoch": 0.8059640333379722, + "grad_norm": 0.09658629447221756, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 208490 + }, + { + "epoch": 0.8060026905413554, + "grad_norm": 0.11044806241989136, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 208500 + }, + { + "epoch": 0.8060413477447388, + "grad_norm": 0.1090376228094101, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 208510 + }, + { + "epoch": 0.806080004948122, + "grad_norm": 0.0996435284614563, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 208520 + }, + { + "epoch": 0.8061186621515053, + "grad_norm": 0.10276366770267487, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 208530 + }, + { + "epoch": 0.8061573193548885, + "grad_norm": 0.10204587876796722, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 208540 + }, + { + "epoch": 0.8061959765582719, + "grad_norm": 0.09693923592567444, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 208550 + }, + { + "epoch": 0.8062346337616552, + "grad_norm": 0.10240539163351059, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 208560 + }, + { + "epoch": 0.8062732909650384, + "grad_norm": 0.09578530490398407, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 208570 + }, + { + "epoch": 0.8063119481684217, + "grad_norm": 0.1167021319270134, + "learning_rate": 0.002, + "loss": 2.317, + "step": 208580 + }, + { + "epoch": 0.806350605371805, + "grad_norm": 0.09664864093065262, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 208590 + }, + { + "epoch": 0.8063892625751883, + "grad_norm": 0.09875663369894028, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 208600 + }, + { + "epoch": 0.8064279197785715, + "grad_norm": 0.09840506315231323, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 208610 + }, + { + "epoch": 0.8064665769819548, + "grad_norm": 0.08871841430664062, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 208620 + }, + { + "epoch": 0.8065052341853381, + "grad_norm": 0.08804570138454437, + "learning_rate": 0.002, + "loss": 2.339, + "step": 208630 + }, + { + "epoch": 0.8065438913887214, + "grad_norm": 0.12451464682817459, + "learning_rate": 0.002, + "loss": 2.346, + "step": 208640 + }, + { + "epoch": 0.8065825485921047, + "grad_norm": 0.09834271669387817, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 208650 + }, + { + "epoch": 0.8066212057954879, + "grad_norm": 0.10178635269403458, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 208660 + }, + { + "epoch": 0.8066598629988712, + "grad_norm": 0.09166623651981354, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 208670 + }, + { + "epoch": 0.8066985202022545, + "grad_norm": 0.09693189710378647, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 208680 + }, + { + "epoch": 0.8067371774056378, + "grad_norm": 0.09537971019744873, + "learning_rate": 0.002, + "loss": 2.334, + "step": 208690 + }, + { + "epoch": 0.806775834609021, + "grad_norm": 0.09352777153253555, + "learning_rate": 0.002, + "loss": 2.33, + "step": 208700 + }, + { + "epoch": 0.8068144918124043, + "grad_norm": 0.097089983522892, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 208710 + }, + { + "epoch": 0.8068531490157876, + "grad_norm": 0.11535824090242386, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 208720 + }, + { + "epoch": 0.8068918062191709, + "grad_norm": 0.12450321763753891, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 208730 + }, + { + "epoch": 0.8069304634225541, + "grad_norm": 0.09889606386423111, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 208740 + }, + { + "epoch": 0.8069691206259374, + "grad_norm": 0.10972964018583298, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 208750 + }, + { + "epoch": 0.8070077778293208, + "grad_norm": 0.10638296604156494, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 208760 + }, + { + "epoch": 0.807046435032704, + "grad_norm": 0.09678032994270325, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 208770 + }, + { + "epoch": 0.8070850922360873, + "grad_norm": 0.11467410624027252, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 208780 + }, + { + "epoch": 0.8071237494394705, + "grad_norm": 0.10477212816476822, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 208790 + }, + { + "epoch": 0.8071624066428539, + "grad_norm": 0.12658697366714478, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 208800 + }, + { + "epoch": 0.8072010638462371, + "grad_norm": 0.10979489237070084, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 208810 + }, + { + "epoch": 0.8072397210496204, + "grad_norm": 0.10843465477228165, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 208820 + }, + { + "epoch": 0.8072783782530036, + "grad_norm": 0.09200742840766907, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 208830 + }, + { + "epoch": 0.8073170354563869, + "grad_norm": 0.09514549374580383, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 208840 + }, + { + "epoch": 0.8073556926597703, + "grad_norm": 0.1180688887834549, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 208850 + }, + { + "epoch": 0.8073943498631535, + "grad_norm": 0.09553969651460648, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 208860 + }, + { + "epoch": 0.8074330070665368, + "grad_norm": 0.10637307912111282, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 208870 + }, + { + "epoch": 0.80747166426992, + "grad_norm": 0.10888589173555374, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 208880 + }, + { + "epoch": 0.8075103214733034, + "grad_norm": 0.09537948668003082, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 208890 + }, + { + "epoch": 0.8075489786766866, + "grad_norm": 0.10661628842353821, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 208900 + }, + { + "epoch": 0.8075876358800699, + "grad_norm": 0.13251517713069916, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 208910 + }, + { + "epoch": 0.8076262930834531, + "grad_norm": 0.10121644288301468, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 208920 + }, + { + "epoch": 0.8076649502868365, + "grad_norm": 0.27895215153694153, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 208930 + }, + { + "epoch": 0.8077036074902197, + "grad_norm": 0.09418975561857224, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 208940 + }, + { + "epoch": 0.807742264693603, + "grad_norm": 0.1138615608215332, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 208950 + }, + { + "epoch": 0.8077809218969862, + "grad_norm": 0.106838159263134, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 208960 + }, + { + "epoch": 0.8078195791003696, + "grad_norm": 0.11634216457605362, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 208970 + }, + { + "epoch": 0.8078582363037529, + "grad_norm": 0.11513422429561615, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 208980 + }, + { + "epoch": 0.8078968935071361, + "grad_norm": 0.11838242411613464, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 208990 + }, + { + "epoch": 0.8079355507105194, + "grad_norm": 0.10261435061693192, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 209000 + }, + { + "epoch": 0.8079742079139027, + "grad_norm": 0.09122615307569504, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 209010 + }, + { + "epoch": 0.808012865117286, + "grad_norm": 0.1018514409661293, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 209020 + }, + { + "epoch": 0.8080515223206692, + "grad_norm": 0.11309902369976044, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 209030 + }, + { + "epoch": 0.8080901795240525, + "grad_norm": 0.09477593749761581, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 209040 + }, + { + "epoch": 0.8081288367274357, + "grad_norm": 0.11209241300821304, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 209050 + }, + { + "epoch": 0.8081674939308191, + "grad_norm": 0.11718212068080902, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 209060 + }, + { + "epoch": 0.8082061511342024, + "grad_norm": 0.10840430110692978, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 209070 + }, + { + "epoch": 0.8082448083375856, + "grad_norm": 0.09068944305181503, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 209080 + }, + { + "epoch": 0.8082834655409689, + "grad_norm": 0.10419811308383942, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 209090 + }, + { + "epoch": 0.8083221227443522, + "grad_norm": 0.10249201208353043, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 209100 + }, + { + "epoch": 0.8083607799477355, + "grad_norm": 0.09948867559432983, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 209110 + }, + { + "epoch": 0.8083994371511187, + "grad_norm": 0.10818983614444733, + "learning_rate": 0.002, + "loss": 2.342, + "step": 209120 + }, + { + "epoch": 0.808438094354502, + "grad_norm": 0.09621796011924744, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 209130 + }, + { + "epoch": 0.8084767515578853, + "grad_norm": 0.09759146720170975, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 209140 + }, + { + "epoch": 0.8085154087612686, + "grad_norm": 0.11393880844116211, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 209150 + }, + { + "epoch": 0.8085540659646518, + "grad_norm": 0.10928820818662643, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 209160 + }, + { + "epoch": 0.8085927231680351, + "grad_norm": 0.1005977988243103, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 209170 + }, + { + "epoch": 0.8086313803714185, + "grad_norm": 0.10145454853773117, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 209180 + }, + { + "epoch": 0.8086700375748017, + "grad_norm": 0.09967648983001709, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 209190 + }, + { + "epoch": 0.808708694778185, + "grad_norm": 0.10319273173809052, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 209200 + }, + { + "epoch": 0.8087473519815682, + "grad_norm": 0.10071306675672531, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 209210 + }, + { + "epoch": 0.8087860091849515, + "grad_norm": 0.09968692809343338, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 209220 + }, + { + "epoch": 0.8088246663883348, + "grad_norm": 0.11552612483501434, + "learning_rate": 0.002, + "loss": 2.321, + "step": 209230 + }, + { + "epoch": 0.8088633235917181, + "grad_norm": 0.10566051304340363, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 209240 + }, + { + "epoch": 0.8089019807951013, + "grad_norm": 0.10294246673583984, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 209250 + }, + { + "epoch": 0.8089406379984846, + "grad_norm": 0.10880181193351746, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 209260 + }, + { + "epoch": 0.808979295201868, + "grad_norm": 0.10486720502376556, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 209270 + }, + { + "epoch": 0.8090179524052512, + "grad_norm": 0.10026592761278152, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 209280 + }, + { + "epoch": 0.8090566096086345, + "grad_norm": 0.0959942489862442, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 209290 + }, + { + "epoch": 0.8090952668120177, + "grad_norm": 0.11561115086078644, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 209300 + }, + { + "epoch": 0.8091339240154011, + "grad_norm": 0.09788880497217178, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 209310 + }, + { + "epoch": 0.8091725812187843, + "grad_norm": 0.08990046381950378, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 209320 + }, + { + "epoch": 0.8092112384221676, + "grad_norm": 0.10459993034601212, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 209330 + }, + { + "epoch": 0.8092498956255508, + "grad_norm": 0.10062652826309204, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 209340 + }, + { + "epoch": 0.8092885528289342, + "grad_norm": 0.11534670740365982, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 209350 + }, + { + "epoch": 0.8093272100323174, + "grad_norm": 0.1055721789598465, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 209360 + }, + { + "epoch": 0.8093658672357007, + "grad_norm": 0.10717543959617615, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 209370 + }, + { + "epoch": 0.809404524439084, + "grad_norm": 0.12222599238157272, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 209380 + }, + { + "epoch": 0.8094431816424672, + "grad_norm": 0.11402303725481033, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 209390 + }, + { + "epoch": 0.8094818388458506, + "grad_norm": 0.09476860612630844, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 209400 + }, + { + "epoch": 0.8095204960492338, + "grad_norm": 0.12673380970954895, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 209410 + }, + { + "epoch": 0.8095591532526171, + "grad_norm": 0.1033923402428627, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 209420 + }, + { + "epoch": 0.8095978104560003, + "grad_norm": 0.14089608192443848, + "learning_rate": 0.002, + "loss": 2.3661, + "step": 209430 + }, + { + "epoch": 0.8096364676593837, + "grad_norm": 0.10108928382396698, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 209440 + }, + { + "epoch": 0.8096751248627669, + "grad_norm": 0.11482837796211243, + "learning_rate": 0.002, + "loss": 2.354, + "step": 209450 + }, + { + "epoch": 0.8097137820661502, + "grad_norm": 0.1034383699297905, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 209460 + }, + { + "epoch": 0.8097524392695334, + "grad_norm": 0.09372849017381668, + "learning_rate": 0.002, + "loss": 2.327, + "step": 209470 + }, + { + "epoch": 0.8097910964729168, + "grad_norm": 0.10579051077365875, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 209480 + }, + { + "epoch": 0.8098297536763001, + "grad_norm": 0.09323503077030182, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 209490 + }, + { + "epoch": 0.8098684108796833, + "grad_norm": 0.09396940469741821, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 209500 + }, + { + "epoch": 0.8099070680830666, + "grad_norm": 0.09449894726276398, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 209510 + }, + { + "epoch": 0.8099457252864499, + "grad_norm": 0.10405312478542328, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 209520 + }, + { + "epoch": 0.8099843824898332, + "grad_norm": 0.10237361490726471, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 209530 + }, + { + "epoch": 0.8100230396932164, + "grad_norm": 0.11019392311573029, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 209540 + }, + { + "epoch": 0.8100616968965997, + "grad_norm": 0.12901364266872406, + "learning_rate": 0.002, + "loss": 2.329, + "step": 209550 + }, + { + "epoch": 0.810100354099983, + "grad_norm": 0.11369182914495468, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 209560 + }, + { + "epoch": 0.8101390113033663, + "grad_norm": 0.11985727399587631, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 209570 + }, + { + "epoch": 0.8101776685067495, + "grad_norm": 0.12362784147262573, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 209580 + }, + { + "epoch": 0.8102163257101328, + "grad_norm": 0.11008656769990921, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 209590 + }, + { + "epoch": 0.810254982913516, + "grad_norm": 0.09525039047002792, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 209600 + }, + { + "epoch": 0.8102936401168994, + "grad_norm": 0.09465383738279343, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 209610 + }, + { + "epoch": 0.8103322973202827, + "grad_norm": 0.12793488800525665, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 209620 + }, + { + "epoch": 0.8103709545236659, + "grad_norm": 0.11443587392568588, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 209630 + }, + { + "epoch": 0.8104096117270492, + "grad_norm": 0.09764964878559113, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 209640 + }, + { + "epoch": 0.8104482689304325, + "grad_norm": 0.10466815531253815, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 209650 + }, + { + "epoch": 0.8104869261338158, + "grad_norm": 0.09393223375082016, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 209660 + }, + { + "epoch": 0.810525583337199, + "grad_norm": 0.10785609483718872, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 209670 + }, + { + "epoch": 0.8105642405405823, + "grad_norm": 0.09427961707115173, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 209680 + }, + { + "epoch": 0.8106028977439657, + "grad_norm": 0.1001754030585289, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 209690 + }, + { + "epoch": 0.8106415549473489, + "grad_norm": 0.11797784268856049, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 209700 + }, + { + "epoch": 0.8106802121507322, + "grad_norm": 0.1397496461868286, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 209710 + }, + { + "epoch": 0.8107188693541154, + "grad_norm": 0.10363800823688507, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 209720 + }, + { + "epoch": 0.8107575265574988, + "grad_norm": 0.11160644888877869, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 209730 + }, + { + "epoch": 0.810796183760882, + "grad_norm": 0.10050149261951447, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 209740 + }, + { + "epoch": 0.8108348409642653, + "grad_norm": 0.11539889872074127, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 209750 + }, + { + "epoch": 0.8108734981676485, + "grad_norm": 0.09776457399129868, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 209760 + }, + { + "epoch": 0.8109121553710318, + "grad_norm": 0.13011224567890167, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 209770 + }, + { + "epoch": 0.8109508125744151, + "grad_norm": 0.10698943585157394, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 209780 + }, + { + "epoch": 0.8109894697777984, + "grad_norm": 0.08842163532972336, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 209790 + }, + { + "epoch": 0.8110281269811817, + "grad_norm": 0.10007239878177643, + "learning_rate": 0.002, + "loss": 2.349, + "step": 209800 + }, + { + "epoch": 0.8110667841845649, + "grad_norm": 0.09910252690315247, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 209810 + }, + { + "epoch": 0.8111054413879483, + "grad_norm": 0.10531774908304214, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 209820 + }, + { + "epoch": 0.8111440985913315, + "grad_norm": 0.09842731058597565, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 209830 + }, + { + "epoch": 0.8111827557947148, + "grad_norm": 0.1132144182920456, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 209840 + }, + { + "epoch": 0.811221412998098, + "grad_norm": 0.10505369305610657, + "learning_rate": 0.002, + "loss": 2.351, + "step": 209850 + }, + { + "epoch": 0.8112600702014814, + "grad_norm": 0.09998974204063416, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 209860 + }, + { + "epoch": 0.8112987274048646, + "grad_norm": 0.1102500855922699, + "learning_rate": 0.002, + "loss": 2.355, + "step": 209870 + }, + { + "epoch": 0.8113373846082479, + "grad_norm": 0.10504692792892456, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 209880 + }, + { + "epoch": 0.8113760418116311, + "grad_norm": 0.10265660285949707, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 209890 + }, + { + "epoch": 0.8114146990150145, + "grad_norm": 0.1298140585422516, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 209900 + }, + { + "epoch": 0.8114533562183978, + "grad_norm": 0.11420505493879318, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 209910 + }, + { + "epoch": 0.811492013421781, + "grad_norm": 0.10623852163553238, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 209920 + }, + { + "epoch": 0.8115306706251643, + "grad_norm": 0.09393060207366943, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 209930 + }, + { + "epoch": 0.8115693278285476, + "grad_norm": 0.1149723008275032, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 209940 + }, + { + "epoch": 0.8116079850319309, + "grad_norm": 0.12244723737239838, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 209950 + }, + { + "epoch": 0.8116466422353141, + "grad_norm": 0.10593932121992111, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 209960 + }, + { + "epoch": 0.8116852994386974, + "grad_norm": 0.10085232555866241, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 209970 + }, + { + "epoch": 0.8117239566420806, + "grad_norm": 0.10210473090410233, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 209980 + }, + { + "epoch": 0.811762613845464, + "grad_norm": 0.09529945999383926, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 209990 + }, + { + "epoch": 0.8118012710488473, + "grad_norm": 0.11046209186315536, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 210000 + }, + { + "epoch": 0.8118399282522305, + "grad_norm": 0.10785539448261261, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 210010 + }, + { + "epoch": 0.8118785854556138, + "grad_norm": 0.12557782232761383, + "learning_rate": 0.002, + "loss": 2.341, + "step": 210020 + }, + { + "epoch": 0.8119172426589971, + "grad_norm": 0.11159656941890717, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 210030 + }, + { + "epoch": 0.8119558998623804, + "grad_norm": 0.10711421072483063, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 210040 + }, + { + "epoch": 0.8119945570657636, + "grad_norm": 0.10146241635084152, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 210050 + }, + { + "epoch": 0.8120332142691469, + "grad_norm": 0.10713270306587219, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 210060 + }, + { + "epoch": 0.8120718714725302, + "grad_norm": 0.10487532615661621, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 210070 + }, + { + "epoch": 0.8121105286759135, + "grad_norm": 0.09500955045223236, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 210080 + }, + { + "epoch": 0.8121491858792967, + "grad_norm": 0.10562046617269516, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 210090 + }, + { + "epoch": 0.81218784308268, + "grad_norm": 0.11464356631040573, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 210100 + }, + { + "epoch": 0.8122265002860634, + "grad_norm": 0.10423224419355392, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 210110 + }, + { + "epoch": 0.8122651574894466, + "grad_norm": 0.09312635660171509, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 210120 + }, + { + "epoch": 0.8123038146928299, + "grad_norm": 0.10562204569578171, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 210130 + }, + { + "epoch": 0.8123424718962131, + "grad_norm": 0.12157256156206131, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 210140 + }, + { + "epoch": 0.8123811290995964, + "grad_norm": 0.09659356623888016, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 210150 + }, + { + "epoch": 0.8124197863029797, + "grad_norm": 0.1745530515909195, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 210160 + }, + { + "epoch": 0.812458443506363, + "grad_norm": 0.1092507392168045, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 210170 + }, + { + "epoch": 0.8124971007097462, + "grad_norm": 0.11003392189741135, + "learning_rate": 0.002, + "loss": 2.357, + "step": 210180 + }, + { + "epoch": 0.8125357579131295, + "grad_norm": 0.09807730466127396, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 210190 + }, + { + "epoch": 0.8125744151165128, + "grad_norm": 0.11290355026721954, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 210200 + }, + { + "epoch": 0.8126130723198961, + "grad_norm": 0.1134817823767662, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 210210 + }, + { + "epoch": 0.8126517295232794, + "grad_norm": 0.09779141843318939, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 210220 + }, + { + "epoch": 0.8126903867266626, + "grad_norm": 0.10998640209436417, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 210230 + }, + { + "epoch": 0.812729043930046, + "grad_norm": 0.10387808084487915, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 210240 + }, + { + "epoch": 0.8127677011334292, + "grad_norm": 0.09765990078449249, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 210250 + }, + { + "epoch": 0.8128063583368125, + "grad_norm": 0.09933991730213165, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 210260 + }, + { + "epoch": 0.8128450155401957, + "grad_norm": 0.09934469312429428, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 210270 + }, + { + "epoch": 0.8128836727435791, + "grad_norm": 0.11484673619270325, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 210280 + }, + { + "epoch": 0.8129223299469623, + "grad_norm": 0.091299869120121, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 210290 + }, + { + "epoch": 0.8129609871503456, + "grad_norm": 0.1021435409784317, + "learning_rate": 0.002, + "loss": 2.338, + "step": 210300 + }, + { + "epoch": 0.8129996443537288, + "grad_norm": 0.10851260274648666, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 210310 + }, + { + "epoch": 0.8130383015571121, + "grad_norm": 0.08935797214508057, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 210320 + }, + { + "epoch": 0.8130769587604955, + "grad_norm": 0.09947099536657333, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 210330 + }, + { + "epoch": 0.8131156159638787, + "grad_norm": 0.1050669401884079, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 210340 + }, + { + "epoch": 0.813154273167262, + "grad_norm": 0.09838138520717621, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 210350 + }, + { + "epoch": 0.8131929303706452, + "grad_norm": 0.09755048900842667, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 210360 + }, + { + "epoch": 0.8132315875740286, + "grad_norm": 0.12264610826969147, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 210370 + }, + { + "epoch": 0.8132702447774118, + "grad_norm": 0.11399409174919128, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 210380 + }, + { + "epoch": 0.8133089019807951, + "grad_norm": 0.09926419705152512, + "learning_rate": 0.002, + "loss": 2.338, + "step": 210390 + }, + { + "epoch": 0.8133475591841783, + "grad_norm": 0.095893993973732, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 210400 + }, + { + "epoch": 0.8133862163875617, + "grad_norm": 0.1031627431511879, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 210410 + }, + { + "epoch": 0.813424873590945, + "grad_norm": 0.11482734978199005, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 210420 + }, + { + "epoch": 0.8134635307943282, + "grad_norm": 0.08721525222063065, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 210430 + }, + { + "epoch": 0.8135021879977115, + "grad_norm": 0.10867653042078018, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 210440 + }, + { + "epoch": 0.8135408452010948, + "grad_norm": 0.09797938913106918, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 210450 + }, + { + "epoch": 0.8135795024044781, + "grad_norm": 0.10617242753505707, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 210460 + }, + { + "epoch": 0.8136181596078613, + "grad_norm": 0.11680958420038223, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 210470 + }, + { + "epoch": 0.8136568168112446, + "grad_norm": 0.10997258871793747, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 210480 + }, + { + "epoch": 0.8136954740146279, + "grad_norm": 0.10637106001377106, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 210490 + }, + { + "epoch": 0.8137341312180112, + "grad_norm": 0.0961093157529831, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 210500 + }, + { + "epoch": 0.8137727884213944, + "grad_norm": 0.1117602214217186, + "learning_rate": 0.002, + "loss": 2.34, + "step": 210510 + }, + { + "epoch": 0.8138114456247777, + "grad_norm": 0.09345916658639908, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 210520 + }, + { + "epoch": 0.813850102828161, + "grad_norm": 0.12478433549404144, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 210530 + }, + { + "epoch": 0.8138887600315443, + "grad_norm": 0.09902498126029968, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 210540 + }, + { + "epoch": 0.8139274172349276, + "grad_norm": 0.12334750592708588, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 210550 + }, + { + "epoch": 0.8139660744383108, + "grad_norm": 0.11691374331712723, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 210560 + }, + { + "epoch": 0.8140047316416941, + "grad_norm": 0.1074489951133728, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 210570 + }, + { + "epoch": 0.8140433888450774, + "grad_norm": 0.1104162260890007, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 210580 + }, + { + "epoch": 0.8140820460484607, + "grad_norm": 0.09845101833343506, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 210590 + }, + { + "epoch": 0.8141207032518439, + "grad_norm": 0.10408642143011093, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 210600 + }, + { + "epoch": 0.8141593604552272, + "grad_norm": 0.1344076693058014, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 210610 + }, + { + "epoch": 0.8141980176586106, + "grad_norm": 0.09189781546592712, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 210620 + }, + { + "epoch": 0.8142366748619938, + "grad_norm": 0.09596944600343704, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 210630 + }, + { + "epoch": 0.814275332065377, + "grad_norm": 0.10636798292398453, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 210640 + }, + { + "epoch": 0.8143139892687603, + "grad_norm": 0.10187642276287079, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 210650 + }, + { + "epoch": 0.8143526464721437, + "grad_norm": 0.10037054866552353, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 210660 + }, + { + "epoch": 0.8143913036755269, + "grad_norm": 0.113653264939785, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 210670 + }, + { + "epoch": 0.8144299608789102, + "grad_norm": 0.1081879585981369, + "learning_rate": 0.002, + "loss": 2.334, + "step": 210680 + }, + { + "epoch": 0.8144686180822934, + "grad_norm": 0.10552342981100082, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 210690 + }, + { + "epoch": 0.8145072752856767, + "grad_norm": 0.12662236392498016, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 210700 + }, + { + "epoch": 0.81454593248906, + "grad_norm": 0.10322510451078415, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 210710 + }, + { + "epoch": 0.8145845896924433, + "grad_norm": 0.10761570185422897, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 210720 + }, + { + "epoch": 0.8146232468958265, + "grad_norm": 0.09574377536773682, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 210730 + }, + { + "epoch": 0.8146619040992098, + "grad_norm": 0.10037858039140701, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 210740 + }, + { + "epoch": 0.8147005613025932, + "grad_norm": 0.13704144954681396, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 210750 + }, + { + "epoch": 0.8147392185059764, + "grad_norm": 0.09215878695249557, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 210760 + }, + { + "epoch": 0.8147778757093597, + "grad_norm": 0.09566718339920044, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 210770 + }, + { + "epoch": 0.8148165329127429, + "grad_norm": 0.1045469343662262, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 210780 + }, + { + "epoch": 0.8148551901161263, + "grad_norm": 0.11823946982622147, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 210790 + }, + { + "epoch": 0.8148938473195095, + "grad_norm": 0.10271833091974258, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 210800 + }, + { + "epoch": 0.8149325045228928, + "grad_norm": 0.09970005601644516, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 210810 + }, + { + "epoch": 0.814971161726276, + "grad_norm": 0.10856861621141434, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 210820 + }, + { + "epoch": 0.8150098189296594, + "grad_norm": 0.10430700331926346, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 210830 + }, + { + "epoch": 0.8150484761330427, + "grad_norm": 0.11050914973020554, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 210840 + }, + { + "epoch": 0.8150871333364259, + "grad_norm": 0.11571375280618668, + "learning_rate": 0.002, + "loss": 2.337, + "step": 210850 + }, + { + "epoch": 0.8151257905398092, + "grad_norm": 0.1321517676115036, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 210860 + }, + { + "epoch": 0.8151644477431925, + "grad_norm": 0.10609392821788788, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 210870 + }, + { + "epoch": 0.8152031049465758, + "grad_norm": 0.111605204641819, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 210880 + }, + { + "epoch": 0.815241762149959, + "grad_norm": 0.10075222700834274, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 210890 + }, + { + "epoch": 0.8152804193533423, + "grad_norm": 0.11976370215415955, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 210900 + }, + { + "epoch": 0.8153190765567255, + "grad_norm": 0.09836943447589874, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 210910 + }, + { + "epoch": 0.8153577337601089, + "grad_norm": 0.0988221988081932, + "learning_rate": 0.002, + "loss": 2.331, + "step": 210920 + }, + { + "epoch": 0.8153963909634921, + "grad_norm": 0.11077386885881424, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 210930 + }, + { + "epoch": 0.8154350481668754, + "grad_norm": 0.10229333490133286, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 210940 + }, + { + "epoch": 0.8154737053702587, + "grad_norm": 0.10310441255569458, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 210950 + }, + { + "epoch": 0.815512362573642, + "grad_norm": 0.10184182971715927, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 210960 + }, + { + "epoch": 0.8155510197770253, + "grad_norm": 0.08794326335191727, + "learning_rate": 0.002, + "loss": 2.318, + "step": 210970 + }, + { + "epoch": 0.8155896769804085, + "grad_norm": 0.09792374819517136, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 210980 + }, + { + "epoch": 0.8156283341837918, + "grad_norm": 0.11295660585165024, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 210990 + }, + { + "epoch": 0.8156669913871751, + "grad_norm": 0.09905468672513962, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 211000 + }, + { + "epoch": 0.8157056485905584, + "grad_norm": 0.09002207964658737, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 211010 + }, + { + "epoch": 0.8157443057939416, + "grad_norm": 0.10204461216926575, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 211020 + }, + { + "epoch": 0.8157829629973249, + "grad_norm": 0.10997717827558517, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 211030 + }, + { + "epoch": 0.8158216202007083, + "grad_norm": 0.09657556563615799, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 211040 + }, + { + "epoch": 0.8158602774040915, + "grad_norm": 0.1114683449268341, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 211050 + }, + { + "epoch": 0.8158989346074748, + "grad_norm": 0.08971460908651352, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 211060 + }, + { + "epoch": 0.815937591810858, + "grad_norm": 0.1059296503663063, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 211070 + }, + { + "epoch": 0.8159762490142413, + "grad_norm": 0.09650532901287079, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 211080 + }, + { + "epoch": 0.8160149062176246, + "grad_norm": 0.11085394024848938, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 211090 + }, + { + "epoch": 0.8160535634210079, + "grad_norm": 0.08948436379432678, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 211100 + }, + { + "epoch": 0.8160922206243911, + "grad_norm": 0.11666127294301987, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 211110 + }, + { + "epoch": 0.8161308778277744, + "grad_norm": 0.1133730337023735, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 211120 + }, + { + "epoch": 0.8161695350311577, + "grad_norm": 0.09404782205820084, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 211130 + }, + { + "epoch": 0.816208192234541, + "grad_norm": 0.09028097242116928, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 211140 + }, + { + "epoch": 0.8162468494379242, + "grad_norm": 0.09908846765756607, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 211150 + }, + { + "epoch": 0.8162855066413075, + "grad_norm": 0.09854086488485336, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 211160 + }, + { + "epoch": 0.8163241638446909, + "grad_norm": 0.09441866725683212, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 211170 + }, + { + "epoch": 0.8163628210480741, + "grad_norm": 0.11249484866857529, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 211180 + }, + { + "epoch": 0.8164014782514574, + "grad_norm": 0.09532604366540909, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 211190 + }, + { + "epoch": 0.8164401354548406, + "grad_norm": 0.08785288035869598, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 211200 + }, + { + "epoch": 0.816478792658224, + "grad_norm": 0.10407032072544098, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 211210 + }, + { + "epoch": 0.8165174498616072, + "grad_norm": 0.1006706953048706, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 211220 + }, + { + "epoch": 0.8165561070649905, + "grad_norm": 0.09381138533353806, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 211230 + }, + { + "epoch": 0.8165947642683737, + "grad_norm": 0.09494668990373611, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 211240 + }, + { + "epoch": 0.816633421471757, + "grad_norm": 0.10869157314300537, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 211250 + }, + { + "epoch": 0.8166720786751404, + "grad_norm": 0.09605544805526733, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 211260 + }, + { + "epoch": 0.8167107358785236, + "grad_norm": 0.1651240438222885, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 211270 + }, + { + "epoch": 0.8167493930819069, + "grad_norm": 0.11022128909826279, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 211280 + }, + { + "epoch": 0.8167880502852901, + "grad_norm": 0.11367551237344742, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 211290 + }, + { + "epoch": 0.8168267074886735, + "grad_norm": 0.09908533096313477, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 211300 + }, + { + "epoch": 0.8168653646920567, + "grad_norm": 0.15225879848003387, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 211310 + }, + { + "epoch": 0.81690402189544, + "grad_norm": 0.09795329719781876, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 211320 + }, + { + "epoch": 0.8169426790988232, + "grad_norm": 0.11668098717927933, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 211330 + }, + { + "epoch": 0.8169813363022066, + "grad_norm": 0.19748632609844208, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 211340 + }, + { + "epoch": 0.8170199935055898, + "grad_norm": 0.11172216385602951, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 211350 + }, + { + "epoch": 0.8170586507089731, + "grad_norm": 0.10102499276399612, + "learning_rate": 0.002, + "loss": 2.328, + "step": 211360 + }, + { + "epoch": 0.8170973079123564, + "grad_norm": 0.1347329169511795, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 211370 + }, + { + "epoch": 0.8171359651157397, + "grad_norm": 0.10373783856630325, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 211380 + }, + { + "epoch": 0.817174622319123, + "grad_norm": 0.10776378959417343, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 211390 + }, + { + "epoch": 0.8172132795225062, + "grad_norm": 0.09387657791376114, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 211400 + }, + { + "epoch": 0.8172519367258895, + "grad_norm": 0.11693062633275986, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 211410 + }, + { + "epoch": 0.8172905939292728, + "grad_norm": 0.10187207162380219, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 211420 + }, + { + "epoch": 0.8173292511326561, + "grad_norm": 0.10459417849779129, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 211430 + }, + { + "epoch": 0.8173679083360393, + "grad_norm": 0.11918050795793533, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 211440 + }, + { + "epoch": 0.8174065655394226, + "grad_norm": 0.11814329773187637, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 211450 + }, + { + "epoch": 0.8174452227428058, + "grad_norm": 0.08993051201105118, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 211460 + }, + { + "epoch": 0.8174838799461892, + "grad_norm": 0.10689457505941391, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 211470 + }, + { + "epoch": 0.8175225371495725, + "grad_norm": 0.10431385785341263, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 211480 + }, + { + "epoch": 0.8175611943529557, + "grad_norm": 0.09869911521673203, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 211490 + }, + { + "epoch": 0.817599851556339, + "grad_norm": 0.09722461551427841, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 211500 + }, + { + "epoch": 0.8176385087597223, + "grad_norm": 0.095127172768116, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 211510 + }, + { + "epoch": 0.8176771659631056, + "grad_norm": 0.10289538651704788, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 211520 + }, + { + "epoch": 0.8177158231664888, + "grad_norm": 0.09813522547483444, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 211530 + }, + { + "epoch": 0.8177544803698721, + "grad_norm": 0.12998157739639282, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 211540 + }, + { + "epoch": 0.8177931375732554, + "grad_norm": 0.14629608392715454, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 211550 + }, + { + "epoch": 0.8178317947766387, + "grad_norm": 0.09135743230581284, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 211560 + }, + { + "epoch": 0.817870451980022, + "grad_norm": 0.10678249597549438, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 211570 + }, + { + "epoch": 0.8179091091834052, + "grad_norm": 0.12359215319156647, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 211580 + }, + { + "epoch": 0.8179477663867886, + "grad_norm": 0.1174280121922493, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 211590 + }, + { + "epoch": 0.8179864235901718, + "grad_norm": 0.12401141971349716, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 211600 + }, + { + "epoch": 0.8180250807935551, + "grad_norm": 0.09652134776115417, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 211610 + }, + { + "epoch": 0.8180637379969383, + "grad_norm": 0.1306145042181015, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 211620 + }, + { + "epoch": 0.8181023952003216, + "grad_norm": 0.13372282683849335, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 211630 + }, + { + "epoch": 0.8181410524037049, + "grad_norm": 0.13966284692287445, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 211640 + }, + { + "epoch": 0.8181797096070882, + "grad_norm": 0.09505265206098557, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 211650 + }, + { + "epoch": 0.8182183668104714, + "grad_norm": 0.0904129296541214, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 211660 + }, + { + "epoch": 0.8182570240138547, + "grad_norm": 0.09401164203882217, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 211670 + }, + { + "epoch": 0.8182956812172381, + "grad_norm": 0.13849051296710968, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 211680 + }, + { + "epoch": 0.8183343384206213, + "grad_norm": 0.10335908085107803, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 211690 + }, + { + "epoch": 0.8183729956240046, + "grad_norm": 0.10235551744699478, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 211700 + }, + { + "epoch": 0.8184116528273878, + "grad_norm": 0.12333092093467712, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 211710 + }, + { + "epoch": 0.8184503100307712, + "grad_norm": 0.09448209404945374, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 211720 + }, + { + "epoch": 0.8184889672341544, + "grad_norm": 0.12399522960186005, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 211730 + }, + { + "epoch": 0.8185276244375377, + "grad_norm": 0.0968480035662651, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 211740 + }, + { + "epoch": 0.8185662816409209, + "grad_norm": 0.10621775686740875, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 211750 + }, + { + "epoch": 0.8186049388443043, + "grad_norm": 0.11033473163843155, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 211760 + }, + { + "epoch": 0.8186435960476875, + "grad_norm": 0.10795174539089203, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 211770 + }, + { + "epoch": 0.8186822532510708, + "grad_norm": 0.13240036368370056, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 211780 + }, + { + "epoch": 0.818720910454454, + "grad_norm": 0.09353047609329224, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 211790 + }, + { + "epoch": 0.8187595676578373, + "grad_norm": 0.09700329601764679, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 211800 + }, + { + "epoch": 0.8187982248612207, + "grad_norm": 0.4038792550563812, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 211810 + }, + { + "epoch": 0.8188368820646039, + "grad_norm": 0.11665640771389008, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 211820 + }, + { + "epoch": 0.8188755392679872, + "grad_norm": 0.08964387327432632, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 211830 + }, + { + "epoch": 0.8189141964713704, + "grad_norm": 0.18397657573223114, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 211840 + }, + { + "epoch": 0.8189528536747538, + "grad_norm": 0.13267698884010315, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 211850 + }, + { + "epoch": 0.818991510878137, + "grad_norm": 0.10803057998418808, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 211860 + }, + { + "epoch": 0.8190301680815203, + "grad_norm": 0.1194060891866684, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 211870 + }, + { + "epoch": 0.8190688252849035, + "grad_norm": 0.11083207279443741, + "learning_rate": 0.002, + "loss": 2.357, + "step": 211880 + }, + { + "epoch": 0.8191074824882869, + "grad_norm": 0.10171517729759216, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 211890 + }, + { + "epoch": 0.8191461396916702, + "grad_norm": 0.11016334593296051, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 211900 + }, + { + "epoch": 0.8191847968950534, + "grad_norm": 0.11723824590444565, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 211910 + }, + { + "epoch": 0.8192234540984367, + "grad_norm": 0.11024526506662369, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 211920 + }, + { + "epoch": 0.81926211130182, + "grad_norm": 0.10914993286132812, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 211930 + }, + { + "epoch": 0.8193007685052033, + "grad_norm": 0.1024850457906723, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 211940 + }, + { + "epoch": 0.8193394257085865, + "grad_norm": 0.09786119312047958, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 211950 + }, + { + "epoch": 0.8193780829119698, + "grad_norm": 0.08920815587043762, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 211960 + }, + { + "epoch": 0.8194167401153531, + "grad_norm": 0.11163760721683502, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 211970 + }, + { + "epoch": 0.8194553973187364, + "grad_norm": 0.10414455085992813, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 211980 + }, + { + "epoch": 0.8194940545221197, + "grad_norm": 0.10462162643671036, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 211990 + }, + { + "epoch": 0.8195327117255029, + "grad_norm": 0.11228974163532257, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 212000 + }, + { + "epoch": 0.8195713689288862, + "grad_norm": 0.10974586009979248, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 212010 + }, + { + "epoch": 0.8196100261322695, + "grad_norm": 0.10059110820293427, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 212020 + }, + { + "epoch": 0.8196486833356528, + "grad_norm": 0.3894408345222473, + "learning_rate": 0.002, + "loss": 2.338, + "step": 212030 + }, + { + "epoch": 0.819687340539036, + "grad_norm": 0.09322717040777206, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 212040 + }, + { + "epoch": 0.8197259977424193, + "grad_norm": 0.10623813420534134, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 212050 + }, + { + "epoch": 0.8197646549458026, + "grad_norm": 0.09529799222946167, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 212060 + }, + { + "epoch": 0.8198033121491859, + "grad_norm": 0.117469422519207, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 212070 + }, + { + "epoch": 0.8198419693525691, + "grad_norm": 0.09921499341726303, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 212080 + }, + { + "epoch": 0.8198806265559524, + "grad_norm": 0.09404002875089645, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 212090 + }, + { + "epoch": 0.8199192837593358, + "grad_norm": 0.11683033406734467, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 212100 + }, + { + "epoch": 0.819957940962719, + "grad_norm": 0.13784067332744598, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 212110 + }, + { + "epoch": 0.8199965981661023, + "grad_norm": 0.11979611963033676, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 212120 + }, + { + "epoch": 0.8200352553694855, + "grad_norm": 0.09688198566436768, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 212130 + }, + { + "epoch": 0.8200739125728689, + "grad_norm": 0.10285995155572891, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 212140 + }, + { + "epoch": 0.8201125697762521, + "grad_norm": 0.11738825589418411, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 212150 + }, + { + "epoch": 0.8201512269796354, + "grad_norm": 0.10605660825967789, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 212160 + }, + { + "epoch": 0.8201898841830186, + "grad_norm": 0.09839721024036407, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 212170 + }, + { + "epoch": 0.8202285413864019, + "grad_norm": 0.10906285047531128, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 212180 + }, + { + "epoch": 0.8202671985897853, + "grad_norm": 0.09535101801156998, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 212190 + }, + { + "epoch": 0.8203058557931685, + "grad_norm": 0.13287539780139923, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 212200 + }, + { + "epoch": 0.8203445129965518, + "grad_norm": 0.09444214403629303, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 212210 + }, + { + "epoch": 0.820383170199935, + "grad_norm": 0.09353778511285782, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 212220 + }, + { + "epoch": 0.8204218274033184, + "grad_norm": 0.1208491399884224, + "learning_rate": 0.002, + "loss": 2.336, + "step": 212230 + }, + { + "epoch": 0.8204604846067016, + "grad_norm": 0.09998615086078644, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 212240 + }, + { + "epoch": 0.8204991418100849, + "grad_norm": 0.11312830448150635, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 212250 + }, + { + "epoch": 0.8205377990134681, + "grad_norm": 0.09280294924974442, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 212260 + }, + { + "epoch": 0.8205764562168515, + "grad_norm": 0.1156095340847969, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 212270 + }, + { + "epoch": 0.8206151134202347, + "grad_norm": 0.1221746876835823, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 212280 + }, + { + "epoch": 0.820653770623618, + "grad_norm": 0.10551083832979202, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 212290 + }, + { + "epoch": 0.8206924278270012, + "grad_norm": 0.1070566326379776, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 212300 + }, + { + "epoch": 0.8207310850303846, + "grad_norm": 0.1362307369709015, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 212310 + }, + { + "epoch": 0.8207697422337679, + "grad_norm": 0.10047736763954163, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 212320 + }, + { + "epoch": 0.8208083994371511, + "grad_norm": 0.10491714626550674, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 212330 + }, + { + "epoch": 0.8208470566405344, + "grad_norm": 0.1041213795542717, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 212340 + }, + { + "epoch": 0.8208857138439177, + "grad_norm": 0.11715826392173767, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 212350 + }, + { + "epoch": 0.820924371047301, + "grad_norm": 0.09646627306938171, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 212360 + }, + { + "epoch": 0.8209630282506842, + "grad_norm": 0.1179962232708931, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 212370 + }, + { + "epoch": 0.8210016854540675, + "grad_norm": 0.1079191043972969, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 212380 + }, + { + "epoch": 0.8210403426574507, + "grad_norm": 0.09939951449632645, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 212390 + }, + { + "epoch": 0.8210789998608341, + "grad_norm": 0.11012696474790573, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 212400 + }, + { + "epoch": 0.8211176570642174, + "grad_norm": 0.10655669867992401, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 212410 + }, + { + "epoch": 0.8211563142676006, + "grad_norm": 0.11589882522821426, + "learning_rate": 0.002, + "loss": 2.336, + "step": 212420 + }, + { + "epoch": 0.8211949714709839, + "grad_norm": 0.10871432721614838, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 212430 + }, + { + "epoch": 0.8212336286743672, + "grad_norm": 0.12031927704811096, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 212440 + }, + { + "epoch": 0.8212722858777505, + "grad_norm": 0.10812465101480484, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 212450 + }, + { + "epoch": 0.8213109430811337, + "grad_norm": 0.10944726318120956, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 212460 + }, + { + "epoch": 0.821349600284517, + "grad_norm": 0.10045637935400009, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 212470 + }, + { + "epoch": 0.8213882574879003, + "grad_norm": 0.10013867914676666, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 212480 + }, + { + "epoch": 0.8214269146912836, + "grad_norm": 0.12191098183393478, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 212490 + }, + { + "epoch": 0.8214655718946668, + "grad_norm": 0.11149000376462936, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 212500 + }, + { + "epoch": 0.8215042290980501, + "grad_norm": 0.10808087885379791, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 212510 + }, + { + "epoch": 0.8215428863014335, + "grad_norm": 0.09506329149007797, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 212520 + }, + { + "epoch": 0.8215815435048167, + "grad_norm": 0.10009314119815826, + "learning_rate": 0.002, + "loss": 2.336, + "step": 212530 + }, + { + "epoch": 0.8216202007082, + "grad_norm": 0.1179816722869873, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 212540 + }, + { + "epoch": 0.8216588579115832, + "grad_norm": 0.10906680673360825, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 212550 + }, + { + "epoch": 0.8216975151149665, + "grad_norm": 0.10762029141187668, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 212560 + }, + { + "epoch": 0.8217361723183498, + "grad_norm": 0.10383537411689758, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 212570 + }, + { + "epoch": 0.8217748295217331, + "grad_norm": 0.09981483221054077, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 212580 + }, + { + "epoch": 0.8218134867251163, + "grad_norm": 0.1330537050962448, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 212590 + }, + { + "epoch": 0.8218521439284996, + "grad_norm": 0.09957239776849747, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 212600 + }, + { + "epoch": 0.821890801131883, + "grad_norm": 0.09773680567741394, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 212610 + }, + { + "epoch": 0.8219294583352662, + "grad_norm": 0.1022195816040039, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 212620 + }, + { + "epoch": 0.8219681155386495, + "grad_norm": 0.11773774772882462, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 212630 + }, + { + "epoch": 0.8220067727420327, + "grad_norm": 0.12010695040225983, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 212640 + }, + { + "epoch": 0.8220454299454161, + "grad_norm": 0.10011716187000275, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 212650 + }, + { + "epoch": 0.8220840871487993, + "grad_norm": 0.0979095920920372, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 212660 + }, + { + "epoch": 0.8221227443521826, + "grad_norm": 0.10783082991838455, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 212670 + }, + { + "epoch": 0.8221614015555658, + "grad_norm": 0.10398319363594055, + "learning_rate": 0.002, + "loss": 2.348, + "step": 212680 + }, + { + "epoch": 0.8222000587589492, + "grad_norm": 0.09824015945196152, + "learning_rate": 0.002, + "loss": 2.33, + "step": 212690 + }, + { + "epoch": 0.8222387159623324, + "grad_norm": 0.09613677859306335, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 212700 + }, + { + "epoch": 0.8222773731657157, + "grad_norm": 0.1030166894197464, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 212710 + }, + { + "epoch": 0.822316030369099, + "grad_norm": 0.09595217555761337, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 212720 + }, + { + "epoch": 0.8223546875724822, + "grad_norm": 0.09824813902378082, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 212730 + }, + { + "epoch": 0.8223933447758656, + "grad_norm": 0.09953426569700241, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 212740 + }, + { + "epoch": 0.8224320019792488, + "grad_norm": 0.10548051446676254, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 212750 + }, + { + "epoch": 0.8224706591826321, + "grad_norm": 0.11247394979000092, + "learning_rate": 0.002, + "loss": 2.328, + "step": 212760 + }, + { + "epoch": 0.8225093163860153, + "grad_norm": 0.11061998456716537, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 212770 + }, + { + "epoch": 0.8225479735893987, + "grad_norm": 0.11678779870271683, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 212780 + }, + { + "epoch": 0.8225866307927819, + "grad_norm": 0.10830101370811462, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 212790 + }, + { + "epoch": 0.8226252879961652, + "grad_norm": 0.08461033552885056, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 212800 + }, + { + "epoch": 0.8226639451995484, + "grad_norm": 0.10060620307922363, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 212810 + }, + { + "epoch": 0.8227026024029318, + "grad_norm": 0.11858765780925751, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 212820 + }, + { + "epoch": 0.8227412596063151, + "grad_norm": 0.11985529214143753, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 212830 + }, + { + "epoch": 0.8227799168096983, + "grad_norm": 0.11134400218725204, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 212840 + }, + { + "epoch": 0.8228185740130816, + "grad_norm": 0.11833249032497406, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 212850 + }, + { + "epoch": 0.8228572312164649, + "grad_norm": 0.11237727850675583, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 212860 + }, + { + "epoch": 0.8228958884198482, + "grad_norm": 0.12930941581726074, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 212870 + }, + { + "epoch": 0.8229345456232314, + "grad_norm": 0.11326020210981369, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 212880 + }, + { + "epoch": 0.8229732028266147, + "grad_norm": 0.10139208287000656, + "learning_rate": 0.002, + "loss": 2.343, + "step": 212890 + }, + { + "epoch": 0.823011860029998, + "grad_norm": 0.11200857162475586, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 212900 + }, + { + "epoch": 0.8230505172333813, + "grad_norm": 0.15013805031776428, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 212910 + }, + { + "epoch": 0.8230891744367645, + "grad_norm": 0.10213683545589447, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 212920 + }, + { + "epoch": 0.8231278316401478, + "grad_norm": 0.1358550786972046, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 212930 + }, + { + "epoch": 0.823166488843531, + "grad_norm": 0.09838774055242538, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 212940 + }, + { + "epoch": 0.8232051460469144, + "grad_norm": 0.10487181693315506, + "learning_rate": 0.002, + "loss": 2.347, + "step": 212950 + }, + { + "epoch": 0.8232438032502977, + "grad_norm": 0.10073231905698776, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 212960 + }, + { + "epoch": 0.8232824604536809, + "grad_norm": 0.11269784718751907, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 212970 + }, + { + "epoch": 0.8233211176570642, + "grad_norm": 0.10114607959985733, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 212980 + }, + { + "epoch": 0.8233597748604475, + "grad_norm": 0.10706079751253128, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 212990 + }, + { + "epoch": 0.8233984320638308, + "grad_norm": 0.10866525024175644, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 213000 + }, + { + "epoch": 0.823437089267214, + "grad_norm": 0.1156979352235794, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 213010 + }, + { + "epoch": 0.8234757464705973, + "grad_norm": 0.09846003353595734, + "learning_rate": 0.002, + "loss": 2.3635, + "step": 213020 + }, + { + "epoch": 0.8235144036739807, + "grad_norm": 0.13185329735279083, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 213030 + }, + { + "epoch": 0.8235530608773639, + "grad_norm": 0.10501515120267868, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 213040 + }, + { + "epoch": 0.8235917180807472, + "grad_norm": 0.10742828249931335, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 213050 + }, + { + "epoch": 0.8236303752841304, + "grad_norm": 0.10099185258150101, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 213060 + }, + { + "epoch": 0.8236690324875138, + "grad_norm": 0.1175713986158371, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 213070 + }, + { + "epoch": 0.823707689690897, + "grad_norm": 0.11294372379779816, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 213080 + }, + { + "epoch": 0.8237463468942803, + "grad_norm": 0.11598861962556839, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 213090 + }, + { + "epoch": 0.8237850040976635, + "grad_norm": 0.10174839943647385, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 213100 + }, + { + "epoch": 0.8238236613010468, + "grad_norm": 0.13224554061889648, + "learning_rate": 0.002, + "loss": 2.349, + "step": 213110 + }, + { + "epoch": 0.8238623185044301, + "grad_norm": 0.10385262966156006, + "learning_rate": 0.002, + "loss": 2.331, + "step": 213120 + }, + { + "epoch": 0.8239009757078134, + "grad_norm": 0.11136598885059357, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 213130 + }, + { + "epoch": 0.8239396329111967, + "grad_norm": 0.10111375153064728, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 213140 + }, + { + "epoch": 0.8239782901145799, + "grad_norm": 0.19907134771347046, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 213150 + }, + { + "epoch": 0.8240169473179633, + "grad_norm": 0.10010344535112381, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 213160 + }, + { + "epoch": 0.8240556045213465, + "grad_norm": 0.10503219068050385, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 213170 + }, + { + "epoch": 0.8240942617247298, + "grad_norm": 0.09305467456579208, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 213180 + }, + { + "epoch": 0.824132918928113, + "grad_norm": 0.11915779858827591, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 213190 + }, + { + "epoch": 0.8241715761314964, + "grad_norm": 0.10269779711961746, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 213200 + }, + { + "epoch": 0.8242102333348796, + "grad_norm": 0.09319688379764557, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 213210 + }, + { + "epoch": 0.8242488905382629, + "grad_norm": 0.11573675274848938, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 213220 + }, + { + "epoch": 0.8242875477416461, + "grad_norm": 0.11789800226688385, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 213230 + }, + { + "epoch": 0.8243262049450295, + "grad_norm": 0.10294772684574127, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 213240 + }, + { + "epoch": 0.8243648621484128, + "grad_norm": 0.11846683919429779, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 213250 + }, + { + "epoch": 0.824403519351796, + "grad_norm": 0.11015811562538147, + "learning_rate": 0.002, + "loss": 2.353, + "step": 213260 + }, + { + "epoch": 0.8244421765551793, + "grad_norm": 0.11703477054834366, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 213270 + }, + { + "epoch": 0.8244808337585626, + "grad_norm": 0.1198219358921051, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 213280 + }, + { + "epoch": 0.8245194909619459, + "grad_norm": 0.09992329031229019, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 213290 + }, + { + "epoch": 0.8245581481653291, + "grad_norm": 0.09687243402004242, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 213300 + }, + { + "epoch": 0.8245968053687124, + "grad_norm": 0.1290697306394577, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 213310 + }, + { + "epoch": 0.8246354625720956, + "grad_norm": 0.09441085904836655, + "learning_rate": 0.002, + "loss": 2.359, + "step": 213320 + }, + { + "epoch": 0.824674119775479, + "grad_norm": 0.09515941888093948, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 213330 + }, + { + "epoch": 0.8247127769788622, + "grad_norm": 0.1018657311797142, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 213340 + }, + { + "epoch": 0.8247514341822455, + "grad_norm": 0.10830926895141602, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 213350 + }, + { + "epoch": 0.8247900913856288, + "grad_norm": 0.09651590883731842, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 213360 + }, + { + "epoch": 0.8248287485890121, + "grad_norm": 0.10271652042865753, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 213370 + }, + { + "epoch": 0.8248674057923954, + "grad_norm": 0.09921977669000626, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 213380 + }, + { + "epoch": 0.8249060629957786, + "grad_norm": 0.11118409782648087, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 213390 + }, + { + "epoch": 0.8249447201991619, + "grad_norm": 0.10362720489501953, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 213400 + }, + { + "epoch": 0.8249833774025452, + "grad_norm": 0.09841493517160416, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 213410 + }, + { + "epoch": 0.8250220346059285, + "grad_norm": 0.11038927733898163, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 213420 + }, + { + "epoch": 0.8250606918093117, + "grad_norm": 0.09338337928056717, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 213430 + }, + { + "epoch": 0.825099349012695, + "grad_norm": 0.09041500091552734, + "learning_rate": 0.002, + "loss": 2.339, + "step": 213440 + }, + { + "epoch": 0.8251380062160784, + "grad_norm": 0.29466360807418823, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 213450 + }, + { + "epoch": 0.8251766634194616, + "grad_norm": 0.12012405693531036, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 213460 + }, + { + "epoch": 0.8252153206228449, + "grad_norm": 0.08641993254423141, + "learning_rate": 0.002, + "loss": 2.341, + "step": 213470 + }, + { + "epoch": 0.8252539778262281, + "grad_norm": 0.08934962749481201, + "learning_rate": 0.002, + "loss": 2.344, + "step": 213480 + }, + { + "epoch": 0.8252926350296114, + "grad_norm": 0.11363324522972107, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 213490 + }, + { + "epoch": 0.8253312922329947, + "grad_norm": 0.10849322378635406, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 213500 + }, + { + "epoch": 0.825369949436378, + "grad_norm": 0.10399137437343597, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 213510 + }, + { + "epoch": 0.8254086066397612, + "grad_norm": 0.10321518033742905, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 213520 + }, + { + "epoch": 0.8254472638431445, + "grad_norm": 0.11302473396062851, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 213530 + }, + { + "epoch": 0.8254859210465278, + "grad_norm": 0.12279103696346283, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 213540 + }, + { + "epoch": 0.8255245782499111, + "grad_norm": 0.10647827386856079, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 213550 + }, + { + "epoch": 0.8255632354532944, + "grad_norm": 0.1170077994465828, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 213560 + }, + { + "epoch": 0.8256018926566776, + "grad_norm": 0.09381834417581558, + "learning_rate": 0.002, + "loss": 2.335, + "step": 213570 + }, + { + "epoch": 0.825640549860061, + "grad_norm": 0.12134481966495514, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 213580 + }, + { + "epoch": 0.8256792070634442, + "grad_norm": 0.09720830619335175, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 213590 + }, + { + "epoch": 0.8257178642668275, + "grad_norm": 0.11691569536924362, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 213600 + }, + { + "epoch": 0.8257565214702107, + "grad_norm": 0.10867815464735031, + "learning_rate": 0.002, + "loss": 2.352, + "step": 213610 + }, + { + "epoch": 0.8257951786735941, + "grad_norm": 0.10696035623550415, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 213620 + }, + { + "epoch": 0.8258338358769773, + "grad_norm": 0.1083867996931076, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 213630 + }, + { + "epoch": 0.8258724930803606, + "grad_norm": 0.1000467911362648, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 213640 + }, + { + "epoch": 0.8259111502837438, + "grad_norm": 0.10876516997814178, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 213650 + }, + { + "epoch": 0.8259498074871271, + "grad_norm": 0.11239168047904968, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 213660 + }, + { + "epoch": 0.8259884646905105, + "grad_norm": 0.09718149155378342, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 213670 + }, + { + "epoch": 0.8260271218938937, + "grad_norm": 0.09330479800701141, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 213680 + }, + { + "epoch": 0.826065779097277, + "grad_norm": 0.14178209006786346, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 213690 + }, + { + "epoch": 0.8261044363006602, + "grad_norm": 0.097447469830513, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 213700 + }, + { + "epoch": 0.8261430935040436, + "grad_norm": 0.11665435135364532, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 213710 + }, + { + "epoch": 0.8261817507074268, + "grad_norm": 0.10119656473398209, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 213720 + }, + { + "epoch": 0.8262204079108101, + "grad_norm": 0.153537318110466, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 213730 + }, + { + "epoch": 0.8262590651141933, + "grad_norm": 0.09502313286066055, + "learning_rate": 0.002, + "loss": 2.329, + "step": 213740 + }, + { + "epoch": 0.8262977223175767, + "grad_norm": 0.11871633678674698, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 213750 + }, + { + "epoch": 0.82633637952096, + "grad_norm": 0.11032947897911072, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 213760 + }, + { + "epoch": 0.8263750367243432, + "grad_norm": 0.10220306366682053, + "learning_rate": 0.002, + "loss": 2.327, + "step": 213770 + }, + { + "epoch": 0.8264136939277265, + "grad_norm": 0.09602031111717224, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 213780 + }, + { + "epoch": 0.8264523511311098, + "grad_norm": 0.10085641592741013, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 213790 + }, + { + "epoch": 0.8264910083344931, + "grad_norm": 0.10796209424734116, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 213800 + }, + { + "epoch": 0.8265296655378763, + "grad_norm": 0.11179764568805695, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 213810 + }, + { + "epoch": 0.8265683227412596, + "grad_norm": 0.10923667997121811, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 213820 + }, + { + "epoch": 0.8266069799446429, + "grad_norm": 0.10238125920295715, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 213830 + }, + { + "epoch": 0.8266456371480262, + "grad_norm": 0.1121905967593193, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 213840 + }, + { + "epoch": 0.8266842943514094, + "grad_norm": 0.09311661869287491, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 213850 + }, + { + "epoch": 0.8267229515547927, + "grad_norm": 0.0946795791387558, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 213860 + }, + { + "epoch": 0.826761608758176, + "grad_norm": 0.10480405390262604, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 213870 + }, + { + "epoch": 0.8268002659615593, + "grad_norm": 0.11183927208185196, + "learning_rate": 0.002, + "loss": 2.34, + "step": 213880 + }, + { + "epoch": 0.8268389231649426, + "grad_norm": 0.10681787133216858, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 213890 + }, + { + "epoch": 0.8268775803683258, + "grad_norm": 0.09188628196716309, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 213900 + }, + { + "epoch": 0.8269162375717091, + "grad_norm": 0.11369525641202927, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 213910 + }, + { + "epoch": 0.8269548947750924, + "grad_norm": 0.11502217501401901, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 213920 + }, + { + "epoch": 0.8269935519784757, + "grad_norm": 0.12712916731834412, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 213930 + }, + { + "epoch": 0.8270322091818589, + "grad_norm": 0.11223436892032623, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 213940 + }, + { + "epoch": 0.8270708663852422, + "grad_norm": 0.0936799943447113, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 213950 + }, + { + "epoch": 0.8271095235886256, + "grad_norm": 0.10395878553390503, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 213960 + }, + { + "epoch": 0.8271481807920088, + "grad_norm": 0.10461578518152237, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 213970 + }, + { + "epoch": 0.827186837995392, + "grad_norm": 0.10377980023622513, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 213980 + }, + { + "epoch": 0.8272254951987753, + "grad_norm": 0.10748467594385147, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 213990 + }, + { + "epoch": 0.8272641524021587, + "grad_norm": 0.10781331360340118, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 214000 + }, + { + "epoch": 0.8273028096055419, + "grad_norm": 0.0984257161617279, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 214010 + }, + { + "epoch": 0.8273414668089252, + "grad_norm": 0.09564877301454544, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 214020 + }, + { + "epoch": 0.8273801240123084, + "grad_norm": 0.10228132456541061, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 214030 + }, + { + "epoch": 0.8274187812156917, + "grad_norm": 0.1266312301158905, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 214040 + }, + { + "epoch": 0.827457438419075, + "grad_norm": 0.12409737706184387, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 214050 + }, + { + "epoch": 0.8274960956224583, + "grad_norm": 0.09669151902198792, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 214060 + }, + { + "epoch": 0.8275347528258415, + "grad_norm": 0.11038026958703995, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 214070 + }, + { + "epoch": 0.8275734100292248, + "grad_norm": 0.10415840148925781, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 214080 + }, + { + "epoch": 0.8276120672326082, + "grad_norm": 0.09661370515823364, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 214090 + }, + { + "epoch": 0.8276507244359914, + "grad_norm": 0.1039520800113678, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 214100 + }, + { + "epoch": 0.8276893816393747, + "grad_norm": 0.13938656449317932, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 214110 + }, + { + "epoch": 0.8277280388427579, + "grad_norm": 0.09539224207401276, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 214120 + }, + { + "epoch": 0.8277666960461413, + "grad_norm": 0.09107252955436707, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 214130 + }, + { + "epoch": 0.8278053532495245, + "grad_norm": 0.1036781594157219, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 214140 + }, + { + "epoch": 0.8278440104529078, + "grad_norm": 0.11991392076015472, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 214150 + }, + { + "epoch": 0.827882667656291, + "grad_norm": 0.10466620326042175, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 214160 + }, + { + "epoch": 0.8279213248596744, + "grad_norm": 0.09906827658414841, + "learning_rate": 0.002, + "loss": 2.341, + "step": 214170 + }, + { + "epoch": 0.8279599820630577, + "grad_norm": 0.13791698217391968, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 214180 + }, + { + "epoch": 0.8279986392664409, + "grad_norm": 0.12032756954431534, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 214190 + }, + { + "epoch": 0.8280372964698242, + "grad_norm": 0.09512019902467728, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 214200 + }, + { + "epoch": 0.8280759536732074, + "grad_norm": 0.10837588459253311, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 214210 + }, + { + "epoch": 0.8281146108765908, + "grad_norm": 0.09497522562742233, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 214220 + }, + { + "epoch": 0.828153268079974, + "grad_norm": 0.08743920177221298, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 214230 + }, + { + "epoch": 0.8281919252833573, + "grad_norm": 0.09592652320861816, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 214240 + }, + { + "epoch": 0.8282305824867405, + "grad_norm": 0.09966947138309479, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 214250 + }, + { + "epoch": 0.8282692396901239, + "grad_norm": 0.10604499280452728, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 214260 + }, + { + "epoch": 0.8283078968935071, + "grad_norm": 0.09730489552021027, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 214270 + }, + { + "epoch": 0.8283465540968904, + "grad_norm": 0.10596129298210144, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 214280 + }, + { + "epoch": 0.8283852113002736, + "grad_norm": 0.11412282288074493, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 214290 + }, + { + "epoch": 0.828423868503657, + "grad_norm": 0.11002447456121445, + "learning_rate": 0.002, + "loss": 2.341, + "step": 214300 + }, + { + "epoch": 0.8284625257070403, + "grad_norm": 0.09060240536928177, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 214310 + }, + { + "epoch": 0.8285011829104235, + "grad_norm": 0.09924402087926865, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 214320 + }, + { + "epoch": 0.8285398401138068, + "grad_norm": 0.09792988747358322, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 214330 + }, + { + "epoch": 0.8285784973171901, + "grad_norm": 0.11095894873142242, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 214340 + }, + { + "epoch": 0.8286171545205734, + "grad_norm": 0.10736696422100067, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 214350 + }, + { + "epoch": 0.8286558117239566, + "grad_norm": 0.09914804995059967, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 214360 + }, + { + "epoch": 0.8286944689273399, + "grad_norm": 0.11982014030218124, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 214370 + }, + { + "epoch": 0.8287331261307233, + "grad_norm": 0.11260449886322021, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 214380 + }, + { + "epoch": 0.8287717833341065, + "grad_norm": 0.11211036145687103, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 214390 + }, + { + "epoch": 0.8288104405374898, + "grad_norm": 0.12835681438446045, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 214400 + }, + { + "epoch": 0.828849097740873, + "grad_norm": 0.11177512258291245, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 214410 + }, + { + "epoch": 0.8288877549442563, + "grad_norm": 0.10175733268260956, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 214420 + }, + { + "epoch": 0.8289264121476396, + "grad_norm": 0.12657196819782257, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 214430 + }, + { + "epoch": 0.8289650693510229, + "grad_norm": 0.1052093431353569, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 214440 + }, + { + "epoch": 0.8290037265544061, + "grad_norm": 0.11545445024967194, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 214450 + }, + { + "epoch": 0.8290423837577894, + "grad_norm": 0.10227680951356888, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 214460 + }, + { + "epoch": 0.8290810409611727, + "grad_norm": 0.09999417513608932, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 214470 + }, + { + "epoch": 0.829119698164556, + "grad_norm": 0.10263783484697342, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 214480 + }, + { + "epoch": 0.8291583553679392, + "grad_norm": 0.1079811379313469, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 214490 + }, + { + "epoch": 0.8291970125713225, + "grad_norm": 0.09219511598348618, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 214500 + }, + { + "epoch": 0.8292356697747059, + "grad_norm": 0.11662989109754562, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 214510 + }, + { + "epoch": 0.8292743269780891, + "grad_norm": 0.10641618072986603, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 214520 + }, + { + "epoch": 0.8293129841814724, + "grad_norm": 0.08924368768930435, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 214530 + }, + { + "epoch": 0.8293516413848556, + "grad_norm": 0.0992094874382019, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 214540 + }, + { + "epoch": 0.829390298588239, + "grad_norm": 0.1098957285284996, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 214550 + }, + { + "epoch": 0.8294289557916222, + "grad_norm": 0.11594545096158981, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 214560 + }, + { + "epoch": 0.8294676129950055, + "grad_norm": 0.11425887793302536, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 214570 + }, + { + "epoch": 0.8295062701983887, + "grad_norm": 0.08713730424642563, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 214580 + }, + { + "epoch": 0.829544927401772, + "grad_norm": 0.1614382117986679, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 214590 + }, + { + "epoch": 0.8295835846051554, + "grad_norm": 0.1060347706079483, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 214600 + }, + { + "epoch": 0.8296222418085386, + "grad_norm": 0.09657891094684601, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 214610 + }, + { + "epoch": 0.8296608990119219, + "grad_norm": 0.11805426329374313, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 214620 + }, + { + "epoch": 0.8296995562153051, + "grad_norm": 0.09428200125694275, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 214630 + }, + { + "epoch": 0.8297382134186885, + "grad_norm": 0.11019503325223923, + "learning_rate": 0.002, + "loss": 2.336, + "step": 214640 + }, + { + "epoch": 0.8297768706220717, + "grad_norm": 0.11221537739038467, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 214650 + }, + { + "epoch": 0.829815527825455, + "grad_norm": 0.1031530499458313, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 214660 + }, + { + "epoch": 0.8298541850288382, + "grad_norm": 0.11032679677009583, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 214670 + }, + { + "epoch": 0.8298928422322216, + "grad_norm": 0.10092826932668686, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 214680 + }, + { + "epoch": 0.8299314994356048, + "grad_norm": 0.1032138466835022, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 214690 + }, + { + "epoch": 0.8299701566389881, + "grad_norm": 0.11866185069084167, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 214700 + }, + { + "epoch": 0.8300088138423714, + "grad_norm": 0.12750159204006195, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 214710 + }, + { + "epoch": 0.8300474710457547, + "grad_norm": 0.10487841069698334, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 214720 + }, + { + "epoch": 0.830086128249138, + "grad_norm": 0.105401411652565, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 214730 + }, + { + "epoch": 0.8301247854525212, + "grad_norm": 0.10411553829908371, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 214740 + }, + { + "epoch": 0.8301634426559045, + "grad_norm": 0.10573433339595795, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 214750 + }, + { + "epoch": 0.8302020998592878, + "grad_norm": 0.09723833948373795, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 214760 + }, + { + "epoch": 0.8302407570626711, + "grad_norm": 0.09145607799291611, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 214770 + }, + { + "epoch": 0.8302794142660543, + "grad_norm": 0.09567610919475555, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 214780 + }, + { + "epoch": 0.8303180714694376, + "grad_norm": 0.10495653748512268, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 214790 + }, + { + "epoch": 0.8303567286728208, + "grad_norm": 0.1699315309524536, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 214800 + }, + { + "epoch": 0.8303953858762042, + "grad_norm": 0.11049603670835495, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 214810 + }, + { + "epoch": 0.8304340430795875, + "grad_norm": 0.10238435119390488, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 214820 + }, + { + "epoch": 0.8304727002829707, + "grad_norm": 0.10729759186506271, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 214830 + }, + { + "epoch": 0.830511357486354, + "grad_norm": 0.09942258894443512, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 214840 + }, + { + "epoch": 0.8305500146897373, + "grad_norm": 0.11134497076272964, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 214850 + }, + { + "epoch": 0.8305886718931206, + "grad_norm": 0.10739394277334213, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 214860 + }, + { + "epoch": 0.8306273290965038, + "grad_norm": 0.11967971920967102, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 214870 + }, + { + "epoch": 0.8306659862998871, + "grad_norm": 0.12112173438072205, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 214880 + }, + { + "epoch": 0.8307046435032704, + "grad_norm": 0.11589988321065903, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 214890 + }, + { + "epoch": 0.8307433007066537, + "grad_norm": 0.09112926572561264, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 214900 + }, + { + "epoch": 0.830781957910037, + "grad_norm": 0.11147700995206833, + "learning_rate": 0.002, + "loss": 2.3618, + "step": 214910 + }, + { + "epoch": 0.8308206151134202, + "grad_norm": 0.10066591203212738, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 214920 + }, + { + "epoch": 0.8308592723168036, + "grad_norm": 0.10681650042533875, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 214930 + }, + { + "epoch": 0.8308979295201868, + "grad_norm": 0.10110953450202942, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 214940 + }, + { + "epoch": 0.8309365867235701, + "grad_norm": 0.1249653548002243, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 214950 + }, + { + "epoch": 0.8309752439269533, + "grad_norm": 0.11149842292070389, + "learning_rate": 0.002, + "loss": 2.341, + "step": 214960 + }, + { + "epoch": 0.8310139011303366, + "grad_norm": 0.11847642064094543, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 214970 + }, + { + "epoch": 0.8310525583337199, + "grad_norm": 0.09807038307189941, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 214980 + }, + { + "epoch": 0.8310912155371032, + "grad_norm": 0.09750042855739594, + "learning_rate": 0.002, + "loss": 2.336, + "step": 214990 + }, + { + "epoch": 0.8311298727404864, + "grad_norm": 0.09801283478736877, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 215000 + }, + { + "epoch": 0.8311685299438697, + "grad_norm": 0.10606060922145844, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 215010 + }, + { + "epoch": 0.8312071871472531, + "grad_norm": 0.11397572606801987, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 215020 + }, + { + "epoch": 0.8312458443506363, + "grad_norm": 0.11736032366752625, + "learning_rate": 0.002, + "loss": 2.341, + "step": 215030 + }, + { + "epoch": 0.8312845015540196, + "grad_norm": 0.1030939519405365, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 215040 + }, + { + "epoch": 0.8313231587574028, + "grad_norm": 0.12110091745853424, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 215050 + }, + { + "epoch": 0.8313618159607862, + "grad_norm": 0.11975936591625214, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 215060 + }, + { + "epoch": 0.8314004731641694, + "grad_norm": 0.09283725917339325, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 215070 + }, + { + "epoch": 0.8314391303675527, + "grad_norm": 0.12285838276147842, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 215080 + }, + { + "epoch": 0.8314777875709359, + "grad_norm": 0.1171443834900856, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 215090 + }, + { + "epoch": 0.8315164447743193, + "grad_norm": 0.10224750638008118, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 215100 + }, + { + "epoch": 0.8315551019777025, + "grad_norm": 0.10163308680057526, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 215110 + }, + { + "epoch": 0.8315937591810858, + "grad_norm": 0.10186589509248734, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 215120 + }, + { + "epoch": 0.831632416384469, + "grad_norm": 0.10101377218961716, + "learning_rate": 0.002, + "loss": 2.337, + "step": 215130 + }, + { + "epoch": 0.8316710735878523, + "grad_norm": 0.11015063524246216, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 215140 + }, + { + "epoch": 0.8317097307912357, + "grad_norm": 0.09436852484941483, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 215150 + }, + { + "epoch": 0.8317483879946189, + "grad_norm": 0.10956919938325882, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 215160 + }, + { + "epoch": 0.8317870451980022, + "grad_norm": 0.10172809660434723, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 215170 + }, + { + "epoch": 0.8318257024013854, + "grad_norm": 0.1085454672574997, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 215180 + }, + { + "epoch": 0.8318643596047688, + "grad_norm": 0.12386401742696762, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 215190 + }, + { + "epoch": 0.831903016808152, + "grad_norm": 0.11149362474679947, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 215200 + }, + { + "epoch": 0.8319416740115353, + "grad_norm": 0.11105955392122269, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 215210 + }, + { + "epoch": 0.8319803312149185, + "grad_norm": 0.08862071484327316, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 215220 + }, + { + "epoch": 0.8320189884183019, + "grad_norm": 0.10108217597007751, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 215230 + }, + { + "epoch": 0.8320576456216852, + "grad_norm": 0.10174451768398285, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 215240 + }, + { + "epoch": 0.8320963028250684, + "grad_norm": 0.1056303083896637, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 215250 + }, + { + "epoch": 0.8321349600284517, + "grad_norm": 0.09623876959085464, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 215260 + }, + { + "epoch": 0.832173617231835, + "grad_norm": 0.10596370697021484, + "learning_rate": 0.002, + "loss": 2.327, + "step": 215270 + }, + { + "epoch": 0.8322122744352183, + "grad_norm": 0.11672472208738327, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 215280 + }, + { + "epoch": 0.8322509316386015, + "grad_norm": 0.09565582871437073, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 215290 + }, + { + "epoch": 0.8322895888419848, + "grad_norm": 0.10847000032663345, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 215300 + }, + { + "epoch": 0.8323282460453681, + "grad_norm": 0.10517299920320511, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 215310 + }, + { + "epoch": 0.8323669032487514, + "grad_norm": 0.09611780196428299, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 215320 + }, + { + "epoch": 0.8324055604521347, + "grad_norm": 0.10611071437597275, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 215330 + }, + { + "epoch": 0.8324442176555179, + "grad_norm": 0.10718956589698792, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 215340 + }, + { + "epoch": 0.8324828748589012, + "grad_norm": 0.11760836094617844, + "learning_rate": 0.002, + "loss": 2.323, + "step": 215350 + }, + { + "epoch": 0.8325215320622845, + "grad_norm": 0.10962734371423721, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 215360 + }, + { + "epoch": 0.8325601892656678, + "grad_norm": 0.10704983025789261, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 215370 + }, + { + "epoch": 0.832598846469051, + "grad_norm": 0.10682962089776993, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 215380 + }, + { + "epoch": 0.8326375036724343, + "grad_norm": 0.12136154621839523, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 215390 + }, + { + "epoch": 0.8326761608758176, + "grad_norm": 0.09624449908733368, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 215400 + }, + { + "epoch": 0.8327148180792009, + "grad_norm": 0.10110598057508469, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 215410 + }, + { + "epoch": 0.8327534752825841, + "grad_norm": 0.10344604402780533, + "learning_rate": 0.002, + "loss": 2.351, + "step": 215420 + }, + { + "epoch": 0.8327921324859674, + "grad_norm": 0.10519465059041977, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 215430 + }, + { + "epoch": 0.8328307896893508, + "grad_norm": 0.10835524648427963, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 215440 + }, + { + "epoch": 0.832869446892734, + "grad_norm": 0.09608236700296402, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 215450 + }, + { + "epoch": 0.8329081040961173, + "grad_norm": 0.09845644980669022, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 215460 + }, + { + "epoch": 0.8329467612995005, + "grad_norm": 0.11512713879346848, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 215470 + }, + { + "epoch": 0.8329854185028839, + "grad_norm": 0.10259012877941132, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 215480 + }, + { + "epoch": 0.8330240757062671, + "grad_norm": 0.08932927995920181, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 215490 + }, + { + "epoch": 0.8330627329096504, + "grad_norm": 0.11538581550121307, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 215500 + }, + { + "epoch": 0.8331013901130336, + "grad_norm": 0.13710728287696838, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 215510 + }, + { + "epoch": 0.8331400473164169, + "grad_norm": 0.0972672551870346, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 215520 + }, + { + "epoch": 0.8331787045198003, + "grad_norm": 0.09992696344852448, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 215530 + }, + { + "epoch": 0.8332173617231835, + "grad_norm": 0.1077425628900528, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 215540 + }, + { + "epoch": 0.8332560189265668, + "grad_norm": 0.10400703549385071, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 215550 + }, + { + "epoch": 0.83329467612995, + "grad_norm": 0.11263838410377502, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 215560 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.10221675783395767, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 215570 + }, + { + "epoch": 0.8333719905367166, + "grad_norm": 0.10675705224275589, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 215580 + }, + { + "epoch": 0.8334106477400999, + "grad_norm": 0.11262036114931107, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 215590 + }, + { + "epoch": 0.8334493049434831, + "grad_norm": 0.10636333376169205, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 215600 + }, + { + "epoch": 0.8334879621468665, + "grad_norm": 0.11252046376466751, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 215610 + }, + { + "epoch": 0.8335266193502497, + "grad_norm": 0.10936474055051804, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 215620 + }, + { + "epoch": 0.833565276553633, + "grad_norm": 0.09406422823667526, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 215630 + }, + { + "epoch": 0.8336039337570162, + "grad_norm": 0.10333401709794998, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 215640 + }, + { + "epoch": 0.8336425909603996, + "grad_norm": 0.10925310105085373, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 215650 + }, + { + "epoch": 0.8336812481637829, + "grad_norm": 0.16220533847808838, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 215660 + }, + { + "epoch": 0.8337199053671661, + "grad_norm": 0.1127329021692276, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 215670 + }, + { + "epoch": 0.8337585625705494, + "grad_norm": 0.10669612139463425, + "learning_rate": 0.002, + "loss": 2.329, + "step": 215680 + }, + { + "epoch": 0.8337972197739327, + "grad_norm": 0.10470252484083176, + "learning_rate": 0.002, + "loss": 2.3604, + "step": 215690 + }, + { + "epoch": 0.833835876977316, + "grad_norm": 0.10056356340646744, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 215700 + }, + { + "epoch": 0.8338745341806992, + "grad_norm": 0.09819260984659195, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 215710 + }, + { + "epoch": 0.8339131913840825, + "grad_norm": 0.09792476892471313, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 215720 + }, + { + "epoch": 0.8339518485874657, + "grad_norm": 0.09533162415027618, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 215730 + }, + { + "epoch": 0.8339905057908491, + "grad_norm": 0.11937808245420456, + "learning_rate": 0.002, + "loss": 2.3674, + "step": 215740 + }, + { + "epoch": 0.8340291629942324, + "grad_norm": 0.10256972163915634, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 215750 + }, + { + "epoch": 0.8340678201976156, + "grad_norm": 0.10601312667131424, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 215760 + }, + { + "epoch": 0.8341064774009989, + "grad_norm": 0.10224002599716187, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 215770 + }, + { + "epoch": 0.8341451346043822, + "grad_norm": 0.09572790563106537, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 215780 + }, + { + "epoch": 0.8341837918077655, + "grad_norm": 0.1392766535282135, + "learning_rate": 0.002, + "loss": 2.332, + "step": 215790 + }, + { + "epoch": 0.8342224490111487, + "grad_norm": 0.10670210421085358, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 215800 + }, + { + "epoch": 0.834261106214532, + "grad_norm": 0.10126934945583344, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 215810 + }, + { + "epoch": 0.8342997634179153, + "grad_norm": 0.10573316365480423, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 215820 + }, + { + "epoch": 0.8343384206212986, + "grad_norm": 0.09832464158535004, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 215830 + }, + { + "epoch": 0.8343770778246818, + "grad_norm": 0.12400636076927185, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 215840 + }, + { + "epoch": 0.8344157350280651, + "grad_norm": 0.10741636902093887, + "learning_rate": 0.002, + "loss": 2.343, + "step": 215850 + }, + { + "epoch": 0.8344543922314485, + "grad_norm": 0.12283851206302643, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 215860 + }, + { + "epoch": 0.8344930494348317, + "grad_norm": 0.1226603090763092, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 215870 + }, + { + "epoch": 0.834531706638215, + "grad_norm": 0.09905299544334412, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 215880 + }, + { + "epoch": 0.8345703638415982, + "grad_norm": 0.10570216923952103, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 215890 + }, + { + "epoch": 0.8346090210449815, + "grad_norm": 0.1257603019475937, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 215900 + }, + { + "epoch": 0.8346476782483648, + "grad_norm": 0.11307378858327866, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 215910 + }, + { + "epoch": 0.8346863354517481, + "grad_norm": 0.13677355647087097, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 215920 + }, + { + "epoch": 0.8347249926551313, + "grad_norm": 0.10929053276777267, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 215930 + }, + { + "epoch": 0.8347636498585146, + "grad_norm": 0.08977066725492477, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 215940 + }, + { + "epoch": 0.834802307061898, + "grad_norm": 0.10597755014896393, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 215950 + }, + { + "epoch": 0.8348409642652812, + "grad_norm": 0.10703890025615692, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 215960 + }, + { + "epoch": 0.8348796214686645, + "grad_norm": 0.09597240388393402, + "learning_rate": 0.002, + "loss": 2.336, + "step": 215970 + }, + { + "epoch": 0.8349182786720477, + "grad_norm": 0.08831729739904404, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 215980 + }, + { + "epoch": 0.8349569358754311, + "grad_norm": 0.09937921911478043, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 215990 + }, + { + "epoch": 0.8349955930788143, + "grad_norm": 0.10918296128511429, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 216000 + }, + { + "epoch": 0.8350342502821976, + "grad_norm": 0.14627370238304138, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 216010 + }, + { + "epoch": 0.8350729074855808, + "grad_norm": 0.11544416844844818, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 216020 + }, + { + "epoch": 0.8351115646889642, + "grad_norm": 0.10169287770986557, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 216030 + }, + { + "epoch": 0.8351502218923474, + "grad_norm": 0.1060442104935646, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 216040 + }, + { + "epoch": 0.8351888790957307, + "grad_norm": 0.11080343276262283, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 216050 + }, + { + "epoch": 0.835227536299114, + "grad_norm": 0.11087153106927872, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 216060 + }, + { + "epoch": 0.8352661935024972, + "grad_norm": 0.10543015599250793, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 216070 + }, + { + "epoch": 0.8353048507058806, + "grad_norm": 0.09673027694225311, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 216080 + }, + { + "epoch": 0.8353435079092638, + "grad_norm": 0.10776925086975098, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 216090 + }, + { + "epoch": 0.8353821651126471, + "grad_norm": 0.11809718608856201, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 216100 + }, + { + "epoch": 0.8354208223160303, + "grad_norm": 0.11080779135227203, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 216110 + }, + { + "epoch": 0.8354594795194137, + "grad_norm": 0.13108794391155243, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 216120 + }, + { + "epoch": 0.8354981367227969, + "grad_norm": 0.10459788143634796, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 216130 + }, + { + "epoch": 0.8355367939261802, + "grad_norm": 0.09469229727983475, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 216140 + }, + { + "epoch": 0.8355754511295634, + "grad_norm": 0.12095770239830017, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 216150 + }, + { + "epoch": 0.8356141083329468, + "grad_norm": 0.10411792248487473, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 216160 + }, + { + "epoch": 0.8356527655363301, + "grad_norm": 0.10881275683641434, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 216170 + }, + { + "epoch": 0.8356914227397133, + "grad_norm": 0.11071446537971497, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 216180 + }, + { + "epoch": 0.8357300799430966, + "grad_norm": 0.1212799996137619, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 216190 + }, + { + "epoch": 0.8357687371464799, + "grad_norm": 0.09932661801576614, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 216200 + }, + { + "epoch": 0.8358073943498632, + "grad_norm": 0.10696447640657425, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 216210 + }, + { + "epoch": 0.8358460515532464, + "grad_norm": 0.09831404685974121, + "learning_rate": 0.002, + "loss": 2.325, + "step": 216220 + }, + { + "epoch": 0.8358847087566297, + "grad_norm": 0.10389647632837296, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 216230 + }, + { + "epoch": 0.835923365960013, + "grad_norm": 0.11285470426082611, + "learning_rate": 0.002, + "loss": 2.328, + "step": 216240 + }, + { + "epoch": 0.8359620231633963, + "grad_norm": 0.11450444906949997, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 216250 + }, + { + "epoch": 0.8360006803667795, + "grad_norm": 0.10833612084388733, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 216260 + }, + { + "epoch": 0.8360393375701628, + "grad_norm": 0.08962885290384293, + "learning_rate": 0.002, + "loss": 2.347, + "step": 216270 + }, + { + "epoch": 0.836077994773546, + "grad_norm": 0.09980128705501556, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 216280 + }, + { + "epoch": 0.8361166519769294, + "grad_norm": 0.09436109662055969, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 216290 + }, + { + "epoch": 0.8361553091803127, + "grad_norm": 0.08935563266277313, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 216300 + }, + { + "epoch": 0.8361939663836959, + "grad_norm": 0.1144620031118393, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 216310 + }, + { + "epoch": 0.8362326235870792, + "grad_norm": 0.0936708003282547, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 216320 + }, + { + "epoch": 0.8362712807904625, + "grad_norm": 0.12868732213974, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 216330 + }, + { + "epoch": 0.8363099379938458, + "grad_norm": 0.09277084469795227, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 216340 + }, + { + "epoch": 0.836348595197229, + "grad_norm": 0.10452208667993546, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 216350 + }, + { + "epoch": 0.8363872524006123, + "grad_norm": 0.10119964927434921, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 216360 + }, + { + "epoch": 0.8364259096039957, + "grad_norm": 0.11663848906755447, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 216370 + }, + { + "epoch": 0.8364645668073789, + "grad_norm": 0.1097102165222168, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 216380 + }, + { + "epoch": 0.8365032240107622, + "grad_norm": 0.11153378337621689, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 216390 + }, + { + "epoch": 0.8365418812141454, + "grad_norm": 0.09548214823007584, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 216400 + }, + { + "epoch": 0.8365805384175288, + "grad_norm": 0.0946519672870636, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 216410 + }, + { + "epoch": 0.836619195620912, + "grad_norm": 0.10115214437246323, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 216420 + }, + { + "epoch": 0.8366578528242953, + "grad_norm": 0.09835871309041977, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 216430 + }, + { + "epoch": 0.8366965100276785, + "grad_norm": 0.10255956649780273, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 216440 + }, + { + "epoch": 0.8367351672310618, + "grad_norm": 0.10846435278654099, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 216450 + }, + { + "epoch": 0.8367738244344451, + "grad_norm": 0.105130136013031, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 216460 + }, + { + "epoch": 0.8368124816378284, + "grad_norm": 0.09569898247718811, + "learning_rate": 0.002, + "loss": 2.348, + "step": 216470 + }, + { + "epoch": 0.8368511388412117, + "grad_norm": 0.14089907705783844, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 216480 + }, + { + "epoch": 0.8368897960445949, + "grad_norm": 0.11543848365545273, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 216490 + }, + { + "epoch": 0.8369284532479783, + "grad_norm": 0.09327449649572372, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 216500 + }, + { + "epoch": 0.8369671104513615, + "grad_norm": 0.12228869646787643, + "learning_rate": 0.002, + "loss": 2.33, + "step": 216510 + }, + { + "epoch": 0.8370057676547448, + "grad_norm": 0.09986231476068497, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 216520 + }, + { + "epoch": 0.837044424858128, + "grad_norm": 0.10587063431739807, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 216530 + }, + { + "epoch": 0.8370830820615114, + "grad_norm": 0.11145617067813873, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 216540 + }, + { + "epoch": 0.8371217392648946, + "grad_norm": 0.1187463253736496, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 216550 + }, + { + "epoch": 0.8371603964682779, + "grad_norm": 0.1010122075676918, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 216560 + }, + { + "epoch": 0.8371990536716611, + "grad_norm": 0.09798204153776169, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 216570 + }, + { + "epoch": 0.8372377108750445, + "grad_norm": 0.10220952332019806, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 216580 + }, + { + "epoch": 0.8372763680784278, + "grad_norm": 0.09570495784282684, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 216590 + }, + { + "epoch": 0.837315025281811, + "grad_norm": 0.12634548544883728, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 216600 + }, + { + "epoch": 0.8373536824851943, + "grad_norm": 0.10543528199195862, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 216610 + }, + { + "epoch": 0.8373923396885776, + "grad_norm": 0.10621460527181625, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 216620 + }, + { + "epoch": 0.8374309968919609, + "grad_norm": 0.1278172731399536, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 216630 + }, + { + "epoch": 0.8374696540953441, + "grad_norm": 0.10291978716850281, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 216640 + }, + { + "epoch": 0.8375083112987274, + "grad_norm": 0.1192663311958313, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 216650 + }, + { + "epoch": 0.8375469685021106, + "grad_norm": 0.10618377476930618, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 216660 + }, + { + "epoch": 0.837585625705494, + "grad_norm": 0.13136693835258484, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 216670 + }, + { + "epoch": 0.8376242829088772, + "grad_norm": 0.09881991893053055, + "learning_rate": 0.002, + "loss": 2.332, + "step": 216680 + }, + { + "epoch": 0.8376629401122605, + "grad_norm": 0.11821454763412476, + "learning_rate": 0.002, + "loss": 2.344, + "step": 216690 + }, + { + "epoch": 0.8377015973156438, + "grad_norm": 0.10533761233091354, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 216700 + }, + { + "epoch": 0.8377402545190271, + "grad_norm": 0.09469486027956009, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 216710 + }, + { + "epoch": 0.8377789117224104, + "grad_norm": 0.10206017643213272, + "learning_rate": 0.002, + "loss": 2.345, + "step": 216720 + }, + { + "epoch": 0.8378175689257936, + "grad_norm": 0.11334537714719772, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 216730 + }, + { + "epoch": 0.8378562261291769, + "grad_norm": 0.11263494193553925, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 216740 + }, + { + "epoch": 0.8378948833325602, + "grad_norm": 0.11252845078706741, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 216750 + }, + { + "epoch": 0.8379335405359435, + "grad_norm": 0.11377183347940445, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 216760 + }, + { + "epoch": 0.8379721977393267, + "grad_norm": 0.12358468770980835, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 216770 + }, + { + "epoch": 0.83801085494271, + "grad_norm": 0.12280930578708649, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 216780 + }, + { + "epoch": 0.8380495121460934, + "grad_norm": 0.08837874978780746, + "learning_rate": 0.002, + "loss": 2.329, + "step": 216790 + }, + { + "epoch": 0.8380881693494766, + "grad_norm": 0.09349276125431061, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 216800 + }, + { + "epoch": 0.8381268265528599, + "grad_norm": 0.09149591624736786, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 216810 + }, + { + "epoch": 0.8381654837562431, + "grad_norm": 0.1146298423409462, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 216820 + }, + { + "epoch": 0.8382041409596264, + "grad_norm": 0.08947496861219406, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 216830 + }, + { + "epoch": 0.8382427981630097, + "grad_norm": 0.13498574495315552, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 216840 + }, + { + "epoch": 0.838281455366393, + "grad_norm": 0.09723348915576935, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 216850 + }, + { + "epoch": 0.8383201125697762, + "grad_norm": 0.0952632948756218, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 216860 + }, + { + "epoch": 0.8383587697731595, + "grad_norm": 0.10652051120996475, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 216870 + }, + { + "epoch": 0.8383974269765428, + "grad_norm": 0.11049710214138031, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 216880 + }, + { + "epoch": 0.8384360841799261, + "grad_norm": 0.12850986421108246, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 216890 + }, + { + "epoch": 0.8384747413833094, + "grad_norm": 0.0921587198972702, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 216900 + }, + { + "epoch": 0.8385133985866926, + "grad_norm": 0.1148732528090477, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 216910 + }, + { + "epoch": 0.838552055790076, + "grad_norm": 0.17332953214645386, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 216920 + }, + { + "epoch": 0.8385907129934592, + "grad_norm": 0.10540525615215302, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 216930 + }, + { + "epoch": 0.8386293701968425, + "grad_norm": 0.0966196283698082, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 216940 + }, + { + "epoch": 0.8386680274002257, + "grad_norm": 0.1091567873954773, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 216950 + }, + { + "epoch": 0.8387066846036091, + "grad_norm": 0.09678920358419418, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 216960 + }, + { + "epoch": 0.8387453418069923, + "grad_norm": 0.09217251092195511, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 216970 + }, + { + "epoch": 0.8387839990103756, + "grad_norm": 0.09982069581747055, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 216980 + }, + { + "epoch": 0.8388226562137588, + "grad_norm": 0.09687652438879013, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 216990 + }, + { + "epoch": 0.8388613134171421, + "grad_norm": 0.13824646174907684, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 217000 + }, + { + "epoch": 0.8388999706205255, + "grad_norm": 0.09609826654195786, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 217010 + }, + { + "epoch": 0.8389386278239087, + "grad_norm": 0.09718557447195053, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 217020 + }, + { + "epoch": 0.838977285027292, + "grad_norm": 0.10497701913118362, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 217030 + }, + { + "epoch": 0.8390159422306752, + "grad_norm": 0.10044204443693161, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 217040 + }, + { + "epoch": 0.8390545994340586, + "grad_norm": 0.10007092356681824, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 217050 + }, + { + "epoch": 0.8390932566374418, + "grad_norm": 0.11548102647066116, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 217060 + }, + { + "epoch": 0.8391319138408251, + "grad_norm": 0.12100327759981155, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 217070 + }, + { + "epoch": 0.8391705710442083, + "grad_norm": 0.10042814165353775, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 217080 + }, + { + "epoch": 0.8392092282475917, + "grad_norm": 0.09519599378108978, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 217090 + }, + { + "epoch": 0.839247885450975, + "grad_norm": 0.13729625940322876, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 217100 + }, + { + "epoch": 0.8392865426543582, + "grad_norm": 0.1120065376162529, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 217110 + }, + { + "epoch": 0.8393251998577415, + "grad_norm": 0.12437760829925537, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 217120 + }, + { + "epoch": 0.8393638570611248, + "grad_norm": 0.10527113825082779, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 217130 + }, + { + "epoch": 0.8394025142645081, + "grad_norm": 0.1128300353884697, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 217140 + }, + { + "epoch": 0.8394411714678913, + "grad_norm": 0.0989866778254509, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 217150 + }, + { + "epoch": 0.8394798286712746, + "grad_norm": 0.10943536460399628, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 217160 + }, + { + "epoch": 0.8395184858746579, + "grad_norm": 0.10708934813737869, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 217170 + }, + { + "epoch": 0.8395571430780412, + "grad_norm": 0.11243987083435059, + "learning_rate": 0.002, + "loss": 2.347, + "step": 217180 + }, + { + "epoch": 0.8395958002814244, + "grad_norm": 0.11842658370733261, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 217190 + }, + { + "epoch": 0.8396344574848077, + "grad_norm": 0.10376089066267014, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 217200 + }, + { + "epoch": 0.839673114688191, + "grad_norm": 0.11659283190965652, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 217210 + }, + { + "epoch": 0.8397117718915743, + "grad_norm": 0.11655773967504501, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 217220 + }, + { + "epoch": 0.8397504290949576, + "grad_norm": 0.09950608760118484, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 217230 + }, + { + "epoch": 0.8397890862983408, + "grad_norm": 0.10458651185035706, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 217240 + }, + { + "epoch": 0.8398277435017241, + "grad_norm": 0.10697665065526962, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 217250 + }, + { + "epoch": 0.8398664007051074, + "grad_norm": 0.12223963439464569, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 217260 + }, + { + "epoch": 0.8399050579084907, + "grad_norm": 0.095971018075943, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 217270 + }, + { + "epoch": 0.8399437151118739, + "grad_norm": 0.09986023604869843, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 217280 + }, + { + "epoch": 0.8399823723152572, + "grad_norm": 0.10367639362812042, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 217290 + }, + { + "epoch": 0.8400210295186405, + "grad_norm": 0.10191434621810913, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 217300 + }, + { + "epoch": 0.8400596867220238, + "grad_norm": 0.10491587221622467, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 217310 + }, + { + "epoch": 0.840098343925407, + "grad_norm": 0.11406951397657394, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 217320 + }, + { + "epoch": 0.8401370011287903, + "grad_norm": 0.10647682100534439, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 217330 + }, + { + "epoch": 0.8401756583321737, + "grad_norm": 0.10654482245445251, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 217340 + }, + { + "epoch": 0.8402143155355569, + "grad_norm": 0.11024165153503418, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 217350 + }, + { + "epoch": 0.8402529727389402, + "grad_norm": 0.09606366604566574, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 217360 + }, + { + "epoch": 0.8402916299423234, + "grad_norm": 0.1414986103773117, + "learning_rate": 0.002, + "loss": 2.341, + "step": 217370 + }, + { + "epoch": 0.8403302871457067, + "grad_norm": 0.09301739931106567, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 217380 + }, + { + "epoch": 0.84036894434909, + "grad_norm": 0.10734853148460388, + "learning_rate": 0.002, + "loss": 2.333, + "step": 217390 + }, + { + "epoch": 0.8404076015524733, + "grad_norm": 0.09502530097961426, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 217400 + }, + { + "epoch": 0.8404462587558565, + "grad_norm": 0.09801766276359558, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 217410 + }, + { + "epoch": 0.8404849159592398, + "grad_norm": 0.10311899334192276, + "learning_rate": 0.002, + "loss": 2.327, + "step": 217420 + }, + { + "epoch": 0.8405235731626232, + "grad_norm": 0.10635611414909363, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 217430 + }, + { + "epoch": 0.8405622303660064, + "grad_norm": 0.10635276883840561, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 217440 + }, + { + "epoch": 0.8406008875693897, + "grad_norm": 0.11130545288324356, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 217450 + }, + { + "epoch": 0.8406395447727729, + "grad_norm": 0.10718841850757599, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 217460 + }, + { + "epoch": 0.8406782019761563, + "grad_norm": 0.10781049728393555, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 217470 + }, + { + "epoch": 0.8407168591795395, + "grad_norm": 0.10921121388673782, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 217480 + }, + { + "epoch": 0.8407555163829228, + "grad_norm": 0.09522861242294312, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 217490 + }, + { + "epoch": 0.840794173586306, + "grad_norm": 0.10912470519542694, + "learning_rate": 0.002, + "loss": 2.348, + "step": 217500 + }, + { + "epoch": 0.8408328307896894, + "grad_norm": 0.12018518149852753, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 217510 + }, + { + "epoch": 0.8408714879930727, + "grad_norm": 0.12558038532733917, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 217520 + }, + { + "epoch": 0.8409101451964559, + "grad_norm": 0.09982512146234512, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 217530 + }, + { + "epoch": 0.8409488023998392, + "grad_norm": 0.09959869831800461, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 217540 + }, + { + "epoch": 0.8409874596032224, + "grad_norm": 0.10776180773973465, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 217550 + }, + { + "epoch": 0.8410261168066058, + "grad_norm": 0.11372330039739609, + "learning_rate": 0.002, + "loss": 2.348, + "step": 217560 + }, + { + "epoch": 0.841064774009989, + "grad_norm": 0.09761717915534973, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 217570 + }, + { + "epoch": 0.8411034312133723, + "grad_norm": 0.12191992998123169, + "learning_rate": 0.002, + "loss": 2.332, + "step": 217580 + }, + { + "epoch": 0.8411420884167555, + "grad_norm": 0.09978660941123962, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 217590 + }, + { + "epoch": 0.8411807456201389, + "grad_norm": 0.1122688353061676, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 217600 + }, + { + "epoch": 0.8412194028235221, + "grad_norm": 0.0917871966958046, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 217610 + }, + { + "epoch": 0.8412580600269054, + "grad_norm": 0.11007138341665268, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 217620 + }, + { + "epoch": 0.8412967172302886, + "grad_norm": 0.11412589251995087, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 217630 + }, + { + "epoch": 0.841335374433672, + "grad_norm": 0.09647293388843536, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 217640 + }, + { + "epoch": 0.8413740316370553, + "grad_norm": 0.10458957403898239, + "learning_rate": 0.002, + "loss": 2.334, + "step": 217650 + }, + { + "epoch": 0.8414126888404385, + "grad_norm": 0.09868065267801285, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 217660 + }, + { + "epoch": 0.8414513460438218, + "grad_norm": 0.1279451549053192, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 217670 + }, + { + "epoch": 0.8414900032472051, + "grad_norm": 0.10215198248624802, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 217680 + }, + { + "epoch": 0.8415286604505884, + "grad_norm": 0.10315291583538055, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 217690 + }, + { + "epoch": 0.8415673176539716, + "grad_norm": 0.11462391912937164, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 217700 + }, + { + "epoch": 0.8416059748573549, + "grad_norm": 0.09662192314863205, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 217710 + }, + { + "epoch": 0.8416446320607383, + "grad_norm": 0.11247068643569946, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 217720 + }, + { + "epoch": 0.8416832892641215, + "grad_norm": 0.10318507999181747, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 217730 + }, + { + "epoch": 0.8417219464675048, + "grad_norm": 0.1108061671257019, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 217740 + }, + { + "epoch": 0.841760603670888, + "grad_norm": 0.11808360368013382, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 217750 + }, + { + "epoch": 0.8417992608742713, + "grad_norm": 0.10940134525299072, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 217760 + }, + { + "epoch": 0.8418379180776546, + "grad_norm": 0.10000176727771759, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 217770 + }, + { + "epoch": 0.8418765752810379, + "grad_norm": 0.11437246203422546, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 217780 + }, + { + "epoch": 0.8419152324844211, + "grad_norm": 0.10316790640354156, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 217790 + }, + { + "epoch": 0.8419538896878044, + "grad_norm": 0.122736856341362, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 217800 + }, + { + "epoch": 0.8419925468911877, + "grad_norm": 0.09634695202112198, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 217810 + }, + { + "epoch": 0.842031204094571, + "grad_norm": 0.11506647616624832, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 217820 + }, + { + "epoch": 0.8420698612979542, + "grad_norm": 0.11156859248876572, + "learning_rate": 0.002, + "loss": 2.333, + "step": 217830 + }, + { + "epoch": 0.8421085185013375, + "grad_norm": 0.11431318521499634, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 217840 + }, + { + "epoch": 0.8421471757047209, + "grad_norm": 0.10677983611822128, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 217850 + }, + { + "epoch": 0.8421858329081041, + "grad_norm": 0.1121184453368187, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 217860 + }, + { + "epoch": 0.8422244901114874, + "grad_norm": 0.11475158482789993, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 217870 + }, + { + "epoch": 0.8422631473148706, + "grad_norm": 0.10288957506418228, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 217880 + }, + { + "epoch": 0.842301804518254, + "grad_norm": 0.12002434581518173, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 217890 + }, + { + "epoch": 0.8423404617216372, + "grad_norm": 0.09990371018648148, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 217900 + }, + { + "epoch": 0.8423791189250205, + "grad_norm": 0.09702201932668686, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 217910 + }, + { + "epoch": 0.8424177761284037, + "grad_norm": 0.10541484504938126, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 217920 + }, + { + "epoch": 0.842456433331787, + "grad_norm": 0.24822331964969635, + "learning_rate": 0.002, + "loss": 2.345, + "step": 217930 + }, + { + "epoch": 0.8424950905351704, + "grad_norm": 0.09540753811597824, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 217940 + }, + { + "epoch": 0.8425337477385536, + "grad_norm": 0.12220920622348785, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 217950 + }, + { + "epoch": 0.8425724049419369, + "grad_norm": 0.1137244924902916, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 217960 + }, + { + "epoch": 0.8426110621453201, + "grad_norm": 0.1189429759979248, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 217970 + }, + { + "epoch": 0.8426497193487035, + "grad_norm": 0.09206331521272659, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 217980 + }, + { + "epoch": 0.8426883765520867, + "grad_norm": 0.08543618023395538, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 217990 + }, + { + "epoch": 0.84272703375547, + "grad_norm": 0.13945125043392181, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 218000 + }, + { + "epoch": 0.8427656909588532, + "grad_norm": 0.12041846662759781, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 218010 + }, + { + "epoch": 0.8428043481622366, + "grad_norm": 0.10267645865678787, + "learning_rate": 0.002, + "loss": 2.347, + "step": 218020 + }, + { + "epoch": 0.8428430053656198, + "grad_norm": 0.12095742672681808, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 218030 + }, + { + "epoch": 0.8428816625690031, + "grad_norm": 0.09274441003799438, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 218040 + }, + { + "epoch": 0.8429203197723864, + "grad_norm": 0.11805014312267303, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 218050 + }, + { + "epoch": 0.8429589769757697, + "grad_norm": 0.09663267433643341, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 218060 + }, + { + "epoch": 0.842997634179153, + "grad_norm": 0.10639258474111557, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 218070 + }, + { + "epoch": 0.8430362913825362, + "grad_norm": 0.09747055172920227, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 218080 + }, + { + "epoch": 0.8430749485859195, + "grad_norm": 0.12348346412181854, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 218090 + }, + { + "epoch": 0.8431136057893028, + "grad_norm": 0.10564039647579193, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 218100 + }, + { + "epoch": 0.8431522629926861, + "grad_norm": 0.10222464054822922, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 218110 + }, + { + "epoch": 0.8431909201960693, + "grad_norm": 0.11077496409416199, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 218120 + }, + { + "epoch": 0.8432295773994526, + "grad_norm": 0.10465149581432343, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 218130 + }, + { + "epoch": 0.8432682346028358, + "grad_norm": 0.09964410960674286, + "learning_rate": 0.002, + "loss": 2.334, + "step": 218140 + }, + { + "epoch": 0.8433068918062192, + "grad_norm": 0.0931280329823494, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 218150 + }, + { + "epoch": 0.8433455490096025, + "grad_norm": 0.12744703888893127, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 218160 + }, + { + "epoch": 0.8433842062129857, + "grad_norm": 0.10196051001548767, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 218170 + }, + { + "epoch": 0.843422863416369, + "grad_norm": 0.11853543668985367, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 218180 + }, + { + "epoch": 0.8434615206197523, + "grad_norm": 0.1072535440325737, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 218190 + }, + { + "epoch": 0.8435001778231356, + "grad_norm": 0.12971144914627075, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 218200 + }, + { + "epoch": 0.8435388350265188, + "grad_norm": 0.10555911064147949, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 218210 + }, + { + "epoch": 0.8435774922299021, + "grad_norm": 0.12876932322978973, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 218220 + }, + { + "epoch": 0.8436161494332854, + "grad_norm": 0.10753574967384338, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 218230 + }, + { + "epoch": 0.8436548066366687, + "grad_norm": 0.10081668198108673, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 218240 + }, + { + "epoch": 0.843693463840052, + "grad_norm": 0.09541888535022736, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 218250 + }, + { + "epoch": 0.8437321210434352, + "grad_norm": 0.12228714674711227, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 218260 + }, + { + "epoch": 0.8437707782468186, + "grad_norm": 0.09093743562698364, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 218270 + }, + { + "epoch": 0.8438094354502018, + "grad_norm": 0.12039365619421005, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 218280 + }, + { + "epoch": 0.8438480926535851, + "grad_norm": 0.10386831313371658, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 218290 + }, + { + "epoch": 0.8438867498569683, + "grad_norm": 0.10784223675727844, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 218300 + }, + { + "epoch": 0.8439254070603516, + "grad_norm": 0.10106562077999115, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 218310 + }, + { + "epoch": 0.8439640642637349, + "grad_norm": 0.09477700293064117, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 218320 + }, + { + "epoch": 0.8440027214671182, + "grad_norm": 0.09081584215164185, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 218330 + }, + { + "epoch": 0.8440413786705014, + "grad_norm": 0.09890346229076385, + "learning_rate": 0.002, + "loss": 2.337, + "step": 218340 + }, + { + "epoch": 0.8440800358738847, + "grad_norm": 0.0994071513414383, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 218350 + }, + { + "epoch": 0.8441186930772681, + "grad_norm": 0.09795328974723816, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 218360 + }, + { + "epoch": 0.8441573502806513, + "grad_norm": 0.10990192741155624, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 218370 + }, + { + "epoch": 0.8441960074840346, + "grad_norm": 0.1020970419049263, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 218380 + }, + { + "epoch": 0.8442346646874178, + "grad_norm": 0.12286726385354996, + "learning_rate": 0.002, + "loss": 2.336, + "step": 218390 + }, + { + "epoch": 0.8442733218908012, + "grad_norm": 0.10366500914096832, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 218400 + }, + { + "epoch": 0.8443119790941844, + "grad_norm": 0.09809817373752594, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 218410 + }, + { + "epoch": 0.8443506362975677, + "grad_norm": 0.09430515021085739, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 218420 + }, + { + "epoch": 0.8443892935009509, + "grad_norm": 0.10951139777898788, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 218430 + }, + { + "epoch": 0.8444279507043343, + "grad_norm": 0.1111026182770729, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 218440 + }, + { + "epoch": 0.8444666079077175, + "grad_norm": 0.10476387292146683, + "learning_rate": 0.002, + "loss": 2.323, + "step": 218450 + }, + { + "epoch": 0.8445052651111008, + "grad_norm": 0.09188532829284668, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 218460 + }, + { + "epoch": 0.844543922314484, + "grad_norm": 0.10570862889289856, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 218470 + }, + { + "epoch": 0.8445825795178673, + "grad_norm": 0.10107731819152832, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 218480 + }, + { + "epoch": 0.8446212367212507, + "grad_norm": 0.1053171381354332, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 218490 + }, + { + "epoch": 0.8446598939246339, + "grad_norm": 0.11416930705308914, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 218500 + }, + { + "epoch": 0.8446985511280172, + "grad_norm": 0.10873345285654068, + "learning_rate": 0.002, + "loss": 2.334, + "step": 218510 + }, + { + "epoch": 0.8447372083314004, + "grad_norm": 0.11997786164283752, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 218520 + }, + { + "epoch": 0.8447758655347838, + "grad_norm": 0.09660268574953079, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 218530 + }, + { + "epoch": 0.844814522738167, + "grad_norm": 0.12605643272399902, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 218540 + }, + { + "epoch": 0.8448531799415503, + "grad_norm": 0.12503686547279358, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 218550 + }, + { + "epoch": 0.8448918371449335, + "grad_norm": 0.09812545776367188, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 218560 + }, + { + "epoch": 0.8449304943483169, + "grad_norm": 0.1045181006193161, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 218570 + }, + { + "epoch": 0.8449691515517002, + "grad_norm": 0.09139660745859146, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 218580 + }, + { + "epoch": 0.8450078087550834, + "grad_norm": 0.11241042613983154, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 218590 + }, + { + "epoch": 0.8450464659584667, + "grad_norm": 0.11408902704715729, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 218600 + }, + { + "epoch": 0.84508512316185, + "grad_norm": 0.09838565438985825, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 218610 + }, + { + "epoch": 0.8451237803652333, + "grad_norm": 0.09781533479690552, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 218620 + }, + { + "epoch": 0.8451624375686165, + "grad_norm": 0.09544527530670166, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 218630 + }, + { + "epoch": 0.8452010947719998, + "grad_norm": 0.10308004170656204, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 218640 + }, + { + "epoch": 0.8452397519753831, + "grad_norm": 0.11524269729852676, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 218650 + }, + { + "epoch": 0.8452784091787664, + "grad_norm": 0.15980708599090576, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 218660 + }, + { + "epoch": 0.8453170663821497, + "grad_norm": 0.11850898712873459, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 218670 + }, + { + "epoch": 0.8453557235855329, + "grad_norm": 0.12017089873552322, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 218680 + }, + { + "epoch": 0.8453943807889162, + "grad_norm": 0.11325656622648239, + "learning_rate": 0.002, + "loss": 2.364, + "step": 218690 + }, + { + "epoch": 0.8454330379922995, + "grad_norm": 0.10288386791944504, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 218700 + }, + { + "epoch": 0.8454716951956828, + "grad_norm": 0.10763750225305557, + "learning_rate": 0.002, + "loss": 2.348, + "step": 218710 + }, + { + "epoch": 0.845510352399066, + "grad_norm": 0.12071944773197174, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 218720 + }, + { + "epoch": 0.8455490096024493, + "grad_norm": 0.10066035389900208, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 218730 + }, + { + "epoch": 0.8455876668058326, + "grad_norm": 0.11219926178455353, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 218740 + }, + { + "epoch": 0.8456263240092159, + "grad_norm": 0.09474202990531921, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 218750 + }, + { + "epoch": 0.8456649812125991, + "grad_norm": 0.11808373034000397, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 218760 + }, + { + "epoch": 0.8457036384159824, + "grad_norm": 0.10525443404912949, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 218770 + }, + { + "epoch": 0.8457422956193658, + "grad_norm": 0.09398797154426575, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 218780 + }, + { + "epoch": 0.845780952822749, + "grad_norm": 0.11471854150295258, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 218790 + }, + { + "epoch": 0.8458196100261323, + "grad_norm": 0.10278965532779694, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 218800 + }, + { + "epoch": 0.8458582672295155, + "grad_norm": 0.10400038957595825, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 218810 + }, + { + "epoch": 0.8458969244328989, + "grad_norm": 0.09779319167137146, + "learning_rate": 0.002, + "loss": 2.345, + "step": 218820 + }, + { + "epoch": 0.8459355816362821, + "grad_norm": 0.13267405331134796, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 218830 + }, + { + "epoch": 0.8459742388396654, + "grad_norm": 0.10132269561290741, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 218840 + }, + { + "epoch": 0.8460128960430486, + "grad_norm": 0.10761400312185287, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 218850 + }, + { + "epoch": 0.8460515532464319, + "grad_norm": 0.11506160348653793, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 218860 + }, + { + "epoch": 0.8460902104498152, + "grad_norm": 0.09911599010229111, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 218870 + }, + { + "epoch": 0.8461288676531985, + "grad_norm": 0.09980878233909607, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 218880 + }, + { + "epoch": 0.8461675248565818, + "grad_norm": 0.12252838164567947, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 218890 + }, + { + "epoch": 0.846206182059965, + "grad_norm": 0.10865668207406998, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 218900 + }, + { + "epoch": 0.8462448392633484, + "grad_norm": 0.08866766840219498, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 218910 + }, + { + "epoch": 0.8462834964667316, + "grad_norm": 0.1000230684876442, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 218920 + }, + { + "epoch": 0.8463221536701149, + "grad_norm": 0.09783598780632019, + "learning_rate": 0.002, + "loss": 2.34, + "step": 218930 + }, + { + "epoch": 0.8463608108734981, + "grad_norm": 0.09208564460277557, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 218940 + }, + { + "epoch": 0.8463994680768815, + "grad_norm": 0.11278524994850159, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 218950 + }, + { + "epoch": 0.8464381252802647, + "grad_norm": 0.10701495409011841, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 218960 + }, + { + "epoch": 0.846476782483648, + "grad_norm": 0.09828449040651321, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 218970 + }, + { + "epoch": 0.8465154396870312, + "grad_norm": 0.11303871124982834, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 218980 + }, + { + "epoch": 0.8465540968904146, + "grad_norm": 0.12167762964963913, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 218990 + }, + { + "epoch": 0.8465927540937979, + "grad_norm": 0.11140464246273041, + "learning_rate": 0.002, + "loss": 2.34, + "step": 219000 + }, + { + "epoch": 0.8466314112971811, + "grad_norm": 0.10240872204303741, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 219010 + }, + { + "epoch": 0.8466700685005644, + "grad_norm": 0.11569183319807053, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 219020 + }, + { + "epoch": 0.8467087257039477, + "grad_norm": 0.118963323533535, + "learning_rate": 0.002, + "loss": 2.363, + "step": 219030 + }, + { + "epoch": 0.846747382907331, + "grad_norm": 0.09310389310121536, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 219040 + }, + { + "epoch": 0.8467860401107142, + "grad_norm": 0.10494721680879593, + "learning_rate": 0.002, + "loss": 2.33, + "step": 219050 + }, + { + "epoch": 0.8468246973140975, + "grad_norm": 0.10457353293895721, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 219060 + }, + { + "epoch": 0.8468633545174807, + "grad_norm": 0.11794960498809814, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 219070 + }, + { + "epoch": 0.8469020117208641, + "grad_norm": 0.11988719552755356, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 219080 + }, + { + "epoch": 0.8469406689242474, + "grad_norm": 0.12246638536453247, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 219090 + }, + { + "epoch": 0.8469793261276306, + "grad_norm": 0.10058612376451492, + "learning_rate": 0.002, + "loss": 2.342, + "step": 219100 + }, + { + "epoch": 0.8470179833310139, + "grad_norm": 0.10365330427885056, + "learning_rate": 0.002, + "loss": 2.348, + "step": 219110 + }, + { + "epoch": 0.8470566405343972, + "grad_norm": 0.10714240372180939, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 219120 + }, + { + "epoch": 0.8470952977377805, + "grad_norm": 0.10079375654459, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 219130 + }, + { + "epoch": 0.8471339549411637, + "grad_norm": 0.10422948002815247, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 219140 + }, + { + "epoch": 0.847172612144547, + "grad_norm": 0.15729708969593048, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 219150 + }, + { + "epoch": 0.8472112693479303, + "grad_norm": 0.09796754270792007, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 219160 + }, + { + "epoch": 0.8472499265513136, + "grad_norm": 0.09482486546039581, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 219170 + }, + { + "epoch": 0.8472885837546968, + "grad_norm": 0.0921812355518341, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 219180 + }, + { + "epoch": 0.8473272409580801, + "grad_norm": 0.09821398556232452, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 219190 + }, + { + "epoch": 0.8473658981614635, + "grad_norm": 0.10597255825996399, + "learning_rate": 0.002, + "loss": 2.347, + "step": 219200 + }, + { + "epoch": 0.8474045553648467, + "grad_norm": 0.10136741399765015, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 219210 + }, + { + "epoch": 0.84744321256823, + "grad_norm": 0.10990548878908157, + "learning_rate": 0.002, + "loss": 2.344, + "step": 219220 + }, + { + "epoch": 0.8474818697716132, + "grad_norm": 0.12417969852685928, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 219230 + }, + { + "epoch": 0.8475205269749965, + "grad_norm": 0.17158718407154083, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 219240 + }, + { + "epoch": 0.8475591841783798, + "grad_norm": 0.10663104057312012, + "learning_rate": 0.002, + "loss": 2.346, + "step": 219250 + }, + { + "epoch": 0.8475978413817631, + "grad_norm": 0.09438648074865341, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 219260 + }, + { + "epoch": 0.8476364985851463, + "grad_norm": 0.09565073251724243, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 219270 + }, + { + "epoch": 0.8476751557885296, + "grad_norm": 0.11593829095363617, + "learning_rate": 0.002, + "loss": 2.331, + "step": 219280 + }, + { + "epoch": 0.847713812991913, + "grad_norm": 0.10374920070171356, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 219290 + }, + { + "epoch": 0.8477524701952962, + "grad_norm": 0.1203596442937851, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 219300 + }, + { + "epoch": 0.8477911273986795, + "grad_norm": 0.1203400269150734, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 219310 + }, + { + "epoch": 0.8478297846020627, + "grad_norm": 0.09964600205421448, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 219320 + }, + { + "epoch": 0.8478684418054461, + "grad_norm": 0.1544189155101776, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 219330 + }, + { + "epoch": 0.8479070990088293, + "grad_norm": 0.09774701297283173, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 219340 + }, + { + "epoch": 0.8479457562122126, + "grad_norm": 0.09581021964550018, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 219350 + }, + { + "epoch": 0.8479844134155958, + "grad_norm": 0.0978235974907875, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 219360 + }, + { + "epoch": 0.8480230706189792, + "grad_norm": 0.2410297989845276, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 219370 + }, + { + "epoch": 0.8480617278223624, + "grad_norm": 0.10258159786462784, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 219380 + }, + { + "epoch": 0.8481003850257457, + "grad_norm": 0.11392287164926529, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 219390 + }, + { + "epoch": 0.848139042229129, + "grad_norm": 0.10421387106180191, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 219400 + }, + { + "epoch": 0.8481776994325122, + "grad_norm": 0.11930263787508011, + "learning_rate": 0.002, + "loss": 2.3613, + "step": 219410 + }, + { + "epoch": 0.8482163566358956, + "grad_norm": 0.11117282509803772, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 219420 + }, + { + "epoch": 0.8482550138392788, + "grad_norm": 0.10350653529167175, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 219430 + }, + { + "epoch": 0.8482936710426621, + "grad_norm": 0.10714832693338394, + "learning_rate": 0.002, + "loss": 2.346, + "step": 219440 + }, + { + "epoch": 0.8483323282460453, + "grad_norm": 0.09527911245822906, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 219450 + }, + { + "epoch": 0.8483709854494287, + "grad_norm": 0.10346433520317078, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 219460 + }, + { + "epoch": 0.8484096426528119, + "grad_norm": 0.13404396176338196, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 219470 + }, + { + "epoch": 0.8484482998561952, + "grad_norm": 0.10561177879571915, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 219480 + }, + { + "epoch": 0.8484869570595784, + "grad_norm": 0.10402088612318039, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 219490 + }, + { + "epoch": 0.8485256142629618, + "grad_norm": 0.1081523671746254, + "learning_rate": 0.002, + "loss": 2.349, + "step": 219500 + }, + { + "epoch": 0.848564271466345, + "grad_norm": 0.10165558010339737, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 219510 + }, + { + "epoch": 0.8486029286697283, + "grad_norm": 0.10864666104316711, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 219520 + }, + { + "epoch": 0.8486415858731116, + "grad_norm": 0.11919961124658585, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 219530 + }, + { + "epoch": 0.8486802430764949, + "grad_norm": 0.10787352174520493, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 219540 + }, + { + "epoch": 0.8487189002798782, + "grad_norm": 0.09709031134843826, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 219550 + }, + { + "epoch": 0.8487575574832614, + "grad_norm": 0.11438827961683273, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 219560 + }, + { + "epoch": 0.8487962146866447, + "grad_norm": 0.20422634482383728, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 219570 + }, + { + "epoch": 0.848834871890028, + "grad_norm": 0.104178287088871, + "learning_rate": 0.002, + "loss": 2.371, + "step": 219580 + }, + { + "epoch": 0.8488735290934113, + "grad_norm": 0.10315261781215668, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 219590 + }, + { + "epoch": 0.8489121862967945, + "grad_norm": 0.1082720160484314, + "learning_rate": 0.002, + "loss": 2.346, + "step": 219600 + }, + { + "epoch": 0.8489508435001778, + "grad_norm": 0.11481138318777084, + "learning_rate": 0.002, + "loss": 2.351, + "step": 219610 + }, + { + "epoch": 0.848989500703561, + "grad_norm": 0.09659004211425781, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 219620 + }, + { + "epoch": 0.8490281579069444, + "grad_norm": 0.10843570530414581, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 219630 + }, + { + "epoch": 0.8490668151103277, + "grad_norm": 0.10417255759239197, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 219640 + }, + { + "epoch": 0.8491054723137109, + "grad_norm": 0.09357796609401703, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 219650 + }, + { + "epoch": 0.8491441295170942, + "grad_norm": 0.10457409173250198, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 219660 + }, + { + "epoch": 0.8491827867204775, + "grad_norm": 0.10815020650625229, + "learning_rate": 0.002, + "loss": 2.335, + "step": 219670 + }, + { + "epoch": 0.8492214439238608, + "grad_norm": 0.09615054726600647, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 219680 + }, + { + "epoch": 0.849260101127244, + "grad_norm": 0.1047089621424675, + "learning_rate": 0.002, + "loss": 2.347, + "step": 219690 + }, + { + "epoch": 0.8492987583306273, + "grad_norm": 0.1393386870622635, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 219700 + }, + { + "epoch": 0.8493374155340107, + "grad_norm": 0.12716567516326904, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 219710 + }, + { + "epoch": 0.8493760727373939, + "grad_norm": 0.14259745180606842, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 219720 + }, + { + "epoch": 0.8494147299407772, + "grad_norm": 0.11295412480831146, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 219730 + }, + { + "epoch": 0.8494533871441604, + "grad_norm": 0.11027267575263977, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 219740 + }, + { + "epoch": 0.8494920443475438, + "grad_norm": 0.11603856086730957, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 219750 + }, + { + "epoch": 0.849530701550927, + "grad_norm": 0.10948418080806732, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 219760 + }, + { + "epoch": 0.8495693587543103, + "grad_norm": 0.12260384112596512, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 219770 + }, + { + "epoch": 0.8496080159576935, + "grad_norm": 0.4248383343219757, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 219780 + }, + { + "epoch": 0.8496466731610768, + "grad_norm": 0.10164932906627655, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 219790 + }, + { + "epoch": 0.8496853303644601, + "grad_norm": 1.1660302877426147, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 219800 + }, + { + "epoch": 0.8497239875678434, + "grad_norm": 0.10743222385644913, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 219810 + }, + { + "epoch": 0.8497626447712266, + "grad_norm": 0.11784674972295761, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 219820 + }, + { + "epoch": 0.8498013019746099, + "grad_norm": 0.11492542922496796, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 219830 + }, + { + "epoch": 0.8498399591779933, + "grad_norm": 0.09285452216863632, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 219840 + }, + { + "epoch": 0.8498786163813765, + "grad_norm": 0.1043357327580452, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 219850 + }, + { + "epoch": 0.8499172735847598, + "grad_norm": 0.1173747256398201, + "learning_rate": 0.002, + "loss": 2.339, + "step": 219860 + }, + { + "epoch": 0.849955930788143, + "grad_norm": 0.12000737339258194, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 219870 + }, + { + "epoch": 0.8499945879915264, + "grad_norm": 0.0965653732419014, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 219880 + }, + { + "epoch": 0.8500332451949096, + "grad_norm": 0.09841306507587433, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 219890 + }, + { + "epoch": 0.8500719023982929, + "grad_norm": 0.09404002130031586, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 219900 + }, + { + "epoch": 0.8501105596016761, + "grad_norm": 0.09891854971647263, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 219910 + }, + { + "epoch": 0.8501492168050595, + "grad_norm": 0.10649807751178741, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 219920 + }, + { + "epoch": 0.8501878740084428, + "grad_norm": 0.10601424425840378, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 219930 + }, + { + "epoch": 0.850226531211826, + "grad_norm": 0.10602995753288269, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 219940 + }, + { + "epoch": 0.8502651884152093, + "grad_norm": 0.12841784954071045, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 219950 + }, + { + "epoch": 0.8503038456185925, + "grad_norm": 0.10099924355745316, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 219960 + }, + { + "epoch": 0.8503425028219759, + "grad_norm": 0.1031048521399498, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 219970 + }, + { + "epoch": 0.8503811600253591, + "grad_norm": 0.12088562548160553, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 219980 + }, + { + "epoch": 0.8504198172287424, + "grad_norm": 0.10422363132238388, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 219990 + }, + { + "epoch": 0.8504584744321256, + "grad_norm": 0.10978923738002777, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 220000 + }, + { + "epoch": 0.850497131635509, + "grad_norm": 0.09470580518245697, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 220010 + }, + { + "epoch": 0.8505357888388922, + "grad_norm": 0.10753884166479111, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 220020 + }, + { + "epoch": 0.8505744460422755, + "grad_norm": 0.10996290296316147, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 220030 + }, + { + "epoch": 0.8506131032456588, + "grad_norm": 0.11503701657056808, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 220040 + }, + { + "epoch": 0.8506517604490421, + "grad_norm": 0.11009927839040756, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 220050 + }, + { + "epoch": 0.8506904176524254, + "grad_norm": 0.11286523938179016, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 220060 + }, + { + "epoch": 0.8507290748558086, + "grad_norm": 0.1181272491812706, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 220070 + }, + { + "epoch": 0.8507677320591919, + "grad_norm": 0.10894773155450821, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 220080 + }, + { + "epoch": 0.8508063892625752, + "grad_norm": 0.10753612220287323, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 220090 + }, + { + "epoch": 0.8508450464659585, + "grad_norm": 0.11073257774114609, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 220100 + }, + { + "epoch": 0.8508837036693417, + "grad_norm": 0.09844937920570374, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 220110 + }, + { + "epoch": 0.850922360872725, + "grad_norm": 0.09576030820608139, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 220120 + }, + { + "epoch": 0.8509610180761084, + "grad_norm": 0.107846699655056, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 220130 + }, + { + "epoch": 0.8509996752794916, + "grad_norm": 0.10430942475795746, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 220140 + }, + { + "epoch": 0.8510383324828749, + "grad_norm": 0.09915446490049362, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 220150 + }, + { + "epoch": 0.8510769896862581, + "grad_norm": 0.12929920852184296, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 220160 + }, + { + "epoch": 0.8511156468896414, + "grad_norm": 0.10357212275266647, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 220170 + }, + { + "epoch": 0.8511543040930247, + "grad_norm": 0.1053905338048935, + "learning_rate": 0.002, + "loss": 2.3639, + "step": 220180 + }, + { + "epoch": 0.851192961296408, + "grad_norm": 0.11608556658029556, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 220190 + }, + { + "epoch": 0.8512316184997912, + "grad_norm": 0.11105789989233017, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 220200 + }, + { + "epoch": 0.8512702757031745, + "grad_norm": 0.11207444965839386, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 220210 + }, + { + "epoch": 0.8513089329065578, + "grad_norm": 0.1263352632522583, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 220220 + }, + { + "epoch": 0.8513475901099411, + "grad_norm": 0.0969938337802887, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 220230 + }, + { + "epoch": 0.8513862473133244, + "grad_norm": 0.09842877835035324, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 220240 + }, + { + "epoch": 0.8514249045167076, + "grad_norm": 0.09710202366113663, + "learning_rate": 0.002, + "loss": 2.327, + "step": 220250 + }, + { + "epoch": 0.851463561720091, + "grad_norm": 0.10513780266046524, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 220260 + }, + { + "epoch": 0.8515022189234742, + "grad_norm": 0.10045460611581802, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 220270 + }, + { + "epoch": 0.8515408761268575, + "grad_norm": 0.10101631283760071, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 220280 + }, + { + "epoch": 0.8515795333302407, + "grad_norm": 0.09537997096776962, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 220290 + }, + { + "epoch": 0.8516181905336241, + "grad_norm": 0.10045386105775833, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 220300 + }, + { + "epoch": 0.8516568477370073, + "grad_norm": 0.12381569296121597, + "learning_rate": 0.002, + "loss": 2.335, + "step": 220310 + }, + { + "epoch": 0.8516955049403906, + "grad_norm": 0.08416225016117096, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 220320 + }, + { + "epoch": 0.8517341621437738, + "grad_norm": 0.09276917576789856, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 220330 + }, + { + "epoch": 0.8517728193471571, + "grad_norm": 0.10120249539613724, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 220340 + }, + { + "epoch": 0.8518114765505405, + "grad_norm": 0.10174285620450974, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 220350 + }, + { + "epoch": 0.8518501337539237, + "grad_norm": 0.09552405774593353, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 220360 + }, + { + "epoch": 0.851888790957307, + "grad_norm": 0.09580851346254349, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 220370 + }, + { + "epoch": 0.8519274481606902, + "grad_norm": 0.1014307513833046, + "learning_rate": 0.002, + "loss": 2.347, + "step": 220380 + }, + { + "epoch": 0.8519661053640736, + "grad_norm": 0.11779122799634933, + "learning_rate": 0.002, + "loss": 2.33, + "step": 220390 + }, + { + "epoch": 0.8520047625674568, + "grad_norm": 0.11027638614177704, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 220400 + }, + { + "epoch": 0.8520434197708401, + "grad_norm": 0.0975247174501419, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 220410 + }, + { + "epoch": 0.8520820769742233, + "grad_norm": 0.12367052584886551, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 220420 + }, + { + "epoch": 0.8521207341776067, + "grad_norm": 0.09940453618764877, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 220430 + }, + { + "epoch": 0.85215939138099, + "grad_norm": 0.10287129133939743, + "learning_rate": 0.002, + "loss": 2.333, + "step": 220440 + }, + { + "epoch": 0.8521980485843732, + "grad_norm": 0.10441652685403824, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 220450 + }, + { + "epoch": 0.8522367057877565, + "grad_norm": 0.10059020668268204, + "learning_rate": 0.002, + "loss": 2.355, + "step": 220460 + }, + { + "epoch": 0.8522753629911398, + "grad_norm": 0.1422119140625, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 220470 + }, + { + "epoch": 0.8523140201945231, + "grad_norm": 0.09308356046676636, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 220480 + }, + { + "epoch": 0.8523526773979063, + "grad_norm": 0.11652087420225143, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 220490 + }, + { + "epoch": 0.8523913346012896, + "grad_norm": 0.10112705081701279, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 220500 + }, + { + "epoch": 0.8524299918046729, + "grad_norm": 0.1368659883737564, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 220510 + }, + { + "epoch": 0.8524686490080562, + "grad_norm": 0.10706181079149246, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 220520 + }, + { + "epoch": 0.8525073062114394, + "grad_norm": 0.10697004199028015, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 220530 + }, + { + "epoch": 0.8525459634148227, + "grad_norm": 0.10608157515525818, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 220540 + }, + { + "epoch": 0.852584620618206, + "grad_norm": 0.10999225825071335, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 220550 + }, + { + "epoch": 0.8526232778215893, + "grad_norm": 0.09438899904489517, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 220560 + }, + { + "epoch": 0.8526619350249726, + "grad_norm": 0.09783659875392914, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 220570 + }, + { + "epoch": 0.8527005922283558, + "grad_norm": 0.09867078810930252, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 220580 + }, + { + "epoch": 0.8527392494317391, + "grad_norm": 0.09790950268507004, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 220590 + }, + { + "epoch": 0.8527779066351224, + "grad_norm": 0.12746822834014893, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 220600 + }, + { + "epoch": 0.8528165638385057, + "grad_norm": 0.11303797364234924, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 220610 + }, + { + "epoch": 0.8528552210418889, + "grad_norm": 0.0921299010515213, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 220620 + }, + { + "epoch": 0.8528938782452722, + "grad_norm": 0.11156327277421951, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 220630 + }, + { + "epoch": 0.8529325354486555, + "grad_norm": 0.11616717278957367, + "learning_rate": 0.002, + "loss": 2.3152, + "step": 220640 + }, + { + "epoch": 0.8529711926520388, + "grad_norm": 0.09659020602703094, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 220650 + }, + { + "epoch": 0.853009849855422, + "grad_norm": 0.1045290008187294, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 220660 + }, + { + "epoch": 0.8530485070588053, + "grad_norm": 0.11413311958312988, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 220670 + }, + { + "epoch": 0.8530871642621887, + "grad_norm": 0.09833575785160065, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 220680 + }, + { + "epoch": 0.8531258214655719, + "grad_norm": 0.11460181325674057, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 220690 + }, + { + "epoch": 0.8531644786689552, + "grad_norm": 0.10402899980545044, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 220700 + }, + { + "epoch": 0.8532031358723384, + "grad_norm": 0.11590957641601562, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 220710 + }, + { + "epoch": 0.8532417930757217, + "grad_norm": 0.10896258801221848, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 220720 + }, + { + "epoch": 0.853280450279105, + "grad_norm": 0.10807967185974121, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 220730 + }, + { + "epoch": 0.8533191074824883, + "grad_norm": 0.0994463711977005, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 220740 + }, + { + "epoch": 0.8533577646858715, + "grad_norm": 0.15874207019805908, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 220750 + }, + { + "epoch": 0.8533964218892548, + "grad_norm": 0.11601907759904861, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 220760 + }, + { + "epoch": 0.8534350790926382, + "grad_norm": 0.09301315993070602, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 220770 + }, + { + "epoch": 0.8534737362960214, + "grad_norm": 0.1044008731842041, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 220780 + }, + { + "epoch": 0.8535123934994047, + "grad_norm": 0.10023225098848343, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 220790 + }, + { + "epoch": 0.8535510507027879, + "grad_norm": 0.11397735029459, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 220800 + }, + { + "epoch": 0.8535897079061713, + "grad_norm": 0.09428733587265015, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 220810 + }, + { + "epoch": 0.8536283651095545, + "grad_norm": 0.09366890043020248, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 220820 + }, + { + "epoch": 0.8536670223129378, + "grad_norm": 0.09214852005243301, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 220830 + }, + { + "epoch": 0.853705679516321, + "grad_norm": 0.09980152547359467, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 220840 + }, + { + "epoch": 0.8537443367197044, + "grad_norm": 0.09426254779100418, + "learning_rate": 0.002, + "loss": 2.357, + "step": 220850 + }, + { + "epoch": 0.8537829939230877, + "grad_norm": 0.09684693813323975, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 220860 + }, + { + "epoch": 0.8538216511264709, + "grad_norm": 0.13577967882156372, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 220870 + }, + { + "epoch": 0.8538603083298542, + "grad_norm": 0.11286691576242447, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 220880 + }, + { + "epoch": 0.8538989655332374, + "grad_norm": 0.10951905697584152, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 220890 + }, + { + "epoch": 0.8539376227366208, + "grad_norm": 0.09772135317325592, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 220900 + }, + { + "epoch": 0.853976279940004, + "grad_norm": 0.10379073023796082, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 220910 + }, + { + "epoch": 0.8540149371433873, + "grad_norm": 0.10221673548221588, + "learning_rate": 0.002, + "loss": 2.329, + "step": 220920 + }, + { + "epoch": 0.8540535943467705, + "grad_norm": 0.1037604883313179, + "learning_rate": 0.002, + "loss": 2.331, + "step": 220930 + }, + { + "epoch": 0.8540922515501539, + "grad_norm": 0.09758436679840088, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 220940 + }, + { + "epoch": 0.8541309087535371, + "grad_norm": 0.11016526073217392, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 220950 + }, + { + "epoch": 0.8541695659569204, + "grad_norm": 0.10051882266998291, + "learning_rate": 0.002, + "loss": 2.337, + "step": 220960 + }, + { + "epoch": 0.8542082231603036, + "grad_norm": 0.10580579191446304, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 220970 + }, + { + "epoch": 0.854246880363687, + "grad_norm": 0.10832671076059341, + "learning_rate": 0.002, + "loss": 2.347, + "step": 220980 + }, + { + "epoch": 0.8542855375670703, + "grad_norm": 0.08817476779222488, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 220990 + }, + { + "epoch": 0.8543241947704535, + "grad_norm": 0.10400227457284927, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 221000 + }, + { + "epoch": 0.8543628519738368, + "grad_norm": 0.23474979400634766, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 221010 + }, + { + "epoch": 0.8544015091772201, + "grad_norm": 0.09855043888092041, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 221020 + }, + { + "epoch": 0.8544401663806034, + "grad_norm": 0.10361428558826447, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 221030 + }, + { + "epoch": 0.8544788235839866, + "grad_norm": 0.10192447900772095, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 221040 + }, + { + "epoch": 0.8545174807873699, + "grad_norm": 0.0955294519662857, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 221050 + }, + { + "epoch": 0.8545561379907533, + "grad_norm": 0.08866667747497559, + "learning_rate": 0.002, + "loss": 2.316, + "step": 221060 + }, + { + "epoch": 0.8545947951941365, + "grad_norm": 0.10242463648319244, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 221070 + }, + { + "epoch": 0.8546334523975198, + "grad_norm": 0.12807013094425201, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 221080 + }, + { + "epoch": 0.854672109600903, + "grad_norm": 0.10718151181936264, + "learning_rate": 0.002, + "loss": 2.344, + "step": 221090 + }, + { + "epoch": 0.8547107668042863, + "grad_norm": 0.09337849915027618, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 221100 + }, + { + "epoch": 0.8547494240076696, + "grad_norm": 0.10777440667152405, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 221110 + }, + { + "epoch": 0.8547880812110529, + "grad_norm": 0.11653538048267365, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 221120 + }, + { + "epoch": 0.8548267384144361, + "grad_norm": 0.09051918983459473, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 221130 + }, + { + "epoch": 0.8548653956178194, + "grad_norm": 0.09129197895526886, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 221140 + }, + { + "epoch": 0.8549040528212027, + "grad_norm": 0.1159987598657608, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 221150 + }, + { + "epoch": 0.854942710024586, + "grad_norm": 0.1097775474190712, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 221160 + }, + { + "epoch": 0.8549813672279692, + "grad_norm": 0.11373516172170639, + "learning_rate": 0.002, + "loss": 2.342, + "step": 221170 + }, + { + "epoch": 0.8550200244313525, + "grad_norm": 0.10861944407224655, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 221180 + }, + { + "epoch": 0.8550586816347359, + "grad_norm": 0.09435247629880905, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 221190 + }, + { + "epoch": 0.8550973388381191, + "grad_norm": 0.11557682603597641, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 221200 + }, + { + "epoch": 0.8551359960415024, + "grad_norm": 0.09689876437187195, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 221210 + }, + { + "epoch": 0.8551746532448856, + "grad_norm": 0.18857908248901367, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 221220 + }, + { + "epoch": 0.855213310448269, + "grad_norm": 0.1228751391172409, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 221230 + }, + { + "epoch": 0.8552519676516522, + "grad_norm": 0.1002848893404007, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 221240 + }, + { + "epoch": 0.8552906248550355, + "grad_norm": 0.09509483724832535, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 221250 + }, + { + "epoch": 0.8553292820584187, + "grad_norm": 0.1349356323480606, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 221260 + }, + { + "epoch": 0.855367939261802, + "grad_norm": 0.09846282750368118, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 221270 + }, + { + "epoch": 0.8554065964651854, + "grad_norm": 0.09455190598964691, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 221280 + }, + { + "epoch": 0.8554452536685686, + "grad_norm": 0.1285431981086731, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 221290 + }, + { + "epoch": 0.8554839108719519, + "grad_norm": 0.10386514663696289, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 221300 + }, + { + "epoch": 0.8555225680753351, + "grad_norm": 0.09915978461503983, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 221310 + }, + { + "epoch": 0.8555612252787185, + "grad_norm": 0.10544169694185257, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 221320 + }, + { + "epoch": 0.8555998824821017, + "grad_norm": 0.09322534501552582, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 221330 + }, + { + "epoch": 0.855638539685485, + "grad_norm": 0.12254448235034943, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 221340 + }, + { + "epoch": 0.8556771968888682, + "grad_norm": 0.09801836311817169, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 221350 + }, + { + "epoch": 0.8557158540922516, + "grad_norm": 0.11010506004095078, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 221360 + }, + { + "epoch": 0.8557545112956348, + "grad_norm": 0.1010926365852356, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 221370 + }, + { + "epoch": 0.8557931684990181, + "grad_norm": 0.09351800382137299, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 221380 + }, + { + "epoch": 0.8558318257024013, + "grad_norm": 0.10786961019039154, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 221390 + }, + { + "epoch": 0.8558704829057847, + "grad_norm": 0.1168721541762352, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 221400 + }, + { + "epoch": 0.855909140109168, + "grad_norm": 0.13645702600479126, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 221410 + }, + { + "epoch": 0.8559477973125512, + "grad_norm": 0.09873227775096893, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 221420 + }, + { + "epoch": 0.8559864545159345, + "grad_norm": 0.10063973814249039, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 221430 + }, + { + "epoch": 0.8560251117193178, + "grad_norm": 0.10681134462356567, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 221440 + }, + { + "epoch": 0.8560637689227011, + "grad_norm": 0.12218131124973297, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 221450 + }, + { + "epoch": 0.8561024261260843, + "grad_norm": 0.12007876485586166, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 221460 + }, + { + "epoch": 0.8561410833294676, + "grad_norm": 0.09673456847667694, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 221470 + }, + { + "epoch": 0.8561797405328508, + "grad_norm": 0.10487453639507294, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 221480 + }, + { + "epoch": 0.8562183977362342, + "grad_norm": 0.09984520822763443, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 221490 + }, + { + "epoch": 0.8562570549396175, + "grad_norm": 0.10763146728277206, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 221500 + }, + { + "epoch": 0.8562957121430007, + "grad_norm": 0.10822135955095291, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 221510 + }, + { + "epoch": 0.856334369346384, + "grad_norm": 0.10311654955148697, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 221520 + }, + { + "epoch": 0.8563730265497673, + "grad_norm": 0.14389926195144653, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 221530 + }, + { + "epoch": 0.8564116837531506, + "grad_norm": 0.10449139028787613, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 221540 + }, + { + "epoch": 0.8564503409565338, + "grad_norm": 0.10372086614370346, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 221550 + }, + { + "epoch": 0.8564889981599171, + "grad_norm": 0.14530038833618164, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 221560 + }, + { + "epoch": 0.8565276553633004, + "grad_norm": 0.11310365796089172, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 221570 + }, + { + "epoch": 0.8565663125666837, + "grad_norm": 0.10484128445386887, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 221580 + }, + { + "epoch": 0.856604969770067, + "grad_norm": 0.10530244559049606, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 221590 + }, + { + "epoch": 0.8566436269734502, + "grad_norm": 0.1365823745727539, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 221600 + }, + { + "epoch": 0.8566822841768336, + "grad_norm": 0.10979634523391724, + "learning_rate": 0.002, + "loss": 2.344, + "step": 221610 + }, + { + "epoch": 0.8567209413802168, + "grad_norm": 0.10430336743593216, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 221620 + }, + { + "epoch": 0.8567595985836001, + "grad_norm": 0.11881018429994583, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 221630 + }, + { + "epoch": 0.8567982557869833, + "grad_norm": 0.11305271834135056, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 221640 + }, + { + "epoch": 0.8568369129903666, + "grad_norm": 0.10439368337392807, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 221650 + }, + { + "epoch": 0.8568755701937499, + "grad_norm": 0.10345533490180969, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 221660 + }, + { + "epoch": 0.8569142273971332, + "grad_norm": 0.11305728554725647, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 221670 + }, + { + "epoch": 0.8569528846005164, + "grad_norm": 0.10882841795682907, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 221680 + }, + { + "epoch": 0.8569915418038997, + "grad_norm": 0.09862805157899857, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 221690 + }, + { + "epoch": 0.8570301990072831, + "grad_norm": 0.11100216209888458, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 221700 + }, + { + "epoch": 0.8570688562106663, + "grad_norm": 0.09722207486629486, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 221710 + }, + { + "epoch": 0.8571075134140496, + "grad_norm": 0.091136634349823, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 221720 + }, + { + "epoch": 0.8571461706174328, + "grad_norm": 0.10493214428424835, + "learning_rate": 0.002, + "loss": 2.341, + "step": 221730 + }, + { + "epoch": 0.8571848278208162, + "grad_norm": 0.09578853100538254, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 221740 + }, + { + "epoch": 0.8572234850241994, + "grad_norm": 0.09849863499403, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 221750 + }, + { + "epoch": 0.8572621422275827, + "grad_norm": 0.11259200423955917, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 221760 + }, + { + "epoch": 0.8573007994309659, + "grad_norm": 0.09756533056497574, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 221770 + }, + { + "epoch": 0.8573394566343493, + "grad_norm": 0.12699413299560547, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 221780 + }, + { + "epoch": 0.8573781138377325, + "grad_norm": 0.11433012783527374, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 221790 + }, + { + "epoch": 0.8574167710411158, + "grad_norm": 0.11333296447992325, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 221800 + }, + { + "epoch": 0.857455428244499, + "grad_norm": 0.11023814976215363, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 221810 + }, + { + "epoch": 0.8574940854478823, + "grad_norm": 0.10318921506404877, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 221820 + }, + { + "epoch": 0.8575327426512657, + "grad_norm": 0.10669266432523727, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 221830 + }, + { + "epoch": 0.8575713998546489, + "grad_norm": 0.11468558758497238, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 221840 + }, + { + "epoch": 0.8576100570580322, + "grad_norm": 0.1044035330414772, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 221850 + }, + { + "epoch": 0.8576487142614154, + "grad_norm": 0.09796303510665894, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 221860 + }, + { + "epoch": 0.8576873714647988, + "grad_norm": 0.10812048614025116, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 221870 + }, + { + "epoch": 0.857726028668182, + "grad_norm": 0.12456179410219193, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 221880 + }, + { + "epoch": 0.8577646858715653, + "grad_norm": 0.11183394491672516, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 221890 + }, + { + "epoch": 0.8578033430749485, + "grad_norm": 0.12505802512168884, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 221900 + }, + { + "epoch": 0.8578420002783319, + "grad_norm": 0.10315761715173721, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 221910 + }, + { + "epoch": 0.8578806574817152, + "grad_norm": 0.10516806691884995, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 221920 + }, + { + "epoch": 0.8579193146850984, + "grad_norm": 0.12225325405597687, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 221930 + }, + { + "epoch": 0.8579579718884817, + "grad_norm": 0.09954703599214554, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 221940 + }, + { + "epoch": 0.857996629091865, + "grad_norm": 0.09836176037788391, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 221950 + }, + { + "epoch": 0.8580352862952483, + "grad_norm": 0.11061525344848633, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 221960 + }, + { + "epoch": 0.8580739434986315, + "grad_norm": 0.1230001151561737, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 221970 + }, + { + "epoch": 0.8581126007020148, + "grad_norm": 0.09734803438186646, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 221980 + }, + { + "epoch": 0.8581512579053981, + "grad_norm": 0.10479872673749924, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 221990 + }, + { + "epoch": 0.8581899151087814, + "grad_norm": 0.10779935121536255, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 222000 + }, + { + "epoch": 0.8582285723121647, + "grad_norm": 0.10514236986637115, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 222010 + }, + { + "epoch": 0.8582672295155479, + "grad_norm": 0.09969477355480194, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 222020 + }, + { + "epoch": 0.8583058867189312, + "grad_norm": 0.11630406975746155, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 222030 + }, + { + "epoch": 0.8583445439223145, + "grad_norm": 0.08934228122234344, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 222040 + }, + { + "epoch": 0.8583832011256978, + "grad_norm": 0.11019016057252884, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 222050 + }, + { + "epoch": 0.858421858329081, + "grad_norm": 0.11922332644462585, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 222060 + }, + { + "epoch": 0.8584605155324643, + "grad_norm": 0.09459064155817032, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 222070 + }, + { + "epoch": 0.8584991727358476, + "grad_norm": 0.10141943395137787, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 222080 + }, + { + "epoch": 0.8585378299392309, + "grad_norm": 0.09072628617286682, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 222090 + }, + { + "epoch": 0.8585764871426141, + "grad_norm": 0.09824512898921967, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 222100 + }, + { + "epoch": 0.8586151443459974, + "grad_norm": 0.10149113088846207, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 222110 + }, + { + "epoch": 0.8586538015493808, + "grad_norm": 0.09815272688865662, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 222120 + }, + { + "epoch": 0.858692458752764, + "grad_norm": 0.11693661659955978, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 222130 + }, + { + "epoch": 0.8587311159561473, + "grad_norm": 0.10831683874130249, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 222140 + }, + { + "epoch": 0.8587697731595305, + "grad_norm": 0.09843482077121735, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 222150 + }, + { + "epoch": 0.8588084303629139, + "grad_norm": 0.13081464171409607, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 222160 + }, + { + "epoch": 0.8588470875662971, + "grad_norm": 0.10841447860002518, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 222170 + }, + { + "epoch": 0.8588857447696804, + "grad_norm": 0.11414126306772232, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 222180 + }, + { + "epoch": 0.8589244019730636, + "grad_norm": 0.10653481632471085, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 222190 + }, + { + "epoch": 0.8589630591764469, + "grad_norm": 0.11468642204999924, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 222200 + }, + { + "epoch": 0.8590017163798302, + "grad_norm": 0.11711590737104416, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 222210 + }, + { + "epoch": 0.8590403735832135, + "grad_norm": 0.10254094004631042, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 222220 + }, + { + "epoch": 0.8590790307865968, + "grad_norm": 0.09667069464921951, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 222230 + }, + { + "epoch": 0.85911768798998, + "grad_norm": 0.09760228544473648, + "learning_rate": 0.002, + "loss": 2.326, + "step": 222240 + }, + { + "epoch": 0.8591563451933634, + "grad_norm": 0.1100788488984108, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 222250 + }, + { + "epoch": 0.8591950023967466, + "grad_norm": 0.11454857140779495, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 222260 + }, + { + "epoch": 0.8592336596001299, + "grad_norm": 0.11041669547557831, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 222270 + }, + { + "epoch": 0.8592723168035131, + "grad_norm": 0.1071617379784584, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 222280 + }, + { + "epoch": 0.8593109740068965, + "grad_norm": 0.0996813029050827, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 222290 + }, + { + "epoch": 0.8593496312102797, + "grad_norm": 0.10956218093633652, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 222300 + }, + { + "epoch": 0.859388288413663, + "grad_norm": 0.13127194344997406, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 222310 + }, + { + "epoch": 0.8594269456170462, + "grad_norm": 0.09963233768939972, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 222320 + }, + { + "epoch": 0.8594656028204296, + "grad_norm": 0.10281427949666977, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 222330 + }, + { + "epoch": 0.8595042600238129, + "grad_norm": 0.09876349568367004, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 222340 + }, + { + "epoch": 0.8595429172271961, + "grad_norm": 0.09714307636022568, + "learning_rate": 0.002, + "loss": 2.329, + "step": 222350 + }, + { + "epoch": 0.8595815744305794, + "grad_norm": 0.0989079400897026, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 222360 + }, + { + "epoch": 0.8596202316339627, + "grad_norm": 0.08984922617673874, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 222370 + }, + { + "epoch": 0.859658888837346, + "grad_norm": 0.10395090281963348, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 222380 + }, + { + "epoch": 0.8596975460407292, + "grad_norm": 0.13927537202835083, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 222390 + }, + { + "epoch": 0.8597362032441125, + "grad_norm": 0.11574968695640564, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 222400 + }, + { + "epoch": 0.8597748604474957, + "grad_norm": 0.11554904282093048, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 222410 + }, + { + "epoch": 0.8598135176508791, + "grad_norm": 0.0983390286564827, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 222420 + }, + { + "epoch": 0.8598521748542624, + "grad_norm": 0.09339765459299088, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 222430 + }, + { + "epoch": 0.8598908320576456, + "grad_norm": 0.11920101940631866, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 222440 + }, + { + "epoch": 0.8599294892610289, + "grad_norm": 0.10644099116325378, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 222450 + }, + { + "epoch": 0.8599681464644122, + "grad_norm": 0.10699266940355301, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 222460 + }, + { + "epoch": 0.8600068036677955, + "grad_norm": 0.11919573694467545, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 222470 + }, + { + "epoch": 0.8600454608711787, + "grad_norm": 0.11993969976902008, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 222480 + }, + { + "epoch": 0.860084118074562, + "grad_norm": 0.10443676263093948, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 222490 + }, + { + "epoch": 0.8601227752779453, + "grad_norm": 0.11422792077064514, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 222500 + }, + { + "epoch": 0.8601614324813286, + "grad_norm": 0.11695490777492523, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 222510 + }, + { + "epoch": 0.8602000896847118, + "grad_norm": 0.09599953144788742, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 222520 + }, + { + "epoch": 0.8602387468880951, + "grad_norm": 0.09750067442655563, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 222530 + }, + { + "epoch": 0.8602774040914785, + "grad_norm": 0.09763391315937042, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 222540 + }, + { + "epoch": 0.8603160612948617, + "grad_norm": 0.10374131798744202, + "learning_rate": 0.002, + "loss": 2.335, + "step": 222550 + }, + { + "epoch": 0.860354718498245, + "grad_norm": 0.10062684118747711, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 222560 + }, + { + "epoch": 0.8603933757016282, + "grad_norm": 0.1127900630235672, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 222570 + }, + { + "epoch": 0.8604320329050115, + "grad_norm": 0.09476776421070099, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 222580 + }, + { + "epoch": 0.8604706901083948, + "grad_norm": 0.11507733166217804, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 222590 + }, + { + "epoch": 0.8605093473117781, + "grad_norm": 0.10111179202795029, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 222600 + }, + { + "epoch": 0.8605480045151613, + "grad_norm": 0.09470687061548233, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 222610 + }, + { + "epoch": 0.8605866617185446, + "grad_norm": 0.1021670550107956, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 222620 + }, + { + "epoch": 0.860625318921928, + "grad_norm": 0.08839169889688492, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 222630 + }, + { + "epoch": 0.8606639761253112, + "grad_norm": 0.11978471279144287, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 222640 + }, + { + "epoch": 0.8607026333286945, + "grad_norm": 0.11308436840772629, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 222650 + }, + { + "epoch": 0.8607412905320777, + "grad_norm": 0.09787823259830475, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 222660 + }, + { + "epoch": 0.8607799477354611, + "grad_norm": 0.1060105636715889, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 222670 + }, + { + "epoch": 0.8608186049388443, + "grad_norm": 0.09345019608736038, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 222680 + }, + { + "epoch": 0.8608572621422276, + "grad_norm": 0.1005011722445488, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 222690 + }, + { + "epoch": 0.8608959193456108, + "grad_norm": 0.1362825334072113, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 222700 + }, + { + "epoch": 0.8609345765489942, + "grad_norm": 0.1013195663690567, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 222710 + }, + { + "epoch": 0.8609732337523774, + "grad_norm": 0.1083022877573967, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 222720 + }, + { + "epoch": 0.8610118909557607, + "grad_norm": 0.12598514556884766, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 222730 + }, + { + "epoch": 0.861050548159144, + "grad_norm": 0.09817186743021011, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 222740 + }, + { + "epoch": 0.8610892053625272, + "grad_norm": 0.13273663818836212, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 222750 + }, + { + "epoch": 0.8611278625659106, + "grad_norm": 0.08212157338857651, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 222760 + }, + { + "epoch": 0.8611665197692938, + "grad_norm": 0.11281108111143112, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 222770 + }, + { + "epoch": 0.8612051769726771, + "grad_norm": 0.12281344830989838, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 222780 + }, + { + "epoch": 0.8612438341760603, + "grad_norm": 0.09948590397834778, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 222790 + }, + { + "epoch": 0.8612824913794437, + "grad_norm": 0.09139905869960785, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 222800 + }, + { + "epoch": 0.8613211485828269, + "grad_norm": 0.09847621619701385, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 222810 + }, + { + "epoch": 0.8613598057862102, + "grad_norm": 0.2321125864982605, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 222820 + }, + { + "epoch": 0.8613984629895934, + "grad_norm": 0.11758513748645782, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 222830 + }, + { + "epoch": 0.8614371201929768, + "grad_norm": 0.09759420156478882, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 222840 + }, + { + "epoch": 0.86147577739636, + "grad_norm": 0.12681740522384644, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 222850 + }, + { + "epoch": 0.8615144345997433, + "grad_norm": 0.09722145646810532, + "learning_rate": 0.002, + "loss": 2.333, + "step": 222860 + }, + { + "epoch": 0.8615530918031266, + "grad_norm": 0.12716875970363617, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 222870 + }, + { + "epoch": 0.8615917490065099, + "grad_norm": 0.12183672189712524, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 222880 + }, + { + "epoch": 0.8616304062098932, + "grad_norm": 0.09976981580257416, + "learning_rate": 0.002, + "loss": 2.348, + "step": 222890 + }, + { + "epoch": 0.8616690634132764, + "grad_norm": 0.0947190448641777, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 222900 + }, + { + "epoch": 0.8617077206166597, + "grad_norm": 0.10142937302589417, + "learning_rate": 0.002, + "loss": 2.3156, + "step": 222910 + }, + { + "epoch": 0.861746377820043, + "grad_norm": 0.10353469103574753, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 222920 + }, + { + "epoch": 0.8617850350234263, + "grad_norm": 0.11019370704889297, + "learning_rate": 0.002, + "loss": 2.33, + "step": 222930 + }, + { + "epoch": 0.8618236922268095, + "grad_norm": 0.10573454201221466, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 222940 + }, + { + "epoch": 0.8618623494301928, + "grad_norm": 0.09382513165473938, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 222950 + }, + { + "epoch": 0.861901006633576, + "grad_norm": 0.1011807918548584, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 222960 + }, + { + "epoch": 0.8619396638369594, + "grad_norm": 0.10121998190879822, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 222970 + }, + { + "epoch": 0.8619783210403427, + "grad_norm": 0.15870223939418793, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 222980 + }, + { + "epoch": 0.8620169782437259, + "grad_norm": 0.12690460681915283, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 222990 + }, + { + "epoch": 0.8620556354471092, + "grad_norm": 0.10708886384963989, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 223000 + }, + { + "epoch": 0.8620942926504925, + "grad_norm": 0.0971582904458046, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 223010 + }, + { + "epoch": 0.8621329498538758, + "grad_norm": 0.10086013376712799, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 223020 + }, + { + "epoch": 0.862171607057259, + "grad_norm": 0.11198851466178894, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 223030 + }, + { + "epoch": 0.8622102642606423, + "grad_norm": 0.11701015383005142, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 223040 + }, + { + "epoch": 0.8622489214640257, + "grad_norm": 0.10343601554632187, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 223050 + }, + { + "epoch": 0.8622875786674089, + "grad_norm": 0.09298793226480484, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 223060 + }, + { + "epoch": 0.8623262358707922, + "grad_norm": 0.08753270655870438, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 223070 + }, + { + "epoch": 0.8623648930741754, + "grad_norm": 0.1008063554763794, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 223080 + }, + { + "epoch": 0.8624035502775588, + "grad_norm": 0.10008800774812698, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 223090 + }, + { + "epoch": 0.862442207480942, + "grad_norm": 0.09982694685459137, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 223100 + }, + { + "epoch": 0.8624808646843253, + "grad_norm": 0.09783685952425003, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 223110 + }, + { + "epoch": 0.8625195218877085, + "grad_norm": 0.10794012248516083, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 223120 + }, + { + "epoch": 0.8625581790910918, + "grad_norm": 0.10148628056049347, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 223130 + }, + { + "epoch": 0.8625968362944751, + "grad_norm": 0.10609929263591766, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 223140 + }, + { + "epoch": 0.8626354934978584, + "grad_norm": 0.12248008698225021, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 223150 + }, + { + "epoch": 0.8626741507012416, + "grad_norm": 0.10261505842208862, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 223160 + }, + { + "epoch": 0.8627128079046249, + "grad_norm": 0.10075739026069641, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 223170 + }, + { + "epoch": 0.8627514651080083, + "grad_norm": 0.10503147542476654, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 223180 + }, + { + "epoch": 0.8627901223113915, + "grad_norm": 0.1116899847984314, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 223190 + }, + { + "epoch": 0.8628287795147748, + "grad_norm": 0.09337859600782394, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 223200 + }, + { + "epoch": 0.862867436718158, + "grad_norm": 0.10986005514860153, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 223210 + }, + { + "epoch": 0.8629060939215414, + "grad_norm": 0.09611022472381592, + "learning_rate": 0.002, + "loss": 2.339, + "step": 223220 + }, + { + "epoch": 0.8629447511249246, + "grad_norm": 0.09995223581790924, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 223230 + }, + { + "epoch": 0.8629834083283079, + "grad_norm": 0.10946323722600937, + "learning_rate": 0.002, + "loss": 2.341, + "step": 223240 + }, + { + "epoch": 0.8630220655316911, + "grad_norm": 0.10604280233383179, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 223250 + }, + { + "epoch": 0.8630607227350745, + "grad_norm": 0.10414547473192215, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 223260 + }, + { + "epoch": 0.8630993799384578, + "grad_norm": 0.12171155214309692, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 223270 + }, + { + "epoch": 0.863138037141841, + "grad_norm": 0.10600202530622482, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 223280 + }, + { + "epoch": 0.8631766943452243, + "grad_norm": 0.10572559386491776, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 223290 + }, + { + "epoch": 0.8632153515486075, + "grad_norm": 0.09074997901916504, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 223300 + }, + { + "epoch": 0.8632540087519909, + "grad_norm": 0.10141141712665558, + "learning_rate": 0.002, + "loss": 2.329, + "step": 223310 + }, + { + "epoch": 0.8632926659553741, + "grad_norm": 0.1120740994811058, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 223320 + }, + { + "epoch": 0.8633313231587574, + "grad_norm": 0.10076904296875, + "learning_rate": 0.002, + "loss": 2.329, + "step": 223330 + }, + { + "epoch": 0.8633699803621406, + "grad_norm": 0.11237254738807678, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 223340 + }, + { + "epoch": 0.863408637565524, + "grad_norm": 0.10964491218328476, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 223350 + }, + { + "epoch": 0.8634472947689072, + "grad_norm": 0.12141384929418564, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 223360 + }, + { + "epoch": 0.8634859519722905, + "grad_norm": 0.1059795618057251, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 223370 + }, + { + "epoch": 0.8635246091756738, + "grad_norm": 0.09433835744857788, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 223380 + }, + { + "epoch": 0.8635632663790571, + "grad_norm": 0.12807735800743103, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 223390 + }, + { + "epoch": 0.8636019235824404, + "grad_norm": 0.10353527218103409, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 223400 + }, + { + "epoch": 0.8636405807858236, + "grad_norm": 0.08917547762393951, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 223410 + }, + { + "epoch": 0.8636792379892069, + "grad_norm": 0.09677743911743164, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 223420 + }, + { + "epoch": 0.8637178951925902, + "grad_norm": 0.09181633591651917, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 223430 + }, + { + "epoch": 0.8637565523959735, + "grad_norm": 0.10283015668392181, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 223440 + }, + { + "epoch": 0.8637952095993567, + "grad_norm": 0.0930963084101677, + "learning_rate": 0.002, + "loss": 2.337, + "step": 223450 + }, + { + "epoch": 0.86383386680274, + "grad_norm": 0.11149480938911438, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 223460 + }, + { + "epoch": 0.8638725240061234, + "grad_norm": 0.10241681337356567, + "learning_rate": 0.002, + "loss": 2.345, + "step": 223470 + }, + { + "epoch": 0.8639111812095066, + "grad_norm": 0.09172318130731583, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 223480 + }, + { + "epoch": 0.8639498384128899, + "grad_norm": 0.09435839205980301, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 223490 + }, + { + "epoch": 0.8639884956162731, + "grad_norm": 0.09670911729335785, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 223500 + }, + { + "epoch": 0.8640271528196564, + "grad_norm": 0.10295765846967697, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 223510 + }, + { + "epoch": 0.8640658100230397, + "grad_norm": 0.09967496991157532, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 223520 + }, + { + "epoch": 0.864104467226423, + "grad_norm": 0.08990298956632614, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 223530 + }, + { + "epoch": 0.8641431244298062, + "grad_norm": 0.12398983538150787, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 223540 + }, + { + "epoch": 0.8641817816331895, + "grad_norm": 0.09743223339319229, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 223550 + }, + { + "epoch": 0.8642204388365728, + "grad_norm": 0.09881418943405151, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 223560 + }, + { + "epoch": 0.8642590960399561, + "grad_norm": 0.11653521656990051, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 223570 + }, + { + "epoch": 0.8642977532433394, + "grad_norm": 0.10108964145183563, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 223580 + }, + { + "epoch": 0.8643364104467226, + "grad_norm": 0.10443811118602753, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 223590 + }, + { + "epoch": 0.864375067650106, + "grad_norm": 0.12487456202507019, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 223600 + }, + { + "epoch": 0.8644137248534892, + "grad_norm": 0.0982515960931778, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 223610 + }, + { + "epoch": 0.8644523820568725, + "grad_norm": 0.09217357635498047, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 223620 + }, + { + "epoch": 0.8644910392602557, + "grad_norm": 0.11840027570724487, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 223630 + }, + { + "epoch": 0.8645296964636391, + "grad_norm": 0.1063842624425888, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 223640 + }, + { + "epoch": 0.8645683536670223, + "grad_norm": 0.10393808037042618, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 223650 + }, + { + "epoch": 0.8646070108704056, + "grad_norm": 0.10301019251346588, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 223660 + }, + { + "epoch": 0.8646456680737888, + "grad_norm": 0.09212903678417206, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 223670 + }, + { + "epoch": 0.8646843252771721, + "grad_norm": 0.08990156650543213, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 223680 + }, + { + "epoch": 0.8647229824805555, + "grad_norm": 0.10661827027797699, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 223690 + }, + { + "epoch": 0.8647616396839387, + "grad_norm": 0.1021680161356926, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 223700 + }, + { + "epoch": 0.864800296887322, + "grad_norm": 0.12244638800621033, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 223710 + }, + { + "epoch": 0.8648389540907052, + "grad_norm": 0.10838084667921066, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 223720 + }, + { + "epoch": 0.8648776112940886, + "grad_norm": 0.10406170040369034, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 223730 + }, + { + "epoch": 0.8649162684974718, + "grad_norm": 0.10603002458810806, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 223740 + }, + { + "epoch": 0.8649549257008551, + "grad_norm": 0.09609860181808472, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 223750 + }, + { + "epoch": 0.8649935829042383, + "grad_norm": 0.12051209807395935, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 223760 + }, + { + "epoch": 0.8650322401076217, + "grad_norm": 0.09325770288705826, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 223770 + }, + { + "epoch": 0.865070897311005, + "grad_norm": 0.11577421426773071, + "learning_rate": 0.002, + "loss": 2.339, + "step": 223780 + }, + { + "epoch": 0.8651095545143882, + "grad_norm": 0.1013086810708046, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 223790 + }, + { + "epoch": 0.8651482117177715, + "grad_norm": 0.09731609374284744, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 223800 + }, + { + "epoch": 0.8651868689211548, + "grad_norm": 0.10846145451068878, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 223810 + }, + { + "epoch": 0.8652255261245381, + "grad_norm": 0.1096300408244133, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 223820 + }, + { + "epoch": 0.8652641833279213, + "grad_norm": 0.11329855024814606, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 223830 + }, + { + "epoch": 0.8653028405313046, + "grad_norm": 0.1072562038898468, + "learning_rate": 0.002, + "loss": 2.323, + "step": 223840 + }, + { + "epoch": 0.8653414977346879, + "grad_norm": 0.13581515848636627, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 223850 + }, + { + "epoch": 0.8653801549380712, + "grad_norm": 0.10407847166061401, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 223860 + }, + { + "epoch": 0.8654188121414544, + "grad_norm": 0.10293195396661758, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 223870 + }, + { + "epoch": 0.8654574693448377, + "grad_norm": 0.11165141314268112, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 223880 + }, + { + "epoch": 0.865496126548221, + "grad_norm": 0.11360661685466766, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 223890 + }, + { + "epoch": 0.8655347837516043, + "grad_norm": 0.09765450656414032, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 223900 + }, + { + "epoch": 0.8655734409549876, + "grad_norm": 0.1077166348695755, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 223910 + }, + { + "epoch": 0.8656120981583708, + "grad_norm": 0.09532441943883896, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 223920 + }, + { + "epoch": 0.8656507553617541, + "grad_norm": 0.09879755228757858, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 223930 + }, + { + "epoch": 0.8656894125651374, + "grad_norm": 0.10443105548620224, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 223940 + }, + { + "epoch": 0.8657280697685207, + "grad_norm": 0.11443766206502914, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 223950 + }, + { + "epoch": 0.8657667269719039, + "grad_norm": 0.1095224916934967, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 223960 + }, + { + "epoch": 0.8658053841752872, + "grad_norm": 0.11135567724704742, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 223970 + }, + { + "epoch": 0.8658440413786705, + "grad_norm": 0.09788601845502853, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 223980 + }, + { + "epoch": 0.8658826985820538, + "grad_norm": 0.10934220254421234, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 223990 + }, + { + "epoch": 0.865921355785437, + "grad_norm": 0.09874802827835083, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 224000 + }, + { + "epoch": 0.8659600129888203, + "grad_norm": 0.11080588400363922, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 224010 + }, + { + "epoch": 0.8659986701922037, + "grad_norm": 0.09544280171394348, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 224020 + }, + { + "epoch": 0.8660373273955869, + "grad_norm": 0.08992662280797958, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 224030 + }, + { + "epoch": 0.8660759845989702, + "grad_norm": 0.12229316681623459, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 224040 + }, + { + "epoch": 0.8661146418023534, + "grad_norm": 0.11490171402692795, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 224050 + }, + { + "epoch": 0.8661532990057367, + "grad_norm": 0.09182967245578766, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 224060 + }, + { + "epoch": 0.86619195620912, + "grad_norm": 0.11167624592781067, + "learning_rate": 0.002, + "loss": 2.3673, + "step": 224070 + }, + { + "epoch": 0.8662306134125033, + "grad_norm": 0.1071360632777214, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 224080 + }, + { + "epoch": 0.8662692706158865, + "grad_norm": 0.11656167358160019, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 224090 + }, + { + "epoch": 0.8663079278192698, + "grad_norm": 0.11341526359319687, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 224100 + }, + { + "epoch": 0.8663465850226532, + "grad_norm": 0.10455506294965744, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 224110 + }, + { + "epoch": 0.8663852422260364, + "grad_norm": 0.12185147404670715, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 224120 + }, + { + "epoch": 0.8664238994294197, + "grad_norm": 0.0907309502363205, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 224130 + }, + { + "epoch": 0.8664625566328029, + "grad_norm": 0.13204310834407806, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 224140 + }, + { + "epoch": 0.8665012138361863, + "grad_norm": 0.09790869802236557, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 224150 + }, + { + "epoch": 0.8665398710395695, + "grad_norm": 0.106050044298172, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 224160 + }, + { + "epoch": 0.8665785282429528, + "grad_norm": 0.10837047547101974, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 224170 + }, + { + "epoch": 0.866617185446336, + "grad_norm": 0.10271471738815308, + "learning_rate": 0.002, + "loss": 2.332, + "step": 224180 + }, + { + "epoch": 0.8666558426497194, + "grad_norm": 0.10636868327856064, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 224190 + }, + { + "epoch": 0.8666944998531027, + "grad_norm": 0.13913333415985107, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 224200 + }, + { + "epoch": 0.8667331570564859, + "grad_norm": 0.10228648781776428, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 224210 + }, + { + "epoch": 0.8667718142598692, + "grad_norm": 0.107154481112957, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 224220 + }, + { + "epoch": 0.8668104714632524, + "grad_norm": 0.09693600237369537, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 224230 + }, + { + "epoch": 0.8668491286666358, + "grad_norm": 0.1205499917268753, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 224240 + }, + { + "epoch": 0.866887785870019, + "grad_norm": 0.1094214916229248, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 224250 + }, + { + "epoch": 0.8669264430734023, + "grad_norm": 0.09406102448701859, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 224260 + }, + { + "epoch": 0.8669651002767855, + "grad_norm": 0.12342895567417145, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 224270 + }, + { + "epoch": 0.8670037574801689, + "grad_norm": 0.1090339794754982, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 224280 + }, + { + "epoch": 0.8670424146835521, + "grad_norm": 0.09570826590061188, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 224290 + }, + { + "epoch": 0.8670810718869354, + "grad_norm": 0.10601980984210968, + "learning_rate": 0.002, + "loss": 2.352, + "step": 224300 + }, + { + "epoch": 0.8671197290903186, + "grad_norm": 0.09096609055995941, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 224310 + }, + { + "epoch": 0.867158386293702, + "grad_norm": 0.12070676684379578, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 224320 + }, + { + "epoch": 0.8671970434970853, + "grad_norm": 0.12272123992443085, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 224330 + }, + { + "epoch": 0.8672357007004685, + "grad_norm": 0.09796576201915741, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 224340 + }, + { + "epoch": 0.8672743579038518, + "grad_norm": 0.09902673214673996, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 224350 + }, + { + "epoch": 0.8673130151072351, + "grad_norm": 0.10000674426555634, + "learning_rate": 0.002, + "loss": 2.3157, + "step": 224360 + }, + { + "epoch": 0.8673516723106184, + "grad_norm": 0.08922852575778961, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 224370 + }, + { + "epoch": 0.8673903295140016, + "grad_norm": 0.11467423290014267, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 224380 + }, + { + "epoch": 0.8674289867173849, + "grad_norm": 0.08990102261304855, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 224390 + }, + { + "epoch": 0.8674676439207682, + "grad_norm": 0.2590179145336151, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 224400 + }, + { + "epoch": 0.8675063011241515, + "grad_norm": 0.11853017657995224, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 224410 + }, + { + "epoch": 0.8675449583275348, + "grad_norm": 0.10964160412549973, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 224420 + }, + { + "epoch": 0.867583615530918, + "grad_norm": 0.10733091831207275, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 224430 + }, + { + "epoch": 0.8676222727343013, + "grad_norm": 0.10910530388355255, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 224440 + }, + { + "epoch": 0.8676609299376846, + "grad_norm": 0.10902879387140274, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 224450 + }, + { + "epoch": 0.8676995871410679, + "grad_norm": 0.10836129635572433, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 224460 + }, + { + "epoch": 0.8677382443444511, + "grad_norm": 0.10329374670982361, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 224470 + }, + { + "epoch": 0.8677769015478344, + "grad_norm": 0.09962306916713715, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 224480 + }, + { + "epoch": 0.8678155587512177, + "grad_norm": 0.09849024564027786, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 224490 + }, + { + "epoch": 0.867854215954601, + "grad_norm": 0.10376264154911041, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 224500 + }, + { + "epoch": 0.8678928731579842, + "grad_norm": 0.10472551733255386, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 224510 + }, + { + "epoch": 0.8679315303613675, + "grad_norm": 0.1245124563574791, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 224520 + }, + { + "epoch": 0.8679701875647509, + "grad_norm": 0.09382300078868866, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 224530 + }, + { + "epoch": 0.8680088447681341, + "grad_norm": 0.1148284450173378, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 224540 + }, + { + "epoch": 0.8680475019715174, + "grad_norm": 0.09577217698097229, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 224550 + }, + { + "epoch": 0.8680861591749006, + "grad_norm": 0.12388365715742111, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 224560 + }, + { + "epoch": 0.868124816378284, + "grad_norm": 0.1013205349445343, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 224570 + }, + { + "epoch": 0.8681634735816672, + "grad_norm": 0.09336186200380325, + "learning_rate": 0.002, + "loss": 2.322, + "step": 224580 + }, + { + "epoch": 0.8682021307850505, + "grad_norm": 0.12024692445993423, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 224590 + }, + { + "epoch": 0.8682407879884337, + "grad_norm": 0.11582981795072556, + "learning_rate": 0.002, + "loss": 2.332, + "step": 224600 + }, + { + "epoch": 0.868279445191817, + "grad_norm": 0.10193216800689697, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 224610 + }, + { + "epoch": 0.8683181023952004, + "grad_norm": 0.10503973811864853, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 224620 + }, + { + "epoch": 0.8683567595985836, + "grad_norm": 0.10338109731674194, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 224630 + }, + { + "epoch": 0.8683954168019669, + "grad_norm": 0.11863572895526886, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 224640 + }, + { + "epoch": 0.8684340740053501, + "grad_norm": 0.14006471633911133, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 224650 + }, + { + "epoch": 0.8684727312087335, + "grad_norm": 0.08928750455379486, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 224660 + }, + { + "epoch": 0.8685113884121167, + "grad_norm": 0.16105005145072937, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 224670 + }, + { + "epoch": 0.8685500456155, + "grad_norm": 0.1085537001490593, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 224680 + }, + { + "epoch": 0.8685887028188832, + "grad_norm": 0.10102713853120804, + "learning_rate": 0.002, + "loss": 2.345, + "step": 224690 + }, + { + "epoch": 0.8686273600222666, + "grad_norm": 0.128278449177742, + "learning_rate": 0.002, + "loss": 2.347, + "step": 224700 + }, + { + "epoch": 0.8686660172256498, + "grad_norm": 0.13314104080200195, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 224710 + }, + { + "epoch": 0.8687046744290331, + "grad_norm": 0.11651406437158585, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 224720 + }, + { + "epoch": 0.8687433316324163, + "grad_norm": 0.11891312897205353, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 224730 + }, + { + "epoch": 0.8687819888357997, + "grad_norm": 0.09145008027553558, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 224740 + }, + { + "epoch": 0.868820646039183, + "grad_norm": 0.1225404292345047, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 224750 + }, + { + "epoch": 0.8688593032425662, + "grad_norm": 0.1320568174123764, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 224760 + }, + { + "epoch": 0.8688979604459495, + "grad_norm": 0.09423022717237473, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 224770 + }, + { + "epoch": 0.8689366176493328, + "grad_norm": 0.09961433708667755, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 224780 + }, + { + "epoch": 0.8689752748527161, + "grad_norm": 0.14324261248111725, + "learning_rate": 0.002, + "loss": 2.336, + "step": 224790 + }, + { + "epoch": 0.8690139320560993, + "grad_norm": 0.13467486202716827, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 224800 + }, + { + "epoch": 0.8690525892594826, + "grad_norm": 0.10713488608598709, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 224810 + }, + { + "epoch": 0.8690912464628658, + "grad_norm": 0.11601117253303528, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 224820 + }, + { + "epoch": 0.8691299036662492, + "grad_norm": 0.11640362441539764, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 224830 + }, + { + "epoch": 0.8691685608696325, + "grad_norm": 0.13760827481746674, + "learning_rate": 0.002, + "loss": 2.329, + "step": 224840 + }, + { + "epoch": 0.8692072180730157, + "grad_norm": 0.11234764754772186, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 224850 + }, + { + "epoch": 0.869245875276399, + "grad_norm": 0.10185302793979645, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 224860 + }, + { + "epoch": 0.8692845324797823, + "grad_norm": 0.15274551510810852, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 224870 + }, + { + "epoch": 0.8693231896831656, + "grad_norm": 0.0994364470243454, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 224880 + }, + { + "epoch": 0.8693618468865488, + "grad_norm": 0.12746405601501465, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 224890 + }, + { + "epoch": 0.8694005040899321, + "grad_norm": 0.24766580760478973, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 224900 + }, + { + "epoch": 0.8694391612933154, + "grad_norm": 0.1405581384897232, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 224910 + }, + { + "epoch": 0.8694778184966987, + "grad_norm": 0.11151156574487686, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 224920 + }, + { + "epoch": 0.869516475700082, + "grad_norm": 0.11751139163970947, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 224930 + }, + { + "epoch": 0.8695551329034652, + "grad_norm": 0.11526947468519211, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 224940 + }, + { + "epoch": 0.8695937901068486, + "grad_norm": 0.10381560027599335, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 224950 + }, + { + "epoch": 0.8696324473102318, + "grad_norm": 0.09702971577644348, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 224960 + }, + { + "epoch": 0.8696711045136151, + "grad_norm": 0.11534032225608826, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 224970 + }, + { + "epoch": 0.8697097617169983, + "grad_norm": 0.12234510481357574, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 224980 + }, + { + "epoch": 0.8697484189203816, + "grad_norm": 0.10677313804626465, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 224990 + }, + { + "epoch": 0.8697870761237649, + "grad_norm": 0.09780658036470413, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 225000 + }, + { + "epoch": 0.8698257333271482, + "grad_norm": 0.10465943068265915, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 225010 + }, + { + "epoch": 0.8698643905305314, + "grad_norm": 0.11495811492204666, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 225020 + }, + { + "epoch": 0.8699030477339147, + "grad_norm": 0.10129937529563904, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 225030 + }, + { + "epoch": 0.869941704937298, + "grad_norm": 0.11727405339479446, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 225040 + }, + { + "epoch": 0.8699803621406813, + "grad_norm": 0.11523114889860153, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 225050 + }, + { + "epoch": 0.8700190193440646, + "grad_norm": 0.10869179666042328, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 225060 + }, + { + "epoch": 0.8700576765474478, + "grad_norm": 0.11668886244297028, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 225070 + }, + { + "epoch": 0.8700963337508312, + "grad_norm": 0.10371943563222885, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 225080 + }, + { + "epoch": 0.8701349909542144, + "grad_norm": 0.11870495975017548, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 225090 + }, + { + "epoch": 0.8701736481575977, + "grad_norm": 0.1018364354968071, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 225100 + }, + { + "epoch": 0.8702123053609809, + "grad_norm": 0.09280882030725479, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 225110 + }, + { + "epoch": 0.8702509625643643, + "grad_norm": 0.11336492002010345, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 225120 + }, + { + "epoch": 0.8702896197677475, + "grad_norm": 0.12220898270606995, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 225130 + }, + { + "epoch": 0.8703282769711308, + "grad_norm": 0.10010644793510437, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 225140 + }, + { + "epoch": 0.870366934174514, + "grad_norm": 0.11782457679510117, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 225150 + }, + { + "epoch": 0.8704055913778973, + "grad_norm": 0.09656857699155807, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 225160 + }, + { + "epoch": 0.8704442485812807, + "grad_norm": 0.10837922245264053, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 225170 + }, + { + "epoch": 0.8704829057846639, + "grad_norm": 0.09899026900529861, + "learning_rate": 0.002, + "loss": 2.339, + "step": 225180 + }, + { + "epoch": 0.8705215629880472, + "grad_norm": 0.10472776740789413, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 225190 + }, + { + "epoch": 0.8705602201914304, + "grad_norm": 0.09675848484039307, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 225200 + }, + { + "epoch": 0.8705988773948138, + "grad_norm": 0.10160231590270996, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 225210 + }, + { + "epoch": 0.870637534598197, + "grad_norm": 0.10469699651002884, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 225220 + }, + { + "epoch": 0.8706761918015803, + "grad_norm": 0.09993157535791397, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 225230 + }, + { + "epoch": 0.8707148490049635, + "grad_norm": 0.10199610888957977, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 225240 + }, + { + "epoch": 0.8707535062083469, + "grad_norm": 0.09740594029426575, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 225250 + }, + { + "epoch": 0.8707921634117302, + "grad_norm": 0.10081925988197327, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 225260 + }, + { + "epoch": 0.8708308206151134, + "grad_norm": 0.098286472260952, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 225270 + }, + { + "epoch": 0.8708694778184967, + "grad_norm": 0.10415423661470413, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 225280 + }, + { + "epoch": 0.87090813502188, + "grad_norm": 0.10316446423530579, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 225290 + }, + { + "epoch": 0.8709467922252633, + "grad_norm": 0.0970345288515091, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 225300 + }, + { + "epoch": 0.8709854494286465, + "grad_norm": 0.1323097199201584, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 225310 + }, + { + "epoch": 0.8710241066320298, + "grad_norm": 0.11852015554904938, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 225320 + }, + { + "epoch": 0.8710627638354131, + "grad_norm": 0.09876296669244766, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 225330 + }, + { + "epoch": 0.8711014210387964, + "grad_norm": 0.15328599512577057, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 225340 + }, + { + "epoch": 0.8711400782421796, + "grad_norm": 0.12386604398488998, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 225350 + }, + { + "epoch": 0.8711787354455629, + "grad_norm": 0.09469375014305115, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 225360 + }, + { + "epoch": 0.8712173926489462, + "grad_norm": 0.10191961377859116, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 225370 + }, + { + "epoch": 0.8712560498523295, + "grad_norm": 0.15213637053966522, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 225380 + }, + { + "epoch": 0.8712947070557128, + "grad_norm": 0.10185971856117249, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 225390 + }, + { + "epoch": 0.871333364259096, + "grad_norm": 0.12874957919120789, + "learning_rate": 0.002, + "loss": 2.355, + "step": 225400 + }, + { + "epoch": 0.8713720214624793, + "grad_norm": 0.10777001827955246, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 225410 + }, + { + "epoch": 0.8714106786658626, + "grad_norm": 0.11325021833181381, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 225420 + }, + { + "epoch": 0.8714493358692459, + "grad_norm": 0.10347343236207962, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 225430 + }, + { + "epoch": 0.8714879930726291, + "grad_norm": 0.1121433898806572, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 225440 + }, + { + "epoch": 0.8715266502760124, + "grad_norm": 0.09990080446004868, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 225450 + }, + { + "epoch": 0.8715653074793958, + "grad_norm": 0.13435986638069153, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 225460 + }, + { + "epoch": 0.871603964682779, + "grad_norm": 0.15727294981479645, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 225470 + }, + { + "epoch": 0.8716426218861623, + "grad_norm": 0.08477748930454254, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 225480 + }, + { + "epoch": 0.8716812790895455, + "grad_norm": 0.1023939922451973, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 225490 + }, + { + "epoch": 0.8717199362929289, + "grad_norm": 0.10732561349868774, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 225500 + }, + { + "epoch": 0.8717585934963121, + "grad_norm": 0.12268977612257004, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 225510 + }, + { + "epoch": 0.8717972506996954, + "grad_norm": 0.1006249263882637, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 225520 + }, + { + "epoch": 0.8718359079030786, + "grad_norm": 0.09923502057790756, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 225530 + }, + { + "epoch": 0.8718745651064619, + "grad_norm": 0.1083303838968277, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 225540 + }, + { + "epoch": 0.8719132223098452, + "grad_norm": 0.10976643115282059, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 225550 + }, + { + "epoch": 0.8719518795132285, + "grad_norm": 0.12189637869596481, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 225560 + }, + { + "epoch": 0.8719905367166118, + "grad_norm": 0.10473313927650452, + "learning_rate": 0.002, + "loss": 2.331, + "step": 225570 + }, + { + "epoch": 0.872029193919995, + "grad_norm": 0.09554488211870193, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 225580 + }, + { + "epoch": 0.8720678511233784, + "grad_norm": 0.1166779100894928, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 225590 + }, + { + "epoch": 0.8721065083267616, + "grad_norm": 0.10591023415327072, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 225600 + }, + { + "epoch": 0.8721451655301449, + "grad_norm": 0.09874525666236877, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 225610 + }, + { + "epoch": 0.8721838227335281, + "grad_norm": 0.11275798082351685, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 225620 + }, + { + "epoch": 0.8722224799369115, + "grad_norm": 0.10221873968839645, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 225630 + }, + { + "epoch": 0.8722611371402947, + "grad_norm": 0.2800368368625641, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 225640 + }, + { + "epoch": 0.872299794343678, + "grad_norm": 0.10905713587999344, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 225650 + }, + { + "epoch": 0.8723384515470612, + "grad_norm": 0.10796914994716644, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 225660 + }, + { + "epoch": 0.8723771087504446, + "grad_norm": 0.13260188698768616, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 225670 + }, + { + "epoch": 0.8724157659538279, + "grad_norm": 0.10988302528858185, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 225680 + }, + { + "epoch": 0.8724544231572111, + "grad_norm": 0.09902926534414291, + "learning_rate": 0.002, + "loss": 2.325, + "step": 225690 + }, + { + "epoch": 0.8724930803605944, + "grad_norm": 0.1075626090168953, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 225700 + }, + { + "epoch": 0.8725317375639776, + "grad_norm": 0.11184363812208176, + "learning_rate": 0.002, + "loss": 2.342, + "step": 225710 + }, + { + "epoch": 0.872570394767361, + "grad_norm": 0.1105227917432785, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 225720 + }, + { + "epoch": 0.8726090519707442, + "grad_norm": 0.1228136196732521, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 225730 + }, + { + "epoch": 0.8726477091741275, + "grad_norm": 0.09833686798810959, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 225740 + }, + { + "epoch": 0.8726863663775107, + "grad_norm": 0.10563940554857254, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 225750 + }, + { + "epoch": 0.8727250235808941, + "grad_norm": 0.10530819743871689, + "learning_rate": 0.002, + "loss": 2.325, + "step": 225760 + }, + { + "epoch": 0.8727636807842774, + "grad_norm": 0.1024976521730423, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 225770 + }, + { + "epoch": 0.8728023379876606, + "grad_norm": 0.11485017091035843, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 225780 + }, + { + "epoch": 0.8728409951910439, + "grad_norm": 0.09713538736104965, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 225790 + }, + { + "epoch": 0.8728796523944272, + "grad_norm": 0.100540891289711, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 225800 + }, + { + "epoch": 0.8729183095978105, + "grad_norm": 0.11147937923669815, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 225810 + }, + { + "epoch": 0.8729569668011937, + "grad_norm": 0.09945949912071228, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 225820 + }, + { + "epoch": 0.872995624004577, + "grad_norm": 0.1261347085237503, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 225830 + }, + { + "epoch": 0.8730342812079603, + "grad_norm": 0.10962780565023422, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 225840 + }, + { + "epoch": 0.8730729384113436, + "grad_norm": 0.11201975494623184, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 225850 + }, + { + "epoch": 0.8731115956147268, + "grad_norm": 0.1256123185157776, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 225860 + }, + { + "epoch": 0.8731502528181101, + "grad_norm": 0.11174170672893524, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 225870 + }, + { + "epoch": 0.8731889100214935, + "grad_norm": 0.10957685858011246, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 225880 + }, + { + "epoch": 0.8732275672248767, + "grad_norm": 0.11033196747303009, + "learning_rate": 0.002, + "loss": 2.341, + "step": 225890 + }, + { + "epoch": 0.87326622442826, + "grad_norm": 0.11110338568687439, + "learning_rate": 0.002, + "loss": 2.334, + "step": 225900 + }, + { + "epoch": 0.8733048816316432, + "grad_norm": 0.10742199420928955, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 225910 + }, + { + "epoch": 0.8733435388350265, + "grad_norm": 0.10513104498386383, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 225920 + }, + { + "epoch": 0.8733821960384098, + "grad_norm": 0.10281091928482056, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 225930 + }, + { + "epoch": 0.8734208532417931, + "grad_norm": 0.10133229941129684, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 225940 + }, + { + "epoch": 0.8734595104451763, + "grad_norm": 0.10914817452430725, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 225950 + }, + { + "epoch": 0.8734981676485596, + "grad_norm": 0.11362199485301971, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 225960 + }, + { + "epoch": 0.873536824851943, + "grad_norm": 0.1011771410703659, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 225970 + }, + { + "epoch": 0.8735754820553262, + "grad_norm": 0.09920097142457962, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 225980 + }, + { + "epoch": 0.8736141392587095, + "grad_norm": 0.1007387787103653, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 225990 + }, + { + "epoch": 0.8736527964620927, + "grad_norm": 0.09851537644863129, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 226000 + }, + { + "epoch": 0.8736914536654761, + "grad_norm": 0.096402108669281, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 226010 + }, + { + "epoch": 0.8737301108688593, + "grad_norm": 0.10415016859769821, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 226020 + }, + { + "epoch": 0.8737687680722426, + "grad_norm": 0.10720174014568329, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 226030 + }, + { + "epoch": 0.8738074252756258, + "grad_norm": 0.09929098933935165, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 226040 + }, + { + "epoch": 0.8738460824790092, + "grad_norm": 0.11904928088188171, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 226050 + }, + { + "epoch": 0.8738847396823924, + "grad_norm": 0.0928359255194664, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 226060 + }, + { + "epoch": 0.8739233968857757, + "grad_norm": 0.11214454472064972, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 226070 + }, + { + "epoch": 0.873962054089159, + "grad_norm": 0.1066681444644928, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 226080 + }, + { + "epoch": 0.8740007112925422, + "grad_norm": 0.10091183334589005, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 226090 + }, + { + "epoch": 0.8740393684959256, + "grad_norm": 0.11797045916318893, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 226100 + }, + { + "epoch": 0.8740780256993088, + "grad_norm": 0.0893377959728241, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 226110 + }, + { + "epoch": 0.8741166829026921, + "grad_norm": 0.11226366460323334, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 226120 + }, + { + "epoch": 0.8741553401060753, + "grad_norm": 0.11060541868209839, + "learning_rate": 0.002, + "loss": 2.348, + "step": 226130 + }, + { + "epoch": 0.8741939973094587, + "grad_norm": 0.09058070182800293, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 226140 + }, + { + "epoch": 0.8742326545128419, + "grad_norm": 0.11443872004747391, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 226150 + }, + { + "epoch": 0.8742713117162252, + "grad_norm": 0.09787355363368988, + "learning_rate": 0.002, + "loss": 2.339, + "step": 226160 + }, + { + "epoch": 0.8743099689196084, + "grad_norm": 0.130850151181221, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 226170 + }, + { + "epoch": 0.8743486261229918, + "grad_norm": 0.11858386546373367, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 226180 + }, + { + "epoch": 0.874387283326375, + "grad_norm": 0.09746160358190536, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 226190 + }, + { + "epoch": 0.8744259405297583, + "grad_norm": 0.09695523232221603, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 226200 + }, + { + "epoch": 0.8744645977331416, + "grad_norm": 0.09290745109319687, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 226210 + }, + { + "epoch": 0.8745032549365249, + "grad_norm": 0.1002887487411499, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 226220 + }, + { + "epoch": 0.8745419121399082, + "grad_norm": 0.10883896052837372, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 226230 + }, + { + "epoch": 0.8745805693432914, + "grad_norm": 0.11657164990901947, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 226240 + }, + { + "epoch": 0.8746192265466747, + "grad_norm": 0.09341996163129807, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 226250 + }, + { + "epoch": 0.874657883750058, + "grad_norm": 0.10273752361536026, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 226260 + }, + { + "epoch": 0.8746965409534413, + "grad_norm": 0.08396825194358826, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 226270 + }, + { + "epoch": 0.8747351981568245, + "grad_norm": 0.10608824342489243, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 226280 + }, + { + "epoch": 0.8747738553602078, + "grad_norm": 0.09661603718996048, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 226290 + }, + { + "epoch": 0.874812512563591, + "grad_norm": 0.11775680631399155, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 226300 + }, + { + "epoch": 0.8748511697669744, + "grad_norm": 0.10600218921899796, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 226310 + }, + { + "epoch": 0.8748898269703577, + "grad_norm": 0.09572699666023254, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 226320 + }, + { + "epoch": 0.8749284841737409, + "grad_norm": 0.11705332249403, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 226330 + }, + { + "epoch": 0.8749671413771242, + "grad_norm": 0.11242889612913132, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 226340 + }, + { + "epoch": 0.8750057985805075, + "grad_norm": 0.1230028048157692, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 226350 + }, + { + "epoch": 0.8750444557838908, + "grad_norm": 0.09983881562948227, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 226360 + }, + { + "epoch": 0.875083112987274, + "grad_norm": 0.08863212168216705, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 226370 + }, + { + "epoch": 0.8751217701906573, + "grad_norm": 0.0950503945350647, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 226380 + }, + { + "epoch": 0.8751604273940407, + "grad_norm": 0.11235824972391129, + "learning_rate": 0.002, + "loss": 2.344, + "step": 226390 + }, + { + "epoch": 0.8751990845974239, + "grad_norm": 0.11355585604906082, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 226400 + }, + { + "epoch": 0.8752377418008072, + "grad_norm": 0.1016106829047203, + "learning_rate": 0.002, + "loss": 2.34, + "step": 226410 + }, + { + "epoch": 0.8752763990041904, + "grad_norm": 0.10593704879283905, + "learning_rate": 0.002, + "loss": 2.329, + "step": 226420 + }, + { + "epoch": 0.8753150562075738, + "grad_norm": 0.12088311463594437, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 226430 + }, + { + "epoch": 0.875353713410957, + "grad_norm": 0.09646426886320114, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 226440 + }, + { + "epoch": 0.8753923706143403, + "grad_norm": 0.09106966108083725, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 226450 + }, + { + "epoch": 0.8754310278177235, + "grad_norm": 0.10133379697799683, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 226460 + }, + { + "epoch": 0.8754696850211068, + "grad_norm": 0.10170179605484009, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 226470 + }, + { + "epoch": 0.8755083422244901, + "grad_norm": 0.1035342887043953, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 226480 + }, + { + "epoch": 0.8755469994278734, + "grad_norm": 0.09977155178785324, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 226490 + }, + { + "epoch": 0.8755856566312566, + "grad_norm": 0.10518094152212143, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 226500 + }, + { + "epoch": 0.8756243138346399, + "grad_norm": 0.1063164472579956, + "learning_rate": 0.002, + "loss": 2.339, + "step": 226510 + }, + { + "epoch": 0.8756629710380233, + "grad_norm": 0.09273169934749603, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 226520 + }, + { + "epoch": 0.8757016282414065, + "grad_norm": 0.11271461099386215, + "learning_rate": 0.002, + "loss": 2.352, + "step": 226530 + }, + { + "epoch": 0.8757402854447898, + "grad_norm": 0.12785807251930237, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 226540 + }, + { + "epoch": 0.875778942648173, + "grad_norm": 0.10536123067140579, + "learning_rate": 0.002, + "loss": 2.33, + "step": 226550 + }, + { + "epoch": 0.8758175998515564, + "grad_norm": 0.10244054347276688, + "learning_rate": 0.002, + "loss": 2.339, + "step": 226560 + }, + { + "epoch": 0.8758562570549396, + "grad_norm": 0.1381578892469406, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 226570 + }, + { + "epoch": 0.8758949142583229, + "grad_norm": 0.11038932204246521, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 226580 + }, + { + "epoch": 0.8759335714617061, + "grad_norm": 0.12924733757972717, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 226590 + }, + { + "epoch": 0.8759722286650895, + "grad_norm": 0.09875442832708359, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 226600 + }, + { + "epoch": 0.8760108858684728, + "grad_norm": 0.11456121504306793, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 226610 + }, + { + "epoch": 0.876049543071856, + "grad_norm": 0.09922508150339127, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 226620 + }, + { + "epoch": 0.8760882002752393, + "grad_norm": 0.09993966668844223, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 226630 + }, + { + "epoch": 0.8761268574786225, + "grad_norm": 0.09752309322357178, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 226640 + }, + { + "epoch": 0.8761655146820059, + "grad_norm": 0.0955125242471695, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 226650 + }, + { + "epoch": 0.8762041718853891, + "grad_norm": 0.10003511607646942, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 226660 + }, + { + "epoch": 0.8762428290887724, + "grad_norm": 0.13894720375537872, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 226670 + }, + { + "epoch": 0.8762814862921556, + "grad_norm": 0.10475655645132065, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 226680 + }, + { + "epoch": 0.876320143495539, + "grad_norm": 0.10792479664087296, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 226690 + }, + { + "epoch": 0.8763588006989222, + "grad_norm": 0.12203353643417358, + "learning_rate": 0.002, + "loss": 2.334, + "step": 226700 + }, + { + "epoch": 0.8763974579023055, + "grad_norm": 0.09583789110183716, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 226710 + }, + { + "epoch": 0.8764361151056888, + "grad_norm": 0.10303744673728943, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 226720 + }, + { + "epoch": 0.8764747723090721, + "grad_norm": 0.09134358167648315, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 226730 + }, + { + "epoch": 0.8765134295124554, + "grad_norm": 0.5620478391647339, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 226740 + }, + { + "epoch": 0.8765520867158386, + "grad_norm": 0.10164141654968262, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 226750 + }, + { + "epoch": 0.8765907439192219, + "grad_norm": 0.10489300638437271, + "learning_rate": 0.002, + "loss": 2.339, + "step": 226760 + }, + { + "epoch": 0.8766294011226052, + "grad_norm": 0.09423074871301651, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 226770 + }, + { + "epoch": 0.8766680583259885, + "grad_norm": 0.13391481339931488, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 226780 + }, + { + "epoch": 0.8767067155293717, + "grad_norm": 0.09959197789430618, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 226790 + }, + { + "epoch": 0.876745372732755, + "grad_norm": 0.1476932317018509, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 226800 + }, + { + "epoch": 0.8767840299361384, + "grad_norm": 0.1119222566485405, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 226810 + }, + { + "epoch": 0.8768226871395216, + "grad_norm": 0.10764788091182709, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 226820 + }, + { + "epoch": 0.8768613443429049, + "grad_norm": 0.09944967925548553, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 226830 + }, + { + "epoch": 0.8769000015462881, + "grad_norm": 0.09813541173934937, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 226840 + }, + { + "epoch": 0.8769386587496714, + "grad_norm": 0.12044399231672287, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 226850 + }, + { + "epoch": 0.8769773159530547, + "grad_norm": 0.10897082835435867, + "learning_rate": 0.002, + "loss": 2.333, + "step": 226860 + }, + { + "epoch": 0.877015973156438, + "grad_norm": 0.10732237249612808, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 226870 + }, + { + "epoch": 0.8770546303598212, + "grad_norm": 0.11734417825937271, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 226880 + }, + { + "epoch": 0.8770932875632045, + "grad_norm": 0.09896469116210938, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 226890 + }, + { + "epoch": 0.8771319447665878, + "grad_norm": 0.12109851837158203, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 226900 + }, + { + "epoch": 0.8771706019699711, + "grad_norm": 0.10177912563085556, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 226910 + }, + { + "epoch": 0.8772092591733543, + "grad_norm": 0.10985736548900604, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 226920 + }, + { + "epoch": 0.8772479163767376, + "grad_norm": 0.10610976070165634, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 226930 + }, + { + "epoch": 0.877286573580121, + "grad_norm": 0.10284141451120377, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 226940 + }, + { + "epoch": 0.8773252307835042, + "grad_norm": 0.10084030777215958, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 226950 + }, + { + "epoch": 0.8773638879868875, + "grad_norm": 0.11514756828546524, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 226960 + }, + { + "epoch": 0.8774025451902707, + "grad_norm": 0.10323145985603333, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 226970 + }, + { + "epoch": 0.8774412023936541, + "grad_norm": 0.09738577902317047, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 226980 + }, + { + "epoch": 0.8774798595970373, + "grad_norm": 0.09809859097003937, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 226990 + }, + { + "epoch": 0.8775185168004206, + "grad_norm": 0.11053669452667236, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 227000 + }, + { + "epoch": 0.8775571740038038, + "grad_norm": 0.11902974545955658, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 227010 + }, + { + "epoch": 0.8775958312071871, + "grad_norm": 0.10271196067333221, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 227020 + }, + { + "epoch": 0.8776344884105705, + "grad_norm": 0.11581193655729294, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 227030 + }, + { + "epoch": 0.8776731456139537, + "grad_norm": 0.1064939945936203, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 227040 + }, + { + "epoch": 0.877711802817337, + "grad_norm": 0.10863006114959717, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 227050 + }, + { + "epoch": 0.8777504600207202, + "grad_norm": 0.10866818577051163, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 227060 + }, + { + "epoch": 0.8777891172241036, + "grad_norm": 0.09740670770406723, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 227070 + }, + { + "epoch": 0.8778277744274868, + "grad_norm": 0.11110842227935791, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 227080 + }, + { + "epoch": 0.8778664316308701, + "grad_norm": 0.10708796977996826, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 227090 + }, + { + "epoch": 0.8779050888342533, + "grad_norm": 0.10029701143503189, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 227100 + }, + { + "epoch": 0.8779437460376367, + "grad_norm": 0.10857430845499039, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 227110 + }, + { + "epoch": 0.87798240324102, + "grad_norm": 0.12921090424060822, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 227120 + }, + { + "epoch": 0.8780210604444032, + "grad_norm": 0.11042632162570953, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 227130 + }, + { + "epoch": 0.8780597176477865, + "grad_norm": 0.09481260925531387, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 227140 + }, + { + "epoch": 0.8780983748511698, + "grad_norm": 0.10456386208534241, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 227150 + }, + { + "epoch": 0.8781370320545531, + "grad_norm": 0.09098687022924423, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 227160 + }, + { + "epoch": 0.8781756892579363, + "grad_norm": 0.11767854541540146, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 227170 + }, + { + "epoch": 0.8782143464613196, + "grad_norm": 0.11023345589637756, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 227180 + }, + { + "epoch": 0.8782530036647029, + "grad_norm": 0.10239629447460175, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 227190 + }, + { + "epoch": 0.8782916608680862, + "grad_norm": 0.11235344409942627, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 227200 + }, + { + "epoch": 0.8783303180714694, + "grad_norm": 0.13639125227928162, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 227210 + }, + { + "epoch": 0.8783689752748527, + "grad_norm": 0.11406679451465607, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 227220 + }, + { + "epoch": 0.878407632478236, + "grad_norm": 0.12473028898239136, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 227230 + }, + { + "epoch": 0.8784462896816193, + "grad_norm": 0.09570764005184174, + "learning_rate": 0.002, + "loss": 2.328, + "step": 227240 + }, + { + "epoch": 0.8784849468850026, + "grad_norm": 0.0939561128616333, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 227250 + }, + { + "epoch": 0.8785236040883858, + "grad_norm": 0.12944932281970978, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 227260 + }, + { + "epoch": 0.8785622612917691, + "grad_norm": 0.1073397770524025, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 227270 + }, + { + "epoch": 0.8786009184951524, + "grad_norm": 0.09914367645978928, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 227280 + }, + { + "epoch": 0.8786395756985357, + "grad_norm": 0.1429779827594757, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 227290 + }, + { + "epoch": 0.8786782329019189, + "grad_norm": 0.09743501991033554, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 227300 + }, + { + "epoch": 0.8787168901053022, + "grad_norm": 0.10444758087396622, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 227310 + }, + { + "epoch": 0.8787555473086855, + "grad_norm": 0.10072892159223557, + "learning_rate": 0.002, + "loss": 2.331, + "step": 227320 + }, + { + "epoch": 0.8787942045120688, + "grad_norm": 0.11068283766508102, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 227330 + }, + { + "epoch": 0.878832861715452, + "grad_norm": 0.11059466749429703, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 227340 + }, + { + "epoch": 0.8788715189188353, + "grad_norm": 0.1134902760386467, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 227350 + }, + { + "epoch": 0.8789101761222187, + "grad_norm": 0.11368384957313538, + "learning_rate": 0.002, + "loss": 2.331, + "step": 227360 + }, + { + "epoch": 0.8789488333256019, + "grad_norm": 0.10789523273706436, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 227370 + }, + { + "epoch": 0.8789874905289852, + "grad_norm": 0.11604959517717361, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 227380 + }, + { + "epoch": 0.8790261477323684, + "grad_norm": 0.09539544582366943, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 227390 + }, + { + "epoch": 0.8790648049357517, + "grad_norm": 0.10180415958166122, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 227400 + }, + { + "epoch": 0.879103462139135, + "grad_norm": 0.10947983711957932, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 227410 + }, + { + "epoch": 0.8791421193425183, + "grad_norm": 0.10848617553710938, + "learning_rate": 0.002, + "loss": 2.329, + "step": 227420 + }, + { + "epoch": 0.8791807765459015, + "grad_norm": 0.12041208893060684, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 227430 + }, + { + "epoch": 0.8792194337492848, + "grad_norm": 0.09465758502483368, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 227440 + }, + { + "epoch": 0.8792580909526682, + "grad_norm": 0.14059308171272278, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 227450 + }, + { + "epoch": 0.8792967481560514, + "grad_norm": 0.10597053915262222, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 227460 + }, + { + "epoch": 0.8793354053594347, + "grad_norm": 0.10933779925107956, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 227470 + }, + { + "epoch": 0.8793740625628179, + "grad_norm": 0.09205158799886703, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 227480 + }, + { + "epoch": 0.8794127197662013, + "grad_norm": 0.10398532450199127, + "learning_rate": 0.002, + "loss": 2.332, + "step": 227490 + }, + { + "epoch": 0.8794513769695845, + "grad_norm": 0.10922596603631973, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 227500 + }, + { + "epoch": 0.8794900341729678, + "grad_norm": 0.11087905615568161, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 227510 + }, + { + "epoch": 0.879528691376351, + "grad_norm": 0.11130758374929428, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 227520 + }, + { + "epoch": 0.8795673485797344, + "grad_norm": 0.09835169464349747, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 227530 + }, + { + "epoch": 0.8796060057831177, + "grad_norm": 0.11270011216402054, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 227540 + }, + { + "epoch": 0.8796446629865009, + "grad_norm": 0.08977176249027252, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 227550 + }, + { + "epoch": 0.8796833201898842, + "grad_norm": 0.10379301011562347, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 227560 + }, + { + "epoch": 0.8797219773932674, + "grad_norm": 0.10165230184793472, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 227570 + }, + { + "epoch": 0.8797606345966508, + "grad_norm": 0.10444454848766327, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 227580 + }, + { + "epoch": 0.879799291800034, + "grad_norm": 0.10805295407772064, + "learning_rate": 0.002, + "loss": 2.341, + "step": 227590 + }, + { + "epoch": 0.8798379490034173, + "grad_norm": 0.0940571278333664, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 227600 + }, + { + "epoch": 0.8798766062068005, + "grad_norm": 0.10650225728750229, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 227610 + }, + { + "epoch": 0.8799152634101839, + "grad_norm": 0.09061005711555481, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 227620 + }, + { + "epoch": 0.8799539206135671, + "grad_norm": 0.11038070172071457, + "learning_rate": 0.002, + "loss": 2.327, + "step": 227630 + }, + { + "epoch": 0.8799925778169504, + "grad_norm": 0.11455640941858292, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 227640 + }, + { + "epoch": 0.8800312350203336, + "grad_norm": 0.10514939576387405, + "learning_rate": 0.002, + "loss": 2.335, + "step": 227650 + }, + { + "epoch": 0.880069892223717, + "grad_norm": 0.10529999434947968, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 227660 + }, + { + "epoch": 0.8801085494271003, + "grad_norm": 0.10474987328052521, + "learning_rate": 0.002, + "loss": 2.345, + "step": 227670 + }, + { + "epoch": 0.8801472066304835, + "grad_norm": 0.12214624136686325, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 227680 + }, + { + "epoch": 0.8801858638338668, + "grad_norm": 0.11123227328062057, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 227690 + }, + { + "epoch": 0.8802245210372501, + "grad_norm": 0.09702645987272263, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 227700 + }, + { + "epoch": 0.8802631782406334, + "grad_norm": 0.10156147181987762, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 227710 + }, + { + "epoch": 0.8803018354440166, + "grad_norm": 0.10677109658718109, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 227720 + }, + { + "epoch": 0.8803404926473999, + "grad_norm": 0.09380175173282623, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 227730 + }, + { + "epoch": 0.8803791498507832, + "grad_norm": 0.0992397591471672, + "learning_rate": 0.002, + "loss": 2.333, + "step": 227740 + }, + { + "epoch": 0.8804178070541665, + "grad_norm": 0.10190727561712265, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 227750 + }, + { + "epoch": 0.8804564642575498, + "grad_norm": 0.11910542100667953, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 227760 + }, + { + "epoch": 0.880495121460933, + "grad_norm": 0.1118072047829628, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 227770 + }, + { + "epoch": 0.8805337786643163, + "grad_norm": 0.10263451933860779, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 227780 + }, + { + "epoch": 0.8805724358676996, + "grad_norm": 0.09938156604766846, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 227790 + }, + { + "epoch": 0.8806110930710829, + "grad_norm": 0.11796408146619797, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 227800 + }, + { + "epoch": 0.8806497502744661, + "grad_norm": 0.08841456472873688, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 227810 + }, + { + "epoch": 0.8806884074778494, + "grad_norm": 0.11047626286745071, + "learning_rate": 0.002, + "loss": 2.35, + "step": 227820 + }, + { + "epoch": 0.8807270646812327, + "grad_norm": 0.10162214189767838, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 227830 + }, + { + "epoch": 0.880765721884616, + "grad_norm": 0.11247002333402634, + "learning_rate": 0.002, + "loss": 2.328, + "step": 227840 + }, + { + "epoch": 0.8808043790879992, + "grad_norm": 0.09718988835811615, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 227850 + }, + { + "epoch": 0.8808430362913825, + "grad_norm": 0.13489054143428802, + "learning_rate": 0.002, + "loss": 2.338, + "step": 227860 + }, + { + "epoch": 0.8808816934947659, + "grad_norm": 0.1064859926700592, + "learning_rate": 0.002, + "loss": 2.336, + "step": 227870 + }, + { + "epoch": 0.8809203506981491, + "grad_norm": 0.10662670433521271, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 227880 + }, + { + "epoch": 0.8809590079015324, + "grad_norm": 0.10369833558797836, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 227890 + }, + { + "epoch": 0.8809976651049156, + "grad_norm": 0.11877351999282837, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 227900 + }, + { + "epoch": 0.881036322308299, + "grad_norm": 0.09558946639299393, + "learning_rate": 0.002, + "loss": 2.3677, + "step": 227910 + }, + { + "epoch": 0.8810749795116822, + "grad_norm": 0.09941543638706207, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 227920 + }, + { + "epoch": 0.8811136367150655, + "grad_norm": 0.10346577316522598, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 227930 + }, + { + "epoch": 0.8811522939184487, + "grad_norm": 0.09705489128828049, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 227940 + }, + { + "epoch": 0.881190951121832, + "grad_norm": 0.107711561024189, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 227950 + }, + { + "epoch": 0.8812296083252154, + "grad_norm": 0.10006747394800186, + "learning_rate": 0.002, + "loss": 2.355, + "step": 227960 + }, + { + "epoch": 0.8812682655285986, + "grad_norm": 0.10859206318855286, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 227970 + }, + { + "epoch": 0.8813069227319819, + "grad_norm": 0.09800279885530472, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 227980 + }, + { + "epoch": 0.8813455799353651, + "grad_norm": 0.10673708468675613, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 227990 + }, + { + "epoch": 0.8813842371387485, + "grad_norm": 0.10194409638643265, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 228000 + }, + { + "epoch": 0.8814228943421317, + "grad_norm": 0.08737658709287643, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 228010 + }, + { + "epoch": 0.881461551545515, + "grad_norm": 0.0974418893456459, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 228020 + }, + { + "epoch": 0.8815002087488982, + "grad_norm": 0.1162291020154953, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 228030 + }, + { + "epoch": 0.8815388659522816, + "grad_norm": 0.11426295340061188, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 228040 + }, + { + "epoch": 0.8815775231556648, + "grad_norm": 0.09751128405332565, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 228050 + }, + { + "epoch": 0.8816161803590481, + "grad_norm": 0.1150236502289772, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 228060 + }, + { + "epoch": 0.8816548375624313, + "grad_norm": 0.10207517445087433, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 228070 + }, + { + "epoch": 0.8816934947658147, + "grad_norm": 0.09584974497556686, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 228080 + }, + { + "epoch": 0.881732151969198, + "grad_norm": 0.08666618168354034, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 228090 + }, + { + "epoch": 0.8817708091725812, + "grad_norm": 0.11309351027011871, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 228100 + }, + { + "epoch": 0.8818094663759645, + "grad_norm": 0.11253513395786285, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 228110 + }, + { + "epoch": 0.8818481235793477, + "grad_norm": 0.12086072564125061, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 228120 + }, + { + "epoch": 0.8818867807827311, + "grad_norm": 0.12018612772226334, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 228130 + }, + { + "epoch": 0.8819254379861143, + "grad_norm": 0.09976814687252045, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 228140 + }, + { + "epoch": 0.8819640951894976, + "grad_norm": 0.10137394070625305, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 228150 + }, + { + "epoch": 0.8820027523928808, + "grad_norm": 0.1139528751373291, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 228160 + }, + { + "epoch": 0.8820414095962642, + "grad_norm": 0.09931164979934692, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 228170 + }, + { + "epoch": 0.8820800667996475, + "grad_norm": 0.12239488959312439, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 228180 + }, + { + "epoch": 0.8821187240030307, + "grad_norm": 0.10625220090150833, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 228190 + }, + { + "epoch": 0.882157381206414, + "grad_norm": 0.09328392893075943, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 228200 + }, + { + "epoch": 0.8821960384097973, + "grad_norm": 0.11561544239521027, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 228210 + }, + { + "epoch": 0.8822346956131806, + "grad_norm": 0.1072704866528511, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 228220 + }, + { + "epoch": 0.8822733528165638, + "grad_norm": 0.11116018146276474, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 228230 + }, + { + "epoch": 0.8823120100199471, + "grad_norm": 0.10479837656021118, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 228240 + }, + { + "epoch": 0.8823506672233304, + "grad_norm": 0.1300334632396698, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 228250 + }, + { + "epoch": 0.8823893244267137, + "grad_norm": 0.12884414196014404, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 228260 + }, + { + "epoch": 0.882427981630097, + "grad_norm": 0.11360549926757812, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 228270 + }, + { + "epoch": 0.8824666388334802, + "grad_norm": 0.09666585922241211, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 228280 + }, + { + "epoch": 0.8825052960368636, + "grad_norm": 0.0928926020860672, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 228290 + }, + { + "epoch": 0.8825439532402468, + "grad_norm": 0.10815281420946121, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 228300 + }, + { + "epoch": 0.8825826104436301, + "grad_norm": 0.1050979271531105, + "learning_rate": 0.002, + "loss": 2.348, + "step": 228310 + }, + { + "epoch": 0.8826212676470133, + "grad_norm": 0.08897456526756287, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 228320 + }, + { + "epoch": 0.8826599248503966, + "grad_norm": 0.12232557684183121, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 228330 + }, + { + "epoch": 0.8826985820537799, + "grad_norm": 0.0979345440864563, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 228340 + }, + { + "epoch": 0.8827372392571632, + "grad_norm": 0.10978517681360245, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 228350 + }, + { + "epoch": 0.8827758964605464, + "grad_norm": 0.10496382415294647, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 228360 + }, + { + "epoch": 0.8828145536639297, + "grad_norm": 0.09739359468221664, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 228370 + }, + { + "epoch": 0.882853210867313, + "grad_norm": 0.10366075485944748, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 228380 + }, + { + "epoch": 0.8828918680706963, + "grad_norm": 0.0847225934267044, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 228390 + }, + { + "epoch": 0.8829305252740796, + "grad_norm": 0.10986627638339996, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 228400 + }, + { + "epoch": 0.8829691824774628, + "grad_norm": 0.1198427677154541, + "learning_rate": 0.002, + "loss": 2.333, + "step": 228410 + }, + { + "epoch": 0.8830078396808462, + "grad_norm": 0.11189264059066772, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 228420 + }, + { + "epoch": 0.8830464968842294, + "grad_norm": 0.11902923882007599, + "learning_rate": 0.002, + "loss": 2.341, + "step": 228430 + }, + { + "epoch": 0.8830851540876127, + "grad_norm": 0.09806467592716217, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 228440 + }, + { + "epoch": 0.8831238112909959, + "grad_norm": 0.12040401995182037, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 228450 + }, + { + "epoch": 0.8831624684943793, + "grad_norm": 0.11240734905004501, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 228460 + }, + { + "epoch": 0.8832011256977625, + "grad_norm": 0.13707761466503143, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 228470 + }, + { + "epoch": 0.8832397829011458, + "grad_norm": 0.10513965040445328, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 228480 + }, + { + "epoch": 0.883278440104529, + "grad_norm": 0.11313371360301971, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 228490 + }, + { + "epoch": 0.8833170973079123, + "grad_norm": 0.09669902175664902, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 228500 + }, + { + "epoch": 0.8833557545112957, + "grad_norm": 0.10225246101617813, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 228510 + }, + { + "epoch": 0.8833944117146789, + "grad_norm": 0.09568794071674347, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 228520 + }, + { + "epoch": 0.8834330689180622, + "grad_norm": 0.12730160355567932, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 228530 + }, + { + "epoch": 0.8834717261214454, + "grad_norm": 0.11025600880384445, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 228540 + }, + { + "epoch": 0.8835103833248288, + "grad_norm": 0.09465350210666656, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 228550 + }, + { + "epoch": 0.883549040528212, + "grad_norm": 0.09230029582977295, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 228560 + }, + { + "epoch": 0.8835876977315953, + "grad_norm": 0.10580102354288101, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 228570 + }, + { + "epoch": 0.8836263549349785, + "grad_norm": 0.1314316838979721, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 228580 + }, + { + "epoch": 0.8836650121383619, + "grad_norm": 0.11994590610265732, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 228590 + }, + { + "epoch": 0.8837036693417452, + "grad_norm": 0.11196423321962357, + "learning_rate": 0.002, + "loss": 2.359, + "step": 228600 + }, + { + "epoch": 0.8837423265451284, + "grad_norm": 0.09601790457963943, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 228610 + }, + { + "epoch": 0.8837809837485117, + "grad_norm": 0.08654838055372238, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 228620 + }, + { + "epoch": 0.883819640951895, + "grad_norm": 0.10150102525949478, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 228630 + }, + { + "epoch": 0.8838582981552783, + "grad_norm": 0.10870946198701859, + "learning_rate": 0.002, + "loss": 2.357, + "step": 228640 + }, + { + "epoch": 0.8838969553586615, + "grad_norm": 0.11671384423971176, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 228650 + }, + { + "epoch": 0.8839356125620448, + "grad_norm": 0.0986432358622551, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 228660 + }, + { + "epoch": 0.8839742697654281, + "grad_norm": 0.11731009930372238, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 228670 + }, + { + "epoch": 0.8840129269688114, + "grad_norm": 0.09436880052089691, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 228680 + }, + { + "epoch": 0.8840515841721946, + "grad_norm": 0.19049400091171265, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 228690 + }, + { + "epoch": 0.8840902413755779, + "grad_norm": 0.11192015558481216, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 228700 + }, + { + "epoch": 0.8841288985789612, + "grad_norm": 0.09574125707149506, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 228710 + }, + { + "epoch": 0.8841675557823445, + "grad_norm": 0.1215815544128418, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 228720 + }, + { + "epoch": 0.8842062129857278, + "grad_norm": 0.11466271430253983, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 228730 + }, + { + "epoch": 0.884244870189111, + "grad_norm": 0.11675561219453812, + "learning_rate": 0.002, + "loss": 2.349, + "step": 228740 + }, + { + "epoch": 0.8842835273924943, + "grad_norm": 0.11158723384141922, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 228750 + }, + { + "epoch": 0.8843221845958776, + "grad_norm": 0.10511514544487, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 228760 + }, + { + "epoch": 0.8843608417992609, + "grad_norm": 0.11220479011535645, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 228770 + }, + { + "epoch": 0.8843994990026441, + "grad_norm": 0.0958787351846695, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 228780 + }, + { + "epoch": 0.8844381562060274, + "grad_norm": 0.11027432233095169, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 228790 + }, + { + "epoch": 0.8844768134094108, + "grad_norm": 0.09384578466415405, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 228800 + }, + { + "epoch": 0.884515470612794, + "grad_norm": 0.11357222497463226, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 228810 + }, + { + "epoch": 0.8845541278161773, + "grad_norm": 0.10767655819654465, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 228820 + }, + { + "epoch": 0.8845927850195605, + "grad_norm": 0.10896115750074387, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 228830 + }, + { + "epoch": 0.8846314422229439, + "grad_norm": 0.12012704461812973, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 228840 + }, + { + "epoch": 0.8846700994263271, + "grad_norm": 0.09226809442043304, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 228850 + }, + { + "epoch": 0.8847087566297104, + "grad_norm": 0.10555747151374817, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 228860 + }, + { + "epoch": 0.8847474138330936, + "grad_norm": 0.09659525752067566, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 228870 + }, + { + "epoch": 0.8847860710364769, + "grad_norm": 0.11739001423120499, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 228880 + }, + { + "epoch": 0.8848247282398602, + "grad_norm": 0.10839199274778366, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 228890 + }, + { + "epoch": 0.8848633854432435, + "grad_norm": 0.09374792128801346, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 228900 + }, + { + "epoch": 0.8849020426466268, + "grad_norm": 0.10925695300102234, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 228910 + }, + { + "epoch": 0.88494069985001, + "grad_norm": 0.10080505162477493, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 228920 + }, + { + "epoch": 0.8849793570533934, + "grad_norm": 0.10030867159366608, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 228930 + }, + { + "epoch": 0.8850180142567766, + "grad_norm": 0.11349781602621078, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 228940 + }, + { + "epoch": 0.8850566714601599, + "grad_norm": 0.11237026751041412, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 228950 + }, + { + "epoch": 0.8850953286635431, + "grad_norm": 0.112160824239254, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 228960 + }, + { + "epoch": 0.8851339858669265, + "grad_norm": 0.09719818085432053, + "learning_rate": 0.002, + "loss": 2.357, + "step": 228970 + }, + { + "epoch": 0.8851726430703097, + "grad_norm": 0.07541675865650177, + "learning_rate": 0.002, + "loss": 2.325, + "step": 228980 + }, + { + "epoch": 0.885211300273693, + "grad_norm": 0.11679107695817947, + "learning_rate": 0.002, + "loss": 2.348, + "step": 228990 + }, + { + "epoch": 0.8852499574770762, + "grad_norm": 0.10213762521743774, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 229000 + }, + { + "epoch": 0.8852886146804596, + "grad_norm": 0.10680190473794937, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 229010 + }, + { + "epoch": 0.8853272718838429, + "grad_norm": 0.10340744256973267, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 229020 + }, + { + "epoch": 0.8853659290872261, + "grad_norm": 0.10625316947698593, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 229030 + }, + { + "epoch": 0.8854045862906094, + "grad_norm": 0.11096010357141495, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 229040 + }, + { + "epoch": 0.8854432434939926, + "grad_norm": 0.10733357816934586, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 229050 + }, + { + "epoch": 0.885481900697376, + "grad_norm": 0.10386716574430466, + "learning_rate": 0.002, + "loss": 2.341, + "step": 229060 + }, + { + "epoch": 0.8855205579007592, + "grad_norm": 0.08777379244565964, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 229070 + }, + { + "epoch": 0.8855592151041425, + "grad_norm": 0.12602606415748596, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 229080 + }, + { + "epoch": 0.8855978723075257, + "grad_norm": 0.12142560631036758, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 229090 + }, + { + "epoch": 0.8856365295109091, + "grad_norm": 0.10715343803167343, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 229100 + }, + { + "epoch": 0.8856751867142924, + "grad_norm": 0.11046537011861801, + "learning_rate": 0.002, + "loss": 2.351, + "step": 229110 + }, + { + "epoch": 0.8857138439176756, + "grad_norm": 0.12889137864112854, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 229120 + }, + { + "epoch": 0.8857525011210589, + "grad_norm": 0.09733462333679199, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 229130 + }, + { + "epoch": 0.8857911583244422, + "grad_norm": 0.10241176933050156, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 229140 + }, + { + "epoch": 0.8858298155278255, + "grad_norm": 0.10478372871875763, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 229150 + }, + { + "epoch": 0.8858684727312087, + "grad_norm": 0.1303471326828003, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 229160 + }, + { + "epoch": 0.885907129934592, + "grad_norm": 0.12433631718158722, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 229170 + }, + { + "epoch": 0.8859457871379753, + "grad_norm": 0.10237149894237518, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 229180 + }, + { + "epoch": 0.8859844443413586, + "grad_norm": 0.10052412748336792, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 229190 + }, + { + "epoch": 0.8860231015447418, + "grad_norm": 0.09713948518037796, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 229200 + }, + { + "epoch": 0.8860617587481251, + "grad_norm": 0.1421917825937271, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 229210 + }, + { + "epoch": 0.8861004159515085, + "grad_norm": 0.12245440483093262, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 229220 + }, + { + "epoch": 0.8861390731548917, + "grad_norm": 0.1105659231543541, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 229230 + }, + { + "epoch": 0.886177730358275, + "grad_norm": 0.11617681384086609, + "learning_rate": 0.002, + "loss": 2.3104, + "step": 229240 + }, + { + "epoch": 0.8862163875616582, + "grad_norm": 0.10943305492401123, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 229250 + }, + { + "epoch": 0.8862550447650415, + "grad_norm": 0.10044894367456436, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 229260 + }, + { + "epoch": 0.8862937019684248, + "grad_norm": 0.10492462664842606, + "learning_rate": 0.002, + "loss": 2.336, + "step": 229270 + }, + { + "epoch": 0.8863323591718081, + "grad_norm": 0.10984104126691818, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 229280 + }, + { + "epoch": 0.8863710163751913, + "grad_norm": 0.10741350799798965, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 229290 + }, + { + "epoch": 0.8864096735785746, + "grad_norm": 0.08082353323698044, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 229300 + }, + { + "epoch": 0.886448330781958, + "grad_norm": 0.10525934398174286, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 229310 + }, + { + "epoch": 0.8864869879853412, + "grad_norm": 0.10217452049255371, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 229320 + }, + { + "epoch": 0.8865256451887245, + "grad_norm": 0.1082843765616417, + "learning_rate": 0.002, + "loss": 2.335, + "step": 229330 + }, + { + "epoch": 0.8865643023921077, + "grad_norm": 0.10454316437244415, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 229340 + }, + { + "epoch": 0.8866029595954911, + "grad_norm": 0.10445873439311981, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 229350 + }, + { + "epoch": 0.8866416167988743, + "grad_norm": 0.0989517867565155, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 229360 + }, + { + "epoch": 0.8866802740022576, + "grad_norm": 0.10514253377914429, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 229370 + }, + { + "epoch": 0.8867189312056408, + "grad_norm": 0.10348337143659592, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 229380 + }, + { + "epoch": 0.8867575884090242, + "grad_norm": 0.0983634814620018, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 229390 + }, + { + "epoch": 0.8867962456124074, + "grad_norm": 0.10358452051877975, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 229400 + }, + { + "epoch": 0.8868349028157907, + "grad_norm": 0.10264099389314651, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 229410 + }, + { + "epoch": 0.886873560019174, + "grad_norm": 0.11348433047533035, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 229420 + }, + { + "epoch": 0.8869122172225572, + "grad_norm": 0.09199924767017365, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 229430 + }, + { + "epoch": 0.8869508744259406, + "grad_norm": 0.09641484171152115, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 229440 + }, + { + "epoch": 0.8869895316293238, + "grad_norm": 0.11211732029914856, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 229450 + }, + { + "epoch": 0.8870281888327071, + "grad_norm": 0.1105748862028122, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 229460 + }, + { + "epoch": 0.8870668460360903, + "grad_norm": 0.11591217666864395, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 229470 + }, + { + "epoch": 0.8871055032394737, + "grad_norm": 0.11064792424440384, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 229480 + }, + { + "epoch": 0.8871441604428569, + "grad_norm": 0.09471415728330612, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 229490 + }, + { + "epoch": 0.8871828176462402, + "grad_norm": 0.10499259829521179, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 229500 + }, + { + "epoch": 0.8872214748496234, + "grad_norm": 0.10962316393852234, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 229510 + }, + { + "epoch": 0.8872601320530068, + "grad_norm": 0.09535825252532959, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 229520 + }, + { + "epoch": 0.88729878925639, + "grad_norm": 0.09556851536035538, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 229530 + }, + { + "epoch": 0.8873374464597733, + "grad_norm": 0.09536069631576538, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 229540 + }, + { + "epoch": 0.8873761036631566, + "grad_norm": 0.10342884063720703, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 229550 + }, + { + "epoch": 0.8874147608665399, + "grad_norm": 0.0994892343878746, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 229560 + }, + { + "epoch": 0.8874534180699232, + "grad_norm": 0.09657128155231476, + "learning_rate": 0.002, + "loss": 2.326, + "step": 229570 + }, + { + "epoch": 0.8874920752733064, + "grad_norm": 0.11561530083417892, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 229580 + }, + { + "epoch": 0.8875307324766897, + "grad_norm": 0.11136766523122787, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 229590 + }, + { + "epoch": 0.887569389680073, + "grad_norm": 0.11479032784700394, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 229600 + }, + { + "epoch": 0.8876080468834563, + "grad_norm": 0.10477154701948166, + "learning_rate": 0.002, + "loss": 2.349, + "step": 229610 + }, + { + "epoch": 0.8876467040868395, + "grad_norm": 0.09890048950910568, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 229620 + }, + { + "epoch": 0.8876853612902228, + "grad_norm": 0.1130228266119957, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 229630 + }, + { + "epoch": 0.887724018493606, + "grad_norm": 0.1031917929649353, + "learning_rate": 0.002, + "loss": 2.332, + "step": 229640 + }, + { + "epoch": 0.8877626756969894, + "grad_norm": 0.12092357873916626, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 229650 + }, + { + "epoch": 0.8878013329003727, + "grad_norm": 0.11220254004001617, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 229660 + }, + { + "epoch": 0.8878399901037559, + "grad_norm": 0.10315951704978943, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 229670 + }, + { + "epoch": 0.8878786473071392, + "grad_norm": 0.09558948129415512, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 229680 + }, + { + "epoch": 0.8879173045105225, + "grad_norm": 0.11425267159938812, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 229690 + }, + { + "epoch": 0.8879559617139058, + "grad_norm": 0.0957653596997261, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 229700 + }, + { + "epoch": 0.887994618917289, + "grad_norm": 0.09852249175310135, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 229710 + }, + { + "epoch": 0.8880332761206723, + "grad_norm": 0.12549278140068054, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 229720 + }, + { + "epoch": 0.8880719333240557, + "grad_norm": 0.10537635535001755, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 229730 + }, + { + "epoch": 0.8881105905274389, + "grad_norm": 0.10095790773630142, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 229740 + }, + { + "epoch": 0.8881492477308222, + "grad_norm": 0.11167895048856735, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 229750 + }, + { + "epoch": 0.8881879049342054, + "grad_norm": 0.09808744490146637, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 229760 + }, + { + "epoch": 0.8882265621375888, + "grad_norm": 0.10633957386016846, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 229770 + }, + { + "epoch": 0.888265219340972, + "grad_norm": 0.1152457445859909, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 229780 + }, + { + "epoch": 0.8883038765443553, + "grad_norm": 0.09889443218708038, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 229790 + }, + { + "epoch": 0.8883425337477385, + "grad_norm": 0.11010714620351791, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 229800 + }, + { + "epoch": 0.8883811909511218, + "grad_norm": 0.15187367796897888, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 229810 + }, + { + "epoch": 0.8884198481545051, + "grad_norm": 0.11343226581811905, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 229820 + }, + { + "epoch": 0.8884585053578884, + "grad_norm": 0.11588403582572937, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 229830 + }, + { + "epoch": 0.8884971625612716, + "grad_norm": 0.10823334753513336, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 229840 + }, + { + "epoch": 0.8885358197646549, + "grad_norm": 0.11281239241361618, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 229850 + }, + { + "epoch": 0.8885744769680383, + "grad_norm": 0.10507730394601822, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 229860 + }, + { + "epoch": 0.8886131341714215, + "grad_norm": 0.09830960631370544, + "learning_rate": 0.002, + "loss": 2.3584, + "step": 229870 + }, + { + "epoch": 0.8886517913748048, + "grad_norm": 0.12222305685281754, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 229880 + }, + { + "epoch": 0.888690448578188, + "grad_norm": 0.11914224177598953, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 229890 + }, + { + "epoch": 0.8887291057815714, + "grad_norm": 0.09565926343202591, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 229900 + }, + { + "epoch": 0.8887677629849546, + "grad_norm": 0.10244297236204147, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 229910 + }, + { + "epoch": 0.8888064201883379, + "grad_norm": 0.09705287963151932, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 229920 + }, + { + "epoch": 0.8888450773917211, + "grad_norm": 0.09104941040277481, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 229930 + }, + { + "epoch": 0.8888837345951045, + "grad_norm": 0.10585571080446243, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 229940 + }, + { + "epoch": 0.8889223917984878, + "grad_norm": 0.1275555044412613, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 229950 + }, + { + "epoch": 0.888961049001871, + "grad_norm": 0.09641186147928238, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 229960 + }, + { + "epoch": 0.8889997062052543, + "grad_norm": 0.14952360093593597, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 229970 + }, + { + "epoch": 0.8890383634086375, + "grad_norm": 0.10552582889795303, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 229980 + }, + { + "epoch": 0.8890770206120209, + "grad_norm": 0.09293745458126068, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 229990 + }, + { + "epoch": 0.8891156778154041, + "grad_norm": 0.10250788927078247, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 230000 + }, + { + "epoch": 0.8891543350187874, + "grad_norm": 0.12834201753139496, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 230010 + }, + { + "epoch": 0.8891929922221706, + "grad_norm": 0.1091959998011589, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 230020 + }, + { + "epoch": 0.889231649425554, + "grad_norm": 0.11037223041057587, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 230030 + }, + { + "epoch": 0.8892703066289372, + "grad_norm": 0.1013093888759613, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 230040 + }, + { + "epoch": 0.8893089638323205, + "grad_norm": 0.10540314763784409, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 230050 + }, + { + "epoch": 0.8893476210357038, + "grad_norm": 0.10271942615509033, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 230060 + }, + { + "epoch": 0.8893862782390871, + "grad_norm": 0.09896955639123917, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 230070 + }, + { + "epoch": 0.8894249354424704, + "grad_norm": 0.11004067212343216, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 230080 + }, + { + "epoch": 0.8894635926458536, + "grad_norm": 0.11132332682609558, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 230090 + }, + { + "epoch": 0.8895022498492369, + "grad_norm": 0.11204580962657928, + "learning_rate": 0.002, + "loss": 2.338, + "step": 230100 + }, + { + "epoch": 0.8895409070526202, + "grad_norm": 0.10116194188594818, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 230110 + }, + { + "epoch": 0.8895795642560035, + "grad_norm": 0.09769944846630096, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 230120 + }, + { + "epoch": 0.8896182214593867, + "grad_norm": 0.11125995218753815, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 230130 + }, + { + "epoch": 0.88965687866277, + "grad_norm": 0.09720432758331299, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 230140 + }, + { + "epoch": 0.8896955358661534, + "grad_norm": 0.09110531210899353, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 230150 + }, + { + "epoch": 0.8897341930695366, + "grad_norm": 0.11076952517032623, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 230160 + }, + { + "epoch": 0.8897728502729199, + "grad_norm": 0.10560379177331924, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 230170 + }, + { + "epoch": 0.8898115074763031, + "grad_norm": 0.1136026680469513, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 230180 + }, + { + "epoch": 0.8898501646796864, + "grad_norm": 0.11075719445943832, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 230190 + }, + { + "epoch": 0.8898888218830697, + "grad_norm": 0.10945750027894974, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 230200 + }, + { + "epoch": 0.889927479086453, + "grad_norm": 0.15665782988071442, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 230210 + }, + { + "epoch": 0.8899661362898362, + "grad_norm": 0.10520077496767044, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 230220 + }, + { + "epoch": 0.8900047934932195, + "grad_norm": 0.09022749960422516, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 230230 + }, + { + "epoch": 0.8900434506966028, + "grad_norm": 0.12738844752311707, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 230240 + }, + { + "epoch": 0.8900821078999861, + "grad_norm": 0.09396673738956451, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 230250 + }, + { + "epoch": 0.8901207651033693, + "grad_norm": 0.1027170792222023, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 230260 + }, + { + "epoch": 0.8901594223067526, + "grad_norm": 0.11400040239095688, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 230270 + }, + { + "epoch": 0.890198079510136, + "grad_norm": 0.13193011283874512, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 230280 + }, + { + "epoch": 0.8902367367135192, + "grad_norm": 0.09875988215208054, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 230290 + }, + { + "epoch": 0.8902753939169025, + "grad_norm": 0.11976584792137146, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 230300 + }, + { + "epoch": 0.8903140511202857, + "grad_norm": 0.11035315692424774, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 230310 + }, + { + "epoch": 0.8903527083236691, + "grad_norm": 0.0940026044845581, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 230320 + }, + { + "epoch": 0.8903913655270523, + "grad_norm": 0.10700642317533493, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 230330 + }, + { + "epoch": 0.8904300227304356, + "grad_norm": 0.09496472775936127, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 230340 + }, + { + "epoch": 0.8904686799338188, + "grad_norm": 0.11273860931396484, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 230350 + }, + { + "epoch": 0.8905073371372021, + "grad_norm": 0.09876589477062225, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 230360 + }, + { + "epoch": 0.8905459943405855, + "grad_norm": 0.10113933682441711, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 230370 + }, + { + "epoch": 0.8905846515439687, + "grad_norm": 0.09982689470052719, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 230380 + }, + { + "epoch": 0.890623308747352, + "grad_norm": 0.09933792799711227, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 230390 + }, + { + "epoch": 0.8906619659507352, + "grad_norm": 0.10531254857778549, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 230400 + }, + { + "epoch": 0.8907006231541186, + "grad_norm": 0.10806956887245178, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 230410 + }, + { + "epoch": 0.8907392803575018, + "grad_norm": 0.10214032977819443, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 230420 + }, + { + "epoch": 0.8907779375608851, + "grad_norm": 0.09792348742485046, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 230430 + }, + { + "epoch": 0.8908165947642683, + "grad_norm": 0.12945565581321716, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 230440 + }, + { + "epoch": 0.8908552519676517, + "grad_norm": 0.1092633530497551, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 230450 + }, + { + "epoch": 0.890893909171035, + "grad_norm": 0.09912041574716568, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 230460 + }, + { + "epoch": 0.8909325663744182, + "grad_norm": 0.10581175237894058, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 230470 + }, + { + "epoch": 0.8909712235778015, + "grad_norm": 0.10082611441612244, + "learning_rate": 0.002, + "loss": 2.337, + "step": 230480 + }, + { + "epoch": 0.8910098807811848, + "grad_norm": 0.11029192805290222, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 230490 + }, + { + "epoch": 0.8910485379845681, + "grad_norm": 0.12048397958278656, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 230500 + }, + { + "epoch": 0.8910871951879513, + "grad_norm": 0.11811503767967224, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 230510 + }, + { + "epoch": 0.8911258523913346, + "grad_norm": 0.10998043417930603, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 230520 + }, + { + "epoch": 0.8911645095947179, + "grad_norm": 0.1065039113163948, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 230530 + }, + { + "epoch": 0.8912031667981012, + "grad_norm": 0.11100541800260544, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 230540 + }, + { + "epoch": 0.8912418240014844, + "grad_norm": 0.09994493424892426, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 230550 + }, + { + "epoch": 0.8912804812048677, + "grad_norm": 0.10134243965148926, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 230560 + }, + { + "epoch": 0.8913191384082509, + "grad_norm": 0.11106912046670914, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 230570 + }, + { + "epoch": 0.8913577956116343, + "grad_norm": 0.1081485003232956, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 230580 + }, + { + "epoch": 0.8913964528150176, + "grad_norm": 0.1305999904870987, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 230590 + }, + { + "epoch": 0.8914351100184008, + "grad_norm": 0.09498662501573563, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 230600 + }, + { + "epoch": 0.8914737672217841, + "grad_norm": 0.101992167532444, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 230610 + }, + { + "epoch": 0.8915124244251674, + "grad_norm": 0.10639800131320953, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 230620 + }, + { + "epoch": 0.8915510816285507, + "grad_norm": 0.09898429363965988, + "learning_rate": 0.002, + "loss": 2.314, + "step": 230630 + }, + { + "epoch": 0.8915897388319339, + "grad_norm": 0.09582017362117767, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 230640 + }, + { + "epoch": 0.8916283960353172, + "grad_norm": 0.10886652022600174, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 230650 + }, + { + "epoch": 0.8916670532387005, + "grad_norm": 0.10178403556346893, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 230660 + }, + { + "epoch": 0.8917057104420838, + "grad_norm": 0.1168789491057396, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 230670 + }, + { + "epoch": 0.891744367645467, + "grad_norm": 0.09374828636646271, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 230680 + }, + { + "epoch": 0.8917830248488503, + "grad_norm": 0.10166969150304794, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 230690 + }, + { + "epoch": 0.8918216820522337, + "grad_norm": 0.11109510809183121, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 230700 + }, + { + "epoch": 0.8918603392556169, + "grad_norm": 0.09495483338832855, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 230710 + }, + { + "epoch": 0.8918989964590002, + "grad_norm": 0.10241233557462692, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 230720 + }, + { + "epoch": 0.8919376536623834, + "grad_norm": 0.10209019482135773, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 230730 + }, + { + "epoch": 0.8919763108657667, + "grad_norm": 0.1326477825641632, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 230740 + }, + { + "epoch": 0.89201496806915, + "grad_norm": 0.09626547992229462, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 230750 + }, + { + "epoch": 0.8920536252725333, + "grad_norm": 0.10015839338302612, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 230760 + }, + { + "epoch": 0.8920922824759165, + "grad_norm": 0.09552304446697235, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 230770 + }, + { + "epoch": 0.8921309396792998, + "grad_norm": 0.11490238457918167, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 230780 + }, + { + "epoch": 0.8921695968826832, + "grad_norm": 0.12381553649902344, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 230790 + }, + { + "epoch": 0.8922082540860664, + "grad_norm": 0.10215188562870026, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 230800 + }, + { + "epoch": 0.8922469112894497, + "grad_norm": 0.09393597394227982, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 230810 + }, + { + "epoch": 0.8922855684928329, + "grad_norm": 0.10499109327793121, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 230820 + }, + { + "epoch": 0.8923242256962163, + "grad_norm": 0.10030298680067062, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 230830 + }, + { + "epoch": 0.8923628828995995, + "grad_norm": 0.09288015961647034, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 230840 + }, + { + "epoch": 0.8924015401029828, + "grad_norm": 0.09977603703737259, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 230850 + }, + { + "epoch": 0.892440197306366, + "grad_norm": 0.09825431555509567, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 230860 + }, + { + "epoch": 0.8924788545097494, + "grad_norm": 0.11791310459375381, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 230870 + }, + { + "epoch": 0.8925175117131326, + "grad_norm": 0.09519605338573456, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 230880 + }, + { + "epoch": 0.8925561689165159, + "grad_norm": 0.10679690539836884, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 230890 + }, + { + "epoch": 0.8925948261198992, + "grad_norm": 0.102602519094944, + "learning_rate": 0.002, + "loss": 2.337, + "step": 230900 + }, + { + "epoch": 0.8926334833232824, + "grad_norm": 0.11841274797916412, + "learning_rate": 0.002, + "loss": 2.331, + "step": 230910 + }, + { + "epoch": 0.8926721405266658, + "grad_norm": 0.114153191447258, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 230920 + }, + { + "epoch": 0.892710797730049, + "grad_norm": 0.08805635571479797, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 230930 + }, + { + "epoch": 0.8927494549334323, + "grad_norm": 0.12973535060882568, + "learning_rate": 0.002, + "loss": 2.348, + "step": 230940 + }, + { + "epoch": 0.8927881121368155, + "grad_norm": 0.1187136098742485, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 230950 + }, + { + "epoch": 0.8928267693401989, + "grad_norm": 0.1086641252040863, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 230960 + }, + { + "epoch": 0.8928654265435821, + "grad_norm": 0.11849938333034515, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 230970 + }, + { + "epoch": 0.8929040837469654, + "grad_norm": 0.11114994436502457, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 230980 + }, + { + "epoch": 0.8929427409503486, + "grad_norm": 0.12062875926494598, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 230990 + }, + { + "epoch": 0.892981398153732, + "grad_norm": 0.09971782565116882, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 231000 + }, + { + "epoch": 0.8930200553571153, + "grad_norm": 0.10399482399225235, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 231010 + }, + { + "epoch": 0.8930587125604985, + "grad_norm": 0.0989309698343277, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 231020 + }, + { + "epoch": 0.8930973697638818, + "grad_norm": 0.1022104024887085, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 231030 + }, + { + "epoch": 0.8931360269672651, + "grad_norm": 0.12379945814609528, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 231040 + }, + { + "epoch": 0.8931746841706484, + "grad_norm": 0.11856026947498322, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 231050 + }, + { + "epoch": 0.8932133413740316, + "grad_norm": 0.0948251411318779, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 231060 + }, + { + "epoch": 0.8932519985774149, + "grad_norm": 0.09422452747821808, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 231070 + }, + { + "epoch": 0.8932906557807982, + "grad_norm": 0.10392773896455765, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 231080 + }, + { + "epoch": 0.8933293129841815, + "grad_norm": 0.09506283700466156, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 231090 + }, + { + "epoch": 0.8933679701875648, + "grad_norm": 0.10650750249624252, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 231100 + }, + { + "epoch": 0.893406627390948, + "grad_norm": 0.11290176957845688, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 231110 + }, + { + "epoch": 0.8934452845943313, + "grad_norm": 0.1044296845793724, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 231120 + }, + { + "epoch": 0.8934839417977146, + "grad_norm": 0.09647175669670105, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 231130 + }, + { + "epoch": 0.8935225990010979, + "grad_norm": 0.09704600274562836, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 231140 + }, + { + "epoch": 0.8935612562044811, + "grad_norm": 0.14066311717033386, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 231150 + }, + { + "epoch": 0.8935999134078644, + "grad_norm": 0.09952470660209656, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 231160 + }, + { + "epoch": 0.8936385706112477, + "grad_norm": 0.09438595175743103, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 231170 + }, + { + "epoch": 0.893677227814631, + "grad_norm": 0.09615175426006317, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 231180 + }, + { + "epoch": 0.8937158850180142, + "grad_norm": 0.38761892914772034, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 231190 + }, + { + "epoch": 0.8937545422213975, + "grad_norm": 0.11786754429340363, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 231200 + }, + { + "epoch": 0.8937931994247809, + "grad_norm": 0.10287931561470032, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 231210 + }, + { + "epoch": 0.8938318566281641, + "grad_norm": 0.10862020403146744, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 231220 + }, + { + "epoch": 0.8938705138315474, + "grad_norm": 0.09461180865764618, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 231230 + }, + { + "epoch": 0.8939091710349306, + "grad_norm": 0.10775598138570786, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 231240 + }, + { + "epoch": 0.893947828238314, + "grad_norm": 0.09789089858531952, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 231250 + }, + { + "epoch": 0.8939864854416972, + "grad_norm": 0.10795722156763077, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 231260 + }, + { + "epoch": 0.8940251426450805, + "grad_norm": 0.10861648619174957, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 231270 + }, + { + "epoch": 0.8940637998484637, + "grad_norm": 0.11098982393741608, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 231280 + }, + { + "epoch": 0.894102457051847, + "grad_norm": 0.09414079785346985, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 231290 + }, + { + "epoch": 0.8941411142552304, + "grad_norm": 0.11190325766801834, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 231300 + }, + { + "epoch": 0.8941797714586136, + "grad_norm": 0.10887134075164795, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 231310 + }, + { + "epoch": 0.8942184286619969, + "grad_norm": 0.09470488131046295, + "learning_rate": 0.002, + "loss": 2.328, + "step": 231320 + }, + { + "epoch": 0.8942570858653801, + "grad_norm": 0.10217797756195068, + "learning_rate": 0.002, + "loss": 2.356, + "step": 231330 + }, + { + "epoch": 0.8942957430687635, + "grad_norm": 0.0979577824473381, + "learning_rate": 0.002, + "loss": 2.34, + "step": 231340 + }, + { + "epoch": 0.8943344002721467, + "grad_norm": 0.10248088836669922, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 231350 + }, + { + "epoch": 0.89437305747553, + "grad_norm": 0.09941322356462479, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 231360 + }, + { + "epoch": 0.8944117146789132, + "grad_norm": 0.12089301645755768, + "learning_rate": 0.002, + "loss": 2.323, + "step": 231370 + }, + { + "epoch": 0.8944503718822966, + "grad_norm": 0.1151440218091011, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 231380 + }, + { + "epoch": 0.8944890290856798, + "grad_norm": 0.116435207426548, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 231390 + }, + { + "epoch": 0.8945276862890631, + "grad_norm": 0.10244743525981903, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 231400 + }, + { + "epoch": 0.8945663434924463, + "grad_norm": 0.13515597581863403, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 231410 + }, + { + "epoch": 0.8946050006958297, + "grad_norm": 0.09385132789611816, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 231420 + }, + { + "epoch": 0.894643657899213, + "grad_norm": 0.10678960382938385, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 231430 + }, + { + "epoch": 0.8946823151025962, + "grad_norm": 0.10624562203884125, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 231440 + }, + { + "epoch": 0.8947209723059795, + "grad_norm": 0.1100616380572319, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 231450 + }, + { + "epoch": 0.8947596295093627, + "grad_norm": 0.0967680811882019, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 231460 + }, + { + "epoch": 0.8947982867127461, + "grad_norm": 0.12556961178779602, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 231470 + }, + { + "epoch": 0.8948369439161293, + "grad_norm": 0.11757630109786987, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 231480 + }, + { + "epoch": 0.8948756011195126, + "grad_norm": 0.09802540391683578, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 231490 + }, + { + "epoch": 0.8949142583228958, + "grad_norm": 0.10392263531684875, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 231500 + }, + { + "epoch": 0.8949529155262792, + "grad_norm": 0.11355400085449219, + "learning_rate": 0.002, + "loss": 2.339, + "step": 231510 + }, + { + "epoch": 0.8949915727296625, + "grad_norm": 0.09654173254966736, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 231520 + }, + { + "epoch": 0.8950302299330457, + "grad_norm": 0.1176266297698021, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 231530 + }, + { + "epoch": 0.895068887136429, + "grad_norm": 0.10907765477895737, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 231540 + }, + { + "epoch": 0.8951075443398123, + "grad_norm": 0.10915853828191757, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 231550 + }, + { + "epoch": 0.8951462015431956, + "grad_norm": 0.11131073534488678, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 231560 + }, + { + "epoch": 0.8951848587465788, + "grad_norm": 0.11446822434663773, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 231570 + }, + { + "epoch": 0.8952235159499621, + "grad_norm": 0.12288888543844223, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 231580 + }, + { + "epoch": 0.8952621731533454, + "grad_norm": 0.3278125524520874, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 231590 + }, + { + "epoch": 0.8953008303567287, + "grad_norm": 0.11354555934667587, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 231600 + }, + { + "epoch": 0.895339487560112, + "grad_norm": 0.10242932289838791, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 231610 + }, + { + "epoch": 0.8953781447634952, + "grad_norm": 0.1100035160779953, + "learning_rate": 0.002, + "loss": 2.341, + "step": 231620 + }, + { + "epoch": 0.8954168019668786, + "grad_norm": 0.09858231991529465, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 231630 + }, + { + "epoch": 0.8954554591702618, + "grad_norm": 0.10620801895856857, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 231640 + }, + { + "epoch": 0.8954941163736451, + "grad_norm": 0.12707732617855072, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 231650 + }, + { + "epoch": 0.8955327735770283, + "grad_norm": 0.12811803817749023, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 231660 + }, + { + "epoch": 0.8955714307804116, + "grad_norm": 0.1011258214712143, + "learning_rate": 0.002, + "loss": 2.351, + "step": 231670 + }, + { + "epoch": 0.8956100879837949, + "grad_norm": 0.10836885124444962, + "learning_rate": 0.002, + "loss": 2.345, + "step": 231680 + }, + { + "epoch": 0.8956487451871782, + "grad_norm": 0.11384893953800201, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 231690 + }, + { + "epoch": 0.8956874023905614, + "grad_norm": 0.10242240130901337, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 231700 + }, + { + "epoch": 0.8957260595939447, + "grad_norm": 0.10519468039274216, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 231710 + }, + { + "epoch": 0.895764716797328, + "grad_norm": 0.1039244532585144, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 231720 + }, + { + "epoch": 0.8958033740007113, + "grad_norm": 0.1218051016330719, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 231730 + }, + { + "epoch": 0.8958420312040946, + "grad_norm": 0.13029955327510834, + "learning_rate": 0.002, + "loss": 2.322, + "step": 231740 + }, + { + "epoch": 0.8958806884074778, + "grad_norm": 0.10035617649555206, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 231750 + }, + { + "epoch": 0.8959193456108612, + "grad_norm": 0.0961938425898552, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 231760 + }, + { + "epoch": 0.8959580028142444, + "grad_norm": 0.11479473859071732, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 231770 + }, + { + "epoch": 0.8959966600176277, + "grad_norm": 0.11282454431056976, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 231780 + }, + { + "epoch": 0.8960353172210109, + "grad_norm": 0.10895568132400513, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 231790 + }, + { + "epoch": 0.8960739744243943, + "grad_norm": 0.12822161614894867, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 231800 + }, + { + "epoch": 0.8961126316277775, + "grad_norm": 0.11044373363256454, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 231810 + }, + { + "epoch": 0.8961512888311608, + "grad_norm": 0.09917205572128296, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 231820 + }, + { + "epoch": 0.896189946034544, + "grad_norm": 0.10342488437891006, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 231830 + }, + { + "epoch": 0.8962286032379273, + "grad_norm": 0.11677366495132446, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 231840 + }, + { + "epoch": 0.8962672604413107, + "grad_norm": 0.10196980088949203, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 231850 + }, + { + "epoch": 0.8963059176446939, + "grad_norm": 0.2261200100183487, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 231860 + }, + { + "epoch": 0.8963445748480772, + "grad_norm": 0.10797779262065887, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 231870 + }, + { + "epoch": 0.8963832320514604, + "grad_norm": 0.1007605567574501, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 231880 + }, + { + "epoch": 0.8964218892548438, + "grad_norm": 0.10831963270902634, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 231890 + }, + { + "epoch": 0.896460546458227, + "grad_norm": 0.1089211255311966, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 231900 + }, + { + "epoch": 0.8964992036616103, + "grad_norm": 0.11141608655452728, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 231910 + }, + { + "epoch": 0.8965378608649935, + "grad_norm": 0.11648708581924438, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 231920 + }, + { + "epoch": 0.8965765180683769, + "grad_norm": 0.09122157096862793, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 231930 + }, + { + "epoch": 0.8966151752717602, + "grad_norm": 0.10319860279560089, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 231940 + }, + { + "epoch": 0.8966538324751434, + "grad_norm": 0.1191958412528038, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 231950 + }, + { + "epoch": 0.8966924896785267, + "grad_norm": 0.09657797962427139, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 231960 + }, + { + "epoch": 0.89673114688191, + "grad_norm": 0.11388564109802246, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 231970 + }, + { + "epoch": 0.8967698040852933, + "grad_norm": 0.09544172137975693, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 231980 + }, + { + "epoch": 0.8968084612886765, + "grad_norm": 0.11244893074035645, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 231990 + }, + { + "epoch": 0.8968471184920598, + "grad_norm": 0.12543711066246033, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 232000 + }, + { + "epoch": 0.8968857756954431, + "grad_norm": 0.0969160795211792, + "learning_rate": 0.002, + "loss": 2.341, + "step": 232010 + }, + { + "epoch": 0.8969244328988264, + "grad_norm": 0.10747043043375015, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 232020 + }, + { + "epoch": 0.8969630901022096, + "grad_norm": 0.10407809168100357, + "learning_rate": 0.002, + "loss": 2.339, + "step": 232030 + }, + { + "epoch": 0.8970017473055929, + "grad_norm": 0.1038336455821991, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 232040 + }, + { + "epoch": 0.8970404045089762, + "grad_norm": 0.13403278589248657, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 232050 + }, + { + "epoch": 0.8970790617123595, + "grad_norm": 0.09164881706237793, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 232060 + }, + { + "epoch": 0.8971177189157428, + "grad_norm": 0.1021568551659584, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 232070 + }, + { + "epoch": 0.897156376119126, + "grad_norm": 0.10171541571617126, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 232080 + }, + { + "epoch": 0.8971950333225093, + "grad_norm": 0.10120661556720734, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 232090 + }, + { + "epoch": 0.8972336905258926, + "grad_norm": 0.11354358494281769, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 232100 + }, + { + "epoch": 0.8972723477292759, + "grad_norm": 0.11167127639055252, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 232110 + }, + { + "epoch": 0.8973110049326591, + "grad_norm": 0.10344775021076202, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 232120 + }, + { + "epoch": 0.8973496621360424, + "grad_norm": 0.09466537833213806, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 232130 + }, + { + "epoch": 0.8973883193394258, + "grad_norm": 0.11126919835805893, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 232140 + }, + { + "epoch": 0.897426976542809, + "grad_norm": 0.1044890433549881, + "learning_rate": 0.002, + "loss": 2.332, + "step": 232150 + }, + { + "epoch": 0.8974656337461923, + "grad_norm": 0.10409554839134216, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 232160 + }, + { + "epoch": 0.8975042909495755, + "grad_norm": 0.11487068235874176, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 232170 + }, + { + "epoch": 0.8975429481529589, + "grad_norm": 0.09771732240915298, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 232180 + }, + { + "epoch": 0.8975816053563421, + "grad_norm": 0.13147693872451782, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 232190 + }, + { + "epoch": 0.8976202625597254, + "grad_norm": 0.12580551207065582, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 232200 + }, + { + "epoch": 0.8976589197631086, + "grad_norm": 0.10050297528505325, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 232210 + }, + { + "epoch": 0.8976975769664919, + "grad_norm": 0.12342985719442368, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 232220 + }, + { + "epoch": 0.8977362341698752, + "grad_norm": 0.09766845405101776, + "learning_rate": 0.002, + "loss": 2.348, + "step": 232230 + }, + { + "epoch": 0.8977748913732585, + "grad_norm": 0.09380897879600525, + "learning_rate": 0.002, + "loss": 2.344, + "step": 232240 + }, + { + "epoch": 0.8978135485766418, + "grad_norm": 0.12180311977863312, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 232250 + }, + { + "epoch": 0.897852205780025, + "grad_norm": 0.12489345669746399, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 232260 + }, + { + "epoch": 0.8978908629834084, + "grad_norm": 0.10110122710466385, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 232270 + }, + { + "epoch": 0.8979295201867916, + "grad_norm": 0.10600592195987701, + "learning_rate": 0.002, + "loss": 2.334, + "step": 232280 + }, + { + "epoch": 0.8979681773901749, + "grad_norm": 0.10433664172887802, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 232290 + }, + { + "epoch": 0.8980068345935581, + "grad_norm": 0.09368140250444412, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 232300 + }, + { + "epoch": 0.8980454917969415, + "grad_norm": 0.11990946531295776, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 232310 + }, + { + "epoch": 0.8980841490003247, + "grad_norm": 0.09606378525495529, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 232320 + }, + { + "epoch": 0.898122806203708, + "grad_norm": 0.10541123896837234, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 232330 + }, + { + "epoch": 0.8981614634070912, + "grad_norm": 0.10391794145107269, + "learning_rate": 0.002, + "loss": 2.331, + "step": 232340 + }, + { + "epoch": 0.8982001206104746, + "grad_norm": 0.10537216812372208, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 232350 + }, + { + "epoch": 0.8982387778138579, + "grad_norm": 0.09761541336774826, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 232360 + }, + { + "epoch": 0.8982774350172411, + "grad_norm": 0.10396650433540344, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 232370 + }, + { + "epoch": 0.8983160922206244, + "grad_norm": 0.10859867930412292, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 232380 + }, + { + "epoch": 0.8983547494240076, + "grad_norm": 0.10541373491287231, + "learning_rate": 0.002, + "loss": 2.346, + "step": 232390 + }, + { + "epoch": 0.898393406627391, + "grad_norm": 0.1089557409286499, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 232400 + }, + { + "epoch": 0.8984320638307742, + "grad_norm": 0.1103198453783989, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 232410 + }, + { + "epoch": 0.8984707210341575, + "grad_norm": 0.10214130580425262, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 232420 + }, + { + "epoch": 0.8985093782375407, + "grad_norm": 0.09987477213144302, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 232430 + }, + { + "epoch": 0.8985480354409241, + "grad_norm": 0.1155705377459526, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 232440 + }, + { + "epoch": 0.8985866926443073, + "grad_norm": 0.09893004596233368, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 232450 + }, + { + "epoch": 0.8986253498476906, + "grad_norm": 0.09296012669801712, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 232460 + }, + { + "epoch": 0.8986640070510739, + "grad_norm": 0.09634601324796677, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 232470 + }, + { + "epoch": 0.8987026642544572, + "grad_norm": 0.1043325737118721, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 232480 + }, + { + "epoch": 0.8987413214578405, + "grad_norm": 0.11548048257827759, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 232490 + }, + { + "epoch": 0.8987799786612237, + "grad_norm": 0.10021772980690002, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 232500 + }, + { + "epoch": 0.898818635864607, + "grad_norm": 0.11486779898405075, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 232510 + }, + { + "epoch": 0.8988572930679903, + "grad_norm": 0.10646865516901016, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 232520 + }, + { + "epoch": 0.8988959502713736, + "grad_norm": 0.08879152685403824, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 232530 + }, + { + "epoch": 0.8989346074747568, + "grad_norm": 0.10799049586057663, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 232540 + }, + { + "epoch": 0.8989732646781401, + "grad_norm": 0.09489186108112335, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 232550 + }, + { + "epoch": 0.8990119218815235, + "grad_norm": 0.10621532797813416, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 232560 + }, + { + "epoch": 0.8990505790849067, + "grad_norm": 0.09988352656364441, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 232570 + }, + { + "epoch": 0.89908923628829, + "grad_norm": 0.10915430635213852, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 232580 + }, + { + "epoch": 0.8991278934916732, + "grad_norm": 0.1114613264799118, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 232590 + }, + { + "epoch": 0.8991665506950565, + "grad_norm": 0.13072876632213593, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 232600 + }, + { + "epoch": 0.8992052078984398, + "grad_norm": 0.09026247262954712, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 232610 + }, + { + "epoch": 0.8992438651018231, + "grad_norm": 0.09996059536933899, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 232620 + }, + { + "epoch": 0.8992825223052063, + "grad_norm": 0.09811939299106598, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 232630 + }, + { + "epoch": 0.8993211795085896, + "grad_norm": 0.11068232357501984, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 232640 + }, + { + "epoch": 0.899359836711973, + "grad_norm": 0.09158763289451599, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 232650 + }, + { + "epoch": 0.8993984939153562, + "grad_norm": 0.5038883090019226, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 232660 + }, + { + "epoch": 0.8994371511187395, + "grad_norm": 0.10323592275381088, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 232670 + }, + { + "epoch": 0.8994758083221227, + "grad_norm": 0.12048277258872986, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 232680 + }, + { + "epoch": 0.8995144655255061, + "grad_norm": 0.11678492277860641, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 232690 + }, + { + "epoch": 0.8995531227288893, + "grad_norm": 0.10575645416975021, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 232700 + }, + { + "epoch": 0.8995917799322726, + "grad_norm": 0.11219353973865509, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 232710 + }, + { + "epoch": 0.8996304371356558, + "grad_norm": 0.10328909009695053, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 232720 + }, + { + "epoch": 0.8996690943390392, + "grad_norm": 0.10199911147356033, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 232730 + }, + { + "epoch": 0.8997077515424224, + "grad_norm": 0.1011134684085846, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 232740 + }, + { + "epoch": 0.8997464087458057, + "grad_norm": 0.11138148605823517, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 232750 + }, + { + "epoch": 0.899785065949189, + "grad_norm": 0.13523036241531372, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 232760 + }, + { + "epoch": 0.8998237231525722, + "grad_norm": 0.09843233972787857, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 232770 + }, + { + "epoch": 0.8998623803559556, + "grad_norm": 0.08956612646579742, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 232780 + }, + { + "epoch": 0.8999010375593388, + "grad_norm": 0.09797809273004532, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 232790 + }, + { + "epoch": 0.8999396947627221, + "grad_norm": 0.14079497754573822, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 232800 + }, + { + "epoch": 0.8999783519661053, + "grad_norm": 0.10322416573762894, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 232810 + }, + { + "epoch": 0.9000170091694887, + "grad_norm": 0.1083340048789978, + "learning_rate": 0.002, + "loss": 2.334, + "step": 232820 + }, + { + "epoch": 0.9000556663728719, + "grad_norm": 0.09649095684289932, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 232830 + }, + { + "epoch": 0.9000943235762552, + "grad_norm": 0.0971124917268753, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 232840 + }, + { + "epoch": 0.9001329807796384, + "grad_norm": 0.10010434687137604, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 232850 + }, + { + "epoch": 0.9001716379830218, + "grad_norm": 0.0972244068980217, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 232860 + }, + { + "epoch": 0.900210295186405, + "grad_norm": 0.11431008577346802, + "learning_rate": 0.002, + "loss": 2.343, + "step": 232870 + }, + { + "epoch": 0.9002489523897883, + "grad_norm": 0.10072416067123413, + "learning_rate": 0.002, + "loss": 2.351, + "step": 232880 + }, + { + "epoch": 0.9002876095931716, + "grad_norm": 0.10842856019735336, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 232890 + }, + { + "epoch": 0.9003262667965549, + "grad_norm": 0.10901232063770294, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 232900 + }, + { + "epoch": 0.9003649239999382, + "grad_norm": 0.09080494195222855, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 232910 + }, + { + "epoch": 0.9004035812033214, + "grad_norm": 0.10430239886045456, + "learning_rate": 0.002, + "loss": 2.33, + "step": 232920 + }, + { + "epoch": 0.9004422384067047, + "grad_norm": 0.11572965234518051, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 232930 + }, + { + "epoch": 0.900480895610088, + "grad_norm": 0.12283935397863388, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 232940 + }, + { + "epoch": 0.9005195528134713, + "grad_norm": 0.11047730594873428, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 232950 + }, + { + "epoch": 0.9005582100168545, + "grad_norm": 0.11393677443265915, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 232960 + }, + { + "epoch": 0.9005968672202378, + "grad_norm": 0.11150172352790833, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 232970 + }, + { + "epoch": 0.900635524423621, + "grad_norm": 0.10802357643842697, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 232980 + }, + { + "epoch": 0.9006741816270044, + "grad_norm": 0.10877017676830292, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 232990 + }, + { + "epoch": 0.9007128388303877, + "grad_norm": 0.11371221393346786, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 233000 + }, + { + "epoch": 0.9007514960337709, + "grad_norm": 0.0988493487238884, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 233010 + }, + { + "epoch": 0.9007901532371542, + "grad_norm": 0.11515526473522186, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 233020 + }, + { + "epoch": 0.9008288104405375, + "grad_norm": 0.113592229783535, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 233030 + }, + { + "epoch": 0.9008674676439208, + "grad_norm": 0.09327522665262222, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 233040 + }, + { + "epoch": 0.900906124847304, + "grad_norm": 0.10999246686697006, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 233050 + }, + { + "epoch": 0.9009447820506873, + "grad_norm": 0.10084047168493271, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 233060 + }, + { + "epoch": 0.9009834392540707, + "grad_norm": 0.10265267640352249, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 233070 + }, + { + "epoch": 0.9010220964574539, + "grad_norm": 0.11654006689786911, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 233080 + }, + { + "epoch": 0.9010607536608372, + "grad_norm": 0.10442501306533813, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 233090 + }, + { + "epoch": 0.9010994108642204, + "grad_norm": 0.0955716222524643, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 233100 + }, + { + "epoch": 0.9011380680676038, + "grad_norm": 0.09556197375059128, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 233110 + }, + { + "epoch": 0.901176725270987, + "grad_norm": 0.10069194436073303, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 233120 + }, + { + "epoch": 0.9012153824743703, + "grad_norm": 0.10242833942174911, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 233130 + }, + { + "epoch": 0.9012540396777535, + "grad_norm": 0.10300853103399277, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 233140 + }, + { + "epoch": 0.9012926968811368, + "grad_norm": 0.11195770651102066, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 233150 + }, + { + "epoch": 0.9013313540845201, + "grad_norm": 0.09703370183706284, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 233160 + }, + { + "epoch": 0.9013700112879034, + "grad_norm": 0.10532054305076599, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 233170 + }, + { + "epoch": 0.9014086684912866, + "grad_norm": 0.09363120049238205, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 233180 + }, + { + "epoch": 0.9014473256946699, + "grad_norm": 0.10732677578926086, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 233190 + }, + { + "epoch": 0.9014859828980533, + "grad_norm": 0.09427095949649811, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 233200 + }, + { + "epoch": 0.9015246401014365, + "grad_norm": 0.13079047203063965, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 233210 + }, + { + "epoch": 0.9015632973048198, + "grad_norm": 0.11108674108982086, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 233220 + }, + { + "epoch": 0.901601954508203, + "grad_norm": 0.09413529932498932, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 233230 + }, + { + "epoch": 0.9016406117115864, + "grad_norm": 0.10972938686609268, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 233240 + }, + { + "epoch": 0.9016792689149696, + "grad_norm": 0.11578845232725143, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 233250 + }, + { + "epoch": 0.9017179261183529, + "grad_norm": 0.1254650205373764, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 233260 + }, + { + "epoch": 0.9017565833217361, + "grad_norm": 0.09579180181026459, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 233270 + }, + { + "epoch": 0.9017952405251195, + "grad_norm": 0.09996544569730759, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 233280 + }, + { + "epoch": 0.9018338977285028, + "grad_norm": 0.09571780264377594, + "learning_rate": 0.002, + "loss": 2.336, + "step": 233290 + }, + { + "epoch": 0.901872554931886, + "grad_norm": 0.0973309725522995, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 233300 + }, + { + "epoch": 0.9019112121352693, + "grad_norm": 0.09493835270404816, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 233310 + }, + { + "epoch": 0.9019498693386525, + "grad_norm": 0.09489183127880096, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 233320 + }, + { + "epoch": 0.9019885265420359, + "grad_norm": 0.09764562547206879, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 233330 + }, + { + "epoch": 0.9020271837454191, + "grad_norm": 0.10663142800331116, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 233340 + }, + { + "epoch": 0.9020658409488024, + "grad_norm": 0.10935185104608536, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 233350 + }, + { + "epoch": 0.9021044981521856, + "grad_norm": 0.09917981177568436, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 233360 + }, + { + "epoch": 0.902143155355569, + "grad_norm": 0.12524504959583282, + "learning_rate": 0.002, + "loss": 2.338, + "step": 233370 + }, + { + "epoch": 0.9021818125589522, + "grad_norm": 0.10528213530778885, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 233380 + }, + { + "epoch": 0.9022204697623355, + "grad_norm": 0.12502014636993408, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 233390 + }, + { + "epoch": 0.9022591269657187, + "grad_norm": 0.10130679607391357, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 233400 + }, + { + "epoch": 0.9022977841691021, + "grad_norm": 0.09227457642555237, + "learning_rate": 0.002, + "loss": 2.322, + "step": 233410 + }, + { + "epoch": 0.9023364413724854, + "grad_norm": 0.09661556035280228, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 233420 + }, + { + "epoch": 0.9023750985758686, + "grad_norm": 0.10869093984365463, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 233430 + }, + { + "epoch": 0.9024137557792519, + "grad_norm": 0.11370524019002914, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 233440 + }, + { + "epoch": 0.9024524129826352, + "grad_norm": 0.10741494596004486, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 233450 + }, + { + "epoch": 0.9024910701860185, + "grad_norm": 0.11763168126344681, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 233460 + }, + { + "epoch": 0.9025297273894017, + "grad_norm": 0.09434393793344498, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 233470 + }, + { + "epoch": 0.902568384592785, + "grad_norm": 0.10174699127674103, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 233480 + }, + { + "epoch": 0.9026070417961684, + "grad_norm": 0.09130481630563736, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 233490 + }, + { + "epoch": 0.9026456989995516, + "grad_norm": 0.09817611426115036, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 233500 + }, + { + "epoch": 0.9026843562029349, + "grad_norm": 0.12424413859844208, + "learning_rate": 0.002, + "loss": 2.342, + "step": 233510 + }, + { + "epoch": 0.9027230134063181, + "grad_norm": 0.11780202388763428, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 233520 + }, + { + "epoch": 0.9027616706097014, + "grad_norm": 0.10289761424064636, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 233530 + }, + { + "epoch": 0.9028003278130847, + "grad_norm": 0.08934229612350464, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 233540 + }, + { + "epoch": 0.902838985016468, + "grad_norm": 0.11549816280603409, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 233550 + }, + { + "epoch": 0.9028776422198512, + "grad_norm": 0.10548436641693115, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 233560 + }, + { + "epoch": 0.9029162994232345, + "grad_norm": 0.09892436861991882, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 233570 + }, + { + "epoch": 0.9029549566266178, + "grad_norm": 0.11846853047609329, + "learning_rate": 0.002, + "loss": 2.344, + "step": 233580 + }, + { + "epoch": 0.9029936138300011, + "grad_norm": 0.10741759091615677, + "learning_rate": 0.002, + "loss": 2.345, + "step": 233590 + }, + { + "epoch": 0.9030322710333843, + "grad_norm": 0.09817928075790405, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 233600 + }, + { + "epoch": 0.9030709282367676, + "grad_norm": 0.1170048788189888, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 233610 + }, + { + "epoch": 0.903109585440151, + "grad_norm": 0.10460609942674637, + "learning_rate": 0.002, + "loss": 2.33, + "step": 233620 + }, + { + "epoch": 0.9031482426435342, + "grad_norm": 0.11310793459415436, + "learning_rate": 0.002, + "loss": 2.339, + "step": 233630 + }, + { + "epoch": 0.9031868998469175, + "grad_norm": 0.12793530523777008, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 233640 + }, + { + "epoch": 0.9032255570503007, + "grad_norm": 0.11377539485692978, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 233650 + }, + { + "epoch": 0.9032642142536841, + "grad_norm": 0.1026814803481102, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 233660 + }, + { + "epoch": 0.9033028714570673, + "grad_norm": 0.11845576763153076, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 233670 + }, + { + "epoch": 0.9033415286604506, + "grad_norm": 0.10374502837657928, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 233680 + }, + { + "epoch": 0.9033801858638338, + "grad_norm": 0.11612123996019363, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 233690 + }, + { + "epoch": 0.9034188430672171, + "grad_norm": 0.11562748998403549, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 233700 + }, + { + "epoch": 0.9034575002706005, + "grad_norm": 0.12471418082714081, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 233710 + }, + { + "epoch": 0.9034961574739837, + "grad_norm": 0.09950396418571472, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 233720 + }, + { + "epoch": 0.903534814677367, + "grad_norm": 0.10906396806240082, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 233730 + }, + { + "epoch": 0.9035734718807502, + "grad_norm": 0.11370264738798141, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 233740 + }, + { + "epoch": 0.9036121290841336, + "grad_norm": 0.11147429049015045, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 233750 + }, + { + "epoch": 0.9036507862875168, + "grad_norm": 0.10699936747550964, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 233760 + }, + { + "epoch": 0.9036894434909001, + "grad_norm": 0.12339533865451813, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 233770 + }, + { + "epoch": 0.9037281006942833, + "grad_norm": 0.118447445333004, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 233780 + }, + { + "epoch": 0.9037667578976667, + "grad_norm": 0.10083477944135666, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 233790 + }, + { + "epoch": 0.90380541510105, + "grad_norm": 0.1282907873392105, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 233800 + }, + { + "epoch": 0.9038440723044332, + "grad_norm": 0.11394553631544113, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 233810 + }, + { + "epoch": 0.9038827295078165, + "grad_norm": 0.0914125069975853, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 233820 + }, + { + "epoch": 0.9039213867111998, + "grad_norm": 0.10464632511138916, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 233830 + }, + { + "epoch": 0.9039600439145831, + "grad_norm": 0.11399640142917633, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 233840 + }, + { + "epoch": 0.9039987011179663, + "grad_norm": 0.11424519866704941, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 233850 + }, + { + "epoch": 0.9040373583213496, + "grad_norm": 0.12120144814252853, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 233860 + }, + { + "epoch": 0.9040760155247328, + "grad_norm": 0.10877831280231476, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 233870 + }, + { + "epoch": 0.9041146727281162, + "grad_norm": 0.15334224700927734, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 233880 + }, + { + "epoch": 0.9041533299314994, + "grad_norm": 0.12217187136411667, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 233890 + }, + { + "epoch": 0.9041919871348827, + "grad_norm": 0.11143625527620316, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 233900 + }, + { + "epoch": 0.9042306443382659, + "grad_norm": 0.1174347847700119, + "learning_rate": 0.002, + "loss": 2.334, + "step": 233910 + }, + { + "epoch": 0.9042693015416493, + "grad_norm": 0.09945125877857208, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 233920 + }, + { + "epoch": 0.9043079587450326, + "grad_norm": 0.09746522456407547, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 233930 + }, + { + "epoch": 0.9043466159484158, + "grad_norm": 0.12990349531173706, + "learning_rate": 0.002, + "loss": 2.34, + "step": 233940 + }, + { + "epoch": 0.9043852731517991, + "grad_norm": 0.11263786256313324, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 233950 + }, + { + "epoch": 0.9044239303551824, + "grad_norm": 0.11403274536132812, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 233960 + }, + { + "epoch": 0.9044625875585657, + "grad_norm": 0.08966249227523804, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 233970 + }, + { + "epoch": 0.9045012447619489, + "grad_norm": 0.1288156658411026, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 233980 + }, + { + "epoch": 0.9045399019653322, + "grad_norm": 0.09932001680135727, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 233990 + }, + { + "epoch": 0.9045785591687155, + "grad_norm": 0.10278764367103577, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 234000 + }, + { + "epoch": 0.9046172163720988, + "grad_norm": 0.10722273588180542, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 234010 + }, + { + "epoch": 0.904655873575482, + "grad_norm": 0.1080775111913681, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 234020 + }, + { + "epoch": 0.9046945307788653, + "grad_norm": 0.11944068223237991, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 234030 + }, + { + "epoch": 0.9047331879822487, + "grad_norm": 0.11086134612560272, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 234040 + }, + { + "epoch": 0.9047718451856319, + "grad_norm": 0.11324305832386017, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 234050 + }, + { + "epoch": 0.9048105023890152, + "grad_norm": 0.09748173505067825, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 234060 + }, + { + "epoch": 0.9048491595923984, + "grad_norm": 0.11554388701915741, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 234070 + }, + { + "epoch": 0.9048878167957817, + "grad_norm": 0.107016921043396, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 234080 + }, + { + "epoch": 0.904926473999165, + "grad_norm": 0.2644636631011963, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 234090 + }, + { + "epoch": 0.9049651312025483, + "grad_norm": 0.09860588610172272, + "learning_rate": 0.002, + "loss": 2.345, + "step": 234100 + }, + { + "epoch": 0.9050037884059315, + "grad_norm": 0.11851309984922409, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 234110 + }, + { + "epoch": 0.9050424456093148, + "grad_norm": 0.16849271953105927, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 234120 + }, + { + "epoch": 0.9050811028126982, + "grad_norm": 0.11365487426519394, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 234130 + }, + { + "epoch": 0.9051197600160814, + "grad_norm": 0.1182205006480217, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 234140 + }, + { + "epoch": 0.9051584172194647, + "grad_norm": 0.10879401117563248, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 234150 + }, + { + "epoch": 0.9051970744228479, + "grad_norm": 0.12132114171981812, + "learning_rate": 0.002, + "loss": 2.335, + "step": 234160 + }, + { + "epoch": 0.9052357316262313, + "grad_norm": 0.10799846798181534, + "learning_rate": 0.002, + "loss": 2.324, + "step": 234170 + }, + { + "epoch": 0.9052743888296145, + "grad_norm": 0.10171600431203842, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 234180 + }, + { + "epoch": 0.9053130460329978, + "grad_norm": 0.11105701327323914, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 234190 + }, + { + "epoch": 0.905351703236381, + "grad_norm": 0.11268547922372818, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 234200 + }, + { + "epoch": 0.9053903604397644, + "grad_norm": 0.11618811637163162, + "learning_rate": 0.002, + "loss": 2.327, + "step": 234210 + }, + { + "epoch": 0.9054290176431476, + "grad_norm": 0.11327293515205383, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 234220 + }, + { + "epoch": 0.9054676748465309, + "grad_norm": 0.44370654225349426, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 234230 + }, + { + "epoch": 0.9055063320499142, + "grad_norm": 0.12347027659416199, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 234240 + }, + { + "epoch": 0.9055449892532974, + "grad_norm": 0.10137667506933212, + "learning_rate": 0.002, + "loss": 2.353, + "step": 234250 + }, + { + "epoch": 0.9055836464566808, + "grad_norm": 0.11867160350084305, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 234260 + }, + { + "epoch": 0.905622303660064, + "grad_norm": 0.10813513398170471, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 234270 + }, + { + "epoch": 0.9056609608634473, + "grad_norm": 0.10353584587574005, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 234280 + }, + { + "epoch": 0.9056996180668305, + "grad_norm": 0.10264535993337631, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 234290 + }, + { + "epoch": 0.9057382752702139, + "grad_norm": 0.12715892493724823, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 234300 + }, + { + "epoch": 0.9057769324735971, + "grad_norm": 0.11884993314743042, + "learning_rate": 0.002, + "loss": 2.334, + "step": 234310 + }, + { + "epoch": 0.9058155896769804, + "grad_norm": 0.1040872186422348, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 234320 + }, + { + "epoch": 0.9058542468803636, + "grad_norm": 0.09612486511468887, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 234330 + }, + { + "epoch": 0.905892904083747, + "grad_norm": 0.10156971961259842, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 234340 + }, + { + "epoch": 0.9059315612871303, + "grad_norm": 0.09877484291791916, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 234350 + }, + { + "epoch": 0.9059702184905135, + "grad_norm": 0.1371445655822754, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 234360 + }, + { + "epoch": 0.9060088756938968, + "grad_norm": 0.0945514440536499, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 234370 + }, + { + "epoch": 0.9060475328972801, + "grad_norm": 0.10155311971902847, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 234380 + }, + { + "epoch": 0.9060861901006634, + "grad_norm": 0.0989159569144249, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 234390 + }, + { + "epoch": 0.9061248473040466, + "grad_norm": 0.11544256657361984, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 234400 + }, + { + "epoch": 0.9061635045074299, + "grad_norm": 0.09864155203104019, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 234410 + }, + { + "epoch": 0.9062021617108132, + "grad_norm": 0.10460689663887024, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 234420 + }, + { + "epoch": 0.9062408189141965, + "grad_norm": 0.1467406451702118, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 234430 + }, + { + "epoch": 0.9062794761175798, + "grad_norm": 0.10551851242780685, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 234440 + }, + { + "epoch": 0.906318133320963, + "grad_norm": 0.1010807529091835, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 234450 + }, + { + "epoch": 0.9063567905243463, + "grad_norm": 0.10369355976581573, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 234460 + }, + { + "epoch": 0.9063954477277296, + "grad_norm": 0.10652629286050797, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 234470 + }, + { + "epoch": 0.9064341049311129, + "grad_norm": 0.10208172351121902, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 234480 + }, + { + "epoch": 0.9064727621344961, + "grad_norm": 0.10453961789608002, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 234490 + }, + { + "epoch": 0.9065114193378794, + "grad_norm": 0.09997710585594177, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 234500 + }, + { + "epoch": 0.9065500765412627, + "grad_norm": 0.11260194331407547, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 234510 + }, + { + "epoch": 0.906588733744646, + "grad_norm": 0.11529659479856491, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 234520 + }, + { + "epoch": 0.9066273909480292, + "grad_norm": 0.09845767170190811, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 234530 + }, + { + "epoch": 0.9066660481514125, + "grad_norm": 0.10353818535804749, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 234540 + }, + { + "epoch": 0.9067047053547959, + "grad_norm": 0.09427450597286224, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 234550 + }, + { + "epoch": 0.9067433625581791, + "grad_norm": 0.11889142543077469, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 234560 + }, + { + "epoch": 0.9067820197615624, + "grad_norm": 0.10010946542024612, + "learning_rate": 0.002, + "loss": 2.35, + "step": 234570 + }, + { + "epoch": 0.9068206769649456, + "grad_norm": 0.09039853513240814, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 234580 + }, + { + "epoch": 0.906859334168329, + "grad_norm": 0.10822974890470505, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 234590 + }, + { + "epoch": 0.9068979913717122, + "grad_norm": 0.10804063826799393, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 234600 + }, + { + "epoch": 0.9069366485750955, + "grad_norm": 0.1527954339981079, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 234610 + }, + { + "epoch": 0.9069753057784787, + "grad_norm": 0.11379213631153107, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 234620 + }, + { + "epoch": 0.907013962981862, + "grad_norm": 0.11318988353013992, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 234630 + }, + { + "epoch": 0.9070526201852454, + "grad_norm": 0.1034546047449112, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 234640 + }, + { + "epoch": 0.9070912773886286, + "grad_norm": 0.09208408743143082, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 234650 + }, + { + "epoch": 0.9071299345920119, + "grad_norm": 0.10531508177518845, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 234660 + }, + { + "epoch": 0.9071685917953951, + "grad_norm": 0.13348685204982758, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 234670 + }, + { + "epoch": 0.9072072489987785, + "grad_norm": 0.09402325004339218, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 234680 + }, + { + "epoch": 0.9072459062021617, + "grad_norm": 0.10708408802747726, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 234690 + }, + { + "epoch": 0.907284563405545, + "grad_norm": 0.13726216554641724, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 234700 + }, + { + "epoch": 0.9073232206089282, + "grad_norm": 0.10366878658533096, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 234710 + }, + { + "epoch": 0.9073618778123116, + "grad_norm": 0.10182026773691177, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 234720 + }, + { + "epoch": 0.9074005350156948, + "grad_norm": 0.1012740358710289, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 234730 + }, + { + "epoch": 0.9074391922190781, + "grad_norm": 0.10341080278158188, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 234740 + }, + { + "epoch": 0.9074778494224613, + "grad_norm": 0.09370353817939758, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 234750 + }, + { + "epoch": 0.9075165066258447, + "grad_norm": 0.10617724061012268, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 234760 + }, + { + "epoch": 0.907555163829228, + "grad_norm": 0.09987720102071762, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 234770 + }, + { + "epoch": 0.9075938210326112, + "grad_norm": 0.11386090517044067, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 234780 + }, + { + "epoch": 0.9076324782359945, + "grad_norm": 0.09530167281627655, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 234790 + }, + { + "epoch": 0.9076711354393777, + "grad_norm": 0.12116479128599167, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 234800 + }, + { + "epoch": 0.9077097926427611, + "grad_norm": 0.10751676559448242, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 234810 + }, + { + "epoch": 0.9077484498461443, + "grad_norm": 0.10586011409759521, + "learning_rate": 0.002, + "loss": 2.3598, + "step": 234820 + }, + { + "epoch": 0.9077871070495276, + "grad_norm": 0.09758689254522324, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 234830 + }, + { + "epoch": 0.9078257642529108, + "grad_norm": 0.11259765177965164, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 234840 + }, + { + "epoch": 0.9078644214562942, + "grad_norm": 0.1003914549946785, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 234850 + }, + { + "epoch": 0.9079030786596775, + "grad_norm": 0.09935523569583893, + "learning_rate": 0.002, + "loss": 2.341, + "step": 234860 + }, + { + "epoch": 0.9079417358630607, + "grad_norm": 0.10035855323076248, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 234870 + }, + { + "epoch": 0.907980393066444, + "grad_norm": 0.08553393930196762, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 234880 + }, + { + "epoch": 0.9080190502698273, + "grad_norm": 0.1025664210319519, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 234890 + }, + { + "epoch": 0.9080577074732106, + "grad_norm": 0.1198197677731514, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 234900 + }, + { + "epoch": 0.9080963646765938, + "grad_norm": 0.11244381964206696, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 234910 + }, + { + "epoch": 0.9081350218799771, + "grad_norm": 0.10571009665727615, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 234920 + }, + { + "epoch": 0.9081736790833604, + "grad_norm": 0.12540297210216522, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 234930 + }, + { + "epoch": 0.9082123362867437, + "grad_norm": 0.10460840165615082, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 234940 + }, + { + "epoch": 0.908250993490127, + "grad_norm": 0.11541231721639633, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 234950 + }, + { + "epoch": 0.9082896506935102, + "grad_norm": 0.11149951815605164, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 234960 + }, + { + "epoch": 0.9083283078968936, + "grad_norm": 0.12732405960559845, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 234970 + }, + { + "epoch": 0.9083669651002768, + "grad_norm": 0.09354390949010849, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 234980 + }, + { + "epoch": 0.9084056223036601, + "grad_norm": 0.11542489379644394, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 234990 + }, + { + "epoch": 0.9084442795070433, + "grad_norm": 0.09353950619697571, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 235000 + }, + { + "epoch": 0.9084829367104266, + "grad_norm": 0.12837208807468414, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 235010 + }, + { + "epoch": 0.9085215939138099, + "grad_norm": 0.11282764375209808, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 235020 + }, + { + "epoch": 0.9085602511171932, + "grad_norm": 0.12078604847192764, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 235030 + }, + { + "epoch": 0.9085989083205764, + "grad_norm": 0.11550453305244446, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 235040 + }, + { + "epoch": 0.9086375655239597, + "grad_norm": 0.10718785226345062, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 235050 + }, + { + "epoch": 0.908676222727343, + "grad_norm": 0.21481357514858246, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 235060 + }, + { + "epoch": 0.9087148799307263, + "grad_norm": 0.12408475577831268, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 235070 + }, + { + "epoch": 0.9087535371341096, + "grad_norm": 0.10488598793745041, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 235080 + }, + { + "epoch": 0.9087921943374928, + "grad_norm": 0.10466601699590683, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 235090 + }, + { + "epoch": 0.9088308515408762, + "grad_norm": 0.1314753293991089, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 235100 + }, + { + "epoch": 0.9088695087442594, + "grad_norm": 0.11760780215263367, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 235110 + }, + { + "epoch": 0.9089081659476427, + "grad_norm": 0.10353752225637436, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 235120 + }, + { + "epoch": 0.9089468231510259, + "grad_norm": 0.1124408170580864, + "learning_rate": 0.002, + "loss": 2.3095, + "step": 235130 + }, + { + "epoch": 0.9089854803544093, + "grad_norm": 0.11053520441055298, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 235140 + }, + { + "epoch": 0.9090241375577925, + "grad_norm": 0.10394348949193954, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 235150 + }, + { + "epoch": 0.9090627947611758, + "grad_norm": 0.10440591722726822, + "learning_rate": 0.002, + "loss": 2.342, + "step": 235160 + }, + { + "epoch": 0.909101451964559, + "grad_norm": 0.09316018968820572, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 235170 + }, + { + "epoch": 0.9091401091679423, + "grad_norm": 0.14168131351470947, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 235180 + }, + { + "epoch": 0.9091787663713257, + "grad_norm": 0.09957494586706161, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 235190 + }, + { + "epoch": 0.9092174235747089, + "grad_norm": 0.09146034717559814, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 235200 + }, + { + "epoch": 0.9092560807780922, + "grad_norm": 0.11028812825679779, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 235210 + }, + { + "epoch": 0.9092947379814754, + "grad_norm": 0.09550734609365463, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 235220 + }, + { + "epoch": 0.9093333951848588, + "grad_norm": 0.10537036508321762, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 235230 + }, + { + "epoch": 0.909372052388242, + "grad_norm": 0.10891729593276978, + "learning_rate": 0.002, + "loss": 2.348, + "step": 235240 + }, + { + "epoch": 0.9094107095916253, + "grad_norm": 0.10347836464643478, + "learning_rate": 0.002, + "loss": 2.341, + "step": 235250 + }, + { + "epoch": 0.9094493667950085, + "grad_norm": 0.10081382840871811, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 235260 + }, + { + "epoch": 0.9094880239983919, + "grad_norm": 0.09710804373025894, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 235270 + }, + { + "epoch": 0.9095266812017752, + "grad_norm": 0.1072625070810318, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 235280 + }, + { + "epoch": 0.9095653384051584, + "grad_norm": 0.11930724233388901, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 235290 + }, + { + "epoch": 0.9096039956085417, + "grad_norm": 0.10877171903848648, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 235300 + }, + { + "epoch": 0.909642652811925, + "grad_norm": 0.1109236478805542, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 235310 + }, + { + "epoch": 0.9096813100153083, + "grad_norm": 0.10849165171384811, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 235320 + }, + { + "epoch": 0.9097199672186915, + "grad_norm": 0.09898397326469421, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 235330 + }, + { + "epoch": 0.9097586244220748, + "grad_norm": 0.09823174774646759, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 235340 + }, + { + "epoch": 0.9097972816254581, + "grad_norm": 0.10743778198957443, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 235350 + }, + { + "epoch": 0.9098359388288414, + "grad_norm": 0.1055048406124115, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 235360 + }, + { + "epoch": 0.9098745960322246, + "grad_norm": 0.11165113747119904, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 235370 + }, + { + "epoch": 0.9099132532356079, + "grad_norm": 0.10300975292921066, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 235380 + }, + { + "epoch": 0.9099519104389912, + "grad_norm": 0.12773124873638153, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 235390 + }, + { + "epoch": 0.9099905676423745, + "grad_norm": 0.1127619743347168, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 235400 + }, + { + "epoch": 0.9100292248457578, + "grad_norm": 0.09095638245344162, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 235410 + }, + { + "epoch": 0.910067882049141, + "grad_norm": 0.10773008316755295, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 235420 + }, + { + "epoch": 0.9101065392525243, + "grad_norm": 0.09691687673330307, + "learning_rate": 0.002, + "loss": 2.349, + "step": 235430 + }, + { + "epoch": 0.9101451964559076, + "grad_norm": 0.10554266721010208, + "learning_rate": 0.002, + "loss": 2.3074, + "step": 235440 + }, + { + "epoch": 0.9101838536592909, + "grad_norm": 0.14661146700382233, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 235450 + }, + { + "epoch": 0.9102225108626741, + "grad_norm": 0.10983286052942276, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 235460 + }, + { + "epoch": 0.9102611680660574, + "grad_norm": 0.1018320843577385, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 235470 + }, + { + "epoch": 0.9102998252694408, + "grad_norm": 0.10218097269535065, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 235480 + }, + { + "epoch": 0.910338482472824, + "grad_norm": 0.10761865228414536, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 235490 + }, + { + "epoch": 0.9103771396762073, + "grad_norm": 0.1041545644402504, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 235500 + }, + { + "epoch": 0.9104157968795905, + "grad_norm": 0.11502991616725922, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 235510 + }, + { + "epoch": 0.9104544540829739, + "grad_norm": 0.08802340924739838, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 235520 + }, + { + "epoch": 0.9104931112863571, + "grad_norm": 0.0944049060344696, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 235530 + }, + { + "epoch": 0.9105317684897404, + "grad_norm": 0.11132679879665375, + "learning_rate": 0.002, + "loss": 2.343, + "step": 235540 + }, + { + "epoch": 0.9105704256931236, + "grad_norm": 0.11546406894922256, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 235550 + }, + { + "epoch": 0.9106090828965069, + "grad_norm": 0.09580505639314651, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 235560 + }, + { + "epoch": 0.9106477400998902, + "grad_norm": 0.09115248918533325, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 235570 + }, + { + "epoch": 0.9106863973032735, + "grad_norm": 0.09833600372076035, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 235580 + }, + { + "epoch": 0.9107250545066568, + "grad_norm": 0.10657432675361633, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 235590 + }, + { + "epoch": 0.91076371171004, + "grad_norm": 0.11782419681549072, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 235600 + }, + { + "epoch": 0.9108023689134234, + "grad_norm": 0.1045074462890625, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 235610 + }, + { + "epoch": 0.9108410261168066, + "grad_norm": 0.09903860092163086, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 235620 + }, + { + "epoch": 0.9108796833201899, + "grad_norm": 0.1405385136604309, + "learning_rate": 0.002, + "loss": 2.34, + "step": 235630 + }, + { + "epoch": 0.9109183405235731, + "grad_norm": 0.1074899360537529, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 235640 + }, + { + "epoch": 0.9109569977269565, + "grad_norm": 0.11212822794914246, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 235650 + }, + { + "epoch": 0.9109956549303397, + "grad_norm": 0.0984254702925682, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 235660 + }, + { + "epoch": 0.911034312133723, + "grad_norm": 0.11358426511287689, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 235670 + }, + { + "epoch": 0.9110729693371062, + "grad_norm": 0.11881627887487411, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 235680 + }, + { + "epoch": 0.9111116265404896, + "grad_norm": 0.09670265763998032, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 235690 + }, + { + "epoch": 0.9111502837438729, + "grad_norm": 0.0945124551653862, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 235700 + }, + { + "epoch": 0.9111889409472561, + "grad_norm": 0.09245097637176514, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 235710 + }, + { + "epoch": 0.9112275981506394, + "grad_norm": 0.11580207198858261, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 235720 + }, + { + "epoch": 0.9112662553540226, + "grad_norm": 0.11975334584712982, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 235730 + }, + { + "epoch": 0.911304912557406, + "grad_norm": 0.10776902735233307, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 235740 + }, + { + "epoch": 0.9113435697607892, + "grad_norm": 0.1209128350019455, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 235750 + }, + { + "epoch": 0.9113822269641725, + "grad_norm": 0.10492858290672302, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 235760 + }, + { + "epoch": 0.9114208841675557, + "grad_norm": 0.10763996839523315, + "learning_rate": 0.002, + "loss": 2.324, + "step": 235770 + }, + { + "epoch": 0.9114595413709391, + "grad_norm": 0.12180914729833603, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 235780 + }, + { + "epoch": 0.9114981985743223, + "grad_norm": 0.1048932746052742, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 235790 + }, + { + "epoch": 0.9115368557777056, + "grad_norm": 0.1038237065076828, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 235800 + }, + { + "epoch": 0.9115755129810889, + "grad_norm": 0.10732347518205643, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 235810 + }, + { + "epoch": 0.9116141701844722, + "grad_norm": 0.08621811866760254, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 235820 + }, + { + "epoch": 0.9116528273878555, + "grad_norm": 0.12064923346042633, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 235830 + }, + { + "epoch": 0.9116914845912387, + "grad_norm": 0.09688537567853928, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 235840 + }, + { + "epoch": 0.911730141794622, + "grad_norm": 0.11360147595405579, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 235850 + }, + { + "epoch": 0.9117687989980053, + "grad_norm": 0.11028148233890533, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 235860 + }, + { + "epoch": 0.9118074562013886, + "grad_norm": 0.10759235918521881, + "learning_rate": 0.002, + "loss": 2.324, + "step": 235870 + }, + { + "epoch": 0.9118461134047718, + "grad_norm": 0.09140212833881378, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 235880 + }, + { + "epoch": 0.9118847706081551, + "grad_norm": 0.15058700740337372, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 235890 + }, + { + "epoch": 0.9119234278115385, + "grad_norm": 0.11045285314321518, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 235900 + }, + { + "epoch": 0.9119620850149217, + "grad_norm": 0.09746697545051575, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 235910 + }, + { + "epoch": 0.912000742218305, + "grad_norm": 0.11804035305976868, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 235920 + }, + { + "epoch": 0.9120393994216882, + "grad_norm": 0.098781518638134, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 235930 + }, + { + "epoch": 0.9120780566250715, + "grad_norm": 0.11542218923568726, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 235940 + }, + { + "epoch": 0.9121167138284548, + "grad_norm": 0.10973048210144043, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 235950 + }, + { + "epoch": 0.9121553710318381, + "grad_norm": 0.100016288459301, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 235960 + }, + { + "epoch": 0.9121940282352213, + "grad_norm": 0.1159529760479927, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 235970 + }, + { + "epoch": 0.9122326854386046, + "grad_norm": 0.09956318140029907, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 235980 + }, + { + "epoch": 0.912271342641988, + "grad_norm": 0.1090196743607521, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 235990 + }, + { + "epoch": 0.9123099998453712, + "grad_norm": 0.10806025564670563, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 236000 + }, + { + "epoch": 0.9123486570487545, + "grad_norm": 0.12057144939899445, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 236010 + }, + { + "epoch": 0.9123873142521377, + "grad_norm": 0.09386761486530304, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 236020 + }, + { + "epoch": 0.9124259714555211, + "grad_norm": 0.09779149293899536, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 236030 + }, + { + "epoch": 0.9124646286589043, + "grad_norm": 0.11577455699443817, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 236040 + }, + { + "epoch": 0.9125032858622876, + "grad_norm": 0.09398800879716873, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 236050 + }, + { + "epoch": 0.9125419430656708, + "grad_norm": 0.09047604352235794, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 236060 + }, + { + "epoch": 0.9125806002690542, + "grad_norm": 0.12920865416526794, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 236070 + }, + { + "epoch": 0.9126192574724374, + "grad_norm": 0.10851120203733444, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 236080 + }, + { + "epoch": 0.9126579146758207, + "grad_norm": 0.1102474182844162, + "learning_rate": 0.002, + "loss": 2.356, + "step": 236090 + }, + { + "epoch": 0.9126965718792039, + "grad_norm": 0.10902708768844604, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 236100 + }, + { + "epoch": 0.9127352290825872, + "grad_norm": 0.0967753529548645, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 236110 + }, + { + "epoch": 0.9127738862859706, + "grad_norm": 0.11337198317050934, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 236120 + }, + { + "epoch": 0.9128125434893538, + "grad_norm": 0.11363617330789566, + "learning_rate": 0.002, + "loss": 2.323, + "step": 236130 + }, + { + "epoch": 0.9128512006927371, + "grad_norm": 0.1053314283490181, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 236140 + }, + { + "epoch": 0.9128898578961203, + "grad_norm": 0.11603876203298569, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 236150 + }, + { + "epoch": 0.9129285150995037, + "grad_norm": 0.10162254422903061, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 236160 + }, + { + "epoch": 0.9129671723028869, + "grad_norm": 0.10282709449529648, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 236170 + }, + { + "epoch": 0.9130058295062702, + "grad_norm": 0.11641443520784378, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 236180 + }, + { + "epoch": 0.9130444867096534, + "grad_norm": 0.11190035939216614, + "learning_rate": 0.002, + "loss": 2.335, + "step": 236190 + }, + { + "epoch": 0.9130831439130368, + "grad_norm": 0.10466290265321732, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 236200 + }, + { + "epoch": 0.91312180111642, + "grad_norm": 0.10124977678060532, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 236210 + }, + { + "epoch": 0.9131604583198033, + "grad_norm": 0.12094393372535706, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 236220 + }, + { + "epoch": 0.9131991155231866, + "grad_norm": 0.1088256910443306, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 236230 + }, + { + "epoch": 0.9132377727265699, + "grad_norm": 0.12817198038101196, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 236240 + }, + { + "epoch": 0.9132764299299532, + "grad_norm": 0.0982198640704155, + "learning_rate": 0.002, + "loss": 2.34, + "step": 236250 + }, + { + "epoch": 0.9133150871333364, + "grad_norm": 0.1014394536614418, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 236260 + }, + { + "epoch": 0.9133537443367197, + "grad_norm": 0.11380444467067719, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 236270 + }, + { + "epoch": 0.913392401540103, + "grad_norm": 0.10722827911376953, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 236280 + }, + { + "epoch": 0.9134310587434863, + "grad_norm": 0.1058383509516716, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 236290 + }, + { + "epoch": 0.9134697159468695, + "grad_norm": 0.11134719848632812, + "learning_rate": 0.002, + "loss": 2.328, + "step": 236300 + }, + { + "epoch": 0.9135083731502528, + "grad_norm": 0.10544317960739136, + "learning_rate": 0.002, + "loss": 2.334, + "step": 236310 + }, + { + "epoch": 0.913547030353636, + "grad_norm": 0.10530325025320053, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 236320 + }, + { + "epoch": 0.9135856875570194, + "grad_norm": 0.11673657596111298, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 236330 + }, + { + "epoch": 0.9136243447604027, + "grad_norm": 0.10748418420553207, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 236340 + }, + { + "epoch": 0.9136630019637859, + "grad_norm": 0.10341091454029083, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 236350 + }, + { + "epoch": 0.9137016591671692, + "grad_norm": 0.09703026711940765, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 236360 + }, + { + "epoch": 0.9137403163705525, + "grad_norm": 0.12050410360097885, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 236370 + }, + { + "epoch": 0.9137789735739358, + "grad_norm": 0.09898030012845993, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 236380 + }, + { + "epoch": 0.913817630777319, + "grad_norm": 0.12472639232873917, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 236390 + }, + { + "epoch": 0.9138562879807023, + "grad_norm": 0.10752347111701965, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 236400 + }, + { + "epoch": 0.9138949451840856, + "grad_norm": 0.10269351303577423, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 236410 + }, + { + "epoch": 0.9139336023874689, + "grad_norm": 0.12358276546001434, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 236420 + }, + { + "epoch": 0.9139722595908522, + "grad_norm": 0.10591723769903183, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 236430 + }, + { + "epoch": 0.9140109167942354, + "grad_norm": 0.11765004694461823, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 236440 + }, + { + "epoch": 0.9140495739976188, + "grad_norm": 0.10490332543849945, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 236450 + }, + { + "epoch": 0.914088231201002, + "grad_norm": 0.09827043116092682, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 236460 + }, + { + "epoch": 0.9141268884043853, + "grad_norm": 0.1087154895067215, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 236470 + }, + { + "epoch": 0.9141655456077685, + "grad_norm": 0.10880590975284576, + "learning_rate": 0.002, + "loss": 2.343, + "step": 236480 + }, + { + "epoch": 0.9142042028111518, + "grad_norm": 0.10290855914354324, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 236490 + }, + { + "epoch": 0.9142428600145351, + "grad_norm": 0.10896014422178268, + "learning_rate": 0.002, + "loss": 2.352, + "step": 236500 + }, + { + "epoch": 0.9142815172179184, + "grad_norm": 0.12035848945379257, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 236510 + }, + { + "epoch": 0.9143201744213016, + "grad_norm": 0.10809791088104248, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 236520 + }, + { + "epoch": 0.9143588316246849, + "grad_norm": 0.10620492696762085, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 236530 + }, + { + "epoch": 0.9143974888280683, + "grad_norm": 0.10088472813367844, + "learning_rate": 0.002, + "loss": 2.341, + "step": 236540 + }, + { + "epoch": 0.9144361460314515, + "grad_norm": 0.09428628534078598, + "learning_rate": 0.002, + "loss": 2.345, + "step": 236550 + }, + { + "epoch": 0.9144748032348348, + "grad_norm": 0.11219991743564606, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 236560 + }, + { + "epoch": 0.914513460438218, + "grad_norm": 0.10495533049106598, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 236570 + }, + { + "epoch": 0.9145521176416014, + "grad_norm": 0.10331616550683975, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 236580 + }, + { + "epoch": 0.9145907748449846, + "grad_norm": 0.1208433285355568, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 236590 + }, + { + "epoch": 0.9146294320483679, + "grad_norm": 0.10469435900449753, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 236600 + }, + { + "epoch": 0.9146680892517511, + "grad_norm": 0.10302191227674484, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 236610 + }, + { + "epoch": 0.9147067464551345, + "grad_norm": 0.10604588687419891, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 236620 + }, + { + "epoch": 0.9147454036585178, + "grad_norm": 0.10278229415416718, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 236630 + }, + { + "epoch": 0.914784060861901, + "grad_norm": 0.10099917650222778, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 236640 + }, + { + "epoch": 0.9148227180652843, + "grad_norm": 0.10236818343400955, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 236650 + }, + { + "epoch": 0.9148613752686675, + "grad_norm": 0.1070743128657341, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 236660 + }, + { + "epoch": 0.9149000324720509, + "grad_norm": 0.10761519521474838, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 236670 + }, + { + "epoch": 0.9149386896754341, + "grad_norm": 0.1201874390244484, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 236680 + }, + { + "epoch": 0.9149773468788174, + "grad_norm": 0.10888718068599701, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 236690 + }, + { + "epoch": 0.9150160040822006, + "grad_norm": 0.10654252767562866, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 236700 + }, + { + "epoch": 0.915054661285584, + "grad_norm": 0.10304878652095795, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 236710 + }, + { + "epoch": 0.9150933184889672, + "grad_norm": 0.10618226230144501, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 236720 + }, + { + "epoch": 0.9151319756923505, + "grad_norm": 0.09469097852706909, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 236730 + }, + { + "epoch": 0.9151706328957337, + "grad_norm": 0.10468887537717819, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 236740 + }, + { + "epoch": 0.9152092900991171, + "grad_norm": 0.1156468614935875, + "learning_rate": 0.002, + "loss": 2.341, + "step": 236750 + }, + { + "epoch": 0.9152479473025004, + "grad_norm": 0.11189986765384674, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 236760 + }, + { + "epoch": 0.9152866045058836, + "grad_norm": 0.10088913142681122, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 236770 + }, + { + "epoch": 0.9153252617092669, + "grad_norm": 0.09479598701000214, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 236780 + }, + { + "epoch": 0.9153639189126502, + "grad_norm": 0.09515275061130524, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 236790 + }, + { + "epoch": 0.9154025761160335, + "grad_norm": 0.10245562344789505, + "learning_rate": 0.002, + "loss": 2.331, + "step": 236800 + }, + { + "epoch": 0.9154412333194167, + "grad_norm": 0.10947719216346741, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 236810 + }, + { + "epoch": 0.9154798905228, + "grad_norm": 0.10580668598413467, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 236820 + }, + { + "epoch": 0.9155185477261834, + "grad_norm": 0.09530249238014221, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 236830 + }, + { + "epoch": 0.9155572049295666, + "grad_norm": 0.13017530739307404, + "learning_rate": 0.002, + "loss": 2.338, + "step": 236840 + }, + { + "epoch": 0.9155958621329499, + "grad_norm": 0.09841717034578323, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 236850 + }, + { + "epoch": 0.9156345193363331, + "grad_norm": 0.11240272223949432, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 236860 + }, + { + "epoch": 0.9156731765397164, + "grad_norm": 0.10746175050735474, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 236870 + }, + { + "epoch": 0.9157118337430997, + "grad_norm": 0.10742993652820587, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 236880 + }, + { + "epoch": 0.915750490946483, + "grad_norm": 0.10194318741559982, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 236890 + }, + { + "epoch": 0.9157891481498662, + "grad_norm": 0.10081067681312561, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 236900 + }, + { + "epoch": 0.9158278053532495, + "grad_norm": 0.10650233179330826, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 236910 + }, + { + "epoch": 0.9158664625566328, + "grad_norm": 0.10834755748510361, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 236920 + }, + { + "epoch": 0.9159051197600161, + "grad_norm": 0.1015695258975029, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 236930 + }, + { + "epoch": 0.9159437769633993, + "grad_norm": 0.10104040801525116, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 236940 + }, + { + "epoch": 0.9159824341667826, + "grad_norm": 0.11582430452108383, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 236950 + }, + { + "epoch": 0.916021091370166, + "grad_norm": 0.11996284872293472, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 236960 + }, + { + "epoch": 0.9160597485735492, + "grad_norm": 0.09137024730443954, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 236970 + }, + { + "epoch": 0.9160984057769325, + "grad_norm": 0.17459522187709808, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 236980 + }, + { + "epoch": 0.9161370629803157, + "grad_norm": 0.18123406171798706, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 236990 + }, + { + "epoch": 0.9161757201836991, + "grad_norm": 0.11360806226730347, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 237000 + }, + { + "epoch": 0.9162143773870823, + "grad_norm": 0.1017279401421547, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 237010 + }, + { + "epoch": 0.9162530345904656, + "grad_norm": 0.09900280088186264, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 237020 + }, + { + "epoch": 0.9162916917938488, + "grad_norm": 0.09855163097381592, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 237030 + }, + { + "epoch": 0.9163303489972321, + "grad_norm": 0.10126887261867523, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 237040 + }, + { + "epoch": 0.9163690062006155, + "grad_norm": 0.13277678191661835, + "learning_rate": 0.002, + "loss": 2.337, + "step": 237050 + }, + { + "epoch": 0.9164076634039987, + "grad_norm": 0.11973906308412552, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 237060 + }, + { + "epoch": 0.916446320607382, + "grad_norm": 0.09777180850505829, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 237070 + }, + { + "epoch": 0.9164849778107652, + "grad_norm": 0.09020551294088364, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 237080 + }, + { + "epoch": 0.9165236350141486, + "grad_norm": 0.11669912189245224, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 237090 + }, + { + "epoch": 0.9165622922175318, + "grad_norm": 0.10898596793413162, + "learning_rate": 0.002, + "loss": 2.3609, + "step": 237100 + }, + { + "epoch": 0.9166009494209151, + "grad_norm": 0.09796357899904251, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 237110 + }, + { + "epoch": 0.9166396066242983, + "grad_norm": 0.10369439423084259, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 237120 + }, + { + "epoch": 0.9166782638276817, + "grad_norm": 0.09319931268692017, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 237130 + }, + { + "epoch": 0.916716921031065, + "grad_norm": 0.08715303987264633, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 237140 + }, + { + "epoch": 0.9167555782344482, + "grad_norm": 0.09676074236631393, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 237150 + }, + { + "epoch": 0.9167942354378315, + "grad_norm": 0.10120874643325806, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 237160 + }, + { + "epoch": 0.9168328926412148, + "grad_norm": 0.10892495512962341, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 237170 + }, + { + "epoch": 0.9168715498445981, + "grad_norm": 0.10513322800397873, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 237180 + }, + { + "epoch": 0.9169102070479813, + "grad_norm": 0.11788270622491837, + "learning_rate": 0.002, + "loss": 2.328, + "step": 237190 + }, + { + "epoch": 0.9169488642513646, + "grad_norm": 0.11247450858354568, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 237200 + }, + { + "epoch": 0.9169875214547478, + "grad_norm": 0.10070424526929855, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 237210 + }, + { + "epoch": 0.9170261786581312, + "grad_norm": 0.11775757372379303, + "learning_rate": 0.002, + "loss": 2.348, + "step": 237220 + }, + { + "epoch": 0.9170648358615144, + "grad_norm": 0.09787221252918243, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 237230 + }, + { + "epoch": 0.9171034930648977, + "grad_norm": 0.10628794878721237, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 237240 + }, + { + "epoch": 0.9171421502682809, + "grad_norm": 0.11313777416944504, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 237250 + }, + { + "epoch": 0.9171808074716643, + "grad_norm": 0.12426883727312088, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 237260 + }, + { + "epoch": 0.9172194646750476, + "grad_norm": 0.11160732060670853, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 237270 + }, + { + "epoch": 0.9172581218784308, + "grad_norm": 0.13596603274345398, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 237280 + }, + { + "epoch": 0.9172967790818141, + "grad_norm": 0.11814679205417633, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 237290 + }, + { + "epoch": 0.9173354362851974, + "grad_norm": 0.10138362646102905, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 237300 + }, + { + "epoch": 0.9173740934885807, + "grad_norm": 0.09971548616886139, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 237310 + }, + { + "epoch": 0.9174127506919639, + "grad_norm": 0.12172117829322815, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 237320 + }, + { + "epoch": 0.9174514078953472, + "grad_norm": 0.10093490779399872, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 237330 + }, + { + "epoch": 0.9174900650987305, + "grad_norm": 0.13062350451946259, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 237340 + }, + { + "epoch": 0.9175287223021138, + "grad_norm": 0.09960246086120605, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 237350 + }, + { + "epoch": 0.917567379505497, + "grad_norm": 0.13497884571552277, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 237360 + }, + { + "epoch": 0.9176060367088803, + "grad_norm": 0.09554940462112427, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 237370 + }, + { + "epoch": 0.9176446939122637, + "grad_norm": 0.1013389453291893, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 237380 + }, + { + "epoch": 0.9176833511156469, + "grad_norm": 0.11148456484079361, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 237390 + }, + { + "epoch": 0.9177220083190302, + "grad_norm": 0.08915894478559494, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 237400 + }, + { + "epoch": 0.9177606655224134, + "grad_norm": 0.1016797125339508, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 237410 + }, + { + "epoch": 0.9177993227257967, + "grad_norm": 0.09685633331537247, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 237420 + }, + { + "epoch": 0.91783797992918, + "grad_norm": 0.11382238566875458, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 237430 + }, + { + "epoch": 0.9178766371325633, + "grad_norm": 0.11097710579633713, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 237440 + }, + { + "epoch": 0.9179152943359465, + "grad_norm": 0.09930465370416641, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 237450 + }, + { + "epoch": 0.9179539515393298, + "grad_norm": 0.11404252797365189, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 237460 + }, + { + "epoch": 0.9179926087427132, + "grad_norm": 0.10939346253871918, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 237470 + }, + { + "epoch": 0.9180312659460964, + "grad_norm": 0.11002951115369797, + "learning_rate": 0.002, + "loss": 2.351, + "step": 237480 + }, + { + "epoch": 0.9180699231494797, + "grad_norm": 0.09177740663290024, + "learning_rate": 0.002, + "loss": 2.34, + "step": 237490 + }, + { + "epoch": 0.9181085803528629, + "grad_norm": 0.09964168071746826, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 237500 + }, + { + "epoch": 0.9181472375562463, + "grad_norm": 0.11763204634189606, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 237510 + }, + { + "epoch": 0.9181858947596295, + "grad_norm": 0.1089107096195221, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 237520 + }, + { + "epoch": 0.9182245519630128, + "grad_norm": 0.10073132067918777, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 237530 + }, + { + "epoch": 0.918263209166396, + "grad_norm": 0.09807328134775162, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 237540 + }, + { + "epoch": 0.9183018663697794, + "grad_norm": 0.12475041300058365, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 237550 + }, + { + "epoch": 0.9183405235731626, + "grad_norm": 0.12477726489305496, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 237560 + }, + { + "epoch": 0.9183791807765459, + "grad_norm": 0.2605946362018585, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 237570 + }, + { + "epoch": 0.9184178379799292, + "grad_norm": 0.09945245087146759, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 237580 + }, + { + "epoch": 0.9184564951833124, + "grad_norm": 0.10772482305765152, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 237590 + }, + { + "epoch": 0.9184951523866958, + "grad_norm": 0.1343308538198471, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 237600 + }, + { + "epoch": 0.918533809590079, + "grad_norm": 0.10860157757997513, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 237610 + }, + { + "epoch": 0.9185724667934623, + "grad_norm": 0.23557095229625702, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 237620 + }, + { + "epoch": 0.9186111239968455, + "grad_norm": 0.09972846508026123, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 237630 + }, + { + "epoch": 0.9186497812002289, + "grad_norm": 0.1198374405503273, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 237640 + }, + { + "epoch": 0.9186884384036121, + "grad_norm": 0.09672702848911285, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 237650 + }, + { + "epoch": 0.9187270956069954, + "grad_norm": 0.10464288294315338, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 237660 + }, + { + "epoch": 0.9187657528103786, + "grad_norm": 0.11569831520318985, + "learning_rate": 0.002, + "loss": 2.325, + "step": 237670 + }, + { + "epoch": 0.918804410013762, + "grad_norm": 0.10307703167200089, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 237680 + }, + { + "epoch": 0.9188430672171453, + "grad_norm": 0.10956721007823944, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 237690 + }, + { + "epoch": 0.9188817244205285, + "grad_norm": 0.11558805406093597, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 237700 + }, + { + "epoch": 0.9189203816239118, + "grad_norm": 0.09959175437688828, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 237710 + }, + { + "epoch": 0.9189590388272951, + "grad_norm": 0.11553435772657394, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 237720 + }, + { + "epoch": 0.9189976960306784, + "grad_norm": 0.10117180645465851, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 237730 + }, + { + "epoch": 0.9190363532340616, + "grad_norm": 0.09056204557418823, + "learning_rate": 0.002, + "loss": 2.329, + "step": 237740 + }, + { + "epoch": 0.9190750104374449, + "grad_norm": 0.10344156622886658, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 237750 + }, + { + "epoch": 0.9191136676408282, + "grad_norm": 0.12726548314094543, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 237760 + }, + { + "epoch": 0.9191523248442115, + "grad_norm": 0.10069930553436279, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 237770 + }, + { + "epoch": 0.9191909820475948, + "grad_norm": 0.1296139508485794, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 237780 + }, + { + "epoch": 0.919229639250978, + "grad_norm": 0.10524491220712662, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 237790 + }, + { + "epoch": 0.9192682964543613, + "grad_norm": 0.08840267360210419, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 237800 + }, + { + "epoch": 0.9193069536577446, + "grad_norm": 0.10867176949977875, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 237810 + }, + { + "epoch": 0.9193456108611279, + "grad_norm": 0.10205422341823578, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 237820 + }, + { + "epoch": 0.9193842680645111, + "grad_norm": 0.0953567698597908, + "learning_rate": 0.002, + "loss": 2.327, + "step": 237830 + }, + { + "epoch": 0.9194229252678944, + "grad_norm": 0.16813983023166656, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 237840 + }, + { + "epoch": 0.9194615824712777, + "grad_norm": 0.10395313054323196, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 237850 + }, + { + "epoch": 0.919500239674661, + "grad_norm": 0.10055957734584808, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 237860 + }, + { + "epoch": 0.9195388968780442, + "grad_norm": 0.09994012117385864, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 237870 + }, + { + "epoch": 0.9195775540814275, + "grad_norm": 0.10582417994737625, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 237880 + }, + { + "epoch": 0.9196162112848109, + "grad_norm": 0.1077481359243393, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 237890 + }, + { + "epoch": 0.9196548684881941, + "grad_norm": 0.10383408516645432, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 237900 + }, + { + "epoch": 0.9196935256915774, + "grad_norm": 0.09966985136270523, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 237910 + }, + { + "epoch": 0.9197321828949606, + "grad_norm": 0.09650438278913498, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 237920 + }, + { + "epoch": 0.919770840098344, + "grad_norm": 0.11974448710680008, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 237930 + }, + { + "epoch": 0.9198094973017272, + "grad_norm": 0.10671281069517136, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 237940 + }, + { + "epoch": 0.9198481545051105, + "grad_norm": 0.10487055033445358, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 237950 + }, + { + "epoch": 0.9198868117084937, + "grad_norm": 0.10726416110992432, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 237960 + }, + { + "epoch": 0.919925468911877, + "grad_norm": 0.10068479925394058, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 237970 + }, + { + "epoch": 0.9199641261152603, + "grad_norm": 0.10443327575922012, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 237980 + }, + { + "epoch": 0.9200027833186436, + "grad_norm": 0.09864545613527298, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 237990 + }, + { + "epoch": 0.9200414405220269, + "grad_norm": 0.09579648077487946, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 238000 + }, + { + "epoch": 0.9200800977254101, + "grad_norm": 0.12088295072317123, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 238010 + }, + { + "epoch": 0.9201187549287935, + "grad_norm": 0.11437676101922989, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 238020 + }, + { + "epoch": 0.9201574121321767, + "grad_norm": 0.09059667587280273, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 238030 + }, + { + "epoch": 0.92019606933556, + "grad_norm": 0.11837775260210037, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 238040 + }, + { + "epoch": 0.9202347265389432, + "grad_norm": 0.10595916956663132, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 238050 + }, + { + "epoch": 0.9202733837423266, + "grad_norm": 0.12788908183574677, + "learning_rate": 0.002, + "loss": 2.352, + "step": 238060 + }, + { + "epoch": 0.9203120409457098, + "grad_norm": 0.102462038397789, + "learning_rate": 0.002, + "loss": 2.342, + "step": 238070 + }, + { + "epoch": 0.9203506981490931, + "grad_norm": 0.10429543256759644, + "learning_rate": 0.002, + "loss": 2.347, + "step": 238080 + }, + { + "epoch": 0.9203893553524763, + "grad_norm": 0.10784517228603363, + "learning_rate": 0.002, + "loss": 2.333, + "step": 238090 + }, + { + "epoch": 0.9204280125558597, + "grad_norm": 0.12058625370264053, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 238100 + }, + { + "epoch": 0.920466669759243, + "grad_norm": 0.10487110912799835, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 238110 + }, + { + "epoch": 0.9205053269626262, + "grad_norm": 0.1079377681016922, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 238120 + }, + { + "epoch": 0.9205439841660095, + "grad_norm": 0.11104608327150345, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 238130 + }, + { + "epoch": 0.9205826413693927, + "grad_norm": 0.12503919005393982, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 238140 + }, + { + "epoch": 0.9206212985727761, + "grad_norm": 0.10990816354751587, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 238150 + }, + { + "epoch": 0.9206599557761593, + "grad_norm": 0.11753184348344803, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 238160 + }, + { + "epoch": 0.9206986129795426, + "grad_norm": 0.11438991874456406, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 238170 + }, + { + "epoch": 0.9207372701829258, + "grad_norm": 0.11788298934698105, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 238180 + }, + { + "epoch": 0.9207759273863092, + "grad_norm": 0.1082969680428505, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 238190 + }, + { + "epoch": 0.9208145845896925, + "grad_norm": 0.12433382868766785, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 238200 + }, + { + "epoch": 0.9208532417930757, + "grad_norm": 0.12203840911388397, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 238210 + }, + { + "epoch": 0.920891898996459, + "grad_norm": 0.128168523311615, + "learning_rate": 0.002, + "loss": 2.33, + "step": 238220 + }, + { + "epoch": 0.9209305561998423, + "grad_norm": 0.09707853198051453, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 238230 + }, + { + "epoch": 0.9209692134032256, + "grad_norm": 0.12274660915136337, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 238240 + }, + { + "epoch": 0.9210078706066088, + "grad_norm": 0.10505618900060654, + "learning_rate": 0.002, + "loss": 2.333, + "step": 238250 + }, + { + "epoch": 0.9210465278099921, + "grad_norm": 0.08565858006477356, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 238260 + }, + { + "epoch": 0.9210851850133754, + "grad_norm": 0.10795660316944122, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 238270 + }, + { + "epoch": 0.9211238422167587, + "grad_norm": 0.1365453600883484, + "learning_rate": 0.002, + "loss": 2.343, + "step": 238280 + }, + { + "epoch": 0.921162499420142, + "grad_norm": 0.11069447547197342, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 238290 + }, + { + "epoch": 0.9212011566235252, + "grad_norm": 0.11392568796873093, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 238300 + }, + { + "epoch": 0.9212398138269086, + "grad_norm": 0.11084969341754913, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 238310 + }, + { + "epoch": 0.9212784710302918, + "grad_norm": 0.1112273707985878, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 238320 + }, + { + "epoch": 0.9213171282336751, + "grad_norm": 0.10770048201084137, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 238330 + }, + { + "epoch": 0.9213557854370583, + "grad_norm": 0.10765349119901657, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 238340 + }, + { + "epoch": 0.9213944426404416, + "grad_norm": 0.10025139898061752, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 238350 + }, + { + "epoch": 0.9214330998438249, + "grad_norm": 0.1013595461845398, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 238360 + }, + { + "epoch": 0.9214717570472082, + "grad_norm": 0.09769086539745331, + "learning_rate": 0.002, + "loss": 2.331, + "step": 238370 + }, + { + "epoch": 0.9215104142505914, + "grad_norm": 0.09939318150281906, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 238380 + }, + { + "epoch": 0.9215490714539747, + "grad_norm": 0.10779769718647003, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 238390 + }, + { + "epoch": 0.921587728657358, + "grad_norm": 0.11342509835958481, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 238400 + }, + { + "epoch": 0.9216263858607413, + "grad_norm": 0.10702987760305405, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 238410 + }, + { + "epoch": 0.9216650430641246, + "grad_norm": 0.10754323750734329, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 238420 + }, + { + "epoch": 0.9217037002675078, + "grad_norm": 0.10378240048885345, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 238430 + }, + { + "epoch": 0.9217423574708912, + "grad_norm": 0.09795913845300674, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 238440 + }, + { + "epoch": 0.9217810146742744, + "grad_norm": 0.10659509152173996, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 238450 + }, + { + "epoch": 0.9218196718776577, + "grad_norm": 0.1028730571269989, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 238460 + }, + { + "epoch": 0.9218583290810409, + "grad_norm": 0.11338159441947937, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 238470 + }, + { + "epoch": 0.9218969862844243, + "grad_norm": 0.0938754603266716, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 238480 + }, + { + "epoch": 0.9219356434878075, + "grad_norm": 0.10216149687767029, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 238490 + }, + { + "epoch": 0.9219743006911908, + "grad_norm": 0.0989452376961708, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 238500 + }, + { + "epoch": 0.922012957894574, + "grad_norm": 0.10219037532806396, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 238510 + }, + { + "epoch": 0.9220516150979573, + "grad_norm": 0.11570502817630768, + "learning_rate": 0.002, + "loss": 2.345, + "step": 238520 + }, + { + "epoch": 0.9220902723013407, + "grad_norm": 0.10561752319335938, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 238530 + }, + { + "epoch": 0.9221289295047239, + "grad_norm": 0.1113727018237114, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 238540 + }, + { + "epoch": 0.9221675867081072, + "grad_norm": 0.11061765998601913, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 238550 + }, + { + "epoch": 0.9222062439114904, + "grad_norm": 0.10693425685167313, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 238560 + }, + { + "epoch": 0.9222449011148738, + "grad_norm": 0.1509937047958374, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 238570 + }, + { + "epoch": 0.922283558318257, + "grad_norm": 0.10090623050928116, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 238580 + }, + { + "epoch": 0.9223222155216403, + "grad_norm": 0.10902487486600876, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 238590 + }, + { + "epoch": 0.9223608727250235, + "grad_norm": 0.10726834833621979, + "learning_rate": 0.002, + "loss": 2.313, + "step": 238600 + }, + { + "epoch": 0.9223995299284069, + "grad_norm": 0.12696325778961182, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 238610 + }, + { + "epoch": 0.9224381871317902, + "grad_norm": 0.10057271271944046, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 238620 + }, + { + "epoch": 0.9224768443351734, + "grad_norm": 0.10777167975902557, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 238630 + }, + { + "epoch": 0.9225155015385567, + "grad_norm": 0.1140720397233963, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 238640 + }, + { + "epoch": 0.92255415874194, + "grad_norm": 0.09188432991504669, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 238650 + }, + { + "epoch": 0.9225928159453233, + "grad_norm": 0.10969416052103043, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 238660 + }, + { + "epoch": 0.9226314731487065, + "grad_norm": 0.09891535341739655, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 238670 + }, + { + "epoch": 0.9226701303520898, + "grad_norm": 0.09819822758436203, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 238680 + }, + { + "epoch": 0.9227087875554731, + "grad_norm": 0.09815497696399689, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 238690 + }, + { + "epoch": 0.9227474447588564, + "grad_norm": 0.1160065159201622, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 238700 + }, + { + "epoch": 0.9227861019622396, + "grad_norm": 0.11586955189704895, + "learning_rate": 0.002, + "loss": 2.331, + "step": 238710 + }, + { + "epoch": 0.9228247591656229, + "grad_norm": 0.12375964969396591, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 238720 + }, + { + "epoch": 0.9228634163690062, + "grad_norm": 0.10358904302120209, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 238730 + }, + { + "epoch": 0.9229020735723895, + "grad_norm": 0.09508515149354935, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 238740 + }, + { + "epoch": 0.9229407307757728, + "grad_norm": 0.09402194619178772, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 238750 + }, + { + "epoch": 0.922979387979156, + "grad_norm": 0.11810307204723358, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 238760 + }, + { + "epoch": 0.9230180451825393, + "grad_norm": 0.11552152037620544, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 238770 + }, + { + "epoch": 0.9230567023859226, + "grad_norm": 0.10606750100851059, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 238780 + }, + { + "epoch": 0.9230953595893059, + "grad_norm": 0.08817689120769501, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 238790 + }, + { + "epoch": 0.9231340167926891, + "grad_norm": 0.11065953969955444, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 238800 + }, + { + "epoch": 0.9231726739960724, + "grad_norm": 0.1170061007142067, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 238810 + }, + { + "epoch": 0.9232113311994558, + "grad_norm": 0.11106377840042114, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 238820 + }, + { + "epoch": 0.923249988402839, + "grad_norm": 0.10830987244844437, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 238830 + }, + { + "epoch": 0.9232886456062223, + "grad_norm": 0.11701057106256485, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 238840 + }, + { + "epoch": 0.9233273028096055, + "grad_norm": 0.09716720879077911, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 238850 + }, + { + "epoch": 0.9233659600129889, + "grad_norm": 0.0941588282585144, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 238860 + }, + { + "epoch": 0.9234046172163721, + "grad_norm": 0.09816788882017136, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 238870 + }, + { + "epoch": 0.9234432744197554, + "grad_norm": 0.11149384081363678, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 238880 + }, + { + "epoch": 0.9234819316231386, + "grad_norm": 0.11008346080780029, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 238890 + }, + { + "epoch": 0.9235205888265219, + "grad_norm": 0.10621821135282516, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 238900 + }, + { + "epoch": 0.9235592460299052, + "grad_norm": 0.09759090095758438, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 238910 + }, + { + "epoch": 0.9235979032332885, + "grad_norm": 0.10635235160589218, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 238920 + }, + { + "epoch": 0.9236365604366717, + "grad_norm": 0.10297021269798279, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 238930 + }, + { + "epoch": 0.923675217640055, + "grad_norm": 0.1188259944319725, + "learning_rate": 0.002, + "loss": 2.345, + "step": 238940 + }, + { + "epoch": 0.9237138748434384, + "grad_norm": 0.11682503670454025, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 238950 + }, + { + "epoch": 0.9237525320468216, + "grad_norm": 0.11106524616479874, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 238960 + }, + { + "epoch": 0.9237911892502049, + "grad_norm": 0.09653395414352417, + "learning_rate": 0.002, + "loss": 2.348, + "step": 238970 + }, + { + "epoch": 0.9238298464535881, + "grad_norm": 0.11367765069007874, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 238980 + }, + { + "epoch": 0.9238685036569715, + "grad_norm": 0.09777499735355377, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 238990 + }, + { + "epoch": 0.9239071608603547, + "grad_norm": 0.10271391272544861, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 239000 + }, + { + "epoch": 0.923945818063738, + "grad_norm": 0.13675880432128906, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 239010 + }, + { + "epoch": 0.9239844752671212, + "grad_norm": 0.09634064137935638, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 239020 + }, + { + "epoch": 0.9240231324705046, + "grad_norm": 0.14566011726856232, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 239030 + }, + { + "epoch": 0.9240617896738879, + "grad_norm": 0.10591859370470047, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 239040 + }, + { + "epoch": 0.9241004468772711, + "grad_norm": 0.11468186974525452, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 239050 + }, + { + "epoch": 0.9241391040806544, + "grad_norm": 0.11868032813072205, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 239060 + }, + { + "epoch": 0.9241777612840376, + "grad_norm": 0.24066656827926636, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 239070 + }, + { + "epoch": 0.924216418487421, + "grad_norm": 0.10723575204610825, + "learning_rate": 0.002, + "loss": 2.335, + "step": 239080 + }, + { + "epoch": 0.9242550756908042, + "grad_norm": 0.14120464026927948, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 239090 + }, + { + "epoch": 0.9242937328941875, + "grad_norm": 0.11646062880754471, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 239100 + }, + { + "epoch": 0.9243323900975707, + "grad_norm": 0.11130773276090622, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 239110 + }, + { + "epoch": 0.9243710473009541, + "grad_norm": 0.12386900931596756, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 239120 + }, + { + "epoch": 0.9244097045043373, + "grad_norm": 0.09747873246669769, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 239130 + }, + { + "epoch": 0.9244483617077206, + "grad_norm": 0.12462751567363739, + "learning_rate": 0.002, + "loss": 2.33, + "step": 239140 + }, + { + "epoch": 0.9244870189111039, + "grad_norm": 0.09987830370664597, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 239150 + }, + { + "epoch": 0.9245256761144872, + "grad_norm": 0.08740975707769394, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 239160 + }, + { + "epoch": 0.9245643333178705, + "grad_norm": 0.10647254437208176, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 239170 + }, + { + "epoch": 0.9246029905212537, + "grad_norm": 0.11191874742507935, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 239180 + }, + { + "epoch": 0.924641647724637, + "grad_norm": 0.10716775804758072, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 239190 + }, + { + "epoch": 0.9246803049280203, + "grad_norm": 0.11306207627058029, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 239200 + }, + { + "epoch": 0.9247189621314036, + "grad_norm": 0.11484642326831818, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 239210 + }, + { + "epoch": 0.9247576193347868, + "grad_norm": 0.12202706187963486, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 239220 + }, + { + "epoch": 0.9247962765381701, + "grad_norm": 0.10952723026275635, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 239230 + }, + { + "epoch": 0.9248349337415535, + "grad_norm": 0.09301014989614487, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 239240 + }, + { + "epoch": 0.9248735909449367, + "grad_norm": 0.12277715653181076, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 239250 + }, + { + "epoch": 0.92491224814832, + "grad_norm": 0.1088462620973587, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 239260 + }, + { + "epoch": 0.9249509053517032, + "grad_norm": 0.10005609691143036, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 239270 + }, + { + "epoch": 0.9249895625550865, + "grad_norm": 0.09543203562498093, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 239280 + }, + { + "epoch": 0.9250282197584698, + "grad_norm": 0.11769011616706848, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 239290 + }, + { + "epoch": 0.9250668769618531, + "grad_norm": 0.09605714678764343, + "learning_rate": 0.002, + "loss": 2.33, + "step": 239300 + }, + { + "epoch": 0.9251055341652363, + "grad_norm": 0.10205802321434021, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 239310 + }, + { + "epoch": 0.9251441913686196, + "grad_norm": 0.10329537838697433, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 239320 + }, + { + "epoch": 0.925182848572003, + "grad_norm": 0.11891063302755356, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 239330 + }, + { + "epoch": 0.9252215057753862, + "grad_norm": 0.12052151560783386, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 239340 + }, + { + "epoch": 0.9252601629787695, + "grad_norm": 0.12861767411231995, + "learning_rate": 0.002, + "loss": 2.329, + "step": 239350 + }, + { + "epoch": 0.9252988201821527, + "grad_norm": 0.09616052359342575, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 239360 + }, + { + "epoch": 0.9253374773855361, + "grad_norm": 0.09918008744716644, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 239370 + }, + { + "epoch": 0.9253761345889193, + "grad_norm": 0.1037401482462883, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 239380 + }, + { + "epoch": 0.9254147917923026, + "grad_norm": 0.10741420835256577, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 239390 + }, + { + "epoch": 0.9254534489956858, + "grad_norm": 0.10255386680364609, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 239400 + }, + { + "epoch": 0.9254921061990692, + "grad_norm": 0.11019117385149002, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 239410 + }, + { + "epoch": 0.9255307634024524, + "grad_norm": 0.10579998791217804, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 239420 + }, + { + "epoch": 0.9255694206058357, + "grad_norm": 0.1270904242992401, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 239430 + }, + { + "epoch": 0.9256080778092189, + "grad_norm": 0.17731136083602905, + "learning_rate": 0.002, + "loss": 2.321, + "step": 239440 + }, + { + "epoch": 0.9256467350126022, + "grad_norm": 0.09464423358440399, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 239450 + }, + { + "epoch": 0.9256853922159856, + "grad_norm": 0.10702194273471832, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 239460 + }, + { + "epoch": 0.9257240494193688, + "grad_norm": 0.10974325239658356, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 239470 + }, + { + "epoch": 0.9257627066227521, + "grad_norm": 0.10220817476511002, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 239480 + }, + { + "epoch": 0.9258013638261353, + "grad_norm": 0.10747115314006805, + "learning_rate": 0.002, + "loss": 2.343, + "step": 239490 + }, + { + "epoch": 0.9258400210295187, + "grad_norm": 0.09978757798671722, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 239500 + }, + { + "epoch": 0.9258786782329019, + "grad_norm": 0.11532643437385559, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 239510 + }, + { + "epoch": 0.9259173354362852, + "grad_norm": 0.09514863044023514, + "learning_rate": 0.002, + "loss": 2.3135, + "step": 239520 + }, + { + "epoch": 0.9259559926396684, + "grad_norm": 0.09954030066728592, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 239530 + }, + { + "epoch": 0.9259946498430518, + "grad_norm": 0.12030558288097382, + "learning_rate": 0.002, + "loss": 2.356, + "step": 239540 + }, + { + "epoch": 0.926033307046435, + "grad_norm": 0.12196508795022964, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 239550 + }, + { + "epoch": 0.9260719642498183, + "grad_norm": 0.09463625401258469, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 239560 + }, + { + "epoch": 0.9261106214532016, + "grad_norm": 0.11292003840208054, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 239570 + }, + { + "epoch": 0.9261492786565849, + "grad_norm": 0.10972633212804794, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 239580 + }, + { + "epoch": 0.9261879358599682, + "grad_norm": 0.11431489139795303, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 239590 + }, + { + "epoch": 0.9262265930633514, + "grad_norm": 0.1135575994849205, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 239600 + }, + { + "epoch": 0.9262652502667347, + "grad_norm": 0.13443417847156525, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 239610 + }, + { + "epoch": 0.9263039074701179, + "grad_norm": 0.130362406373024, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 239620 + }, + { + "epoch": 0.9263425646735013, + "grad_norm": 0.09769725054502487, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 239630 + }, + { + "epoch": 0.9263812218768845, + "grad_norm": 0.11235304176807404, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 239640 + }, + { + "epoch": 0.9264198790802678, + "grad_norm": 0.11400007456541061, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 239650 + }, + { + "epoch": 0.926458536283651, + "grad_norm": 0.11323840916156769, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 239660 + }, + { + "epoch": 0.9264971934870344, + "grad_norm": 0.10057419538497925, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 239670 + }, + { + "epoch": 0.9265358506904177, + "grad_norm": 0.0986354649066925, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 239680 + }, + { + "epoch": 0.9265745078938009, + "grad_norm": 0.14333440363407135, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 239690 + }, + { + "epoch": 0.9266131650971842, + "grad_norm": 0.10923615843057632, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 239700 + }, + { + "epoch": 0.9266518223005675, + "grad_norm": 0.10601142793893814, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 239710 + }, + { + "epoch": 0.9266904795039508, + "grad_norm": 0.117704838514328, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 239720 + }, + { + "epoch": 0.926729136707334, + "grad_norm": 0.1263555884361267, + "learning_rate": 0.002, + "loss": 2.334, + "step": 239730 + }, + { + "epoch": 0.9267677939107173, + "grad_norm": 0.10091309249401093, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 239740 + }, + { + "epoch": 0.9268064511141006, + "grad_norm": 0.11265761405229568, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 239750 + }, + { + "epoch": 0.9268451083174839, + "grad_norm": 0.09833136200904846, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 239760 + }, + { + "epoch": 0.9268837655208672, + "grad_norm": 0.0956064909696579, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 239770 + }, + { + "epoch": 0.9269224227242504, + "grad_norm": 0.11916255205869675, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 239780 + }, + { + "epoch": 0.9269610799276338, + "grad_norm": 0.11998631805181503, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 239790 + }, + { + "epoch": 0.926999737131017, + "grad_norm": 0.10860449075698853, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 239800 + }, + { + "epoch": 0.9270383943344003, + "grad_norm": 0.10798289626836777, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 239810 + }, + { + "epoch": 0.9270770515377835, + "grad_norm": 0.10168136656284332, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 239820 + }, + { + "epoch": 0.9271157087411668, + "grad_norm": 0.10898247361183167, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 239830 + }, + { + "epoch": 0.9271543659445501, + "grad_norm": 0.10820508748292923, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 239840 + }, + { + "epoch": 0.9271930231479334, + "grad_norm": 0.11451074481010437, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 239850 + }, + { + "epoch": 0.9272316803513166, + "grad_norm": 0.0991031676530838, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 239860 + }, + { + "epoch": 0.9272703375546999, + "grad_norm": 0.106524758040905, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 239870 + }, + { + "epoch": 0.9273089947580833, + "grad_norm": 0.11314364522695541, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 239880 + }, + { + "epoch": 0.9273476519614665, + "grad_norm": 0.10054526478052139, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 239890 + }, + { + "epoch": 0.9273863091648498, + "grad_norm": 0.13372132182121277, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 239900 + }, + { + "epoch": 0.927424966368233, + "grad_norm": 0.09951240569353104, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 239910 + }, + { + "epoch": 0.9274636235716164, + "grad_norm": 0.10693838447332382, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 239920 + }, + { + "epoch": 0.9275022807749996, + "grad_norm": 0.11133580654859543, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 239930 + }, + { + "epoch": 0.9275409379783829, + "grad_norm": 0.11053361743688583, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 239940 + }, + { + "epoch": 0.9275795951817661, + "grad_norm": 0.10781632363796234, + "learning_rate": 0.002, + "loss": 2.334, + "step": 239950 + }, + { + "epoch": 0.9276182523851495, + "grad_norm": 0.13119134306907654, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 239960 + }, + { + "epoch": 0.9276569095885328, + "grad_norm": 0.10563421249389648, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 239970 + }, + { + "epoch": 0.927695566791916, + "grad_norm": 0.1123589426279068, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 239980 + }, + { + "epoch": 0.9277342239952993, + "grad_norm": 0.09124656766653061, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 239990 + }, + { + "epoch": 0.9277728811986825, + "grad_norm": 0.0967029258608818, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 240000 + }, + { + "epoch": 0.9278115384020659, + "grad_norm": 0.13630369305610657, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 240010 + }, + { + "epoch": 0.9278501956054491, + "grad_norm": 0.10799701511859894, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 240020 + }, + { + "epoch": 0.9278888528088324, + "grad_norm": 0.10726698487997055, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 240030 + }, + { + "epoch": 0.9279275100122156, + "grad_norm": 0.10995513200759888, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 240040 + }, + { + "epoch": 0.927966167215599, + "grad_norm": 0.11034388840198517, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 240050 + }, + { + "epoch": 0.9280048244189822, + "grad_norm": 0.10742886364459991, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 240060 + }, + { + "epoch": 0.9280434816223655, + "grad_norm": 0.108866848051548, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 240070 + }, + { + "epoch": 0.9280821388257487, + "grad_norm": 0.10716560482978821, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 240080 + }, + { + "epoch": 0.9281207960291321, + "grad_norm": 0.08997724950313568, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 240090 + }, + { + "epoch": 0.9281594532325154, + "grad_norm": 0.12874490022659302, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 240100 + }, + { + "epoch": 0.9281981104358986, + "grad_norm": 0.11026809364557266, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 240110 + }, + { + "epoch": 0.9282367676392819, + "grad_norm": 0.11839725822210312, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 240120 + }, + { + "epoch": 0.9282754248426652, + "grad_norm": 0.10574904084205627, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 240130 + }, + { + "epoch": 0.9283140820460485, + "grad_norm": 0.09895447641611099, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 240140 + }, + { + "epoch": 0.9283527392494317, + "grad_norm": 0.10029957443475723, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 240150 + }, + { + "epoch": 0.928391396452815, + "grad_norm": 0.11841758340597153, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 240160 + }, + { + "epoch": 0.9284300536561984, + "grad_norm": 0.10327313840389252, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 240170 + }, + { + "epoch": 0.9284687108595816, + "grad_norm": 0.10563918203115463, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 240180 + }, + { + "epoch": 0.9285073680629649, + "grad_norm": 0.13680148124694824, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 240190 + }, + { + "epoch": 0.9285460252663481, + "grad_norm": 0.10711419582366943, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 240200 + }, + { + "epoch": 0.9285846824697314, + "grad_norm": 0.09397318959236145, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 240210 + }, + { + "epoch": 0.9286233396731147, + "grad_norm": 0.10058710724115372, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 240220 + }, + { + "epoch": 0.928661996876498, + "grad_norm": 0.1029769703745842, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 240230 + }, + { + "epoch": 0.9287006540798812, + "grad_norm": 0.11130029708147049, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 240240 + }, + { + "epoch": 0.9287393112832645, + "grad_norm": 0.09373640269041061, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 240250 + }, + { + "epoch": 0.9287779684866478, + "grad_norm": 0.10493456572294235, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 240260 + }, + { + "epoch": 0.9288166256900311, + "grad_norm": 0.10127705335617065, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 240270 + }, + { + "epoch": 0.9288552828934143, + "grad_norm": 0.0896872952580452, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 240280 + }, + { + "epoch": 0.9288939400967976, + "grad_norm": 0.11261755228042603, + "learning_rate": 0.002, + "loss": 2.342, + "step": 240290 + }, + { + "epoch": 0.928932597300181, + "grad_norm": 0.09712719917297363, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 240300 + }, + { + "epoch": 0.9289712545035642, + "grad_norm": 0.1010216623544693, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 240310 + }, + { + "epoch": 0.9290099117069475, + "grad_norm": 0.10653354972600937, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 240320 + }, + { + "epoch": 0.9290485689103307, + "grad_norm": 0.09720896184444427, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 240330 + }, + { + "epoch": 0.9290872261137141, + "grad_norm": 0.09222602099180222, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 240340 + }, + { + "epoch": 0.9291258833170973, + "grad_norm": 0.09605705738067627, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 240350 + }, + { + "epoch": 0.9291645405204806, + "grad_norm": 0.11048048734664917, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 240360 + }, + { + "epoch": 0.9292031977238638, + "grad_norm": 0.10950416326522827, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 240370 + }, + { + "epoch": 0.9292418549272471, + "grad_norm": 0.10134270042181015, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 240380 + }, + { + "epoch": 0.9292805121306305, + "grad_norm": 0.09597347676753998, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 240390 + }, + { + "epoch": 0.9293191693340137, + "grad_norm": 0.1042867973446846, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 240400 + }, + { + "epoch": 0.929357826537397, + "grad_norm": 0.11298928409814835, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 240410 + }, + { + "epoch": 0.9293964837407802, + "grad_norm": 0.09755025804042816, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 240420 + }, + { + "epoch": 0.9294351409441636, + "grad_norm": 0.10076082497835159, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 240430 + }, + { + "epoch": 0.9294737981475468, + "grad_norm": 0.11371766030788422, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 240440 + }, + { + "epoch": 0.9295124553509301, + "grad_norm": 0.09814074635505676, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 240450 + }, + { + "epoch": 0.9295511125543133, + "grad_norm": 0.1093234196305275, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 240460 + }, + { + "epoch": 0.9295897697576967, + "grad_norm": 0.1031750962138176, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 240470 + }, + { + "epoch": 0.92962842696108, + "grad_norm": 0.11120779067277908, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 240480 + }, + { + "epoch": 0.9296670841644632, + "grad_norm": 0.11129971593618393, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 240490 + }, + { + "epoch": 0.9297057413678464, + "grad_norm": 0.12351063638925552, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 240500 + }, + { + "epoch": 0.9297443985712298, + "grad_norm": 0.0975901409983635, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 240510 + }, + { + "epoch": 0.9297830557746131, + "grad_norm": 0.10659302026033401, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 240520 + }, + { + "epoch": 0.9298217129779963, + "grad_norm": 0.13084393739700317, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 240530 + }, + { + "epoch": 0.9298603701813796, + "grad_norm": 0.10836442559957504, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 240540 + }, + { + "epoch": 0.9298990273847628, + "grad_norm": 0.11104420572519302, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 240550 + }, + { + "epoch": 0.9299376845881462, + "grad_norm": 0.1024618074297905, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 240560 + }, + { + "epoch": 0.9299763417915294, + "grad_norm": 0.11527860164642334, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 240570 + }, + { + "epoch": 0.9300149989949127, + "grad_norm": 0.08908341079950333, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 240580 + }, + { + "epoch": 0.9300536561982959, + "grad_norm": 0.09538891166448593, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 240590 + }, + { + "epoch": 0.9300923134016793, + "grad_norm": 0.09617014229297638, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 240600 + }, + { + "epoch": 0.9301309706050626, + "grad_norm": 0.12283073365688324, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 240610 + }, + { + "epoch": 0.9301696278084458, + "grad_norm": 0.099635049700737, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 240620 + }, + { + "epoch": 0.9302082850118291, + "grad_norm": 0.09536898136138916, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 240630 + }, + { + "epoch": 0.9302469422152124, + "grad_norm": 0.13080519437789917, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 240640 + }, + { + "epoch": 0.9302855994185957, + "grad_norm": 0.095926932990551, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 240650 + }, + { + "epoch": 0.9303242566219789, + "grad_norm": 0.1012723371386528, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 240660 + }, + { + "epoch": 0.9303629138253622, + "grad_norm": 0.10640578716993332, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 240670 + }, + { + "epoch": 0.9304015710287455, + "grad_norm": 0.09938972443342209, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 240680 + }, + { + "epoch": 0.9304402282321288, + "grad_norm": 0.09295855462551117, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 240690 + }, + { + "epoch": 0.930478885435512, + "grad_norm": 0.10483425855636597, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 240700 + }, + { + "epoch": 0.9305175426388953, + "grad_norm": 0.1012699156999588, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 240710 + }, + { + "epoch": 0.9305561998422787, + "grad_norm": 0.09559882432222366, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 240720 + }, + { + "epoch": 0.9305948570456619, + "grad_norm": 0.12409879267215729, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 240730 + }, + { + "epoch": 0.9306335142490452, + "grad_norm": 0.10803597420454025, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 240740 + }, + { + "epoch": 0.9306721714524284, + "grad_norm": 0.0958801731467247, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 240750 + }, + { + "epoch": 0.9307108286558117, + "grad_norm": 0.11432503163814545, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 240760 + }, + { + "epoch": 0.930749485859195, + "grad_norm": 0.11685143411159515, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 240770 + }, + { + "epoch": 0.9307881430625783, + "grad_norm": 0.11428704112768173, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 240780 + }, + { + "epoch": 0.9308268002659615, + "grad_norm": 0.10317996144294739, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 240790 + }, + { + "epoch": 0.9308654574693448, + "grad_norm": 0.10810055583715439, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 240800 + }, + { + "epoch": 0.9309041146727282, + "grad_norm": 0.1088952124118805, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 240810 + }, + { + "epoch": 0.9309427718761114, + "grad_norm": 0.09635835886001587, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 240820 + }, + { + "epoch": 0.9309814290794947, + "grad_norm": 0.1363757699728012, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 240830 + }, + { + "epoch": 0.9310200862828779, + "grad_norm": 0.10321643948554993, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 240840 + }, + { + "epoch": 0.9310587434862613, + "grad_norm": 0.08783495426177979, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 240850 + }, + { + "epoch": 0.9310974006896445, + "grad_norm": 0.1054040715098381, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 240860 + }, + { + "epoch": 0.9311360578930278, + "grad_norm": 0.1170346811413765, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 240870 + }, + { + "epoch": 0.931174715096411, + "grad_norm": 0.10788789391517639, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 240880 + }, + { + "epoch": 0.9312133722997944, + "grad_norm": 0.10687334090471268, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 240890 + }, + { + "epoch": 0.9312520295031776, + "grad_norm": 0.12458331137895584, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 240900 + }, + { + "epoch": 0.9312906867065609, + "grad_norm": 0.10293076932430267, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 240910 + }, + { + "epoch": 0.9313293439099442, + "grad_norm": 0.11584310233592987, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 240920 + }, + { + "epoch": 0.9313680011133274, + "grad_norm": 0.1265709102153778, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 240930 + }, + { + "epoch": 0.9314066583167108, + "grad_norm": 0.15816473960876465, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 240940 + }, + { + "epoch": 0.931445315520094, + "grad_norm": 0.11346606910228729, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 240950 + }, + { + "epoch": 0.9314839727234773, + "grad_norm": 0.10622525960206985, + "learning_rate": 0.002, + "loss": 2.342, + "step": 240960 + }, + { + "epoch": 0.9315226299268605, + "grad_norm": 0.09166400879621506, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 240970 + }, + { + "epoch": 0.9315612871302439, + "grad_norm": 0.1251622587442398, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 240980 + }, + { + "epoch": 0.9315999443336271, + "grad_norm": 0.09309102594852448, + "learning_rate": 0.002, + "loss": 2.36, + "step": 240990 + }, + { + "epoch": 0.9316386015370104, + "grad_norm": 0.09251026809215546, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 241000 + }, + { + "epoch": 0.9316772587403936, + "grad_norm": 0.09816301614046097, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 241010 + }, + { + "epoch": 0.931715915943777, + "grad_norm": 0.11350065469741821, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 241020 + }, + { + "epoch": 0.9317545731471603, + "grad_norm": 0.1071467250585556, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 241030 + }, + { + "epoch": 0.9317932303505435, + "grad_norm": 0.09898655116558075, + "learning_rate": 0.002, + "loss": 2.353, + "step": 241040 + }, + { + "epoch": 0.9318318875539268, + "grad_norm": 0.15836842358112335, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 241050 + }, + { + "epoch": 0.9318705447573101, + "grad_norm": 0.10508646816015244, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 241060 + }, + { + "epoch": 0.9319092019606934, + "grad_norm": 0.1096983551979065, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 241070 + }, + { + "epoch": 0.9319478591640766, + "grad_norm": 0.12404420971870422, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 241080 + }, + { + "epoch": 0.9319865163674599, + "grad_norm": 0.1084880456328392, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 241090 + }, + { + "epoch": 0.9320251735708432, + "grad_norm": 0.0965440422296524, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 241100 + }, + { + "epoch": 0.9320638307742265, + "grad_norm": 0.10915783047676086, + "learning_rate": 0.002, + "loss": 2.352, + "step": 241110 + }, + { + "epoch": 0.9321024879776098, + "grad_norm": 0.1070767492055893, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 241120 + }, + { + "epoch": 0.932141145180993, + "grad_norm": 0.09906446188688278, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 241130 + }, + { + "epoch": 0.9321798023843763, + "grad_norm": 0.10002636164426804, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 241140 + }, + { + "epoch": 0.9322184595877596, + "grad_norm": 0.10802870243787766, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 241150 + }, + { + "epoch": 0.9322571167911429, + "grad_norm": 0.10186523199081421, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 241160 + }, + { + "epoch": 0.9322957739945261, + "grad_norm": 0.10572991520166397, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 241170 + }, + { + "epoch": 0.9323344311979094, + "grad_norm": 0.10857295989990234, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 241180 + }, + { + "epoch": 0.9323730884012927, + "grad_norm": 0.09867467731237411, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 241190 + }, + { + "epoch": 0.932411745604676, + "grad_norm": 0.1304645538330078, + "learning_rate": 0.002, + "loss": 2.311, + "step": 241200 + }, + { + "epoch": 0.9324504028080592, + "grad_norm": 0.11418737471103668, + "learning_rate": 0.002, + "loss": 2.354, + "step": 241210 + }, + { + "epoch": 0.9324890600114425, + "grad_norm": 0.09130564332008362, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 241220 + }, + { + "epoch": 0.9325277172148259, + "grad_norm": 0.11547353118658066, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 241230 + }, + { + "epoch": 0.9325663744182091, + "grad_norm": 0.10222837328910828, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 241240 + }, + { + "epoch": 0.9326050316215924, + "grad_norm": 0.11761835217475891, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 241250 + }, + { + "epoch": 0.9326436888249756, + "grad_norm": 0.105369433760643, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 241260 + }, + { + "epoch": 0.932682346028359, + "grad_norm": 0.11019854247570038, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 241270 + }, + { + "epoch": 0.9327210032317422, + "grad_norm": 0.11037706583738327, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 241280 + }, + { + "epoch": 0.9327596604351255, + "grad_norm": 0.09246725589036942, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 241290 + }, + { + "epoch": 0.9327983176385087, + "grad_norm": 0.11534950137138367, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 241300 + }, + { + "epoch": 0.932836974841892, + "grad_norm": 0.09918901324272156, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 241310 + }, + { + "epoch": 0.9328756320452753, + "grad_norm": 0.09459531307220459, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 241320 + }, + { + "epoch": 0.9329142892486586, + "grad_norm": 0.10851433873176575, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 241330 + }, + { + "epoch": 0.9329529464520419, + "grad_norm": 0.10667916387319565, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 241340 + }, + { + "epoch": 0.9329916036554251, + "grad_norm": 0.0948222279548645, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 241350 + }, + { + "epoch": 0.9330302608588085, + "grad_norm": 0.1204134076833725, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 241360 + }, + { + "epoch": 0.9330689180621917, + "grad_norm": 0.09408220648765564, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 241370 + }, + { + "epoch": 0.933107575265575, + "grad_norm": 0.10862472653388977, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 241380 + }, + { + "epoch": 0.9331462324689582, + "grad_norm": 0.10759086906909943, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 241390 + }, + { + "epoch": 0.9331848896723416, + "grad_norm": 0.09432414174079895, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 241400 + }, + { + "epoch": 0.9332235468757248, + "grad_norm": 0.09516098350286484, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 241410 + }, + { + "epoch": 0.9332622040791081, + "grad_norm": 0.11452768743038177, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 241420 + }, + { + "epoch": 0.9333008612824913, + "grad_norm": 0.11694354563951492, + "learning_rate": 0.002, + "loss": 2.322, + "step": 241430 + }, + { + "epoch": 0.9333395184858747, + "grad_norm": 0.08922861516475677, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 241440 + }, + { + "epoch": 0.933378175689258, + "grad_norm": 0.10552657395601273, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 241450 + }, + { + "epoch": 0.9334168328926412, + "grad_norm": 0.0998677909374237, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 241460 + }, + { + "epoch": 0.9334554900960245, + "grad_norm": 0.12375766783952713, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 241470 + }, + { + "epoch": 0.9334941472994077, + "grad_norm": 0.09344843029975891, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 241480 + }, + { + "epoch": 0.9335328045027911, + "grad_norm": 0.10336374491453171, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 241490 + }, + { + "epoch": 0.9335714617061743, + "grad_norm": 0.11773277819156647, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 241500 + }, + { + "epoch": 0.9336101189095576, + "grad_norm": 0.1019284725189209, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 241510 + }, + { + "epoch": 0.9336487761129408, + "grad_norm": 0.09431376308202744, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 241520 + }, + { + "epoch": 0.9336874333163242, + "grad_norm": 0.10802071541547775, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 241530 + }, + { + "epoch": 0.9337260905197075, + "grad_norm": 0.10070924460887909, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 241540 + }, + { + "epoch": 0.9337647477230907, + "grad_norm": 0.10516620427370071, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 241550 + }, + { + "epoch": 0.933803404926474, + "grad_norm": 0.10847561806440353, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 241560 + }, + { + "epoch": 0.9338420621298573, + "grad_norm": 0.10606521368026733, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 241570 + }, + { + "epoch": 0.9338807193332406, + "grad_norm": 0.09973947703838348, + "learning_rate": 0.002, + "loss": 2.323, + "step": 241580 + }, + { + "epoch": 0.9339193765366238, + "grad_norm": 0.0958481878042221, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 241590 + }, + { + "epoch": 0.9339580337400071, + "grad_norm": 0.10320821404457092, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 241600 + }, + { + "epoch": 0.9339966909433904, + "grad_norm": 0.10255026817321777, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 241610 + }, + { + "epoch": 0.9340353481467737, + "grad_norm": 0.09719021618366241, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 241620 + }, + { + "epoch": 0.9340740053501569, + "grad_norm": 0.10893251746892929, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 241630 + }, + { + "epoch": 0.9341126625535402, + "grad_norm": 0.10667164623737335, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 241640 + }, + { + "epoch": 0.9341513197569236, + "grad_norm": 0.13029393553733826, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 241650 + }, + { + "epoch": 0.9341899769603068, + "grad_norm": 0.10004956275224686, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 241660 + }, + { + "epoch": 0.9342286341636901, + "grad_norm": 0.11355171352624893, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 241670 + }, + { + "epoch": 0.9342672913670733, + "grad_norm": 0.09860523790121078, + "learning_rate": 0.002, + "loss": 2.334, + "step": 241680 + }, + { + "epoch": 0.9343059485704566, + "grad_norm": 0.15768641233444214, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 241690 + }, + { + "epoch": 0.9343446057738399, + "grad_norm": 0.11209391057491302, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 241700 + }, + { + "epoch": 0.9343832629772232, + "grad_norm": 0.1045207679271698, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 241710 + }, + { + "epoch": 0.9344219201806064, + "grad_norm": 0.10634932667016983, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 241720 + }, + { + "epoch": 0.9344605773839897, + "grad_norm": 0.0978906899690628, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 241730 + }, + { + "epoch": 0.934499234587373, + "grad_norm": 0.12600494921207428, + "learning_rate": 0.002, + "loss": 2.334, + "step": 241740 + }, + { + "epoch": 0.9345378917907563, + "grad_norm": 0.09943626821041107, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 241750 + }, + { + "epoch": 0.9345765489941396, + "grad_norm": 0.10073429346084595, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 241760 + }, + { + "epoch": 0.9346152061975228, + "grad_norm": 0.12771274149417877, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 241770 + }, + { + "epoch": 0.9346538634009062, + "grad_norm": 0.12943124771118164, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 241780 + }, + { + "epoch": 0.9346925206042894, + "grad_norm": 0.11174143850803375, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 241790 + }, + { + "epoch": 0.9347311778076727, + "grad_norm": 0.1090095266699791, + "learning_rate": 0.002, + "loss": 2.349, + "step": 241800 + }, + { + "epoch": 0.9347698350110559, + "grad_norm": 0.10961566120386124, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 241810 + }, + { + "epoch": 0.9348084922144393, + "grad_norm": 0.10913696140050888, + "learning_rate": 0.002, + "loss": 2.36, + "step": 241820 + }, + { + "epoch": 0.9348471494178225, + "grad_norm": 0.11105609685182571, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 241830 + }, + { + "epoch": 0.9348858066212058, + "grad_norm": 0.1042783334851265, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 241840 + }, + { + "epoch": 0.934924463824589, + "grad_norm": 0.11911788582801819, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 241850 + }, + { + "epoch": 0.9349631210279723, + "grad_norm": 0.12005186080932617, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 241860 + }, + { + "epoch": 0.9350017782313557, + "grad_norm": 0.10456695407629013, + "learning_rate": 0.002, + "loss": 2.332, + "step": 241870 + }, + { + "epoch": 0.9350404354347389, + "grad_norm": 0.10455799102783203, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 241880 + }, + { + "epoch": 0.9350790926381222, + "grad_norm": 0.09809655696153641, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 241890 + }, + { + "epoch": 0.9351177498415054, + "grad_norm": 0.1062767282128334, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 241900 + }, + { + "epoch": 0.9351564070448888, + "grad_norm": 0.10312843322753906, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 241910 + }, + { + "epoch": 0.935195064248272, + "grad_norm": 0.11013250797986984, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 241920 + }, + { + "epoch": 0.9352337214516553, + "grad_norm": 0.10098816454410553, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 241930 + }, + { + "epoch": 0.9352723786550385, + "grad_norm": 0.42135360836982727, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 241940 + }, + { + "epoch": 0.9353110358584219, + "grad_norm": 0.13825412094593048, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 241950 + }, + { + "epoch": 0.9353496930618052, + "grad_norm": 0.10749774426221848, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 241960 + }, + { + "epoch": 0.9353883502651884, + "grad_norm": 0.10179366916418076, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 241970 + }, + { + "epoch": 0.9354270074685717, + "grad_norm": 0.10680104047060013, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 241980 + }, + { + "epoch": 0.935465664671955, + "grad_norm": 0.10687895119190216, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 241990 + }, + { + "epoch": 0.9355043218753383, + "grad_norm": 0.10046619921922684, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 242000 + }, + { + "epoch": 0.9355429790787215, + "grad_norm": 0.11829427629709244, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 242010 + }, + { + "epoch": 0.9355816362821048, + "grad_norm": 0.09220101684331894, + "learning_rate": 0.002, + "loss": 2.339, + "step": 242020 + }, + { + "epoch": 0.935620293485488, + "grad_norm": 0.15147259831428528, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 242030 + }, + { + "epoch": 0.9356589506888714, + "grad_norm": 0.10518482327461243, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 242040 + }, + { + "epoch": 0.9356976078922546, + "grad_norm": 0.1036066859960556, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 242050 + }, + { + "epoch": 0.9357362650956379, + "grad_norm": 0.11259414255619049, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 242060 + }, + { + "epoch": 0.9357749222990212, + "grad_norm": 0.12898430228233337, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 242070 + }, + { + "epoch": 0.9358135795024045, + "grad_norm": 0.09404338151216507, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 242080 + }, + { + "epoch": 0.9358522367057878, + "grad_norm": 0.09791241586208344, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 242090 + }, + { + "epoch": 0.935890893909171, + "grad_norm": 0.10467846691608429, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 242100 + }, + { + "epoch": 0.9359295511125543, + "grad_norm": 0.10960084944963455, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 242110 + }, + { + "epoch": 0.9359682083159376, + "grad_norm": 0.10058752447366714, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 242120 + }, + { + "epoch": 0.9360068655193209, + "grad_norm": 0.10815969109535217, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 242130 + }, + { + "epoch": 0.9360455227227041, + "grad_norm": 0.11273110657930374, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 242140 + }, + { + "epoch": 0.9360841799260874, + "grad_norm": 0.10401853173971176, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 242150 + }, + { + "epoch": 0.9361228371294708, + "grad_norm": 0.1059192568063736, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 242160 + }, + { + "epoch": 0.936161494332854, + "grad_norm": 0.12140562385320663, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 242170 + }, + { + "epoch": 0.9362001515362373, + "grad_norm": 0.10082773119211197, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 242180 + }, + { + "epoch": 0.9362388087396205, + "grad_norm": 0.1086844950914383, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 242190 + }, + { + "epoch": 0.9362774659430039, + "grad_norm": 0.10870152711868286, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 242200 + }, + { + "epoch": 0.9363161231463871, + "grad_norm": 0.12130552530288696, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 242210 + }, + { + "epoch": 0.9363547803497704, + "grad_norm": 0.09644386917352676, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 242220 + }, + { + "epoch": 0.9363934375531536, + "grad_norm": 0.09899096190929413, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 242230 + }, + { + "epoch": 0.9364320947565369, + "grad_norm": 0.10976357012987137, + "learning_rate": 0.002, + "loss": 2.34, + "step": 242240 + }, + { + "epoch": 0.9364707519599202, + "grad_norm": 0.11089303344488144, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 242250 + }, + { + "epoch": 0.9365094091633035, + "grad_norm": 0.13811884820461273, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 242260 + }, + { + "epoch": 0.9365480663666867, + "grad_norm": 0.11065199226140976, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 242270 + }, + { + "epoch": 0.93658672357007, + "grad_norm": 0.11939693242311478, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 242280 + }, + { + "epoch": 0.9366253807734534, + "grad_norm": 0.10447102785110474, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 242290 + }, + { + "epoch": 0.9366640379768366, + "grad_norm": 0.10677524656057358, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 242300 + }, + { + "epoch": 0.9367026951802199, + "grad_norm": 0.11139500141143799, + "learning_rate": 0.002, + "loss": 2.336, + "step": 242310 + }, + { + "epoch": 0.9367413523836031, + "grad_norm": 0.09366004168987274, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 242320 + }, + { + "epoch": 0.9367800095869865, + "grad_norm": 0.09346692264080048, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 242330 + }, + { + "epoch": 0.9368186667903697, + "grad_norm": 0.10834218561649323, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 242340 + }, + { + "epoch": 0.936857323993753, + "grad_norm": 0.3928252160549164, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 242350 + }, + { + "epoch": 0.9368959811971362, + "grad_norm": 0.10552085936069489, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 242360 + }, + { + "epoch": 0.9369346384005196, + "grad_norm": 0.1024235412478447, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 242370 + }, + { + "epoch": 0.9369732956039029, + "grad_norm": 0.09039077162742615, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 242380 + }, + { + "epoch": 0.9370119528072861, + "grad_norm": 0.09835212677717209, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 242390 + }, + { + "epoch": 0.9370506100106694, + "grad_norm": 0.10888617485761642, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 242400 + }, + { + "epoch": 0.9370892672140526, + "grad_norm": 0.10096684843301773, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 242410 + }, + { + "epoch": 0.937127924417436, + "grad_norm": 0.10301858931779861, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 242420 + }, + { + "epoch": 0.9371665816208192, + "grad_norm": 0.10485974699258804, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 242430 + }, + { + "epoch": 0.9372052388242025, + "grad_norm": 0.11427116394042969, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 242440 + }, + { + "epoch": 0.9372438960275857, + "grad_norm": 0.10336142033338547, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 242450 + }, + { + "epoch": 0.9372825532309691, + "grad_norm": 0.12452462315559387, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 242460 + }, + { + "epoch": 0.9373212104343523, + "grad_norm": 0.10437518358230591, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 242470 + }, + { + "epoch": 0.9373598676377356, + "grad_norm": 0.09719526022672653, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 242480 + }, + { + "epoch": 0.9373985248411189, + "grad_norm": 0.11504451185464859, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 242490 + }, + { + "epoch": 0.9374371820445022, + "grad_norm": 0.10362989455461502, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 242500 + }, + { + "epoch": 0.9374758392478855, + "grad_norm": 0.10242421180009842, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 242510 + }, + { + "epoch": 0.9375144964512687, + "grad_norm": 0.10308767855167389, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 242520 + }, + { + "epoch": 0.937553153654652, + "grad_norm": 0.1220618262887001, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 242530 + }, + { + "epoch": 0.9375918108580353, + "grad_norm": 0.10344993323087692, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 242540 + }, + { + "epoch": 0.9376304680614186, + "grad_norm": 0.14695045351982117, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 242550 + }, + { + "epoch": 0.9376691252648018, + "grad_norm": 0.11423251032829285, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 242560 + }, + { + "epoch": 0.9377077824681851, + "grad_norm": 0.09518953412771225, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 242570 + }, + { + "epoch": 0.9377464396715685, + "grad_norm": 0.15579114854335785, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 242580 + }, + { + "epoch": 0.9377850968749517, + "grad_norm": 0.09890809655189514, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 242590 + }, + { + "epoch": 0.937823754078335, + "grad_norm": 0.11543553322553635, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 242600 + }, + { + "epoch": 0.9378624112817182, + "grad_norm": 0.10215145349502563, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 242610 + }, + { + "epoch": 0.9379010684851015, + "grad_norm": 0.09498198330402374, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 242620 + }, + { + "epoch": 0.9379397256884848, + "grad_norm": 0.09896153956651688, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 242630 + }, + { + "epoch": 0.9379783828918681, + "grad_norm": 0.1034855842590332, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 242640 + }, + { + "epoch": 0.9380170400952513, + "grad_norm": 0.11195968091487885, + "learning_rate": 0.002, + "loss": 2.334, + "step": 242650 + }, + { + "epoch": 0.9380556972986346, + "grad_norm": 0.1223968118429184, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 242660 + }, + { + "epoch": 0.938094354502018, + "grad_norm": 0.10049095004796982, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 242670 + }, + { + "epoch": 0.9381330117054012, + "grad_norm": 0.09640270471572876, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 242680 + }, + { + "epoch": 0.9381716689087845, + "grad_norm": 0.1136377602815628, + "learning_rate": 0.002, + "loss": 2.35, + "step": 242690 + }, + { + "epoch": 0.9382103261121677, + "grad_norm": 0.09889031946659088, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 242700 + }, + { + "epoch": 0.9382489833155511, + "grad_norm": 0.09716586768627167, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 242710 + }, + { + "epoch": 0.9382876405189343, + "grad_norm": 0.09732740372419357, + "learning_rate": 0.002, + "loss": 2.347, + "step": 242720 + }, + { + "epoch": 0.9383262977223176, + "grad_norm": 0.10785375535488129, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 242730 + }, + { + "epoch": 0.9383649549257008, + "grad_norm": 0.11654423177242279, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 242740 + }, + { + "epoch": 0.9384036121290842, + "grad_norm": 0.10839424282312393, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 242750 + }, + { + "epoch": 0.9384422693324674, + "grad_norm": 0.12296377122402191, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 242760 + }, + { + "epoch": 0.9384809265358507, + "grad_norm": 0.10874893516302109, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 242770 + }, + { + "epoch": 0.9385195837392339, + "grad_norm": 0.10407909005880356, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 242780 + }, + { + "epoch": 0.9385582409426172, + "grad_norm": 0.1068480983376503, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 242790 + }, + { + "epoch": 0.9385968981460006, + "grad_norm": 0.09913309663534164, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 242800 + }, + { + "epoch": 0.9386355553493838, + "grad_norm": 0.13421852886676788, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 242810 + }, + { + "epoch": 0.9386742125527671, + "grad_norm": 0.10474736988544464, + "learning_rate": 0.002, + "loss": 2.339, + "step": 242820 + }, + { + "epoch": 0.9387128697561503, + "grad_norm": 0.11090757697820663, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 242830 + }, + { + "epoch": 0.9387515269595337, + "grad_norm": 0.10732854157686234, + "learning_rate": 0.002, + "loss": 2.327, + "step": 242840 + }, + { + "epoch": 0.9387901841629169, + "grad_norm": 0.09814008325338364, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 242850 + }, + { + "epoch": 0.9388288413663002, + "grad_norm": 0.10455290973186493, + "learning_rate": 0.002, + "loss": 2.339, + "step": 242860 + }, + { + "epoch": 0.9388674985696834, + "grad_norm": 0.1159416064620018, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 242870 + }, + { + "epoch": 0.9389061557730668, + "grad_norm": 0.09911547601222992, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 242880 + }, + { + "epoch": 0.93894481297645, + "grad_norm": 0.09677939116954803, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 242890 + }, + { + "epoch": 0.9389834701798333, + "grad_norm": 0.10465750843286514, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 242900 + }, + { + "epoch": 0.9390221273832166, + "grad_norm": 0.10974880307912827, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 242910 + }, + { + "epoch": 0.9390607845865999, + "grad_norm": 0.10110744088888168, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 242920 + }, + { + "epoch": 0.9390994417899832, + "grad_norm": 0.1309683620929718, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 242930 + }, + { + "epoch": 0.9391380989933664, + "grad_norm": 0.09194327145814896, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 242940 + }, + { + "epoch": 0.9391767561967497, + "grad_norm": 0.10804415494203568, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 242950 + }, + { + "epoch": 0.9392154134001329, + "grad_norm": 0.09277147799730301, + "learning_rate": 0.002, + "loss": 2.353, + "step": 242960 + }, + { + "epoch": 0.9392540706035163, + "grad_norm": 0.1043214276432991, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 242970 + }, + { + "epoch": 0.9392927278068995, + "grad_norm": 0.10557788610458374, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 242980 + }, + { + "epoch": 0.9393313850102828, + "grad_norm": 0.09209178388118744, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 242990 + }, + { + "epoch": 0.939370042213666, + "grad_norm": 0.09748727828264236, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 243000 + }, + { + "epoch": 0.9394086994170494, + "grad_norm": 0.10211475938558578, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 243010 + }, + { + "epoch": 0.9394473566204327, + "grad_norm": 0.11180869489908218, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 243020 + }, + { + "epoch": 0.9394860138238159, + "grad_norm": 0.15321184694766998, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 243030 + }, + { + "epoch": 0.9395246710271992, + "grad_norm": 0.1107967421412468, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 243040 + }, + { + "epoch": 0.9395633282305825, + "grad_norm": 0.10820520669221878, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 243050 + }, + { + "epoch": 0.9396019854339658, + "grad_norm": 0.10260884463787079, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 243060 + }, + { + "epoch": 0.939640642637349, + "grad_norm": 0.10140696167945862, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 243070 + }, + { + "epoch": 0.9396792998407323, + "grad_norm": 0.11433785408735275, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 243080 + }, + { + "epoch": 0.9397179570441156, + "grad_norm": 0.10506769269704819, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 243090 + }, + { + "epoch": 0.9397566142474989, + "grad_norm": 0.1130407378077507, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 243100 + }, + { + "epoch": 0.9397952714508822, + "grad_norm": 0.10419058799743652, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 243110 + }, + { + "epoch": 0.9398339286542654, + "grad_norm": 0.0975937694311142, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 243120 + }, + { + "epoch": 0.9398725858576488, + "grad_norm": 0.09650372713804245, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 243130 + }, + { + "epoch": 0.939911243061032, + "grad_norm": 0.10235495120286942, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 243140 + }, + { + "epoch": 0.9399499002644153, + "grad_norm": 0.10510300099849701, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 243150 + }, + { + "epoch": 0.9399885574677985, + "grad_norm": 0.1160840168595314, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 243160 + }, + { + "epoch": 0.9400272146711818, + "grad_norm": 0.10195966064929962, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 243170 + }, + { + "epoch": 0.9400658718745651, + "grad_norm": 0.13564079999923706, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 243180 + }, + { + "epoch": 0.9401045290779484, + "grad_norm": 0.1034989058971405, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 243190 + }, + { + "epoch": 0.9401431862813316, + "grad_norm": 0.09181249141693115, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 243200 + }, + { + "epoch": 0.9401818434847149, + "grad_norm": 0.11321096867322922, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 243210 + }, + { + "epoch": 0.9402205006880983, + "grad_norm": 0.34259989857673645, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 243220 + }, + { + "epoch": 0.9402591578914815, + "grad_norm": 0.11003011465072632, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 243230 + }, + { + "epoch": 0.9402978150948648, + "grad_norm": 0.09221770614385605, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 243240 + }, + { + "epoch": 0.940336472298248, + "grad_norm": 0.10950542986392975, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 243250 + }, + { + "epoch": 0.9403751295016314, + "grad_norm": 0.09502715617418289, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 243260 + }, + { + "epoch": 0.9404137867050146, + "grad_norm": 0.0961625799536705, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 243270 + }, + { + "epoch": 0.9404524439083979, + "grad_norm": 0.13180868327617645, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 243280 + }, + { + "epoch": 0.9404911011117811, + "grad_norm": 0.09879843890666962, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 243290 + }, + { + "epoch": 0.9405297583151645, + "grad_norm": 0.10887906700372696, + "learning_rate": 0.002, + "loss": 2.339, + "step": 243300 + }, + { + "epoch": 0.9405684155185478, + "grad_norm": 0.09494494646787643, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 243310 + }, + { + "epoch": 0.940607072721931, + "grad_norm": 0.11056619137525558, + "learning_rate": 0.002, + "loss": 2.336, + "step": 243320 + }, + { + "epoch": 0.9406457299253143, + "grad_norm": 0.11053516715765, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 243330 + }, + { + "epoch": 0.9406843871286975, + "grad_norm": 0.10389503836631775, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 243340 + }, + { + "epoch": 0.9407230443320809, + "grad_norm": 0.1147523745894432, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 243350 + }, + { + "epoch": 0.9407617015354641, + "grad_norm": 0.09236140549182892, + "learning_rate": 0.002, + "loss": 2.34, + "step": 243360 + }, + { + "epoch": 0.9408003587388474, + "grad_norm": 0.10874568670988083, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 243370 + }, + { + "epoch": 0.9408390159422306, + "grad_norm": 0.10968281328678131, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 243380 + }, + { + "epoch": 0.940877673145614, + "grad_norm": 0.10738930851221085, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 243390 + }, + { + "epoch": 0.9409163303489972, + "grad_norm": 0.10887566953897476, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 243400 + }, + { + "epoch": 0.9409549875523805, + "grad_norm": 0.09930963069200516, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 243410 + }, + { + "epoch": 0.9409936447557637, + "grad_norm": 0.10712359100580215, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 243420 + }, + { + "epoch": 0.9410323019591471, + "grad_norm": 0.10104658454656601, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 243430 + }, + { + "epoch": 0.9410709591625304, + "grad_norm": 0.09679016470909119, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 243440 + }, + { + "epoch": 0.9411096163659136, + "grad_norm": 0.11958494782447815, + "learning_rate": 0.002, + "loss": 2.34, + "step": 243450 + }, + { + "epoch": 0.9411482735692969, + "grad_norm": 0.10792674124240875, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 243460 + }, + { + "epoch": 0.9411869307726802, + "grad_norm": 0.1145927906036377, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 243470 + }, + { + "epoch": 0.9412255879760635, + "grad_norm": 0.10597149282693863, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 243480 + }, + { + "epoch": 0.9412642451794467, + "grad_norm": 0.09116151183843613, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 243490 + }, + { + "epoch": 0.94130290238283, + "grad_norm": 0.12441765516996384, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 243500 + }, + { + "epoch": 0.9413415595862134, + "grad_norm": 0.13671045005321503, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 243510 + }, + { + "epoch": 0.9413802167895966, + "grad_norm": 0.09610971063375473, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 243520 + }, + { + "epoch": 0.9414188739929799, + "grad_norm": 0.1013101115822792, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 243530 + }, + { + "epoch": 0.9414575311963631, + "grad_norm": 0.10055457800626755, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 243540 + }, + { + "epoch": 0.9414961883997464, + "grad_norm": 0.12221028655767441, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 243550 + }, + { + "epoch": 0.9415348456031297, + "grad_norm": 0.10323991626501083, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 243560 + }, + { + "epoch": 0.941573502806513, + "grad_norm": 0.103202685713768, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 243570 + }, + { + "epoch": 0.9416121600098962, + "grad_norm": 0.10308035463094711, + "learning_rate": 0.002, + "loss": 2.331, + "step": 243580 + }, + { + "epoch": 0.9416508172132795, + "grad_norm": 0.09553807973861694, + "learning_rate": 0.002, + "loss": 2.337, + "step": 243590 + }, + { + "epoch": 0.9416894744166628, + "grad_norm": 0.13683472573757172, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 243600 + }, + { + "epoch": 0.9417281316200461, + "grad_norm": 0.1049049124121666, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 243610 + }, + { + "epoch": 0.9417667888234293, + "grad_norm": 0.09826502948999405, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 243620 + }, + { + "epoch": 0.9418054460268126, + "grad_norm": 0.12797874212265015, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 243630 + }, + { + "epoch": 0.941844103230196, + "grad_norm": 0.10155646502971649, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 243640 + }, + { + "epoch": 0.9418827604335792, + "grad_norm": 0.10716505348682404, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 243650 + }, + { + "epoch": 0.9419214176369625, + "grad_norm": 0.13598762452602386, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 243660 + }, + { + "epoch": 0.9419600748403457, + "grad_norm": 0.08658332377672195, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 243670 + }, + { + "epoch": 0.9419987320437291, + "grad_norm": 0.12665392458438873, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 243680 + }, + { + "epoch": 0.9420373892471123, + "grad_norm": 0.10175671428442001, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 243690 + }, + { + "epoch": 0.9420760464504956, + "grad_norm": 0.10279948264360428, + "learning_rate": 0.002, + "loss": 2.34, + "step": 243700 + }, + { + "epoch": 0.9421147036538788, + "grad_norm": 0.09339248389005661, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 243710 + }, + { + "epoch": 0.9421533608572621, + "grad_norm": 0.09639550000429153, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 243720 + }, + { + "epoch": 0.9421920180606455, + "grad_norm": 0.09585197269916534, + "learning_rate": 0.002, + "loss": 2.333, + "step": 243730 + }, + { + "epoch": 0.9422306752640287, + "grad_norm": 0.09819405525922775, + "learning_rate": 0.002, + "loss": 2.3123, + "step": 243740 + }, + { + "epoch": 0.942269332467412, + "grad_norm": 0.11895592510700226, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 243750 + }, + { + "epoch": 0.9423079896707952, + "grad_norm": 0.10645242035388947, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 243760 + }, + { + "epoch": 0.9423466468741786, + "grad_norm": 0.10574015229940414, + "learning_rate": 0.002, + "loss": 2.335, + "step": 243770 + }, + { + "epoch": 0.9423853040775618, + "grad_norm": 0.11190999299287796, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 243780 + }, + { + "epoch": 0.9424239612809451, + "grad_norm": 0.11906064301729202, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 243790 + }, + { + "epoch": 0.9424626184843283, + "grad_norm": 0.10504163056612015, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 243800 + }, + { + "epoch": 0.9425012756877117, + "grad_norm": 0.0909421443939209, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 243810 + }, + { + "epoch": 0.942539932891095, + "grad_norm": 0.10777609050273895, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 243820 + }, + { + "epoch": 0.9425785900944782, + "grad_norm": 0.09569604694843292, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 243830 + }, + { + "epoch": 0.9426172472978614, + "grad_norm": 0.11758679896593094, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 243840 + }, + { + "epoch": 0.9426559045012448, + "grad_norm": 0.10496620833873749, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 243850 + }, + { + "epoch": 0.9426945617046281, + "grad_norm": 0.13090617954730988, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 243860 + }, + { + "epoch": 0.9427332189080113, + "grad_norm": 0.11926376819610596, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 243870 + }, + { + "epoch": 0.9427718761113946, + "grad_norm": 0.100855752825737, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 243880 + }, + { + "epoch": 0.9428105333147778, + "grad_norm": 0.10731156170368195, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 243890 + }, + { + "epoch": 0.9428491905181612, + "grad_norm": 0.09543273597955704, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 243900 + }, + { + "epoch": 0.9428878477215444, + "grad_norm": 0.08852459490299225, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 243910 + }, + { + "epoch": 0.9429265049249277, + "grad_norm": 0.13325460255146027, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 243920 + }, + { + "epoch": 0.9429651621283109, + "grad_norm": 0.0993916243314743, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 243930 + }, + { + "epoch": 0.9430038193316943, + "grad_norm": 0.10384912043809891, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 243940 + }, + { + "epoch": 0.9430424765350776, + "grad_norm": 0.13025276362895966, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 243950 + }, + { + "epoch": 0.9430811337384608, + "grad_norm": 0.21691864728927612, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 243960 + }, + { + "epoch": 0.9431197909418441, + "grad_norm": 0.09893051534891129, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 243970 + }, + { + "epoch": 0.9431584481452274, + "grad_norm": 0.11579374969005585, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 243980 + }, + { + "epoch": 0.9431971053486107, + "grad_norm": 0.09239792078733444, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 243990 + }, + { + "epoch": 0.9432357625519939, + "grad_norm": 0.1062285453081131, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 244000 + }, + { + "epoch": 0.9432744197553772, + "grad_norm": 0.08503497391939163, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 244010 + }, + { + "epoch": 0.9433130769587605, + "grad_norm": 0.1249237135052681, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 244020 + }, + { + "epoch": 0.9433517341621438, + "grad_norm": 0.10373333096504211, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 244030 + }, + { + "epoch": 0.943390391365527, + "grad_norm": 0.11731874942779541, + "learning_rate": 0.002, + "loss": 2.347, + "step": 244040 + }, + { + "epoch": 0.9434290485689103, + "grad_norm": 0.11248892545700073, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 244050 + }, + { + "epoch": 0.9434677057722937, + "grad_norm": 0.0989983081817627, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 244060 + }, + { + "epoch": 0.9435063629756769, + "grad_norm": 0.09739810228347778, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 244070 + }, + { + "epoch": 0.9435450201790602, + "grad_norm": 0.12019892036914825, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 244080 + }, + { + "epoch": 0.9435836773824434, + "grad_norm": 0.09014622122049332, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 244090 + }, + { + "epoch": 0.9436223345858267, + "grad_norm": 0.10269775241613388, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 244100 + }, + { + "epoch": 0.94366099178921, + "grad_norm": 0.1557008922100067, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 244110 + }, + { + "epoch": 0.9436996489925933, + "grad_norm": 0.18072141706943512, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 244120 + }, + { + "epoch": 0.9437383061959765, + "grad_norm": 0.11029327660799026, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 244130 + }, + { + "epoch": 0.9437769633993598, + "grad_norm": 0.11124139279127121, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 244140 + }, + { + "epoch": 0.9438156206027432, + "grad_norm": 0.09908004850149155, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 244150 + }, + { + "epoch": 0.9438542778061264, + "grad_norm": 0.1112242117524147, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 244160 + }, + { + "epoch": 0.9438929350095097, + "grad_norm": 0.09297651797533035, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 244170 + }, + { + "epoch": 0.9439315922128929, + "grad_norm": 0.12331829965114594, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 244180 + }, + { + "epoch": 0.9439702494162763, + "grad_norm": 0.10122538357973099, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 244190 + }, + { + "epoch": 0.9440089066196595, + "grad_norm": 0.11608784645795822, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 244200 + }, + { + "epoch": 0.9440475638230428, + "grad_norm": 0.12176360189914703, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 244210 + }, + { + "epoch": 0.944086221026426, + "grad_norm": 0.11390931904315948, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 244220 + }, + { + "epoch": 0.9441248782298094, + "grad_norm": 0.10589968413114548, + "learning_rate": 0.002, + "loss": 2.322, + "step": 244230 + }, + { + "epoch": 0.9441635354331926, + "grad_norm": 0.10728061944246292, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 244240 + }, + { + "epoch": 0.9442021926365759, + "grad_norm": 0.107032909989357, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 244250 + }, + { + "epoch": 0.9442408498399592, + "grad_norm": 0.09229406714439392, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 244260 + }, + { + "epoch": 0.9442795070433424, + "grad_norm": 0.09907346963882446, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 244270 + }, + { + "epoch": 0.9443181642467258, + "grad_norm": 0.10639627277851105, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 244280 + }, + { + "epoch": 0.944356821450109, + "grad_norm": 0.11334112286567688, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 244290 + }, + { + "epoch": 0.9443954786534923, + "grad_norm": 0.12863516807556152, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 244300 + }, + { + "epoch": 0.9444341358568755, + "grad_norm": 0.10524982959032059, + "learning_rate": 0.002, + "loss": 2.339, + "step": 244310 + }, + { + "epoch": 0.9444727930602589, + "grad_norm": 0.10224920511245728, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 244320 + }, + { + "epoch": 0.9445114502636421, + "grad_norm": 0.11514956504106522, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 244330 + }, + { + "epoch": 0.9445501074670254, + "grad_norm": 0.10550584644079208, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 244340 + }, + { + "epoch": 0.9445887646704086, + "grad_norm": 0.10284145176410675, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 244350 + }, + { + "epoch": 0.944627421873792, + "grad_norm": 0.12884284555912018, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 244360 + }, + { + "epoch": 0.9446660790771753, + "grad_norm": 0.1060062125325203, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 244370 + }, + { + "epoch": 0.9447047362805585, + "grad_norm": 0.12581922113895416, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 244380 + }, + { + "epoch": 0.9447433934839418, + "grad_norm": 0.11938612908124924, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 244390 + }, + { + "epoch": 0.9447820506873251, + "grad_norm": 0.1125989630818367, + "learning_rate": 0.002, + "loss": 2.344, + "step": 244400 + }, + { + "epoch": 0.9448207078907084, + "grad_norm": 0.11721620708703995, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 244410 + }, + { + "epoch": 0.9448593650940916, + "grad_norm": 0.09430734813213348, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 244420 + }, + { + "epoch": 0.9448980222974749, + "grad_norm": 0.09782515466213226, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 244430 + }, + { + "epoch": 0.9449366795008582, + "grad_norm": 0.09499527513980865, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 244440 + }, + { + "epoch": 0.9449753367042415, + "grad_norm": 0.15276183187961578, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 244450 + }, + { + "epoch": 0.9450139939076247, + "grad_norm": 0.10129997134208679, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 244460 + }, + { + "epoch": 0.945052651111008, + "grad_norm": 0.10709690302610397, + "learning_rate": 0.002, + "loss": 2.327, + "step": 244470 + }, + { + "epoch": 0.9450913083143913, + "grad_norm": 0.09405800700187683, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 244480 + }, + { + "epoch": 0.9451299655177746, + "grad_norm": 0.1307350993156433, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 244490 + }, + { + "epoch": 0.9451686227211579, + "grad_norm": 0.09851517528295517, + "learning_rate": 0.002, + "loss": 2.323, + "step": 244500 + }, + { + "epoch": 0.9452072799245411, + "grad_norm": 0.1036297008395195, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 244510 + }, + { + "epoch": 0.9452459371279244, + "grad_norm": 0.11205063760280609, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 244520 + }, + { + "epoch": 0.9452845943313077, + "grad_norm": 0.12469026446342468, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 244530 + }, + { + "epoch": 0.945323251534691, + "grad_norm": 0.11297151446342468, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 244540 + }, + { + "epoch": 0.9453619087380742, + "grad_norm": 0.13889361917972565, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 244550 + }, + { + "epoch": 0.9454005659414575, + "grad_norm": 0.10334096103906631, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 244560 + }, + { + "epoch": 0.9454392231448409, + "grad_norm": 0.0992535948753357, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 244570 + }, + { + "epoch": 0.9454778803482241, + "grad_norm": 0.09402686357498169, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 244580 + }, + { + "epoch": 0.9455165375516074, + "grad_norm": 0.14916792511940002, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 244590 + }, + { + "epoch": 0.9455551947549906, + "grad_norm": 0.10576571524143219, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 244600 + }, + { + "epoch": 0.945593851958374, + "grad_norm": 0.11637571454048157, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 244610 + }, + { + "epoch": 0.9456325091617572, + "grad_norm": 0.13626307249069214, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 244620 + }, + { + "epoch": 0.9456711663651405, + "grad_norm": 0.09241092205047607, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 244630 + }, + { + "epoch": 0.9457098235685237, + "grad_norm": 0.13166047632694244, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 244640 + }, + { + "epoch": 0.945748480771907, + "grad_norm": 0.11197197437286377, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 244650 + }, + { + "epoch": 0.9457871379752903, + "grad_norm": 0.09408937394618988, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 244660 + }, + { + "epoch": 0.9458257951786736, + "grad_norm": 0.09625637531280518, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 244670 + }, + { + "epoch": 0.9458644523820569, + "grad_norm": 0.3478195369243622, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 244680 + }, + { + "epoch": 0.9459031095854401, + "grad_norm": 0.10369176417589188, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 244690 + }, + { + "epoch": 0.9459417667888235, + "grad_norm": 0.09355360269546509, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 244700 + }, + { + "epoch": 0.9459804239922067, + "grad_norm": 0.10981659591197968, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 244710 + }, + { + "epoch": 0.94601908119559, + "grad_norm": 0.1039636954665184, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 244720 + }, + { + "epoch": 0.9460577383989732, + "grad_norm": 0.1111103966832161, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 244730 + }, + { + "epoch": 0.9460963956023566, + "grad_norm": 0.1148524358868599, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 244740 + }, + { + "epoch": 0.9461350528057398, + "grad_norm": 0.0972360372543335, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 244750 + }, + { + "epoch": 0.9461737100091231, + "grad_norm": 0.11069841682910919, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 244760 + }, + { + "epoch": 0.9462123672125063, + "grad_norm": 0.09903131425380707, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 244770 + }, + { + "epoch": 0.9462510244158897, + "grad_norm": 0.10690746456384659, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 244780 + }, + { + "epoch": 0.946289681619273, + "grad_norm": 0.12674789130687714, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 244790 + }, + { + "epoch": 0.9463283388226562, + "grad_norm": 0.10173221677541733, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 244800 + }, + { + "epoch": 0.9463669960260395, + "grad_norm": 0.12079965323209763, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 244810 + }, + { + "epoch": 0.9464056532294227, + "grad_norm": 0.10707973688840866, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 244820 + }, + { + "epoch": 0.9464443104328061, + "grad_norm": 0.0917050912976265, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 244830 + }, + { + "epoch": 0.9464829676361893, + "grad_norm": 0.10524480789899826, + "learning_rate": 0.002, + "loss": 2.34, + "step": 244840 + }, + { + "epoch": 0.9465216248395726, + "grad_norm": 0.10164222121238708, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 244850 + }, + { + "epoch": 0.9465602820429558, + "grad_norm": 0.09384514391422272, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 244860 + }, + { + "epoch": 0.9465989392463392, + "grad_norm": 0.10381849110126495, + "learning_rate": 0.002, + "loss": 2.3619, + "step": 244870 + }, + { + "epoch": 0.9466375964497225, + "grad_norm": 0.10301042348146439, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 244880 + }, + { + "epoch": 0.9466762536531057, + "grad_norm": 0.09503357857465744, + "learning_rate": 0.002, + "loss": 2.34, + "step": 244890 + }, + { + "epoch": 0.946714910856489, + "grad_norm": 0.164928138256073, + "learning_rate": 0.002, + "loss": 2.333, + "step": 244900 + }, + { + "epoch": 0.9467535680598723, + "grad_norm": 0.11334381252527237, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 244910 + }, + { + "epoch": 0.9467922252632556, + "grad_norm": 0.11500794440507889, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 244920 + }, + { + "epoch": 0.9468308824666388, + "grad_norm": 0.10959529131650925, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 244930 + }, + { + "epoch": 0.9468695396700221, + "grad_norm": 0.10082434117794037, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 244940 + }, + { + "epoch": 0.9469081968734054, + "grad_norm": 0.1229538545012474, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 244950 + }, + { + "epoch": 0.9469468540767887, + "grad_norm": 0.11257387697696686, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 244960 + }, + { + "epoch": 0.9469855112801719, + "grad_norm": 0.08525936305522919, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 244970 + }, + { + "epoch": 0.9470241684835552, + "grad_norm": 0.12079308927059174, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 244980 + }, + { + "epoch": 0.9470628256869386, + "grad_norm": 0.09743160009384155, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 244990 + }, + { + "epoch": 0.9471014828903218, + "grad_norm": 0.10047302395105362, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 245000 + }, + { + "epoch": 0.9471401400937051, + "grad_norm": 0.10854795575141907, + "learning_rate": 0.002, + "loss": 2.352, + "step": 245010 + }, + { + "epoch": 0.9471787972970883, + "grad_norm": 0.11618287861347198, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 245020 + }, + { + "epoch": 0.9472174545004716, + "grad_norm": 0.10113832354545593, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 245030 + }, + { + "epoch": 0.9472561117038549, + "grad_norm": 0.2170974165201187, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 245040 + }, + { + "epoch": 0.9472947689072382, + "grad_norm": 0.10892549902200699, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 245050 + }, + { + "epoch": 0.9473334261106214, + "grad_norm": 0.09819786250591278, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 245060 + }, + { + "epoch": 0.9473720833140047, + "grad_norm": 0.12030044943094254, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 245070 + }, + { + "epoch": 0.947410740517388, + "grad_norm": 0.1042487770318985, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 245080 + }, + { + "epoch": 0.9474493977207713, + "grad_norm": 0.11309467256069183, + "learning_rate": 0.002, + "loss": 2.337, + "step": 245090 + }, + { + "epoch": 0.9474880549241546, + "grad_norm": 0.10138697177171707, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 245100 + }, + { + "epoch": 0.9475267121275378, + "grad_norm": 0.09876588732004166, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 245110 + }, + { + "epoch": 0.9475653693309212, + "grad_norm": 0.11651773750782013, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 245120 + }, + { + "epoch": 0.9476040265343044, + "grad_norm": 0.11628615111112595, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 245130 + }, + { + "epoch": 0.9476426837376877, + "grad_norm": 0.10182247310876846, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 245140 + }, + { + "epoch": 0.9476813409410709, + "grad_norm": 0.09684959799051285, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 245150 + }, + { + "epoch": 0.9477199981444543, + "grad_norm": 0.10772944241762161, + "learning_rate": 0.002, + "loss": 2.342, + "step": 245160 + }, + { + "epoch": 0.9477586553478375, + "grad_norm": 0.10510464757680893, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 245170 + }, + { + "epoch": 0.9477973125512208, + "grad_norm": 0.10853858292102814, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 245180 + }, + { + "epoch": 0.947835969754604, + "grad_norm": 0.10508356243371964, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 245190 + }, + { + "epoch": 0.9478746269579873, + "grad_norm": 0.10015291720628738, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 245200 + }, + { + "epoch": 0.9479132841613707, + "grad_norm": 0.10214115679264069, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 245210 + }, + { + "epoch": 0.9479519413647539, + "grad_norm": 0.11255674064159393, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 245220 + }, + { + "epoch": 0.9479905985681372, + "grad_norm": 0.10145910829305649, + "learning_rate": 0.002, + "loss": 2.349, + "step": 245230 + }, + { + "epoch": 0.9480292557715204, + "grad_norm": 0.11434584110975266, + "learning_rate": 0.002, + "loss": 2.3633, + "step": 245240 + }, + { + "epoch": 0.9480679129749038, + "grad_norm": 0.11106500774621964, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 245250 + }, + { + "epoch": 0.948106570178287, + "grad_norm": 0.09371323138475418, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 245260 + }, + { + "epoch": 0.9481452273816703, + "grad_norm": 0.10655788332223892, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 245270 + }, + { + "epoch": 0.9481838845850535, + "grad_norm": 0.10724660009145737, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 245280 + }, + { + "epoch": 0.9482225417884369, + "grad_norm": 0.10476400703191757, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 245290 + }, + { + "epoch": 0.9482611989918202, + "grad_norm": 0.10649838298559189, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 245300 + }, + { + "epoch": 0.9482998561952034, + "grad_norm": 0.10631929337978363, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 245310 + }, + { + "epoch": 0.9483385133985867, + "grad_norm": 0.11776553839445114, + "learning_rate": 0.002, + "loss": 2.344, + "step": 245320 + }, + { + "epoch": 0.94837717060197, + "grad_norm": 0.11228649318218231, + "learning_rate": 0.002, + "loss": 2.336, + "step": 245330 + }, + { + "epoch": 0.9484158278053533, + "grad_norm": 0.10142796486616135, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 245340 + }, + { + "epoch": 0.9484544850087365, + "grad_norm": 0.11260076612234116, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 245350 + }, + { + "epoch": 0.9484931422121198, + "grad_norm": 0.10552968829870224, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 245360 + }, + { + "epoch": 0.948531799415503, + "grad_norm": 0.08846011757850647, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 245370 + }, + { + "epoch": 0.9485704566188864, + "grad_norm": 0.11995434761047363, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 245380 + }, + { + "epoch": 0.9486091138222696, + "grad_norm": 0.10945376753807068, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 245390 + }, + { + "epoch": 0.9486477710256529, + "grad_norm": 0.10176488757133484, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 245400 + }, + { + "epoch": 0.9486864282290361, + "grad_norm": 0.10432841628789902, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 245410 + }, + { + "epoch": 0.9487250854324195, + "grad_norm": 0.10382282733917236, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 245420 + }, + { + "epoch": 0.9487637426358028, + "grad_norm": 0.09938489645719528, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 245430 + }, + { + "epoch": 0.948802399839186, + "grad_norm": 0.10734734684228897, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 245440 + }, + { + "epoch": 0.9488410570425693, + "grad_norm": 0.11344361305236816, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 245450 + }, + { + "epoch": 0.9488797142459526, + "grad_norm": 0.09626557677984238, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 245460 + }, + { + "epoch": 0.9489183714493359, + "grad_norm": 0.10190510004758835, + "learning_rate": 0.002, + "loss": 2.336, + "step": 245470 + }, + { + "epoch": 0.9489570286527191, + "grad_norm": 0.10002933442592621, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 245480 + }, + { + "epoch": 0.9489956858561024, + "grad_norm": 0.09390109032392502, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 245490 + }, + { + "epoch": 0.9490343430594858, + "grad_norm": 0.1125788688659668, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 245500 + }, + { + "epoch": 0.949073000262869, + "grad_norm": 0.09098855406045914, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 245510 + }, + { + "epoch": 0.9491116574662523, + "grad_norm": 0.09397424012422562, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 245520 + }, + { + "epoch": 0.9491503146696355, + "grad_norm": 0.1099240854382515, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 245530 + }, + { + "epoch": 0.9491889718730189, + "grad_norm": 0.10843119025230408, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 245540 + }, + { + "epoch": 0.9492276290764021, + "grad_norm": 0.11342742294073105, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 245550 + }, + { + "epoch": 0.9492662862797854, + "grad_norm": 0.1284036487340927, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 245560 + }, + { + "epoch": 0.9493049434831686, + "grad_norm": 0.09022320806980133, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 245570 + }, + { + "epoch": 0.9493436006865519, + "grad_norm": 0.09618698060512543, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 245580 + }, + { + "epoch": 0.9493822578899352, + "grad_norm": 0.09820694476366043, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 245590 + }, + { + "epoch": 0.9494209150933185, + "grad_norm": 0.10224698483943939, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 245600 + }, + { + "epoch": 0.9494595722967017, + "grad_norm": 0.11915197968482971, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 245610 + }, + { + "epoch": 0.949498229500085, + "grad_norm": 0.11021997034549713, + "learning_rate": 0.002, + "loss": 2.344, + "step": 245620 + }, + { + "epoch": 0.9495368867034684, + "grad_norm": 0.11132847517728806, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 245630 + }, + { + "epoch": 0.9495755439068516, + "grad_norm": 0.1295585036277771, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 245640 + }, + { + "epoch": 0.9496142011102349, + "grad_norm": 0.10676326602697372, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 245650 + }, + { + "epoch": 0.9496528583136181, + "grad_norm": 0.10247169435024261, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 245660 + }, + { + "epoch": 0.9496915155170015, + "grad_norm": 0.11274540424346924, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 245670 + }, + { + "epoch": 0.9497301727203847, + "grad_norm": 0.0945650115609169, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 245680 + }, + { + "epoch": 0.949768829923768, + "grad_norm": 0.09697515517473221, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 245690 + }, + { + "epoch": 0.9498074871271512, + "grad_norm": 0.10053026676177979, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 245700 + }, + { + "epoch": 0.9498461443305346, + "grad_norm": 0.10225459188222885, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 245710 + }, + { + "epoch": 0.9498848015339179, + "grad_norm": 0.10824661701917648, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 245720 + }, + { + "epoch": 0.9499234587373011, + "grad_norm": 0.09760218113660812, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 245730 + }, + { + "epoch": 0.9499621159406844, + "grad_norm": 0.09142498672008514, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 245740 + }, + { + "epoch": 0.9500007731440676, + "grad_norm": 0.11499843001365662, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 245750 + }, + { + "epoch": 0.950039430347451, + "grad_norm": 0.09203146398067474, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 245760 + }, + { + "epoch": 0.9500780875508342, + "grad_norm": 0.11725558340549469, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 245770 + }, + { + "epoch": 0.9501167447542175, + "grad_norm": 0.11182353645563126, + "learning_rate": 0.002, + "loss": 2.325, + "step": 245780 + }, + { + "epoch": 0.9501554019576007, + "grad_norm": 0.09739293158054352, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 245790 + }, + { + "epoch": 0.9501940591609841, + "grad_norm": 0.11021154373884201, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 245800 + }, + { + "epoch": 0.9502327163643673, + "grad_norm": 0.0996776819229126, + "learning_rate": 0.002, + "loss": 2.336, + "step": 245810 + }, + { + "epoch": 0.9502713735677506, + "grad_norm": 0.10235131531953812, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 245820 + }, + { + "epoch": 0.9503100307711339, + "grad_norm": 0.10729663819074631, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 245830 + }, + { + "epoch": 0.9503486879745172, + "grad_norm": 0.10996519029140472, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 245840 + }, + { + "epoch": 0.9503873451779005, + "grad_norm": 0.10459493100643158, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 245850 + }, + { + "epoch": 0.9504260023812837, + "grad_norm": 0.11086221784353256, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 245860 + }, + { + "epoch": 0.950464659584667, + "grad_norm": 0.0975433811545372, + "learning_rate": 0.002, + "loss": 2.336, + "step": 245870 + }, + { + "epoch": 0.9505033167880503, + "grad_norm": 0.10561605542898178, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 245880 + }, + { + "epoch": 0.9505419739914336, + "grad_norm": 0.11538262665271759, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 245890 + }, + { + "epoch": 0.9505806311948168, + "grad_norm": 0.20161397755146027, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 245900 + }, + { + "epoch": 0.9506192883982001, + "grad_norm": 0.11669044941663742, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 245910 + }, + { + "epoch": 0.9506579456015835, + "grad_norm": 0.09776271134614944, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 245920 + }, + { + "epoch": 0.9506966028049667, + "grad_norm": 0.1042805090546608, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 245930 + }, + { + "epoch": 0.95073526000835, + "grad_norm": 0.15709929168224335, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 245940 + }, + { + "epoch": 0.9507739172117332, + "grad_norm": 0.11055158823728561, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 245950 + }, + { + "epoch": 0.9508125744151165, + "grad_norm": 0.12504521012306213, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 245960 + }, + { + "epoch": 0.9508512316184998, + "grad_norm": 0.09743013232946396, + "learning_rate": 0.002, + "loss": 2.3701, + "step": 245970 + }, + { + "epoch": 0.9508898888218831, + "grad_norm": 0.11348041146993637, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 245980 + }, + { + "epoch": 0.9509285460252663, + "grad_norm": 0.0906619057059288, + "learning_rate": 0.002, + "loss": 2.326, + "step": 245990 + }, + { + "epoch": 0.9509672032286496, + "grad_norm": 0.10841189324855804, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 246000 + }, + { + "epoch": 0.951005860432033, + "grad_norm": 0.09922589361667633, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 246010 + }, + { + "epoch": 0.9510445176354162, + "grad_norm": 0.11780747771263123, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 246020 + }, + { + "epoch": 0.9510831748387994, + "grad_norm": 0.11119019240140915, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 246030 + }, + { + "epoch": 0.9511218320421827, + "grad_norm": 0.09635666012763977, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 246040 + }, + { + "epoch": 0.9511604892455661, + "grad_norm": 0.10957963764667511, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 246050 + }, + { + "epoch": 0.9511991464489493, + "grad_norm": 0.10559819638729095, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 246060 + }, + { + "epoch": 0.9512378036523326, + "grad_norm": 0.12483032792806625, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 246070 + }, + { + "epoch": 0.9512764608557158, + "grad_norm": 0.09919989109039307, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 246080 + }, + { + "epoch": 0.9513151180590992, + "grad_norm": 0.14158926904201508, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 246090 + }, + { + "epoch": 0.9513537752624824, + "grad_norm": 0.09532709419727325, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 246100 + }, + { + "epoch": 0.9513924324658657, + "grad_norm": 0.11871691048145294, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 246110 + }, + { + "epoch": 0.9514310896692489, + "grad_norm": 0.10590434819459915, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 246120 + }, + { + "epoch": 0.9514697468726322, + "grad_norm": 0.09818745404481888, + "learning_rate": 0.002, + "loss": 2.345, + "step": 246130 + }, + { + "epoch": 0.9515084040760156, + "grad_norm": 0.09181661903858185, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 246140 + }, + { + "epoch": 0.9515470612793988, + "grad_norm": 0.11111031472682953, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 246150 + }, + { + "epoch": 0.9515857184827821, + "grad_norm": 0.10768986493349075, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 246160 + }, + { + "epoch": 0.9516243756861653, + "grad_norm": 0.11306337267160416, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 246170 + }, + { + "epoch": 0.9516630328895487, + "grad_norm": 0.12405366450548172, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 246180 + }, + { + "epoch": 0.9517016900929319, + "grad_norm": 0.12463579326868057, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 246190 + }, + { + "epoch": 0.9517403472963152, + "grad_norm": 0.09911695122718811, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 246200 + }, + { + "epoch": 0.9517790044996984, + "grad_norm": 0.09321106225252151, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 246210 + }, + { + "epoch": 0.9518176617030818, + "grad_norm": 0.10201654583215714, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 246220 + }, + { + "epoch": 0.951856318906465, + "grad_norm": 0.11066223680973053, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 246230 + }, + { + "epoch": 0.9518949761098483, + "grad_norm": 0.10625241696834564, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 246240 + }, + { + "epoch": 0.9519336333132316, + "grad_norm": 0.10942903161048889, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 246250 + }, + { + "epoch": 0.9519722905166149, + "grad_norm": 0.08949451148509979, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 246260 + }, + { + "epoch": 0.9520109477199982, + "grad_norm": 0.10243234783411026, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 246270 + }, + { + "epoch": 0.9520496049233814, + "grad_norm": 0.09917983412742615, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 246280 + }, + { + "epoch": 0.9520882621267647, + "grad_norm": 0.37809163331985474, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 246290 + }, + { + "epoch": 0.9521269193301479, + "grad_norm": 0.09876684844493866, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 246300 + }, + { + "epoch": 0.9521655765335313, + "grad_norm": 0.09615564346313477, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 246310 + }, + { + "epoch": 0.9522042337369145, + "grad_norm": 0.09892557561397552, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 246320 + }, + { + "epoch": 0.9522428909402978, + "grad_norm": 0.12730903923511505, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 246330 + }, + { + "epoch": 0.952281548143681, + "grad_norm": 0.09999363869428635, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 246340 + }, + { + "epoch": 0.9523202053470644, + "grad_norm": 0.11508851498365402, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 246350 + }, + { + "epoch": 0.9523588625504477, + "grad_norm": 0.10577461868524551, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 246360 + }, + { + "epoch": 0.9523975197538309, + "grad_norm": 0.08935672789812088, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 246370 + }, + { + "epoch": 0.9524361769572142, + "grad_norm": 0.11016064137220383, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 246380 + }, + { + "epoch": 0.9524748341605975, + "grad_norm": 0.10751229524612427, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 246390 + }, + { + "epoch": 0.9525134913639808, + "grad_norm": 0.10001815110445023, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 246400 + }, + { + "epoch": 0.952552148567364, + "grad_norm": 0.10038241744041443, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 246410 + }, + { + "epoch": 0.9525908057707473, + "grad_norm": 0.10372153669595718, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 246420 + }, + { + "epoch": 0.9526294629741306, + "grad_norm": 0.10932927578687668, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 246430 + }, + { + "epoch": 0.9526681201775139, + "grad_norm": 0.13651612401008606, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 246440 + }, + { + "epoch": 0.9527067773808972, + "grad_norm": 0.09618911892175674, + "learning_rate": 0.002, + "loss": 2.343, + "step": 246450 + }, + { + "epoch": 0.9527454345842804, + "grad_norm": 0.10881288349628448, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 246460 + }, + { + "epoch": 0.9527840917876638, + "grad_norm": 0.09795135259628296, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 246470 + }, + { + "epoch": 0.952822748991047, + "grad_norm": 0.12024227529764175, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 246480 + }, + { + "epoch": 0.9528614061944303, + "grad_norm": 0.10273991525173187, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 246490 + }, + { + "epoch": 0.9529000633978135, + "grad_norm": 0.08923361450433731, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 246500 + }, + { + "epoch": 0.9529387206011968, + "grad_norm": 0.1068204715847969, + "learning_rate": 0.002, + "loss": 2.336, + "step": 246510 + }, + { + "epoch": 0.9529773778045801, + "grad_norm": 0.11845804005861282, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 246520 + }, + { + "epoch": 0.9530160350079634, + "grad_norm": 0.09708742052316666, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 246530 + }, + { + "epoch": 0.9530546922113466, + "grad_norm": 0.09450454264879227, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 246540 + }, + { + "epoch": 0.9530933494147299, + "grad_norm": 0.09565430879592896, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 246550 + }, + { + "epoch": 0.9531320066181133, + "grad_norm": 0.11724113672971725, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 246560 + }, + { + "epoch": 0.9531706638214965, + "grad_norm": 0.10473284125328064, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 246570 + }, + { + "epoch": 0.9532093210248798, + "grad_norm": 0.11020340025424957, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 246580 + }, + { + "epoch": 0.953247978228263, + "grad_norm": 0.09498760104179382, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 246590 + }, + { + "epoch": 0.9532866354316464, + "grad_norm": 0.09037910401821136, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 246600 + }, + { + "epoch": 0.9533252926350296, + "grad_norm": 0.09975023567676544, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 246610 + }, + { + "epoch": 0.9533639498384129, + "grad_norm": 0.1050228402018547, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 246620 + }, + { + "epoch": 0.9534026070417961, + "grad_norm": 0.10599356889724731, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 246630 + }, + { + "epoch": 0.9534412642451795, + "grad_norm": 0.10019141435623169, + "learning_rate": 0.002, + "loss": 2.325, + "step": 246640 + }, + { + "epoch": 0.9534799214485628, + "grad_norm": 0.10532790422439575, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 246650 + }, + { + "epoch": 0.953518578651946, + "grad_norm": 0.11821125447750092, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 246660 + }, + { + "epoch": 0.9535572358553293, + "grad_norm": 0.10337822139263153, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 246670 + }, + { + "epoch": 0.9535958930587125, + "grad_norm": 0.13625840842723846, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 246680 + }, + { + "epoch": 0.9536345502620959, + "grad_norm": 0.11718141287565231, + "learning_rate": 0.002, + "loss": 2.3599, + "step": 246690 + }, + { + "epoch": 0.9536732074654791, + "grad_norm": 0.10749223828315735, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 246700 + }, + { + "epoch": 0.9537118646688624, + "grad_norm": 0.13698039948940277, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 246710 + }, + { + "epoch": 0.9537505218722456, + "grad_norm": 0.10092923790216446, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 246720 + }, + { + "epoch": 0.953789179075629, + "grad_norm": 0.10021331161260605, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 246730 + }, + { + "epoch": 0.9538278362790122, + "grad_norm": 0.09901577979326248, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 246740 + }, + { + "epoch": 0.9538664934823955, + "grad_norm": 0.11905761808156967, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 246750 + }, + { + "epoch": 0.9539051506857787, + "grad_norm": 0.09991281479597092, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 246760 + }, + { + "epoch": 0.9539438078891621, + "grad_norm": 0.11390586197376251, + "learning_rate": 0.002, + "loss": 2.343, + "step": 246770 + }, + { + "epoch": 0.9539824650925454, + "grad_norm": 0.09168101847171783, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 246780 + }, + { + "epoch": 0.9540211222959286, + "grad_norm": 0.09830158948898315, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 246790 + }, + { + "epoch": 0.9540597794993119, + "grad_norm": 0.11884202063083649, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 246800 + }, + { + "epoch": 0.9540984367026952, + "grad_norm": 0.08972768485546112, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 246810 + }, + { + "epoch": 0.9541370939060785, + "grad_norm": 0.10713957995176315, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 246820 + }, + { + "epoch": 0.9541757511094617, + "grad_norm": 0.09925784915685654, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 246830 + }, + { + "epoch": 0.954214408312845, + "grad_norm": 0.10352442413568497, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 246840 + }, + { + "epoch": 0.9542530655162283, + "grad_norm": 0.1348172277212143, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 246850 + }, + { + "epoch": 0.9542917227196116, + "grad_norm": 0.09868699312210083, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 246860 + }, + { + "epoch": 0.9543303799229949, + "grad_norm": 0.08707798272371292, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 246870 + }, + { + "epoch": 0.9543690371263781, + "grad_norm": 0.10370556265115738, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 246880 + }, + { + "epoch": 0.9544076943297614, + "grad_norm": 0.10752521455287933, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 246890 + }, + { + "epoch": 0.9544463515331447, + "grad_norm": 0.10657340288162231, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 246900 + }, + { + "epoch": 0.954485008736528, + "grad_norm": 0.09765563160181046, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 246910 + }, + { + "epoch": 0.9545236659399112, + "grad_norm": 0.11601200699806213, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 246920 + }, + { + "epoch": 0.9545623231432945, + "grad_norm": 0.10320843011140823, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 246930 + }, + { + "epoch": 0.9546009803466778, + "grad_norm": 0.09450047463178635, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 246940 + }, + { + "epoch": 0.9546396375500611, + "grad_norm": 0.11228545755147934, + "learning_rate": 0.002, + "loss": 2.338, + "step": 246950 + }, + { + "epoch": 0.9546782947534443, + "grad_norm": 0.10331697016954422, + "learning_rate": 0.002, + "loss": 2.344, + "step": 246960 + }, + { + "epoch": 0.9547169519568276, + "grad_norm": 0.10116118937730789, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 246970 + }, + { + "epoch": 0.954755609160211, + "grad_norm": 0.11346136033535004, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 246980 + }, + { + "epoch": 0.9547942663635942, + "grad_norm": 0.1188848614692688, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 246990 + }, + { + "epoch": 0.9548329235669775, + "grad_norm": 0.11899816989898682, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 247000 + }, + { + "epoch": 0.9548715807703607, + "grad_norm": 0.0926302894949913, + "learning_rate": 0.002, + "loss": 2.339, + "step": 247010 + }, + { + "epoch": 0.9549102379737441, + "grad_norm": 0.11829515546560287, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 247020 + }, + { + "epoch": 0.9549488951771273, + "grad_norm": 0.10293518751859665, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 247030 + }, + { + "epoch": 0.9549875523805106, + "grad_norm": 0.1046319380402565, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 247040 + }, + { + "epoch": 0.9550262095838938, + "grad_norm": 0.11046954244375229, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 247050 + }, + { + "epoch": 0.9550648667872771, + "grad_norm": 0.10719576478004456, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 247060 + }, + { + "epoch": 0.9551035239906605, + "grad_norm": 0.1009703278541565, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 247070 + }, + { + "epoch": 0.9551421811940437, + "grad_norm": 0.09082961827516556, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 247080 + }, + { + "epoch": 0.955180838397427, + "grad_norm": 0.11020997166633606, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 247090 + }, + { + "epoch": 0.9552194956008102, + "grad_norm": 0.10537709295749664, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 247100 + }, + { + "epoch": 0.9552581528041936, + "grad_norm": 0.10682892054319382, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 247110 + }, + { + "epoch": 0.9552968100075768, + "grad_norm": 0.10063782334327698, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 247120 + }, + { + "epoch": 0.9553354672109601, + "grad_norm": 0.10315974056720734, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 247130 + }, + { + "epoch": 0.9553741244143433, + "grad_norm": 0.09505379945039749, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 247140 + }, + { + "epoch": 0.9554127816177267, + "grad_norm": 0.10029108077287674, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 247150 + }, + { + "epoch": 0.9554514388211099, + "grad_norm": 0.0997537150979042, + "learning_rate": 0.002, + "loss": 2.348, + "step": 247160 + }, + { + "epoch": 0.9554900960244932, + "grad_norm": 0.10752159357070923, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 247170 + }, + { + "epoch": 0.9555287532278764, + "grad_norm": 0.10309893637895584, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 247180 + }, + { + "epoch": 0.9555674104312598, + "grad_norm": 0.10903715342283249, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 247190 + }, + { + "epoch": 0.9556060676346431, + "grad_norm": 0.09624198824167252, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 247200 + }, + { + "epoch": 0.9556447248380263, + "grad_norm": 0.11103501915931702, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 247210 + }, + { + "epoch": 0.9556833820414096, + "grad_norm": 0.11439616978168488, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 247220 + }, + { + "epoch": 0.9557220392447928, + "grad_norm": 0.14776815474033356, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 247230 + }, + { + "epoch": 0.9557606964481762, + "grad_norm": 0.09352473169565201, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 247240 + }, + { + "epoch": 0.9557993536515594, + "grad_norm": 0.10370460897684097, + "learning_rate": 0.002, + "loss": 2.333, + "step": 247250 + }, + { + "epoch": 0.9558380108549427, + "grad_norm": 0.12870363891124725, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 247260 + }, + { + "epoch": 0.9558766680583259, + "grad_norm": 0.10398175567388535, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 247270 + }, + { + "epoch": 0.9559153252617093, + "grad_norm": 0.09815921634435654, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 247280 + }, + { + "epoch": 0.9559539824650926, + "grad_norm": 0.20846377313137054, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 247290 + }, + { + "epoch": 0.9559926396684758, + "grad_norm": 0.10216788202524185, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 247300 + }, + { + "epoch": 0.9560312968718591, + "grad_norm": 0.09926524013280869, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 247310 + }, + { + "epoch": 0.9560699540752424, + "grad_norm": 0.11557048559188843, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 247320 + }, + { + "epoch": 0.9561086112786257, + "grad_norm": 0.12979860603809357, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 247330 + }, + { + "epoch": 0.9561472684820089, + "grad_norm": 0.09846516698598862, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 247340 + }, + { + "epoch": 0.9561859256853922, + "grad_norm": 0.10615224391222, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 247350 + }, + { + "epoch": 0.9562245828887755, + "grad_norm": 0.1250700205564499, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 247360 + }, + { + "epoch": 0.9562632400921588, + "grad_norm": 0.0990319475531578, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 247370 + }, + { + "epoch": 0.956301897295542, + "grad_norm": 0.1047218069434166, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 247380 + }, + { + "epoch": 0.9563405544989253, + "grad_norm": 0.11436039954423904, + "learning_rate": 0.002, + "loss": 2.348, + "step": 247390 + }, + { + "epoch": 0.9563792117023087, + "grad_norm": 0.2365836203098297, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 247400 + }, + { + "epoch": 0.9564178689056919, + "grad_norm": 0.09409108012914658, + "learning_rate": 0.002, + "loss": 2.336, + "step": 247410 + }, + { + "epoch": 0.9564565261090752, + "grad_norm": 0.09443030506372452, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 247420 + }, + { + "epoch": 0.9564951833124584, + "grad_norm": 0.10896280407905579, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 247430 + }, + { + "epoch": 0.9565338405158417, + "grad_norm": 0.11036523431539536, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 247440 + }, + { + "epoch": 0.956572497719225, + "grad_norm": 0.09874240309000015, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 247450 + }, + { + "epoch": 0.9566111549226083, + "grad_norm": 0.11801275610923767, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 247460 + }, + { + "epoch": 0.9566498121259915, + "grad_norm": 0.09368408471345901, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 247470 + }, + { + "epoch": 0.9566884693293748, + "grad_norm": 0.11651654541492462, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 247480 + }, + { + "epoch": 0.9567271265327582, + "grad_norm": 0.09981834888458252, + "learning_rate": 0.002, + "loss": 2.335, + "step": 247490 + }, + { + "epoch": 0.9567657837361414, + "grad_norm": 0.11089000105857849, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 247500 + }, + { + "epoch": 0.9568044409395247, + "grad_norm": 0.11288763582706451, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 247510 + }, + { + "epoch": 0.9568430981429079, + "grad_norm": 0.12972544133663177, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 247520 + }, + { + "epoch": 0.9568817553462913, + "grad_norm": 0.11114118248224258, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 247530 + }, + { + "epoch": 0.9569204125496745, + "grad_norm": 0.09732469916343689, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 247540 + }, + { + "epoch": 0.9569590697530578, + "grad_norm": 0.09808913618326187, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 247550 + }, + { + "epoch": 0.956997726956441, + "grad_norm": 0.12151769548654556, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 247560 + }, + { + "epoch": 0.9570363841598244, + "grad_norm": 0.10195376724004745, + "learning_rate": 0.002, + "loss": 2.343, + "step": 247570 + }, + { + "epoch": 0.9570750413632076, + "grad_norm": 0.11638251692056656, + "learning_rate": 0.002, + "loss": 2.336, + "step": 247580 + }, + { + "epoch": 0.9571136985665909, + "grad_norm": 0.09858639538288116, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 247590 + }, + { + "epoch": 0.9571523557699742, + "grad_norm": 0.09813068062067032, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 247600 + }, + { + "epoch": 0.9571910129733574, + "grad_norm": 0.1054849699139595, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 247610 + }, + { + "epoch": 0.9572296701767408, + "grad_norm": 0.11740526556968689, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 247620 + }, + { + "epoch": 0.957268327380124, + "grad_norm": 0.09919170290231705, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 247630 + }, + { + "epoch": 0.9573069845835073, + "grad_norm": 0.09892688691616058, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 247640 + }, + { + "epoch": 0.9573456417868905, + "grad_norm": 0.09781397879123688, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 247650 + }, + { + "epoch": 0.9573842989902739, + "grad_norm": 0.09282873570919037, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 247660 + }, + { + "epoch": 0.9574229561936571, + "grad_norm": 0.08740855753421783, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 247670 + }, + { + "epoch": 0.9574616133970404, + "grad_norm": 0.11887798458337784, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 247680 + }, + { + "epoch": 0.9575002706004236, + "grad_norm": 0.1058524027466774, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 247690 + }, + { + "epoch": 0.957538927803807, + "grad_norm": 0.0968957170844078, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 247700 + }, + { + "epoch": 0.9575775850071903, + "grad_norm": 0.1604377031326294, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 247710 + }, + { + "epoch": 0.9576162422105735, + "grad_norm": 0.12406475096940994, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 247720 + }, + { + "epoch": 0.9576548994139568, + "grad_norm": 0.12045074254274368, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 247730 + }, + { + "epoch": 0.9576935566173401, + "grad_norm": 0.0910901427268982, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 247740 + }, + { + "epoch": 0.9577322138207234, + "grad_norm": 0.11357443779706955, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 247750 + }, + { + "epoch": 0.9577708710241066, + "grad_norm": 0.09268621355295181, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 247760 + }, + { + "epoch": 0.9578095282274899, + "grad_norm": 0.12930162250995636, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 247770 + }, + { + "epoch": 0.9578481854308731, + "grad_norm": 0.10533638298511505, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 247780 + }, + { + "epoch": 0.9578868426342565, + "grad_norm": 0.1110549047589302, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 247790 + }, + { + "epoch": 0.9579254998376397, + "grad_norm": 0.11005035787820816, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 247800 + }, + { + "epoch": 0.957964157041023, + "grad_norm": 0.10834237188100815, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 247810 + }, + { + "epoch": 0.9580028142444063, + "grad_norm": 0.0905105397105217, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 247820 + }, + { + "epoch": 0.9580414714477896, + "grad_norm": 0.0982719212770462, + "learning_rate": 0.002, + "loss": 2.349, + "step": 247830 + }, + { + "epoch": 0.9580801286511729, + "grad_norm": 0.0960426852107048, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 247840 + }, + { + "epoch": 0.9581187858545561, + "grad_norm": 0.0999850258231163, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 247850 + }, + { + "epoch": 0.9581574430579394, + "grad_norm": 0.11025793850421906, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 247860 + }, + { + "epoch": 0.9581961002613227, + "grad_norm": 0.08891669660806656, + "learning_rate": 0.002, + "loss": 2.338, + "step": 247870 + }, + { + "epoch": 0.958234757464706, + "grad_norm": 0.10581798851490021, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 247880 + }, + { + "epoch": 0.9582734146680892, + "grad_norm": 0.11585883796215057, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 247890 + }, + { + "epoch": 0.9583120718714725, + "grad_norm": 0.09135353565216064, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 247900 + }, + { + "epoch": 0.9583507290748559, + "grad_norm": 0.10126357525587082, + "learning_rate": 0.002, + "loss": 2.352, + "step": 247910 + }, + { + "epoch": 0.9583893862782391, + "grad_norm": 0.09940259158611298, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 247920 + }, + { + "epoch": 0.9584280434816224, + "grad_norm": 0.11455532908439636, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 247930 + }, + { + "epoch": 0.9584667006850056, + "grad_norm": 0.10387568175792694, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 247940 + }, + { + "epoch": 0.958505357888389, + "grad_norm": 0.10172967612743378, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 247950 + }, + { + "epoch": 0.9585440150917722, + "grad_norm": 0.11757977306842804, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 247960 + }, + { + "epoch": 0.9585826722951555, + "grad_norm": 0.1099797710776329, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 247970 + }, + { + "epoch": 0.9586213294985387, + "grad_norm": 0.12944968044757843, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 247980 + }, + { + "epoch": 0.958659986701922, + "grad_norm": 0.09926500171422958, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 247990 + }, + { + "epoch": 0.9586986439053053, + "grad_norm": 0.13880828022956848, + "learning_rate": 0.002, + "loss": 2.355, + "step": 248000 + }, + { + "epoch": 0.9587373011086886, + "grad_norm": 0.10396645218133926, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 248010 + }, + { + "epoch": 0.9587759583120719, + "grad_norm": 0.1147075742483139, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 248020 + }, + { + "epoch": 0.9588146155154551, + "grad_norm": 0.11550748348236084, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 248030 + }, + { + "epoch": 0.9588532727188385, + "grad_norm": 0.10915172845125198, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 248040 + }, + { + "epoch": 0.9588919299222217, + "grad_norm": 0.09901692718267441, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 248050 + }, + { + "epoch": 0.958930587125605, + "grad_norm": 0.1246907114982605, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 248060 + }, + { + "epoch": 0.9589692443289882, + "grad_norm": 0.10774242877960205, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 248070 + }, + { + "epoch": 0.9590079015323716, + "grad_norm": 0.11088234186172485, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 248080 + }, + { + "epoch": 0.9590465587357548, + "grad_norm": 0.11602111160755157, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 248090 + }, + { + "epoch": 0.9590852159391381, + "grad_norm": 0.11179874837398529, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 248100 + }, + { + "epoch": 0.9591238731425213, + "grad_norm": 0.11069482564926147, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 248110 + }, + { + "epoch": 0.9591625303459047, + "grad_norm": 0.09340058267116547, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 248120 + }, + { + "epoch": 0.959201187549288, + "grad_norm": 0.12895047664642334, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 248130 + }, + { + "epoch": 0.9592398447526712, + "grad_norm": 0.1110064834356308, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 248140 + }, + { + "epoch": 0.9592785019560545, + "grad_norm": 0.10294454544782639, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 248150 + }, + { + "epoch": 0.9593171591594377, + "grad_norm": 0.133493110537529, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 248160 + }, + { + "epoch": 0.9593558163628211, + "grad_norm": 0.11024216562509537, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 248170 + }, + { + "epoch": 0.9593944735662043, + "grad_norm": 0.11153551936149597, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 248180 + }, + { + "epoch": 0.9594331307695876, + "grad_norm": 0.11546822637319565, + "learning_rate": 0.002, + "loss": 2.328, + "step": 248190 + }, + { + "epoch": 0.9594717879729708, + "grad_norm": 0.10031704604625702, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 248200 + }, + { + "epoch": 0.9595104451763542, + "grad_norm": 0.10214558988809586, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 248210 + }, + { + "epoch": 0.9595491023797375, + "grad_norm": 0.11143733561038971, + "learning_rate": 0.002, + "loss": 2.324, + "step": 248220 + }, + { + "epoch": 0.9595877595831207, + "grad_norm": 0.10043030232191086, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 248230 + }, + { + "epoch": 0.959626416786504, + "grad_norm": 0.1303481161594391, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 248240 + }, + { + "epoch": 0.9596650739898873, + "grad_norm": 0.10182737559080124, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 248250 + }, + { + "epoch": 0.9597037311932706, + "grad_norm": 0.10109949111938477, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 248260 + }, + { + "epoch": 0.9597423883966538, + "grad_norm": 0.10766161978244781, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 248270 + }, + { + "epoch": 0.9597810456000371, + "grad_norm": 0.09698101133108139, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 248280 + }, + { + "epoch": 0.9598197028034204, + "grad_norm": 0.12188798189163208, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 248290 + }, + { + "epoch": 0.9598583600068037, + "grad_norm": 0.10488150268793106, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 248300 + }, + { + "epoch": 0.9598970172101869, + "grad_norm": 0.09532138705253601, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 248310 + }, + { + "epoch": 0.9599356744135702, + "grad_norm": 0.11081784963607788, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 248320 + }, + { + "epoch": 0.9599743316169536, + "grad_norm": 0.09944086521863937, + "learning_rate": 0.002, + "loss": 2.339, + "step": 248330 + }, + { + "epoch": 0.9600129888203368, + "grad_norm": 0.13181588053703308, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 248340 + }, + { + "epoch": 0.9600516460237201, + "grad_norm": 0.10362618416547775, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 248350 + }, + { + "epoch": 0.9600903032271033, + "grad_norm": 0.10724879801273346, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 248360 + }, + { + "epoch": 0.9601289604304866, + "grad_norm": 0.09819664061069489, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 248370 + }, + { + "epoch": 0.9601676176338699, + "grad_norm": 0.11974728852510452, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 248380 + }, + { + "epoch": 0.9602062748372532, + "grad_norm": 0.11084406077861786, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 248390 + }, + { + "epoch": 0.9602449320406364, + "grad_norm": 0.09880924224853516, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 248400 + }, + { + "epoch": 0.9602835892440197, + "grad_norm": 0.12196718156337738, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 248410 + }, + { + "epoch": 0.960322246447403, + "grad_norm": 0.12853261828422546, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 248420 + }, + { + "epoch": 0.9603609036507863, + "grad_norm": 0.10215596109628677, + "learning_rate": 0.002, + "loss": 2.333, + "step": 248430 + }, + { + "epoch": 0.9603995608541696, + "grad_norm": 0.12762248516082764, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 248440 + }, + { + "epoch": 0.9604382180575528, + "grad_norm": 0.10123594850301743, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 248450 + }, + { + "epoch": 0.9604768752609362, + "grad_norm": 0.12691077589988708, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 248460 + }, + { + "epoch": 0.9605155324643194, + "grad_norm": 0.09701501578092575, + "learning_rate": 0.002, + "loss": 2.351, + "step": 248470 + }, + { + "epoch": 0.9605541896677027, + "grad_norm": 0.12225864827632904, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 248480 + }, + { + "epoch": 0.9605928468710859, + "grad_norm": 0.10921717435121536, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 248490 + }, + { + "epoch": 0.9606315040744693, + "grad_norm": 0.11986319720745087, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 248500 + }, + { + "epoch": 0.9606701612778525, + "grad_norm": 0.09127210080623627, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 248510 + }, + { + "epoch": 0.9607088184812358, + "grad_norm": 0.13293032348155975, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 248520 + }, + { + "epoch": 0.960747475684619, + "grad_norm": 0.1585289090871811, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 248530 + }, + { + "epoch": 0.9607861328880023, + "grad_norm": 0.10031675547361374, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 248540 + }, + { + "epoch": 0.9608247900913857, + "grad_norm": 0.14133134484291077, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 248550 + }, + { + "epoch": 0.9608634472947689, + "grad_norm": 0.10595063120126724, + "learning_rate": 0.002, + "loss": 2.323, + "step": 248560 + }, + { + "epoch": 0.9609021044981522, + "grad_norm": 0.10794010013341904, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 248570 + }, + { + "epoch": 0.9609407617015354, + "grad_norm": 0.09237299859523773, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 248580 + }, + { + "epoch": 0.9609794189049188, + "grad_norm": 0.10003355890512466, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 248590 + }, + { + "epoch": 0.961018076108302, + "grad_norm": 0.11430327594280243, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 248600 + }, + { + "epoch": 0.9610567333116853, + "grad_norm": 0.09757383912801743, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 248610 + }, + { + "epoch": 0.9610953905150685, + "grad_norm": 0.11522988975048065, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 248620 + }, + { + "epoch": 0.9611340477184519, + "grad_norm": 0.09410349279642105, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 248630 + }, + { + "epoch": 0.9611727049218352, + "grad_norm": 0.11336854845285416, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 248640 + }, + { + "epoch": 0.9612113621252184, + "grad_norm": 0.1023351326584816, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 248650 + }, + { + "epoch": 0.9612500193286017, + "grad_norm": 0.11214461922645569, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 248660 + }, + { + "epoch": 0.961288676531985, + "grad_norm": 0.11730778217315674, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 248670 + }, + { + "epoch": 0.9613273337353683, + "grad_norm": 0.09071174263954163, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 248680 + }, + { + "epoch": 0.9613659909387515, + "grad_norm": 0.11421388387680054, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 248690 + }, + { + "epoch": 0.9614046481421348, + "grad_norm": 0.11856932193040848, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 248700 + }, + { + "epoch": 0.961443305345518, + "grad_norm": 0.10843708366155624, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 248710 + }, + { + "epoch": 0.9614819625489014, + "grad_norm": 0.1020749881863594, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 248720 + }, + { + "epoch": 0.9615206197522846, + "grad_norm": 0.10943154245615005, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 248730 + }, + { + "epoch": 0.9615592769556679, + "grad_norm": 0.13843214511871338, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 248740 + }, + { + "epoch": 0.9615979341590511, + "grad_norm": 0.1045226976275444, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 248750 + }, + { + "epoch": 0.9616365913624345, + "grad_norm": 0.1176600456237793, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 248760 + }, + { + "epoch": 0.9616752485658178, + "grad_norm": 0.10141972452402115, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 248770 + }, + { + "epoch": 0.961713905769201, + "grad_norm": 0.09370206296443939, + "learning_rate": 0.002, + "loss": 2.335, + "step": 248780 + }, + { + "epoch": 0.9617525629725843, + "grad_norm": 0.10429870337247849, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 248790 + }, + { + "epoch": 0.9617912201759676, + "grad_norm": 0.11017408221960068, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 248800 + }, + { + "epoch": 0.9618298773793509, + "grad_norm": 0.11044590920209885, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 248810 + }, + { + "epoch": 0.9618685345827341, + "grad_norm": 0.1153523325920105, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 248820 + }, + { + "epoch": 0.9619071917861174, + "grad_norm": 0.10669272392988205, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 248830 + }, + { + "epoch": 0.9619458489895008, + "grad_norm": 0.11303277313709259, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 248840 + }, + { + "epoch": 0.961984506192884, + "grad_norm": 0.0925389900803566, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 248850 + }, + { + "epoch": 0.9620231633962673, + "grad_norm": 0.10389024019241333, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 248860 + }, + { + "epoch": 0.9620618205996505, + "grad_norm": 0.10728508234024048, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 248870 + }, + { + "epoch": 0.9621004778030339, + "grad_norm": 0.12325850874185562, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 248880 + }, + { + "epoch": 0.9621391350064171, + "grad_norm": 0.10077033936977386, + "learning_rate": 0.002, + "loss": 2.336, + "step": 248890 + }, + { + "epoch": 0.9621777922098004, + "grad_norm": 0.10189589112997055, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 248900 + }, + { + "epoch": 0.9622164494131836, + "grad_norm": 0.10807923972606659, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 248910 + }, + { + "epoch": 0.9622551066165669, + "grad_norm": 0.10375122725963593, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 248920 + }, + { + "epoch": 0.9622937638199502, + "grad_norm": 0.09079185873270035, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 248930 + }, + { + "epoch": 0.9623324210233335, + "grad_norm": 0.10723765939474106, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 248940 + }, + { + "epoch": 0.9623710782267167, + "grad_norm": 0.11098438501358032, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 248950 + }, + { + "epoch": 0.9624097354301, + "grad_norm": 0.10970813035964966, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 248960 + }, + { + "epoch": 0.9624483926334834, + "grad_norm": 0.10687640309333801, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 248970 + }, + { + "epoch": 0.9624870498368666, + "grad_norm": 0.1009809747338295, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 248980 + }, + { + "epoch": 0.9625257070402499, + "grad_norm": 0.12127617001533508, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 248990 + }, + { + "epoch": 0.9625643642436331, + "grad_norm": 0.09800001233816147, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 249000 + }, + { + "epoch": 0.9626030214470165, + "grad_norm": 0.09789685159921646, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 249010 + }, + { + "epoch": 0.9626416786503997, + "grad_norm": 0.11263931542634964, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 249020 + }, + { + "epoch": 0.962680335853783, + "grad_norm": 0.12075403332710266, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 249030 + }, + { + "epoch": 0.9627189930571662, + "grad_norm": 0.10094056278467178, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 249040 + }, + { + "epoch": 0.9627576502605496, + "grad_norm": 0.10163767635822296, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 249050 + }, + { + "epoch": 0.9627963074639329, + "grad_norm": 0.09169764816761017, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 249060 + }, + { + "epoch": 0.9628349646673161, + "grad_norm": 0.11688553541898727, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 249070 + }, + { + "epoch": 0.9628736218706994, + "grad_norm": 0.11535628885030746, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 249080 + }, + { + "epoch": 0.9629122790740826, + "grad_norm": 0.11258542537689209, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 249090 + }, + { + "epoch": 0.962950936277466, + "grad_norm": 0.11126603186130524, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 249100 + }, + { + "epoch": 0.9629895934808492, + "grad_norm": 0.11631768196821213, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 249110 + }, + { + "epoch": 0.9630282506842325, + "grad_norm": 0.12675493955612183, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 249120 + }, + { + "epoch": 0.9630669078876157, + "grad_norm": 0.11257590353488922, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 249130 + }, + { + "epoch": 0.9631055650909991, + "grad_norm": 0.09850809723138809, + "learning_rate": 0.002, + "loss": 2.338, + "step": 249140 + }, + { + "epoch": 0.9631442222943823, + "grad_norm": 0.10485752671957016, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 249150 + }, + { + "epoch": 0.9631828794977656, + "grad_norm": 0.12969525158405304, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 249160 + }, + { + "epoch": 0.9632215367011489, + "grad_norm": 0.10446181148290634, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 249170 + }, + { + "epoch": 0.9632601939045322, + "grad_norm": 0.1030307188630104, + "learning_rate": 0.002, + "loss": 2.337, + "step": 249180 + }, + { + "epoch": 0.9632988511079155, + "grad_norm": 0.10285484790802002, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 249190 + }, + { + "epoch": 0.9633375083112987, + "grad_norm": 0.12455843389034271, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 249200 + }, + { + "epoch": 0.963376165514682, + "grad_norm": 0.11602117121219635, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 249210 + }, + { + "epoch": 0.9634148227180653, + "grad_norm": 0.12856896221637726, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 249220 + }, + { + "epoch": 0.9634534799214486, + "grad_norm": 0.09774929285049438, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 249230 + }, + { + "epoch": 0.9634921371248318, + "grad_norm": 0.13181985914707184, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 249240 + }, + { + "epoch": 0.9635307943282151, + "grad_norm": 0.10472414642572403, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 249250 + }, + { + "epoch": 0.9635694515315985, + "grad_norm": 0.09871625900268555, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 249260 + }, + { + "epoch": 0.9636081087349817, + "grad_norm": 0.0966063141822815, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 249270 + }, + { + "epoch": 0.963646765938365, + "grad_norm": 0.104548379778862, + "learning_rate": 0.002, + "loss": 2.322, + "step": 249280 + }, + { + "epoch": 0.9636854231417482, + "grad_norm": 0.09819988161325455, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 249290 + }, + { + "epoch": 0.9637240803451315, + "grad_norm": 0.10572008043527603, + "learning_rate": 0.002, + "loss": 2.357, + "step": 249300 + }, + { + "epoch": 0.9637627375485148, + "grad_norm": 0.1110527440905571, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 249310 + }, + { + "epoch": 0.9638013947518981, + "grad_norm": 0.11289434880018234, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 249320 + }, + { + "epoch": 0.9638400519552813, + "grad_norm": 0.11714992672204971, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 249330 + }, + { + "epoch": 0.9638787091586646, + "grad_norm": 0.10628482699394226, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 249340 + }, + { + "epoch": 0.963917366362048, + "grad_norm": 0.13175849616527557, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 249350 + }, + { + "epoch": 0.9639560235654312, + "grad_norm": 0.1098552718758583, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 249360 + }, + { + "epoch": 0.9639946807688144, + "grad_norm": 0.10685157775878906, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 249370 + }, + { + "epoch": 0.9640333379721977, + "grad_norm": 0.09267696738243103, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 249380 + }, + { + "epoch": 0.9640719951755811, + "grad_norm": 0.09387141466140747, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 249390 + }, + { + "epoch": 0.9641106523789643, + "grad_norm": 0.13935497403144836, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 249400 + }, + { + "epoch": 0.9641493095823476, + "grad_norm": 0.10094621777534485, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 249410 + }, + { + "epoch": 0.9641879667857308, + "grad_norm": 0.1260949969291687, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 249420 + }, + { + "epoch": 0.9642266239891142, + "grad_norm": 0.11990845948457718, + "learning_rate": 0.002, + "loss": 2.367, + "step": 249430 + }, + { + "epoch": 0.9642652811924974, + "grad_norm": 0.09575967490673065, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 249440 + }, + { + "epoch": 0.9643039383958807, + "grad_norm": 0.09636884927749634, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 249450 + }, + { + "epoch": 0.9643425955992639, + "grad_norm": 0.12748612463474274, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 249460 + }, + { + "epoch": 0.9643812528026472, + "grad_norm": 0.11946625262498856, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 249470 + }, + { + "epoch": 0.9644199100060306, + "grad_norm": 0.09697463363409042, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 249480 + }, + { + "epoch": 0.9644585672094138, + "grad_norm": 0.10112636536359787, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 249490 + }, + { + "epoch": 0.9644972244127971, + "grad_norm": 0.09348530322313309, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 249500 + }, + { + "epoch": 0.9645358816161803, + "grad_norm": 0.1333259642124176, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 249510 + }, + { + "epoch": 0.9645745388195637, + "grad_norm": 0.10638459026813507, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 249520 + }, + { + "epoch": 0.9646131960229469, + "grad_norm": 0.1124267429113388, + "learning_rate": 0.002, + "loss": 2.332, + "step": 249530 + }, + { + "epoch": 0.9646518532263302, + "grad_norm": 0.0883001908659935, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 249540 + }, + { + "epoch": 0.9646905104297134, + "grad_norm": 0.100906603038311, + "learning_rate": 0.002, + "loss": 2.335, + "step": 249550 + }, + { + "epoch": 0.9647291676330968, + "grad_norm": 0.1052638441324234, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 249560 + }, + { + "epoch": 0.96476782483648, + "grad_norm": 0.11165803670883179, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 249570 + }, + { + "epoch": 0.9648064820398633, + "grad_norm": 0.0883340910077095, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 249580 + }, + { + "epoch": 0.9648451392432466, + "grad_norm": 0.11354761570692062, + "learning_rate": 0.002, + "loss": 2.326, + "step": 249590 + }, + { + "epoch": 0.9648837964466299, + "grad_norm": 0.11444555968046188, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 249600 + }, + { + "epoch": 0.9649224536500132, + "grad_norm": 0.14036472141742706, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 249610 + }, + { + "epoch": 0.9649611108533964, + "grad_norm": 0.12895837426185608, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 249620 + }, + { + "epoch": 0.9649997680567797, + "grad_norm": 0.11023510992527008, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 249630 + }, + { + "epoch": 0.9650384252601629, + "grad_norm": 0.12974929809570312, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 249640 + }, + { + "epoch": 0.9650770824635463, + "grad_norm": 0.10311168432235718, + "learning_rate": 0.002, + "loss": 2.336, + "step": 249650 + }, + { + "epoch": 0.9651157396669295, + "grad_norm": 0.10950174182653427, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 249660 + }, + { + "epoch": 0.9651543968703128, + "grad_norm": 0.09690546244382858, + "learning_rate": 0.002, + "loss": 2.323, + "step": 249670 + }, + { + "epoch": 0.965193054073696, + "grad_norm": 0.12773922085762024, + "learning_rate": 0.002, + "loss": 2.323, + "step": 249680 + }, + { + "epoch": 0.9652317112770794, + "grad_norm": 0.09502622485160828, + "learning_rate": 0.002, + "loss": 2.327, + "step": 249690 + }, + { + "epoch": 0.9652703684804627, + "grad_norm": 0.15543179214000702, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 249700 + }, + { + "epoch": 0.9653090256838459, + "grad_norm": 0.11124072968959808, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 249710 + }, + { + "epoch": 0.9653476828872292, + "grad_norm": 0.09724342077970505, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 249720 + }, + { + "epoch": 0.9653863400906125, + "grad_norm": 0.08673544228076935, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 249730 + }, + { + "epoch": 0.9654249972939958, + "grad_norm": 0.11834105849266052, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 249740 + }, + { + "epoch": 0.965463654497379, + "grad_norm": 0.0999530702829361, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 249750 + }, + { + "epoch": 0.9655023117007623, + "grad_norm": 0.10077276080846786, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 249760 + }, + { + "epoch": 0.9655409689041456, + "grad_norm": 0.1093694195151329, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 249770 + }, + { + "epoch": 0.9655796261075289, + "grad_norm": 0.10006707161664963, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 249780 + }, + { + "epoch": 0.9656182833109122, + "grad_norm": 0.1079149842262268, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 249790 + }, + { + "epoch": 0.9656569405142954, + "grad_norm": 0.10379232466220856, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 249800 + }, + { + "epoch": 0.9656955977176788, + "grad_norm": 0.09677669405937195, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 249810 + }, + { + "epoch": 0.965734254921062, + "grad_norm": 0.09964921325445175, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 249820 + }, + { + "epoch": 0.9657729121244453, + "grad_norm": 0.09625493735074997, + "learning_rate": 0.002, + "loss": 2.34, + "step": 249830 + }, + { + "epoch": 0.9658115693278285, + "grad_norm": 0.09828057885169983, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 249840 + }, + { + "epoch": 0.9658502265312118, + "grad_norm": 0.10741102695465088, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 249850 + }, + { + "epoch": 0.9658888837345951, + "grad_norm": 0.12895941734313965, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 249860 + }, + { + "epoch": 0.9659275409379784, + "grad_norm": 0.08948575705289841, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 249870 + }, + { + "epoch": 0.9659661981413616, + "grad_norm": 0.10771122574806213, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 249880 + }, + { + "epoch": 0.9660048553447449, + "grad_norm": 0.113441102206707, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 249890 + }, + { + "epoch": 0.9660435125481283, + "grad_norm": 0.10112905502319336, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 249900 + }, + { + "epoch": 0.9660821697515115, + "grad_norm": 0.1073261946439743, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 249910 + }, + { + "epoch": 0.9661208269548948, + "grad_norm": 0.11671066284179688, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 249920 + }, + { + "epoch": 0.966159484158278, + "grad_norm": 0.12026667594909668, + "learning_rate": 0.002, + "loss": 2.356, + "step": 249930 + }, + { + "epoch": 0.9661981413616614, + "grad_norm": 0.10516425967216492, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 249940 + }, + { + "epoch": 0.9662367985650446, + "grad_norm": 0.1196758821606636, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 249950 + }, + { + "epoch": 0.9662754557684279, + "grad_norm": 0.11494259536266327, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 249960 + }, + { + "epoch": 0.9663141129718111, + "grad_norm": 0.1059424877166748, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 249970 + }, + { + "epoch": 0.9663527701751945, + "grad_norm": 0.09678887575864792, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 249980 + }, + { + "epoch": 0.9663914273785777, + "grad_norm": 0.0959455743432045, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 249990 + }, + { + "epoch": 0.966430084581961, + "grad_norm": 0.10656458884477615, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 250000 + }, + { + "epoch": 0.9664687417853443, + "grad_norm": 0.09966926276683807, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 250010 + }, + { + "epoch": 0.9665073989887275, + "grad_norm": 0.10970748960971832, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 250020 + }, + { + "epoch": 0.9665460561921109, + "grad_norm": 0.09668340533971786, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 250030 + }, + { + "epoch": 0.9665847133954941, + "grad_norm": 0.13082824647426605, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 250040 + }, + { + "epoch": 0.9666233705988774, + "grad_norm": 0.13502457737922668, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 250050 + }, + { + "epoch": 0.9666620278022606, + "grad_norm": 0.088272824883461, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 250060 + }, + { + "epoch": 0.966700685005644, + "grad_norm": 0.09272816777229309, + "learning_rate": 0.002, + "loss": 2.336, + "step": 250070 + }, + { + "epoch": 0.9667393422090272, + "grad_norm": 0.09801635891199112, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 250080 + }, + { + "epoch": 0.9667779994124105, + "grad_norm": 0.10866446048021317, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 250090 + }, + { + "epoch": 0.9668166566157937, + "grad_norm": 0.09416583180427551, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 250100 + }, + { + "epoch": 0.9668553138191771, + "grad_norm": 0.12312600016593933, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 250110 + }, + { + "epoch": 0.9668939710225604, + "grad_norm": 0.09534309059381485, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 250120 + }, + { + "epoch": 0.9669326282259436, + "grad_norm": 0.11319991201162338, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 250130 + }, + { + "epoch": 0.9669712854293269, + "grad_norm": 0.10820036381483078, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 250140 + }, + { + "epoch": 0.9670099426327102, + "grad_norm": 0.09464868903160095, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 250150 + }, + { + "epoch": 0.9670485998360935, + "grad_norm": 0.09593628346920013, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 250160 + }, + { + "epoch": 0.9670872570394767, + "grad_norm": 0.09920892864465714, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 250170 + }, + { + "epoch": 0.96712591424286, + "grad_norm": 0.10005899518728256, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 250180 + }, + { + "epoch": 0.9671645714462432, + "grad_norm": 0.11125802993774414, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 250190 + }, + { + "epoch": 0.9672032286496266, + "grad_norm": 0.09977997839450836, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 250200 + }, + { + "epoch": 0.9672418858530099, + "grad_norm": 0.1008404940366745, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 250210 + }, + { + "epoch": 0.9672805430563931, + "grad_norm": 0.1824817657470703, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 250220 + }, + { + "epoch": 0.9673192002597764, + "grad_norm": 0.11212398111820221, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 250230 + }, + { + "epoch": 0.9673578574631597, + "grad_norm": 0.10903162509202957, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 250240 + }, + { + "epoch": 0.967396514666543, + "grad_norm": 0.09185224771499634, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 250250 + }, + { + "epoch": 0.9674351718699262, + "grad_norm": 0.09937603771686554, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 250260 + }, + { + "epoch": 0.9674738290733095, + "grad_norm": 0.11099566519260406, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 250270 + }, + { + "epoch": 0.9675124862766928, + "grad_norm": 0.09390152990818024, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 250280 + }, + { + "epoch": 0.9675511434800761, + "grad_norm": 0.10876850038766861, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 250290 + }, + { + "epoch": 0.9675898006834593, + "grad_norm": 0.12110339850187302, + "learning_rate": 0.002, + "loss": 2.344, + "step": 250300 + }, + { + "epoch": 0.9676284578868426, + "grad_norm": 0.10830269008874893, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 250310 + }, + { + "epoch": 0.967667115090226, + "grad_norm": 0.09625596553087234, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 250320 + }, + { + "epoch": 0.9677057722936092, + "grad_norm": 0.09408263117074966, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 250330 + }, + { + "epoch": 0.9677444294969925, + "grad_norm": 0.10909605026245117, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 250340 + }, + { + "epoch": 0.9677830867003757, + "grad_norm": 0.09753488004207611, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 250350 + }, + { + "epoch": 0.9678217439037591, + "grad_norm": 0.10700512677431107, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 250360 + }, + { + "epoch": 0.9678604011071423, + "grad_norm": 0.12809878587722778, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 250370 + }, + { + "epoch": 0.9678990583105256, + "grad_norm": 0.11302416771650314, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 250380 + }, + { + "epoch": 0.9679377155139088, + "grad_norm": 0.0878138393163681, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 250390 + }, + { + "epoch": 0.9679763727172921, + "grad_norm": 0.10237754881381989, + "learning_rate": 0.002, + "loss": 2.339, + "step": 250400 + }, + { + "epoch": 0.9680150299206755, + "grad_norm": 0.11656080931425095, + "learning_rate": 0.002, + "loss": 2.3111, + "step": 250410 + }, + { + "epoch": 0.9680536871240587, + "grad_norm": 0.11479943245649338, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 250420 + }, + { + "epoch": 0.968092344327442, + "grad_norm": 0.08994613587856293, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 250430 + }, + { + "epoch": 0.9681310015308252, + "grad_norm": 0.09658456593751907, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 250440 + }, + { + "epoch": 0.9681696587342086, + "grad_norm": 0.10116071999073029, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 250450 + }, + { + "epoch": 0.9682083159375918, + "grad_norm": 0.10644138604402542, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 250460 + }, + { + "epoch": 0.9682469731409751, + "grad_norm": 0.1095876395702362, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 250470 + }, + { + "epoch": 0.9682856303443583, + "grad_norm": 0.10697955638170242, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 250480 + }, + { + "epoch": 0.9683242875477417, + "grad_norm": 0.09780683368444443, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 250490 + }, + { + "epoch": 0.9683629447511249, + "grad_norm": 0.09849384427070618, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 250500 + }, + { + "epoch": 0.9684016019545082, + "grad_norm": 0.09895049780607224, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 250510 + }, + { + "epoch": 0.9684402591578914, + "grad_norm": 0.10031626373529434, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 250520 + }, + { + "epoch": 0.9684789163612748, + "grad_norm": 0.1167176216840744, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 250530 + }, + { + "epoch": 0.9685175735646581, + "grad_norm": 0.11091436445713043, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 250540 + }, + { + "epoch": 0.9685562307680413, + "grad_norm": 0.09511246532201767, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 250550 + }, + { + "epoch": 0.9685948879714246, + "grad_norm": 0.09637043625116348, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 250560 + }, + { + "epoch": 0.9686335451748078, + "grad_norm": 0.10570426285266876, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 250570 + }, + { + "epoch": 0.9686722023781912, + "grad_norm": 0.08653859794139862, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 250580 + }, + { + "epoch": 0.9687108595815744, + "grad_norm": 0.11135722696781158, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 250590 + }, + { + "epoch": 0.9687495167849577, + "grad_norm": 0.10831640660762787, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 250600 + }, + { + "epoch": 0.9687881739883409, + "grad_norm": 0.10700742900371552, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 250610 + }, + { + "epoch": 0.9688268311917243, + "grad_norm": 0.09919552505016327, + "learning_rate": 0.002, + "loss": 2.334, + "step": 250620 + }, + { + "epoch": 0.9688654883951076, + "grad_norm": 0.10234495252370834, + "learning_rate": 0.002, + "loss": 2.339, + "step": 250630 + }, + { + "epoch": 0.9689041455984908, + "grad_norm": 0.1608228236436844, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 250640 + }, + { + "epoch": 0.9689428028018741, + "grad_norm": 0.11038243770599365, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 250650 + }, + { + "epoch": 0.9689814600052574, + "grad_norm": 0.13908079266548157, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 250660 + }, + { + "epoch": 0.9690201172086407, + "grad_norm": 0.11118654906749725, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 250670 + }, + { + "epoch": 0.9690587744120239, + "grad_norm": 0.09143441915512085, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 250680 + }, + { + "epoch": 0.9690974316154072, + "grad_norm": 0.1052967831492424, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 250690 + }, + { + "epoch": 0.9691360888187905, + "grad_norm": 0.10974379628896713, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 250700 + }, + { + "epoch": 0.9691747460221738, + "grad_norm": 0.10102606564760208, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 250710 + }, + { + "epoch": 0.969213403225557, + "grad_norm": 0.10798647254705429, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 250720 + }, + { + "epoch": 0.9692520604289403, + "grad_norm": 0.10051941126585007, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 250730 + }, + { + "epoch": 0.9692907176323237, + "grad_norm": 0.10196227580308914, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 250740 + }, + { + "epoch": 0.9693293748357069, + "grad_norm": 0.11750812828540802, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 250750 + }, + { + "epoch": 0.9693680320390902, + "grad_norm": 0.11583947390317917, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 250760 + }, + { + "epoch": 0.9694066892424734, + "grad_norm": 0.10498038679361343, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 250770 + }, + { + "epoch": 0.9694453464458567, + "grad_norm": 0.10385581851005554, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 250780 + }, + { + "epoch": 0.96948400364924, + "grad_norm": 0.10725849866867065, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 250790 + }, + { + "epoch": 0.9695226608526233, + "grad_norm": 0.10024876892566681, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 250800 + }, + { + "epoch": 0.9695613180560065, + "grad_norm": 0.11648895591497421, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 250810 + }, + { + "epoch": 0.9695999752593898, + "grad_norm": 0.11542441695928574, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 250820 + }, + { + "epoch": 0.9696386324627732, + "grad_norm": 0.09200196713209152, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 250830 + }, + { + "epoch": 0.9696772896661564, + "grad_norm": 0.09737569838762283, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 250840 + }, + { + "epoch": 0.9697159468695397, + "grad_norm": 0.10656151175498962, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 250850 + }, + { + "epoch": 0.9697546040729229, + "grad_norm": 0.10151994228363037, + "learning_rate": 0.002, + "loss": 2.333, + "step": 250860 + }, + { + "epoch": 0.9697932612763063, + "grad_norm": 0.09500584006309509, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 250870 + }, + { + "epoch": 0.9698319184796895, + "grad_norm": 0.10878094285726547, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 250880 + }, + { + "epoch": 0.9698705756830728, + "grad_norm": 0.12469379603862762, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 250890 + }, + { + "epoch": 0.969909232886456, + "grad_norm": 0.09622928500175476, + "learning_rate": 0.002, + "loss": 2.343, + "step": 250900 + }, + { + "epoch": 0.9699478900898394, + "grad_norm": 0.10867168009281158, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 250910 + }, + { + "epoch": 0.9699865472932226, + "grad_norm": 0.10699144005775452, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 250920 + }, + { + "epoch": 0.9700252044966059, + "grad_norm": 0.11127132177352905, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 250930 + }, + { + "epoch": 0.9700638616999891, + "grad_norm": 0.09256624430418015, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 250940 + }, + { + "epoch": 0.9701025189033724, + "grad_norm": 0.10343354195356369, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 250950 + }, + { + "epoch": 0.9701411761067558, + "grad_norm": 0.1389748454093933, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 250960 + }, + { + "epoch": 0.970179833310139, + "grad_norm": 0.11664845794439316, + "learning_rate": 0.002, + "loss": 2.335, + "step": 250970 + }, + { + "epoch": 0.9702184905135223, + "grad_norm": 0.11075331270694733, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 250980 + }, + { + "epoch": 0.9702571477169055, + "grad_norm": 0.10614953190088272, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 250990 + }, + { + "epoch": 0.9702958049202889, + "grad_norm": 0.08824150264263153, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 251000 + }, + { + "epoch": 0.9703344621236721, + "grad_norm": 0.10004694759845734, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 251010 + }, + { + "epoch": 0.9703731193270554, + "grad_norm": 0.12179727107286453, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 251020 + }, + { + "epoch": 0.9704117765304386, + "grad_norm": 0.11327700316905975, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 251030 + }, + { + "epoch": 0.970450433733822, + "grad_norm": 0.09601608663797379, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 251040 + }, + { + "epoch": 0.9704890909372053, + "grad_norm": 0.12900158762931824, + "learning_rate": 0.002, + "loss": 2.3156, + "step": 251050 + }, + { + "epoch": 0.9705277481405885, + "grad_norm": 0.0974268764257431, + "learning_rate": 0.002, + "loss": 2.343, + "step": 251060 + }, + { + "epoch": 0.9705664053439718, + "grad_norm": 0.12711255252361298, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 251070 + }, + { + "epoch": 0.9706050625473551, + "grad_norm": 0.1104290634393692, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 251080 + }, + { + "epoch": 0.9706437197507384, + "grad_norm": 0.11057502776384354, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 251090 + }, + { + "epoch": 0.9706823769541216, + "grad_norm": 0.12084945291280746, + "learning_rate": 0.002, + "loss": 2.351, + "step": 251100 + }, + { + "epoch": 0.9707210341575049, + "grad_norm": 0.11005719751119614, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 251110 + }, + { + "epoch": 0.9707596913608881, + "grad_norm": 0.10544893890619278, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 251120 + }, + { + "epoch": 0.9707983485642715, + "grad_norm": 0.1027940958738327, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 251130 + }, + { + "epoch": 0.9708370057676547, + "grad_norm": 0.09388290345668793, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 251140 + }, + { + "epoch": 0.970875662971038, + "grad_norm": 0.15099789202213287, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 251150 + }, + { + "epoch": 0.9709143201744213, + "grad_norm": 0.11640970408916473, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 251160 + }, + { + "epoch": 0.9709529773778046, + "grad_norm": 0.10199052095413208, + "learning_rate": 0.002, + "loss": 2.336, + "step": 251170 + }, + { + "epoch": 0.9709916345811879, + "grad_norm": 0.10691885650157928, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 251180 + }, + { + "epoch": 0.9710302917845711, + "grad_norm": 0.11014623939990997, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 251190 + }, + { + "epoch": 0.9710689489879544, + "grad_norm": 0.10485488176345825, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 251200 + }, + { + "epoch": 0.9711076061913377, + "grad_norm": 0.10701548308134079, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 251210 + }, + { + "epoch": 0.971146263394721, + "grad_norm": 0.10561732202768326, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 251220 + }, + { + "epoch": 0.9711849205981042, + "grad_norm": 0.11018940061330795, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 251230 + }, + { + "epoch": 0.9712235778014875, + "grad_norm": 0.1253640353679657, + "learning_rate": 0.002, + "loss": 2.3091, + "step": 251240 + }, + { + "epoch": 0.9712622350048709, + "grad_norm": 0.10480239987373352, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 251250 + }, + { + "epoch": 0.9713008922082541, + "grad_norm": 0.10991910099983215, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 251260 + }, + { + "epoch": 0.9713395494116374, + "grad_norm": 0.10583469271659851, + "learning_rate": 0.002, + "loss": 2.335, + "step": 251270 + }, + { + "epoch": 0.9713782066150206, + "grad_norm": 0.09267975389957428, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 251280 + }, + { + "epoch": 0.971416863818404, + "grad_norm": 0.10932402312755585, + "learning_rate": 0.002, + "loss": 2.331, + "step": 251290 + }, + { + "epoch": 0.9714555210217872, + "grad_norm": 0.1009744331240654, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 251300 + }, + { + "epoch": 0.9714941782251705, + "grad_norm": 0.12196720391511917, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 251310 + }, + { + "epoch": 0.9715328354285537, + "grad_norm": 0.10302551090717316, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 251320 + }, + { + "epoch": 0.971571492631937, + "grad_norm": 0.10102386027574539, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 251330 + }, + { + "epoch": 0.9716101498353203, + "grad_norm": 0.08784213662147522, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 251340 + }, + { + "epoch": 0.9716488070387036, + "grad_norm": 0.10013749450445175, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 251350 + }, + { + "epoch": 0.9716874642420869, + "grad_norm": 0.1117543876171112, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 251360 + }, + { + "epoch": 0.9717261214454701, + "grad_norm": 0.10328752547502518, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 251370 + }, + { + "epoch": 0.9717647786488535, + "grad_norm": 0.0927998349070549, + "learning_rate": 0.002, + "loss": 2.346, + "step": 251380 + }, + { + "epoch": 0.9718034358522367, + "grad_norm": 0.10032209008932114, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 251390 + }, + { + "epoch": 0.97184209305562, + "grad_norm": 0.10912362486124039, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 251400 + }, + { + "epoch": 0.9718807502590032, + "grad_norm": 0.11461266875267029, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 251410 + }, + { + "epoch": 0.9719194074623866, + "grad_norm": 0.12664832174777985, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 251420 + }, + { + "epoch": 0.9719580646657698, + "grad_norm": 0.14733320474624634, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 251430 + }, + { + "epoch": 0.9719967218691531, + "grad_norm": 0.11037435382604599, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 251440 + }, + { + "epoch": 0.9720353790725363, + "grad_norm": 0.10550345480442047, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 251450 + }, + { + "epoch": 0.9720740362759197, + "grad_norm": 0.11252820491790771, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 251460 + }, + { + "epoch": 0.972112693479303, + "grad_norm": 0.1101410761475563, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 251470 + }, + { + "epoch": 0.9721513506826862, + "grad_norm": 0.10230425000190735, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 251480 + }, + { + "epoch": 0.9721900078860695, + "grad_norm": 0.09026966243982315, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 251490 + }, + { + "epoch": 0.9722286650894527, + "grad_norm": 0.11959964781999588, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 251500 + }, + { + "epoch": 0.9722673222928361, + "grad_norm": 0.10887648910284042, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 251510 + }, + { + "epoch": 0.9723059794962193, + "grad_norm": 0.11334493011236191, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 251520 + }, + { + "epoch": 0.9723446366996026, + "grad_norm": 0.10531239211559296, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 251530 + }, + { + "epoch": 0.9723832939029858, + "grad_norm": 0.09486491978168488, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 251540 + }, + { + "epoch": 0.9724219511063692, + "grad_norm": 0.10903091728687286, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 251550 + }, + { + "epoch": 0.9724606083097525, + "grad_norm": 0.13235808908939362, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 251560 + }, + { + "epoch": 0.9724992655131357, + "grad_norm": 0.0974523052573204, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 251570 + }, + { + "epoch": 0.972537922716519, + "grad_norm": 0.1222124919295311, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 251580 + }, + { + "epoch": 0.9725765799199023, + "grad_norm": 0.09988315403461456, + "learning_rate": 0.002, + "loss": 2.348, + "step": 251590 + }, + { + "epoch": 0.9726152371232856, + "grad_norm": 0.10034121572971344, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 251600 + }, + { + "epoch": 0.9726538943266688, + "grad_norm": 0.12333368510007858, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 251610 + }, + { + "epoch": 0.9726925515300521, + "grad_norm": 0.10110745579004288, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 251620 + }, + { + "epoch": 0.9727312087334354, + "grad_norm": 0.10473684966564178, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 251630 + }, + { + "epoch": 0.9727698659368187, + "grad_norm": 0.10513075441122055, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 251640 + }, + { + "epoch": 0.9728085231402019, + "grad_norm": 0.09620826691389084, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 251650 + }, + { + "epoch": 0.9728471803435852, + "grad_norm": 0.12485495954751968, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 251660 + }, + { + "epoch": 0.9728858375469686, + "grad_norm": 0.10933785885572433, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 251670 + }, + { + "epoch": 0.9729244947503518, + "grad_norm": 0.1092258170247078, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 251680 + }, + { + "epoch": 0.9729631519537351, + "grad_norm": 0.1023181825876236, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 251690 + }, + { + "epoch": 0.9730018091571183, + "grad_norm": 0.1524294763803482, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 251700 + }, + { + "epoch": 0.9730404663605016, + "grad_norm": 0.10203384608030319, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 251710 + }, + { + "epoch": 0.9730791235638849, + "grad_norm": 0.10223396122455597, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 251720 + }, + { + "epoch": 0.9731177807672682, + "grad_norm": 0.11579997092485428, + "learning_rate": 0.002, + "loss": 2.338, + "step": 251730 + }, + { + "epoch": 0.9731564379706514, + "grad_norm": 0.09853821992874146, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 251740 + }, + { + "epoch": 0.9731950951740347, + "grad_norm": 0.10674675554037094, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 251750 + }, + { + "epoch": 0.973233752377418, + "grad_norm": 0.12423861771821976, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 251760 + }, + { + "epoch": 0.9732724095808013, + "grad_norm": 0.09702294319868088, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 251770 + }, + { + "epoch": 0.9733110667841846, + "grad_norm": 0.10267679393291473, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 251780 + }, + { + "epoch": 0.9733497239875678, + "grad_norm": 0.0920570120215416, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 251790 + }, + { + "epoch": 0.9733883811909512, + "grad_norm": 0.1116960197687149, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 251800 + }, + { + "epoch": 0.9734270383943344, + "grad_norm": 0.10197541862726212, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 251810 + }, + { + "epoch": 0.9734656955977177, + "grad_norm": 0.11533726006746292, + "learning_rate": 0.002, + "loss": 2.338, + "step": 251820 + }, + { + "epoch": 0.9735043528011009, + "grad_norm": 0.09482559561729431, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 251830 + }, + { + "epoch": 0.9735430100044843, + "grad_norm": 0.09979456663131714, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 251840 + }, + { + "epoch": 0.9735816672078675, + "grad_norm": 0.11372046917676926, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 251850 + }, + { + "epoch": 0.9736203244112508, + "grad_norm": 0.11386499553918839, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 251860 + }, + { + "epoch": 0.973658981614634, + "grad_norm": 0.09593381732702255, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 251870 + }, + { + "epoch": 0.9736976388180173, + "grad_norm": 0.10269182920455933, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 251880 + }, + { + "epoch": 0.9737362960214007, + "grad_norm": 0.12904858589172363, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 251890 + }, + { + "epoch": 0.9737749532247839, + "grad_norm": 0.11794731020927429, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 251900 + }, + { + "epoch": 0.9738136104281672, + "grad_norm": 0.11491457372903824, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 251910 + }, + { + "epoch": 0.9738522676315504, + "grad_norm": 0.09406273812055588, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 251920 + }, + { + "epoch": 0.9738909248349338, + "grad_norm": 0.12121071666479111, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 251930 + }, + { + "epoch": 0.973929582038317, + "grad_norm": 0.12540626525878906, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 251940 + }, + { + "epoch": 0.9739682392417003, + "grad_norm": 0.09689386188983917, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 251950 + }, + { + "epoch": 0.9740068964450835, + "grad_norm": 0.1502695381641388, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 251960 + }, + { + "epoch": 0.9740455536484669, + "grad_norm": 0.0994613990187645, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 251970 + }, + { + "epoch": 0.9740842108518502, + "grad_norm": 0.10326496511697769, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 251980 + }, + { + "epoch": 0.9741228680552334, + "grad_norm": 0.09585017710924149, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 251990 + }, + { + "epoch": 0.9741615252586167, + "grad_norm": 0.09650801867246628, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 252000 + }, + { + "epoch": 0.974200182462, + "grad_norm": 0.11647900938987732, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 252010 + }, + { + "epoch": 0.9742388396653833, + "grad_norm": 0.12022756040096283, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 252020 + }, + { + "epoch": 0.9742774968687665, + "grad_norm": 0.11134672164916992, + "learning_rate": 0.002, + "loss": 2.3143, + "step": 252030 + }, + { + "epoch": 0.9743161540721498, + "grad_norm": 0.09073596447706223, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 252040 + }, + { + "epoch": 0.974354811275533, + "grad_norm": 0.09896473586559296, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 252050 + }, + { + "epoch": 0.9743934684789164, + "grad_norm": 0.12702728807926178, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 252060 + }, + { + "epoch": 0.9744321256822996, + "grad_norm": 0.09636173397302628, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 252070 + }, + { + "epoch": 0.9744707828856829, + "grad_norm": 0.09168794006109238, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 252080 + }, + { + "epoch": 0.9745094400890661, + "grad_norm": 0.36897939443588257, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 252090 + }, + { + "epoch": 0.9745480972924495, + "grad_norm": 0.11859555542469025, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 252100 + }, + { + "epoch": 0.9745867544958328, + "grad_norm": 0.11081647127866745, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 252110 + }, + { + "epoch": 0.974625411699216, + "grad_norm": 0.1135776937007904, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 252120 + }, + { + "epoch": 0.9746640689025993, + "grad_norm": 0.09395918995141983, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 252130 + }, + { + "epoch": 0.9747027261059826, + "grad_norm": 0.11018582433462143, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 252140 + }, + { + "epoch": 0.9747413833093659, + "grad_norm": 0.09823351353406906, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 252150 + }, + { + "epoch": 0.9747800405127491, + "grad_norm": 0.10773885250091553, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 252160 + }, + { + "epoch": 0.9748186977161324, + "grad_norm": 0.1168753057718277, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 252170 + }, + { + "epoch": 0.9748573549195158, + "grad_norm": 0.18037018179893494, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 252180 + }, + { + "epoch": 0.974896012122899, + "grad_norm": 0.11588846147060394, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 252190 + }, + { + "epoch": 0.9749346693262823, + "grad_norm": 0.10509663075208664, + "learning_rate": 0.002, + "loss": 2.344, + "step": 252200 + }, + { + "epoch": 0.9749733265296655, + "grad_norm": 0.10183600336313248, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 252210 + }, + { + "epoch": 0.9750119837330489, + "grad_norm": 0.10032964497804642, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 252220 + }, + { + "epoch": 0.9750506409364321, + "grad_norm": 0.11123006790876389, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 252230 + }, + { + "epoch": 0.9750892981398154, + "grad_norm": 0.09926677495241165, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 252240 + }, + { + "epoch": 0.9751279553431986, + "grad_norm": 0.1239267885684967, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 252250 + }, + { + "epoch": 0.9751666125465819, + "grad_norm": 0.08306685090065002, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 252260 + }, + { + "epoch": 0.9752052697499652, + "grad_norm": 0.11122148483991623, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 252270 + }, + { + "epoch": 0.9752439269533485, + "grad_norm": 0.10140670090913773, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 252280 + }, + { + "epoch": 0.9752825841567317, + "grad_norm": 0.09582722932100296, + "learning_rate": 0.002, + "loss": 2.336, + "step": 252290 + }, + { + "epoch": 0.975321241360115, + "grad_norm": 0.1066557765007019, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 252300 + }, + { + "epoch": 0.9753598985634984, + "grad_norm": 0.09405253827571869, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 252310 + }, + { + "epoch": 0.9753985557668816, + "grad_norm": 0.10288607329130173, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 252320 + }, + { + "epoch": 0.9754372129702649, + "grad_norm": 0.0957462415099144, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 252330 + }, + { + "epoch": 0.9754758701736481, + "grad_norm": 0.10872723907232285, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 252340 + }, + { + "epoch": 0.9755145273770315, + "grad_norm": 0.11865616589784622, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 252350 + }, + { + "epoch": 0.9755531845804147, + "grad_norm": 0.09794749319553375, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 252360 + }, + { + "epoch": 0.975591841783798, + "grad_norm": 0.10518945753574371, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 252370 + }, + { + "epoch": 0.9756304989871812, + "grad_norm": 0.1064748764038086, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 252380 + }, + { + "epoch": 0.9756691561905646, + "grad_norm": 0.10496281087398529, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 252390 + }, + { + "epoch": 0.9757078133939479, + "grad_norm": 0.14388880133628845, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 252400 + }, + { + "epoch": 0.9757464705973311, + "grad_norm": 0.10157719254493713, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 252410 + }, + { + "epoch": 0.9757851278007144, + "grad_norm": 0.1069631278514862, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 252420 + }, + { + "epoch": 0.9758237850040976, + "grad_norm": 0.09011048823595047, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 252430 + }, + { + "epoch": 0.975862442207481, + "grad_norm": 0.10953779518604279, + "learning_rate": 0.002, + "loss": 2.333, + "step": 252440 + }, + { + "epoch": 0.9759010994108642, + "grad_norm": 0.1189967542886734, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 252450 + }, + { + "epoch": 0.9759397566142475, + "grad_norm": 0.09564502537250519, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 252460 + }, + { + "epoch": 0.9759784138176307, + "grad_norm": 0.11470325291156769, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 252470 + }, + { + "epoch": 0.9760170710210141, + "grad_norm": 0.10929343849420547, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 252480 + }, + { + "epoch": 0.9760557282243973, + "grad_norm": 0.11872894316911697, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 252490 + }, + { + "epoch": 0.9760943854277806, + "grad_norm": 0.09920226037502289, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 252500 + }, + { + "epoch": 0.9761330426311638, + "grad_norm": 0.10016408562660217, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 252510 + }, + { + "epoch": 0.9761716998345472, + "grad_norm": 0.13373656570911407, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 252520 + }, + { + "epoch": 0.9762103570379305, + "grad_norm": 0.10544757544994354, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 252530 + }, + { + "epoch": 0.9762490142413137, + "grad_norm": 0.09542810916900635, + "learning_rate": 0.002, + "loss": 2.339, + "step": 252540 + }, + { + "epoch": 0.976287671444697, + "grad_norm": 0.09550932794809341, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 252550 + }, + { + "epoch": 0.9763263286480803, + "grad_norm": 0.10764776170253754, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 252560 + }, + { + "epoch": 0.9763649858514636, + "grad_norm": 0.11706431955099106, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 252570 + }, + { + "epoch": 0.9764036430548468, + "grad_norm": 0.09157240390777588, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 252580 + }, + { + "epoch": 0.9764423002582301, + "grad_norm": 0.09784223139286041, + "learning_rate": 0.002, + "loss": 2.338, + "step": 252590 + }, + { + "epoch": 0.9764809574616135, + "grad_norm": 0.10729385167360306, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 252600 + }, + { + "epoch": 0.9765196146649967, + "grad_norm": 0.11288652569055557, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 252610 + }, + { + "epoch": 0.97655827186838, + "grad_norm": 0.10649019479751587, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 252620 + }, + { + "epoch": 0.9765969290717632, + "grad_norm": 0.16062025725841522, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 252630 + }, + { + "epoch": 0.9766355862751465, + "grad_norm": 0.1069127693772316, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 252640 + }, + { + "epoch": 0.9766742434785298, + "grad_norm": 0.09770555049180984, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 252650 + }, + { + "epoch": 0.9767129006819131, + "grad_norm": 0.10639701783657074, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 252660 + }, + { + "epoch": 0.9767515578852963, + "grad_norm": 0.13143764436244965, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 252670 + }, + { + "epoch": 0.9767902150886796, + "grad_norm": 0.09408655762672424, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 252680 + }, + { + "epoch": 0.9768288722920629, + "grad_norm": 0.11855518072843552, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 252690 + }, + { + "epoch": 0.9768675294954462, + "grad_norm": 0.09519604593515396, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 252700 + }, + { + "epoch": 0.9769061866988294, + "grad_norm": 0.11027871817350388, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 252710 + }, + { + "epoch": 0.9769448439022127, + "grad_norm": 0.10735554248094559, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 252720 + }, + { + "epoch": 0.9769835011055961, + "grad_norm": 0.09857560694217682, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 252730 + }, + { + "epoch": 0.9770221583089793, + "grad_norm": 0.09966576844453812, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 252740 + }, + { + "epoch": 0.9770608155123626, + "grad_norm": 0.12802031636238098, + "learning_rate": 0.002, + "loss": 2.347, + "step": 252750 + }, + { + "epoch": 0.9770994727157458, + "grad_norm": 0.11255703121423721, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 252760 + }, + { + "epoch": 0.9771381299191292, + "grad_norm": 0.10530129820108414, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 252770 + }, + { + "epoch": 0.9771767871225124, + "grad_norm": 0.10851936787366867, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 252780 + }, + { + "epoch": 0.9772154443258957, + "grad_norm": 0.09338781982660294, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 252790 + }, + { + "epoch": 0.9772541015292789, + "grad_norm": 0.10688374191522598, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 252800 + }, + { + "epoch": 0.9772927587326622, + "grad_norm": 0.10073661059141159, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 252810 + }, + { + "epoch": 0.9773314159360456, + "grad_norm": 0.10849092900753021, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 252820 + }, + { + "epoch": 0.9773700731394288, + "grad_norm": 0.10362280160188675, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 252830 + }, + { + "epoch": 0.9774087303428121, + "grad_norm": 0.10138023644685745, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 252840 + }, + { + "epoch": 0.9774473875461953, + "grad_norm": 0.12903210520744324, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 252850 + }, + { + "epoch": 0.9774860447495787, + "grad_norm": 0.11094959825277328, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 252860 + }, + { + "epoch": 0.9775247019529619, + "grad_norm": 0.10609157383441925, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 252870 + }, + { + "epoch": 0.9775633591563452, + "grad_norm": 0.1097332313656807, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 252880 + }, + { + "epoch": 0.9776020163597284, + "grad_norm": 0.11047670245170593, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 252890 + }, + { + "epoch": 0.9776406735631118, + "grad_norm": 0.1258823722600937, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 252900 + }, + { + "epoch": 0.977679330766495, + "grad_norm": 0.0985330194234848, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 252910 + }, + { + "epoch": 0.9777179879698783, + "grad_norm": 0.10980021208524704, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 252920 + }, + { + "epoch": 0.9777566451732616, + "grad_norm": 0.11983881890773773, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 252930 + }, + { + "epoch": 0.9777953023766449, + "grad_norm": 0.10731696337461472, + "learning_rate": 0.002, + "loss": 2.349, + "step": 252940 + }, + { + "epoch": 0.9778339595800282, + "grad_norm": 0.10070104151964188, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 252950 + }, + { + "epoch": 0.9778726167834114, + "grad_norm": 0.11113781481981277, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 252960 + }, + { + "epoch": 0.9779112739867947, + "grad_norm": 0.09822292625904083, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 252970 + }, + { + "epoch": 0.9779499311901779, + "grad_norm": 0.101972296833992, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 252980 + }, + { + "epoch": 0.9779885883935613, + "grad_norm": 0.12044399976730347, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 252990 + }, + { + "epoch": 0.9780272455969445, + "grad_norm": 0.13268974423408508, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 253000 + }, + { + "epoch": 0.9780659028003278, + "grad_norm": 0.10569026321172714, + "learning_rate": 0.002, + "loss": 2.344, + "step": 253010 + }, + { + "epoch": 0.978104560003711, + "grad_norm": 0.10113076865673065, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 253020 + }, + { + "epoch": 0.9781432172070944, + "grad_norm": 0.09855961054563522, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 253030 + }, + { + "epoch": 0.9781818744104777, + "grad_norm": 0.0978899747133255, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 253040 + }, + { + "epoch": 0.9782205316138609, + "grad_norm": 0.10016301274299622, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 253050 + }, + { + "epoch": 0.9782591888172442, + "grad_norm": 0.11159918457269669, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 253060 + }, + { + "epoch": 0.9782978460206275, + "grad_norm": 0.09567983448505402, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 253070 + }, + { + "epoch": 0.9783365032240108, + "grad_norm": 0.12280935794115067, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 253080 + }, + { + "epoch": 0.978375160427394, + "grad_norm": 0.11773037165403366, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 253090 + }, + { + "epoch": 0.9784138176307773, + "grad_norm": 0.11330371350049973, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 253100 + }, + { + "epoch": 0.9784524748341606, + "grad_norm": 0.1054389551281929, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 253110 + }, + { + "epoch": 0.9784911320375439, + "grad_norm": 0.10864686965942383, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 253120 + }, + { + "epoch": 0.9785297892409272, + "grad_norm": 0.09636160731315613, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 253130 + }, + { + "epoch": 0.9785684464443104, + "grad_norm": 0.11868980526924133, + "learning_rate": 0.002, + "loss": 2.351, + "step": 253140 + }, + { + "epoch": 0.9786071036476938, + "grad_norm": 0.11431093513965607, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 253150 + }, + { + "epoch": 0.978645760851077, + "grad_norm": 0.0956217423081398, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 253160 + }, + { + "epoch": 0.9786844180544603, + "grad_norm": 0.11623834073543549, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 253170 + }, + { + "epoch": 0.9787230752578435, + "grad_norm": 0.10607807338237762, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 253180 + }, + { + "epoch": 0.9787617324612268, + "grad_norm": 0.10642440617084503, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 253190 + }, + { + "epoch": 0.9788003896646101, + "grad_norm": 0.11321180313825607, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 253200 + }, + { + "epoch": 0.9788390468679934, + "grad_norm": 0.1318560391664505, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 253210 + }, + { + "epoch": 0.9788777040713766, + "grad_norm": 0.10777322947978973, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 253220 + }, + { + "epoch": 0.9789163612747599, + "grad_norm": 0.10849911719560623, + "learning_rate": 0.002, + "loss": 2.324, + "step": 253230 + }, + { + "epoch": 0.9789550184781433, + "grad_norm": 0.09989572316408157, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 253240 + }, + { + "epoch": 0.9789936756815265, + "grad_norm": 0.10110043734312057, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 253250 + }, + { + "epoch": 0.9790323328849098, + "grad_norm": 0.13030365109443665, + "learning_rate": 0.002, + "loss": 2.351, + "step": 253260 + }, + { + "epoch": 0.979070990088293, + "grad_norm": 0.09885247051715851, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 253270 + }, + { + "epoch": 0.9791096472916764, + "grad_norm": 0.0909193828701973, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 253280 + }, + { + "epoch": 0.9791483044950596, + "grad_norm": 0.10934343189001083, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 253290 + }, + { + "epoch": 0.9791869616984429, + "grad_norm": 0.11860756576061249, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 253300 + }, + { + "epoch": 0.9792256189018261, + "grad_norm": 0.09530835598707199, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 253310 + }, + { + "epoch": 0.9792642761052095, + "grad_norm": 0.1112305223941803, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 253320 + }, + { + "epoch": 0.9793029333085927, + "grad_norm": 0.09348230808973312, + "learning_rate": 0.002, + "loss": 2.34, + "step": 253330 + }, + { + "epoch": 0.979341590511976, + "grad_norm": 0.14365029335021973, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 253340 + }, + { + "epoch": 0.9793802477153593, + "grad_norm": 0.19712699949741364, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 253350 + }, + { + "epoch": 0.9794189049187425, + "grad_norm": 0.11766904592514038, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 253360 + }, + { + "epoch": 0.9794575621221259, + "grad_norm": 0.11091282218694687, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 253370 + }, + { + "epoch": 0.9794962193255091, + "grad_norm": 0.09966139495372772, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 253380 + }, + { + "epoch": 0.9795348765288924, + "grad_norm": 0.10028279572725296, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 253390 + }, + { + "epoch": 0.9795735337322756, + "grad_norm": 0.10175572335720062, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 253400 + }, + { + "epoch": 0.979612190935659, + "grad_norm": 0.12000759690999985, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 253410 + }, + { + "epoch": 0.9796508481390422, + "grad_norm": 0.10719025880098343, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 253420 + }, + { + "epoch": 0.9796895053424255, + "grad_norm": 0.10947318375110626, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 253430 + }, + { + "epoch": 0.9797281625458087, + "grad_norm": 0.09750618785619736, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 253440 + }, + { + "epoch": 0.9797668197491921, + "grad_norm": 0.12034052610397339, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 253450 + }, + { + "epoch": 0.9798054769525754, + "grad_norm": 0.10206147283315659, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 253460 + }, + { + "epoch": 0.9798441341559586, + "grad_norm": 0.10553114116191864, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 253470 + }, + { + "epoch": 0.9798827913593419, + "grad_norm": 0.10624590516090393, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 253480 + }, + { + "epoch": 0.9799214485627252, + "grad_norm": 0.09161072969436646, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 253490 + }, + { + "epoch": 0.9799601057661085, + "grad_norm": 0.09625286608934402, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 253500 + }, + { + "epoch": 0.9799987629694917, + "grad_norm": 0.10236509889364243, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 253510 + }, + { + "epoch": 0.980037420172875, + "grad_norm": 0.09792248904705048, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 253520 + }, + { + "epoch": 0.9800760773762582, + "grad_norm": 0.10096298903226852, + "learning_rate": 0.002, + "loss": 2.349, + "step": 253530 + }, + { + "epoch": 0.9801147345796416, + "grad_norm": 0.10640841722488403, + "learning_rate": 0.002, + "loss": 2.325, + "step": 253540 + }, + { + "epoch": 0.9801533917830249, + "grad_norm": 0.10135815292596817, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 253550 + }, + { + "epoch": 0.9801920489864081, + "grad_norm": 0.11531524360179901, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 253560 + }, + { + "epoch": 0.9802307061897914, + "grad_norm": 0.0986768901348114, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 253570 + }, + { + "epoch": 0.9802693633931747, + "grad_norm": 0.11826640367507935, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 253580 + }, + { + "epoch": 0.980308020596558, + "grad_norm": 0.11659346520900726, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 253590 + }, + { + "epoch": 0.9803466777999412, + "grad_norm": 0.08888128399848938, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 253600 + }, + { + "epoch": 0.9803853350033245, + "grad_norm": 0.11499710381031036, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 253610 + }, + { + "epoch": 0.9804239922067078, + "grad_norm": 0.11001802980899811, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 253620 + }, + { + "epoch": 0.9804626494100911, + "grad_norm": 0.33217594027519226, + "learning_rate": 0.002, + "loss": 2.321, + "step": 253630 + }, + { + "epoch": 0.9805013066134743, + "grad_norm": 0.11237010359764099, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 253640 + }, + { + "epoch": 0.9805399638168576, + "grad_norm": 0.0909418836236, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 253650 + }, + { + "epoch": 0.980578621020241, + "grad_norm": 0.10539261251688004, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 253660 + }, + { + "epoch": 0.9806172782236242, + "grad_norm": 0.10350189357995987, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 253670 + }, + { + "epoch": 0.9806559354270075, + "grad_norm": 0.1357734650373459, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 253680 + }, + { + "epoch": 0.9806945926303907, + "grad_norm": 0.09479259699583054, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 253690 + }, + { + "epoch": 0.9807332498337741, + "grad_norm": 0.1024928092956543, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 253700 + }, + { + "epoch": 0.9807719070371573, + "grad_norm": 0.11998184770345688, + "learning_rate": 0.002, + "loss": 2.335, + "step": 253710 + }, + { + "epoch": 0.9808105642405406, + "grad_norm": 0.09331895411014557, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 253720 + }, + { + "epoch": 0.9808492214439238, + "grad_norm": 0.11315015703439713, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 253730 + }, + { + "epoch": 0.9808878786473071, + "grad_norm": 0.09995149821043015, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 253740 + }, + { + "epoch": 0.9809265358506905, + "grad_norm": 0.1109841987490654, + "learning_rate": 0.002, + "loss": 2.32, + "step": 253750 + }, + { + "epoch": 0.9809651930540737, + "grad_norm": 0.0948142409324646, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 253760 + }, + { + "epoch": 0.981003850257457, + "grad_norm": 0.10762964934110641, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 253770 + }, + { + "epoch": 0.9810425074608402, + "grad_norm": 0.11686155945062637, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 253780 + }, + { + "epoch": 0.9810811646642236, + "grad_norm": 0.1214323565363884, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 253790 + }, + { + "epoch": 0.9811198218676068, + "grad_norm": 0.10316342860460281, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 253800 + }, + { + "epoch": 0.9811584790709901, + "grad_norm": 0.10486052185297012, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 253810 + }, + { + "epoch": 0.9811971362743733, + "grad_norm": 0.11336401849985123, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 253820 + }, + { + "epoch": 0.9812357934777567, + "grad_norm": 0.1080748662352562, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 253830 + }, + { + "epoch": 0.9812744506811399, + "grad_norm": 0.12142869830131531, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 253840 + }, + { + "epoch": 0.9813131078845232, + "grad_norm": 0.1060624048113823, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 253850 + }, + { + "epoch": 0.9813517650879064, + "grad_norm": 0.10174284875392914, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 253860 + }, + { + "epoch": 0.9813904222912898, + "grad_norm": 0.10029471665620804, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 253870 + }, + { + "epoch": 0.9814290794946731, + "grad_norm": 0.108481265604496, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 253880 + }, + { + "epoch": 0.9814677366980563, + "grad_norm": 0.0938580110669136, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 253890 + }, + { + "epoch": 0.9815063939014396, + "grad_norm": 0.10277163237333298, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 253900 + }, + { + "epoch": 0.9815450511048228, + "grad_norm": 0.12451539933681488, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 253910 + }, + { + "epoch": 0.9815837083082062, + "grad_norm": 0.10546960681676865, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 253920 + }, + { + "epoch": 0.9816223655115894, + "grad_norm": 0.08655869215726852, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 253930 + }, + { + "epoch": 0.9816610227149727, + "grad_norm": 0.12251607328653336, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 253940 + }, + { + "epoch": 0.9816996799183559, + "grad_norm": 0.10581085830926895, + "learning_rate": 0.002, + "loss": 2.34, + "step": 253950 + }, + { + "epoch": 0.9817383371217393, + "grad_norm": 0.12245898693799973, + "learning_rate": 0.002, + "loss": 2.331, + "step": 253960 + }, + { + "epoch": 0.9817769943251226, + "grad_norm": 0.11692578345537186, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 253970 + }, + { + "epoch": 0.9818156515285058, + "grad_norm": 0.10638014227151871, + "learning_rate": 0.002, + "loss": 2.326, + "step": 253980 + }, + { + "epoch": 0.9818543087318891, + "grad_norm": 0.11459168046712875, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 253990 + }, + { + "epoch": 0.9818929659352724, + "grad_norm": 0.11234749853610992, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 254000 + }, + { + "epoch": 0.9819316231386557, + "grad_norm": 0.10456020385026932, + "learning_rate": 0.002, + "loss": 2.334, + "step": 254010 + }, + { + "epoch": 0.9819702803420389, + "grad_norm": 0.10688741505146027, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 254020 + }, + { + "epoch": 0.9820089375454222, + "grad_norm": 0.12189941108226776, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 254030 + }, + { + "epoch": 0.9820475947488055, + "grad_norm": 0.09207422286272049, + "learning_rate": 0.002, + "loss": 2.344, + "step": 254040 + }, + { + "epoch": 0.9820862519521888, + "grad_norm": 0.09040502458810806, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 254050 + }, + { + "epoch": 0.982124909155572, + "grad_norm": 0.08706256747245789, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 254060 + }, + { + "epoch": 0.9821635663589553, + "grad_norm": 0.10764108598232269, + "learning_rate": 0.002, + "loss": 2.337, + "step": 254070 + }, + { + "epoch": 0.9822022235623387, + "grad_norm": 0.09894927591085434, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 254080 + }, + { + "epoch": 0.9822408807657219, + "grad_norm": 0.09293508529663086, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 254090 + }, + { + "epoch": 0.9822795379691052, + "grad_norm": 0.11155544966459274, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 254100 + }, + { + "epoch": 0.9823181951724884, + "grad_norm": 0.08967789262533188, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 254110 + }, + { + "epoch": 0.9823568523758717, + "grad_norm": 0.13756652176380157, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 254120 + }, + { + "epoch": 0.982395509579255, + "grad_norm": 0.10222943872213364, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 254130 + }, + { + "epoch": 0.9824341667826383, + "grad_norm": 0.10353002697229385, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 254140 + }, + { + "epoch": 0.9824728239860215, + "grad_norm": 0.10684805363416672, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 254150 + }, + { + "epoch": 0.9825114811894048, + "grad_norm": 0.12317736446857452, + "learning_rate": 0.002, + "loss": 2.323, + "step": 254160 + }, + { + "epoch": 0.9825501383927882, + "grad_norm": 0.09487446397542953, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 254170 + }, + { + "epoch": 0.9825887955961714, + "grad_norm": 0.09926487505435944, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 254180 + }, + { + "epoch": 0.9826274527995547, + "grad_norm": 0.10189937800168991, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 254190 + }, + { + "epoch": 0.9826661100029379, + "grad_norm": 0.10605458915233612, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 254200 + }, + { + "epoch": 0.9827047672063213, + "grad_norm": 0.09959662705659866, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 254210 + }, + { + "epoch": 0.9827434244097045, + "grad_norm": 0.08425378799438477, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 254220 + }, + { + "epoch": 0.9827820816130878, + "grad_norm": 0.1034320816397667, + "learning_rate": 0.002, + "loss": 2.338, + "step": 254230 + }, + { + "epoch": 0.982820738816471, + "grad_norm": 0.11652296036481857, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 254240 + }, + { + "epoch": 0.9828593960198544, + "grad_norm": 0.09380397945642471, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 254250 + }, + { + "epoch": 0.9828980532232376, + "grad_norm": 0.124928779900074, + "learning_rate": 0.002, + "loss": 2.332, + "step": 254260 + }, + { + "epoch": 0.9829367104266209, + "grad_norm": 0.10061852633953094, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 254270 + }, + { + "epoch": 0.9829753676300041, + "grad_norm": 0.10316528379917145, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 254280 + }, + { + "epoch": 0.9830140248333874, + "grad_norm": 0.09837278723716736, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 254290 + }, + { + "epoch": 0.9830526820367708, + "grad_norm": 0.09891007095575333, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 254300 + }, + { + "epoch": 0.983091339240154, + "grad_norm": 0.10915490239858627, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 254310 + }, + { + "epoch": 0.9831299964435373, + "grad_norm": 0.10277639329433441, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 254320 + }, + { + "epoch": 0.9831686536469205, + "grad_norm": 0.10569757968187332, + "learning_rate": 0.002, + "loss": 2.334, + "step": 254330 + }, + { + "epoch": 0.9832073108503039, + "grad_norm": 0.10817868262529373, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 254340 + }, + { + "epoch": 0.9832459680536871, + "grad_norm": 0.10384424030780792, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 254350 + }, + { + "epoch": 0.9832846252570704, + "grad_norm": 0.09562724083662033, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 254360 + }, + { + "epoch": 0.9833232824604536, + "grad_norm": 0.10068861395120621, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 254370 + }, + { + "epoch": 0.983361939663837, + "grad_norm": 0.10070518404245377, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 254380 + }, + { + "epoch": 0.9834005968672203, + "grad_norm": 0.11541334539651871, + "learning_rate": 0.002, + "loss": 2.327, + "step": 254390 + }, + { + "epoch": 0.9834392540706035, + "grad_norm": 0.098110631108284, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 254400 + }, + { + "epoch": 0.9834779112739868, + "grad_norm": 0.1020338162779808, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 254410 + }, + { + "epoch": 0.9835165684773701, + "grad_norm": 0.11386211216449738, + "learning_rate": 0.002, + "loss": 2.329, + "step": 254420 + }, + { + "epoch": 0.9835552256807534, + "grad_norm": 0.11197617650032043, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 254430 + }, + { + "epoch": 0.9835938828841366, + "grad_norm": 0.11133823543787003, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 254440 + }, + { + "epoch": 0.9836325400875199, + "grad_norm": 0.11409388482570648, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 254450 + }, + { + "epoch": 0.9836711972909031, + "grad_norm": 0.09850602596998215, + "learning_rate": 0.002, + "loss": 2.329, + "step": 254460 + }, + { + "epoch": 0.9837098544942865, + "grad_norm": 0.11488984525203705, + "learning_rate": 0.002, + "loss": 2.331, + "step": 254470 + }, + { + "epoch": 0.9837485116976697, + "grad_norm": 0.09836746007204056, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 254480 + }, + { + "epoch": 0.983787168901053, + "grad_norm": 0.11667005717754364, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 254490 + }, + { + "epoch": 0.9838258261044363, + "grad_norm": 0.11494197696447372, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 254500 + }, + { + "epoch": 0.9838644833078196, + "grad_norm": 0.09573443979024887, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 254510 + }, + { + "epoch": 0.9839031405112029, + "grad_norm": 0.09925644844770432, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 254520 + }, + { + "epoch": 0.9839417977145861, + "grad_norm": 0.12344303727149963, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 254530 + }, + { + "epoch": 0.9839804549179694, + "grad_norm": 0.13273784518241882, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 254540 + }, + { + "epoch": 0.9840191121213527, + "grad_norm": 0.09237249195575714, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 254550 + }, + { + "epoch": 0.984057769324736, + "grad_norm": 0.10044827312231064, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 254560 + }, + { + "epoch": 0.9840964265281192, + "grad_norm": 0.1014859676361084, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 254570 + }, + { + "epoch": 0.9841350837315025, + "grad_norm": 0.09613264352083206, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 254580 + }, + { + "epoch": 0.9841737409348859, + "grad_norm": 0.15587982535362244, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 254590 + }, + { + "epoch": 0.9842123981382691, + "grad_norm": 0.09434341639280319, + "learning_rate": 0.002, + "loss": 2.359, + "step": 254600 + }, + { + "epoch": 0.9842510553416524, + "grad_norm": 0.09690546989440918, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 254610 + }, + { + "epoch": 0.9842897125450356, + "grad_norm": 0.2871216833591461, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 254620 + }, + { + "epoch": 0.984328369748419, + "grad_norm": 0.12504823505878448, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 254630 + }, + { + "epoch": 0.9843670269518022, + "grad_norm": 0.09755125641822815, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 254640 + }, + { + "epoch": 0.9844056841551855, + "grad_norm": 0.09232793748378754, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 254650 + }, + { + "epoch": 0.9844443413585687, + "grad_norm": 0.09949223697185516, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 254660 + }, + { + "epoch": 0.984482998561952, + "grad_norm": 0.13219931721687317, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 254670 + }, + { + "epoch": 0.9845216557653353, + "grad_norm": 0.09665770083665848, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 254680 + }, + { + "epoch": 0.9845603129687186, + "grad_norm": 0.11133351922035217, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 254690 + }, + { + "epoch": 0.9845989701721019, + "grad_norm": 0.10648233443498611, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 254700 + }, + { + "epoch": 0.9846376273754851, + "grad_norm": 0.11144927889108658, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 254710 + }, + { + "epoch": 0.9846762845788685, + "grad_norm": 0.09630986303091049, + "learning_rate": 0.002, + "loss": 2.348, + "step": 254720 + }, + { + "epoch": 0.9847149417822517, + "grad_norm": 0.11907469481229782, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 254730 + }, + { + "epoch": 0.984753598985635, + "grad_norm": 0.30501991510391235, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 254740 + }, + { + "epoch": 0.9847922561890182, + "grad_norm": 0.1148744747042656, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 254750 + }, + { + "epoch": 0.9848309133924016, + "grad_norm": 0.10225886106491089, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 254760 + }, + { + "epoch": 0.9848695705957848, + "grad_norm": 0.09008200466632843, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 254770 + }, + { + "epoch": 0.9849082277991681, + "grad_norm": 0.10727483779191971, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 254780 + }, + { + "epoch": 0.9849468850025513, + "grad_norm": 0.11305704712867737, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 254790 + }, + { + "epoch": 0.9849855422059347, + "grad_norm": 0.11197128891944885, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 254800 + }, + { + "epoch": 0.985024199409318, + "grad_norm": 0.09113840758800507, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 254810 + }, + { + "epoch": 0.9850628566127012, + "grad_norm": 0.0970667377114296, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 254820 + }, + { + "epoch": 0.9851015138160845, + "grad_norm": 0.09900952130556107, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 254830 + }, + { + "epoch": 0.9851401710194677, + "grad_norm": 0.11687123775482178, + "learning_rate": 0.002, + "loss": 2.343, + "step": 254840 + }, + { + "epoch": 0.9851788282228511, + "grad_norm": 0.09309032559394836, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 254850 + }, + { + "epoch": 0.9852174854262343, + "grad_norm": 0.09475398063659668, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 254860 + }, + { + "epoch": 0.9852561426296176, + "grad_norm": 0.09487903118133545, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 254870 + }, + { + "epoch": 0.9852947998330008, + "grad_norm": 0.11891059577465057, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 254880 + }, + { + "epoch": 0.9853334570363842, + "grad_norm": 0.10448039323091507, + "learning_rate": 0.002, + "loss": 2.327, + "step": 254890 + }, + { + "epoch": 0.9853721142397674, + "grad_norm": 0.14385157823562622, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 254900 + }, + { + "epoch": 0.9854107714431507, + "grad_norm": 0.10002294182777405, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 254910 + }, + { + "epoch": 0.985449428646534, + "grad_norm": 0.11528504639863968, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 254920 + }, + { + "epoch": 0.9854880858499173, + "grad_norm": 0.10999160259962082, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 254930 + }, + { + "epoch": 0.9855267430533006, + "grad_norm": 0.10249609500169754, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 254940 + }, + { + "epoch": 0.9855654002566838, + "grad_norm": 0.10020183771848679, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 254950 + }, + { + "epoch": 0.9856040574600671, + "grad_norm": 0.10220799595117569, + "learning_rate": 0.002, + "loss": 2.328, + "step": 254960 + }, + { + "epoch": 0.9856427146634504, + "grad_norm": 0.12004160135984421, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 254970 + }, + { + "epoch": 0.9856813718668337, + "grad_norm": 0.09195999801158905, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 254980 + }, + { + "epoch": 0.9857200290702169, + "grad_norm": 0.09734586626291275, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 254990 + }, + { + "epoch": 0.9857586862736002, + "grad_norm": 0.11250615119934082, + "learning_rate": 0.002, + "loss": 2.342, + "step": 255000 + }, + { + "epoch": 0.9857973434769836, + "grad_norm": 0.10436253994703293, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 255010 + }, + { + "epoch": 0.9858360006803668, + "grad_norm": 0.1116916686296463, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 255020 + }, + { + "epoch": 0.9858746578837501, + "grad_norm": 0.10826195776462555, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 255030 + }, + { + "epoch": 0.9859133150871333, + "grad_norm": 0.11349769681692123, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 255040 + }, + { + "epoch": 0.9859519722905166, + "grad_norm": 0.10054189711809158, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 255050 + }, + { + "epoch": 0.9859906294938999, + "grad_norm": 0.12647701799869537, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 255060 + }, + { + "epoch": 0.9860292866972832, + "grad_norm": 0.10084088146686554, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 255070 + }, + { + "epoch": 0.9860679439006664, + "grad_norm": 0.09174734354019165, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 255080 + }, + { + "epoch": 0.9861066011040497, + "grad_norm": 0.11622127145528793, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 255090 + }, + { + "epoch": 0.986145258307433, + "grad_norm": 0.12576478719711304, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 255100 + }, + { + "epoch": 0.9861839155108163, + "grad_norm": 0.09602378308773041, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 255110 + }, + { + "epoch": 0.9862225727141996, + "grad_norm": 0.115997813642025, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 255120 + }, + { + "epoch": 0.9862612299175828, + "grad_norm": 0.11473080515861511, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 255130 + }, + { + "epoch": 0.9862998871209662, + "grad_norm": 0.09651309251785278, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 255140 + }, + { + "epoch": 0.9863385443243494, + "grad_norm": 0.13024041056632996, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 255150 + }, + { + "epoch": 0.9863772015277327, + "grad_norm": 0.10573989152908325, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 255160 + }, + { + "epoch": 0.9864158587311159, + "grad_norm": 0.08838653564453125, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 255170 + }, + { + "epoch": 0.9864545159344993, + "grad_norm": 0.10317710041999817, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 255180 + }, + { + "epoch": 0.9864931731378825, + "grad_norm": 0.09811824560165405, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 255190 + }, + { + "epoch": 0.9865318303412658, + "grad_norm": 0.0969930961728096, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 255200 + }, + { + "epoch": 0.986570487544649, + "grad_norm": 0.11122286319732666, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 255210 + }, + { + "epoch": 0.9866091447480323, + "grad_norm": 0.11096516996622086, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 255220 + }, + { + "epoch": 0.9866478019514157, + "grad_norm": 0.11507048457860947, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 255230 + }, + { + "epoch": 0.9866864591547989, + "grad_norm": 0.09455648064613342, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 255240 + }, + { + "epoch": 0.9867251163581822, + "grad_norm": 0.1153191328048706, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 255250 + }, + { + "epoch": 0.9867637735615654, + "grad_norm": 0.10693097859621048, + "learning_rate": 0.002, + "loss": 2.3108, + "step": 255260 + }, + { + "epoch": 0.9868024307649488, + "grad_norm": 0.10604626685380936, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 255270 + }, + { + "epoch": 0.986841087968332, + "grad_norm": 0.11790741235017776, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 255280 + }, + { + "epoch": 0.9868797451717153, + "grad_norm": 0.10942420363426208, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 255290 + }, + { + "epoch": 0.9869184023750985, + "grad_norm": 0.12136127054691315, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 255300 + }, + { + "epoch": 0.9869570595784819, + "grad_norm": 0.09772010147571564, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 255310 + }, + { + "epoch": 0.9869957167818652, + "grad_norm": 0.10985453426837921, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 255320 + }, + { + "epoch": 0.9870343739852484, + "grad_norm": 0.11444909125566483, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 255330 + }, + { + "epoch": 0.9870730311886317, + "grad_norm": 0.1355230212211609, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 255340 + }, + { + "epoch": 0.987111688392015, + "grad_norm": 0.09789875894784927, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 255350 + }, + { + "epoch": 0.9871503455953983, + "grad_norm": 0.10760276764631271, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 255360 + }, + { + "epoch": 0.9871890027987815, + "grad_norm": 0.10690094530582428, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 255370 + }, + { + "epoch": 0.9872276600021648, + "grad_norm": 0.10276754200458527, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 255380 + }, + { + "epoch": 0.987266317205548, + "grad_norm": 0.12542317807674408, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 255390 + }, + { + "epoch": 0.9873049744089314, + "grad_norm": 0.11923978477716446, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 255400 + }, + { + "epoch": 0.9873436316123146, + "grad_norm": 0.11488944292068481, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 255410 + }, + { + "epoch": 0.9873822888156979, + "grad_norm": 0.09592791646718979, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 255420 + }, + { + "epoch": 0.9874209460190811, + "grad_norm": 0.09236177057027817, + "learning_rate": 0.002, + "loss": 2.325, + "step": 255430 + }, + { + "epoch": 0.9874596032224645, + "grad_norm": 0.09595596790313721, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 255440 + }, + { + "epoch": 0.9874982604258478, + "grad_norm": 0.15593257546424866, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 255450 + }, + { + "epoch": 0.987536917629231, + "grad_norm": 0.11803868412971497, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 255460 + }, + { + "epoch": 0.9875755748326143, + "grad_norm": 0.1036602035164833, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 255470 + }, + { + "epoch": 0.9876142320359976, + "grad_norm": 0.1197872906923294, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 255480 + }, + { + "epoch": 0.9876528892393809, + "grad_norm": 0.1072472557425499, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 255490 + }, + { + "epoch": 0.9876915464427641, + "grad_norm": 0.10736659169197083, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 255500 + }, + { + "epoch": 0.9877302036461474, + "grad_norm": 0.09682945162057877, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 255510 + }, + { + "epoch": 0.9877688608495307, + "grad_norm": 0.11917953938245773, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 255520 + }, + { + "epoch": 0.987807518052914, + "grad_norm": 0.10262294858694077, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 255530 + }, + { + "epoch": 0.9878461752562973, + "grad_norm": 0.10043973475694656, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 255540 + }, + { + "epoch": 0.9878848324596805, + "grad_norm": 0.10268932580947876, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 255550 + }, + { + "epoch": 0.9879234896630639, + "grad_norm": 0.10502326488494873, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 255560 + }, + { + "epoch": 0.9879621468664471, + "grad_norm": 0.11357767134904861, + "learning_rate": 0.002, + "loss": 2.331, + "step": 255570 + }, + { + "epoch": 0.9880008040698304, + "grad_norm": 0.11702119559049606, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 255580 + }, + { + "epoch": 0.9880394612732136, + "grad_norm": 0.11545008420944214, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 255590 + }, + { + "epoch": 0.9880781184765969, + "grad_norm": 0.1151541993021965, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 255600 + }, + { + "epoch": 0.9881167756799802, + "grad_norm": 0.09862946718931198, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 255610 + }, + { + "epoch": 0.9881554328833635, + "grad_norm": 0.10757492482662201, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 255620 + }, + { + "epoch": 0.9881940900867467, + "grad_norm": 0.09744302183389664, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 255630 + }, + { + "epoch": 0.98823274729013, + "grad_norm": 0.11994163691997528, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 255640 + }, + { + "epoch": 0.9882714044935134, + "grad_norm": 0.09365137666463852, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 255650 + }, + { + "epoch": 0.9883100616968966, + "grad_norm": 0.09933947026729584, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 255660 + }, + { + "epoch": 0.9883487189002799, + "grad_norm": 0.11144649237394333, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 255670 + }, + { + "epoch": 0.9883873761036631, + "grad_norm": 0.11128731071949005, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 255680 + }, + { + "epoch": 0.9884260333070465, + "grad_norm": 0.09201089292764664, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 255690 + }, + { + "epoch": 0.9884646905104297, + "grad_norm": 0.11140462756156921, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 255700 + }, + { + "epoch": 0.988503347713813, + "grad_norm": 0.09715986996889114, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 255710 + }, + { + "epoch": 0.9885420049171962, + "grad_norm": 0.10743650048971176, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 255720 + }, + { + "epoch": 0.9885806621205796, + "grad_norm": 0.09190218895673752, + "learning_rate": 0.002, + "loss": 2.3136, + "step": 255730 + }, + { + "epoch": 0.9886193193239629, + "grad_norm": 0.10867530852556229, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 255740 + }, + { + "epoch": 0.9886579765273461, + "grad_norm": 0.2617291510105133, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 255750 + }, + { + "epoch": 0.9886966337307294, + "grad_norm": 0.11458270251750946, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 255760 + }, + { + "epoch": 0.9887352909341126, + "grad_norm": 0.254932165145874, + "learning_rate": 0.002, + "loss": 2.4027, + "step": 255770 + }, + { + "epoch": 0.988773948137496, + "grad_norm": 0.10507343709468842, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 255780 + }, + { + "epoch": 0.9888126053408792, + "grad_norm": 0.09200027585029602, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 255790 + }, + { + "epoch": 0.9888512625442625, + "grad_norm": 0.0957023873925209, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 255800 + }, + { + "epoch": 0.9888899197476457, + "grad_norm": 0.09974458813667297, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 255810 + }, + { + "epoch": 0.9889285769510291, + "grad_norm": 0.09365065395832062, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 255820 + }, + { + "epoch": 0.9889672341544123, + "grad_norm": 0.13055384159088135, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 255830 + }, + { + "epoch": 0.9890058913577956, + "grad_norm": 0.09168808907270432, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 255840 + }, + { + "epoch": 0.9890445485611788, + "grad_norm": 0.10091964900493622, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 255850 + }, + { + "epoch": 0.9890832057645622, + "grad_norm": 0.10866480320692062, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 255860 + }, + { + "epoch": 0.9891218629679455, + "grad_norm": 0.10408901423215866, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 255870 + }, + { + "epoch": 0.9891605201713287, + "grad_norm": 0.10550463944673538, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 255880 + }, + { + "epoch": 0.989199177374712, + "grad_norm": 0.09939587116241455, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 255890 + }, + { + "epoch": 0.9892378345780953, + "grad_norm": 0.11436154693365097, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 255900 + }, + { + "epoch": 0.9892764917814786, + "grad_norm": 0.10249326378107071, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 255910 + }, + { + "epoch": 0.9893151489848618, + "grad_norm": 0.15078838169574738, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 255920 + }, + { + "epoch": 0.9893538061882451, + "grad_norm": 0.10776419937610626, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 255930 + }, + { + "epoch": 0.9893924633916283, + "grad_norm": 0.10219588130712509, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 255940 + }, + { + "epoch": 0.9894311205950117, + "grad_norm": 0.10183137655258179, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 255950 + }, + { + "epoch": 0.989469777798395, + "grad_norm": 0.10559652000665665, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 255960 + }, + { + "epoch": 0.9895084350017782, + "grad_norm": 0.1247977614402771, + "learning_rate": 0.002, + "loss": 2.349, + "step": 255970 + }, + { + "epoch": 0.9895470922051615, + "grad_norm": 0.10177358239889145, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 255980 + }, + { + "epoch": 0.9895857494085448, + "grad_norm": 0.11221577972173691, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 255990 + }, + { + "epoch": 0.9896244066119281, + "grad_norm": 0.11370981484651566, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 256000 + }, + { + "epoch": 0.9896630638153113, + "grad_norm": 0.11815855652093887, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 256010 + }, + { + "epoch": 0.9897017210186946, + "grad_norm": 0.09536910802125931, + "learning_rate": 0.002, + "loss": 2.339, + "step": 256020 + }, + { + "epoch": 0.9897403782220779, + "grad_norm": 0.10650273412466049, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 256030 + }, + { + "epoch": 0.9897790354254612, + "grad_norm": 0.11287032067775726, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 256040 + }, + { + "epoch": 0.9898176926288444, + "grad_norm": 0.09238874912261963, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 256050 + }, + { + "epoch": 0.9898563498322277, + "grad_norm": 0.12202656269073486, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 256060 + }, + { + "epoch": 0.9898950070356111, + "grad_norm": 0.10122062265872955, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 256070 + }, + { + "epoch": 0.9899336642389943, + "grad_norm": 0.09519728273153305, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 256080 + }, + { + "epoch": 0.9899723214423776, + "grad_norm": 0.10702326148748398, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 256090 + }, + { + "epoch": 0.9900109786457608, + "grad_norm": 0.09455457329750061, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 256100 + }, + { + "epoch": 0.9900496358491442, + "grad_norm": 0.1183152049779892, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 256110 + }, + { + "epoch": 0.9900882930525274, + "grad_norm": 0.11568870395421982, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 256120 + }, + { + "epoch": 0.9901269502559107, + "grad_norm": 0.10681305825710297, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 256130 + }, + { + "epoch": 0.9901656074592939, + "grad_norm": 0.10074838250875473, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 256140 + }, + { + "epoch": 0.9902042646626772, + "grad_norm": 0.0929526686668396, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 256150 + }, + { + "epoch": 0.9902429218660606, + "grad_norm": 0.12454530596733093, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 256160 + }, + { + "epoch": 0.9902815790694438, + "grad_norm": 0.11118060350418091, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 256170 + }, + { + "epoch": 0.9903202362728271, + "grad_norm": 0.10692273825407028, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 256180 + }, + { + "epoch": 0.9903588934762103, + "grad_norm": 0.09192261099815369, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 256190 + }, + { + "epoch": 0.9903975506795937, + "grad_norm": 0.10425573587417603, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 256200 + }, + { + "epoch": 0.9904362078829769, + "grad_norm": 0.09590397030115128, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 256210 + }, + { + "epoch": 0.9904748650863602, + "grad_norm": 0.10007577389478683, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 256220 + }, + { + "epoch": 0.9905135222897434, + "grad_norm": 0.17764490842819214, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 256230 + }, + { + "epoch": 0.9905521794931268, + "grad_norm": 0.10875571519136429, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 256240 + }, + { + "epoch": 0.99059083669651, + "grad_norm": 0.10980333387851715, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 256250 + }, + { + "epoch": 0.9906294938998933, + "grad_norm": 0.09446674585342407, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 256260 + }, + { + "epoch": 0.9906681511032766, + "grad_norm": 0.1014462485909462, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 256270 + }, + { + "epoch": 0.9907068083066599, + "grad_norm": 0.10417579859495163, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 256280 + }, + { + "epoch": 0.9907454655100432, + "grad_norm": 0.22922813892364502, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 256290 + }, + { + "epoch": 0.9907841227134264, + "grad_norm": 0.10854848474264145, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 256300 + }, + { + "epoch": 0.9908227799168097, + "grad_norm": 0.11280990391969681, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 256310 + }, + { + "epoch": 0.9908614371201929, + "grad_norm": 0.11322092264890671, + "learning_rate": 0.002, + "loss": 2.347, + "step": 256320 + }, + { + "epoch": 0.9909000943235763, + "grad_norm": 0.10999466478824615, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 256330 + }, + { + "epoch": 0.9909387515269595, + "grad_norm": 0.10721048712730408, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 256340 + }, + { + "epoch": 0.9909774087303428, + "grad_norm": 0.09981708973646164, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 256350 + }, + { + "epoch": 0.991016065933726, + "grad_norm": 0.0954086109995842, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 256360 + }, + { + "epoch": 0.9910547231371094, + "grad_norm": 0.4622000753879547, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 256370 + }, + { + "epoch": 0.9910933803404927, + "grad_norm": 0.1136743351817131, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 256380 + }, + { + "epoch": 0.9911320375438759, + "grad_norm": 0.14929740130901337, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 256390 + }, + { + "epoch": 0.9911706947472592, + "grad_norm": 0.10695463418960571, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 256400 + }, + { + "epoch": 0.9912093519506425, + "grad_norm": 0.10876549780368805, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 256410 + }, + { + "epoch": 0.9912480091540258, + "grad_norm": 0.2180985063314438, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 256420 + }, + { + "epoch": 0.991286666357409, + "grad_norm": 0.10567354410886765, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 256430 + }, + { + "epoch": 0.9913253235607923, + "grad_norm": 0.11088450998067856, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 256440 + }, + { + "epoch": 0.9913639807641756, + "grad_norm": 0.10130184143781662, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 256450 + }, + { + "epoch": 0.9914026379675589, + "grad_norm": 0.12747453153133392, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 256460 + }, + { + "epoch": 0.9914412951709421, + "grad_norm": 0.09747633337974548, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 256470 + }, + { + "epoch": 0.9914799523743254, + "grad_norm": 0.09843626618385315, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 256480 + }, + { + "epoch": 0.9915186095777088, + "grad_norm": 0.08928278088569641, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 256490 + }, + { + "epoch": 0.991557266781092, + "grad_norm": 0.10301223397254944, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 256500 + }, + { + "epoch": 0.9915959239844753, + "grad_norm": 0.12475394457578659, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 256510 + }, + { + "epoch": 0.9916345811878585, + "grad_norm": 0.09833080321550369, + "learning_rate": 0.002, + "loss": 2.344, + "step": 256520 + }, + { + "epoch": 0.9916732383912418, + "grad_norm": 0.10509419441223145, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 256530 + }, + { + "epoch": 0.9917118955946251, + "grad_norm": 0.10112323611974716, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 256540 + }, + { + "epoch": 0.9917505527980084, + "grad_norm": 0.10240601748228073, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 256550 + }, + { + "epoch": 0.9917892100013916, + "grad_norm": 0.09574146568775177, + "learning_rate": 0.002, + "loss": 2.342, + "step": 256560 + }, + { + "epoch": 0.9918278672047749, + "grad_norm": 0.10088356584310532, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 256570 + }, + { + "epoch": 0.9918665244081583, + "grad_norm": 0.1093062236905098, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 256580 + }, + { + "epoch": 0.9919051816115415, + "grad_norm": 0.09802067279815674, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 256590 + }, + { + "epoch": 0.9919438388149248, + "grad_norm": 0.10470987111330032, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 256600 + }, + { + "epoch": 0.991982496018308, + "grad_norm": 0.1036287471652031, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 256610 + }, + { + "epoch": 0.9920211532216914, + "grad_norm": 0.11956336349248886, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 256620 + }, + { + "epoch": 0.9920598104250746, + "grad_norm": 0.08456283807754517, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 256630 + }, + { + "epoch": 0.9920984676284579, + "grad_norm": 0.0958782359957695, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 256640 + }, + { + "epoch": 0.9921371248318411, + "grad_norm": 0.09258156269788742, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 256650 + }, + { + "epoch": 0.9921757820352245, + "grad_norm": 0.09780330955982208, + "learning_rate": 0.002, + "loss": 2.34, + "step": 256660 + }, + { + "epoch": 0.9922144392386077, + "grad_norm": 0.17852285504341125, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 256670 + }, + { + "epoch": 0.992253096441991, + "grad_norm": 0.0965234786272049, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 256680 + }, + { + "epoch": 0.9922917536453743, + "grad_norm": 0.0944899246096611, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 256690 + }, + { + "epoch": 0.9923304108487575, + "grad_norm": 0.16657981276512146, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 256700 + }, + { + "epoch": 0.9923690680521409, + "grad_norm": 0.09872959554195404, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 256710 + }, + { + "epoch": 0.9924077252555241, + "grad_norm": 0.12205319851636887, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 256720 + }, + { + "epoch": 0.9924463824589074, + "grad_norm": 0.12135138362646103, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 256730 + }, + { + "epoch": 0.9924850396622906, + "grad_norm": 0.10703414678573608, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 256740 + }, + { + "epoch": 0.992523696865674, + "grad_norm": 0.12538368999958038, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 256750 + }, + { + "epoch": 0.9925623540690572, + "grad_norm": 0.10660772770643234, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 256760 + }, + { + "epoch": 0.9926010112724405, + "grad_norm": 0.13139641284942627, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 256770 + }, + { + "epoch": 0.9926396684758237, + "grad_norm": 0.12074771523475647, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 256780 + }, + { + "epoch": 0.9926783256792071, + "grad_norm": 0.10254442691802979, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 256790 + }, + { + "epoch": 0.9927169828825904, + "grad_norm": 0.1065768152475357, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 256800 + }, + { + "epoch": 0.9927556400859736, + "grad_norm": 0.09868630766868591, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 256810 + }, + { + "epoch": 0.9927942972893569, + "grad_norm": 0.10020441561937332, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 256820 + }, + { + "epoch": 0.9928329544927402, + "grad_norm": 0.10684265196323395, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 256830 + }, + { + "epoch": 0.9928716116961235, + "grad_norm": 0.11520854383707047, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 256840 + }, + { + "epoch": 0.9929102688995067, + "grad_norm": 0.11689134687185287, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 256850 + }, + { + "epoch": 0.99294892610289, + "grad_norm": 0.1267421841621399, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 256860 + }, + { + "epoch": 0.9929875833062732, + "grad_norm": 0.11704300343990326, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 256870 + }, + { + "epoch": 0.9930262405096566, + "grad_norm": 0.10719385743141174, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 256880 + }, + { + "epoch": 0.9930648977130399, + "grad_norm": 0.13121972978115082, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 256890 + }, + { + "epoch": 0.9931035549164231, + "grad_norm": 0.1221286877989769, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 256900 + }, + { + "epoch": 0.9931422121198064, + "grad_norm": 0.11407453566789627, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 256910 + }, + { + "epoch": 0.9931808693231897, + "grad_norm": 0.10373591631650925, + "learning_rate": 0.002, + "loss": 2.3589, + "step": 256920 + }, + { + "epoch": 0.993219526526573, + "grad_norm": 0.10496452450752258, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 256930 + }, + { + "epoch": 0.9932581837299562, + "grad_norm": 0.10863498598337173, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 256940 + }, + { + "epoch": 0.9932968409333395, + "grad_norm": 0.10201474279165268, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 256950 + }, + { + "epoch": 0.9933354981367228, + "grad_norm": 0.09551186114549637, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 256960 + }, + { + "epoch": 0.9933741553401061, + "grad_norm": 0.14234507083892822, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 256970 + }, + { + "epoch": 0.9934128125434893, + "grad_norm": 0.15089447796344757, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 256980 + }, + { + "epoch": 0.9934514697468726, + "grad_norm": 0.11496597528457642, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 256990 + }, + { + "epoch": 0.993490126950256, + "grad_norm": 0.10370314121246338, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 257000 + }, + { + "epoch": 0.9935287841536392, + "grad_norm": 0.13056083023548126, + "learning_rate": 0.002, + "loss": 2.334, + "step": 257010 + }, + { + "epoch": 0.9935674413570225, + "grad_norm": 0.10173903405666351, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 257020 + }, + { + "epoch": 0.9936060985604057, + "grad_norm": 0.1256549209356308, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 257030 + }, + { + "epoch": 0.9936447557637891, + "grad_norm": 0.10343006998300552, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 257040 + }, + { + "epoch": 0.9936834129671723, + "grad_norm": 0.1029481515288353, + "learning_rate": 0.002, + "loss": 2.327, + "step": 257050 + }, + { + "epoch": 0.9937220701705556, + "grad_norm": 0.10993833094835281, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 257060 + }, + { + "epoch": 0.9937607273739388, + "grad_norm": 0.11532630026340485, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 257070 + }, + { + "epoch": 0.9937993845773221, + "grad_norm": 0.10593671351671219, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 257080 + }, + { + "epoch": 0.9938380417807055, + "grad_norm": 0.0915307104587555, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 257090 + }, + { + "epoch": 0.9938766989840887, + "grad_norm": 0.22633057832717896, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 257100 + }, + { + "epoch": 0.993915356187472, + "grad_norm": 0.16773845255374908, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 257110 + }, + { + "epoch": 0.9939540133908552, + "grad_norm": 0.116732157766819, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 257120 + }, + { + "epoch": 0.9939926705942386, + "grad_norm": 0.11831034719944, + "learning_rate": 0.002, + "loss": 2.3085, + "step": 257130 + }, + { + "epoch": 0.9940313277976218, + "grad_norm": 0.09759490191936493, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 257140 + }, + { + "epoch": 0.9940699850010051, + "grad_norm": 0.11319709569215775, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 257150 + }, + { + "epoch": 0.9941086422043883, + "grad_norm": 0.10495991259813309, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 257160 + }, + { + "epoch": 0.9941472994077717, + "grad_norm": 0.10347853600978851, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 257170 + }, + { + "epoch": 0.9941859566111549, + "grad_norm": 0.11005652695894241, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 257180 + }, + { + "epoch": 0.9942246138145382, + "grad_norm": 0.1012812927365303, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 257190 + }, + { + "epoch": 0.9942632710179214, + "grad_norm": 0.12478403002023697, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 257200 + }, + { + "epoch": 0.9943019282213048, + "grad_norm": 0.09518378973007202, + "learning_rate": 0.002, + "loss": 2.333, + "step": 257210 + }, + { + "epoch": 0.9943405854246881, + "grad_norm": 0.1230977401137352, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 257220 + }, + { + "epoch": 0.9943792426280713, + "grad_norm": 0.10036604851484299, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 257230 + }, + { + "epoch": 0.9944178998314546, + "grad_norm": 0.13655316829681396, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 257240 + }, + { + "epoch": 0.9944565570348378, + "grad_norm": 0.10106083005666733, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 257250 + }, + { + "epoch": 0.9944952142382212, + "grad_norm": 0.28169330954551697, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 257260 + }, + { + "epoch": 0.9945338714416044, + "grad_norm": 0.1256679892539978, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 257270 + }, + { + "epoch": 0.9945725286449877, + "grad_norm": 0.10080401599407196, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 257280 + }, + { + "epoch": 0.9946111858483709, + "grad_norm": 0.09396111220121384, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 257290 + }, + { + "epoch": 0.9946498430517543, + "grad_norm": 0.1262788027524948, + "learning_rate": 0.002, + "loss": 2.324, + "step": 257300 + }, + { + "epoch": 0.9946885002551376, + "grad_norm": 0.12998312711715698, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 257310 + }, + { + "epoch": 0.9947271574585208, + "grad_norm": 0.10756916552782059, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 257320 + }, + { + "epoch": 0.9947658146619041, + "grad_norm": 0.10928453505039215, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 257330 + }, + { + "epoch": 0.9948044718652874, + "grad_norm": 0.1297762244939804, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 257340 + }, + { + "epoch": 0.9948431290686707, + "grad_norm": 0.11227850615978241, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 257350 + }, + { + "epoch": 0.9948817862720539, + "grad_norm": 0.10288013517856598, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 257360 + }, + { + "epoch": 0.9949204434754372, + "grad_norm": 0.08381501585245132, + "learning_rate": 0.002, + "loss": 2.3657, + "step": 257370 + }, + { + "epoch": 0.9949591006788205, + "grad_norm": 0.13382337987422943, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 257380 + }, + { + "epoch": 0.9949977578822038, + "grad_norm": 0.09629426896572113, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 257390 + }, + { + "epoch": 0.995036415085587, + "grad_norm": 0.10021605342626572, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 257400 + }, + { + "epoch": 0.9950750722889703, + "grad_norm": 0.10886742919683456, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 257410 + }, + { + "epoch": 0.9951137294923537, + "grad_norm": 0.42458826303482056, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 257420 + }, + { + "epoch": 0.9951523866957369, + "grad_norm": 0.11212582141160965, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 257430 + }, + { + "epoch": 0.9951910438991202, + "grad_norm": 0.09543899446725845, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 257440 + }, + { + "epoch": 0.9952297011025034, + "grad_norm": 0.1293252855539322, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 257450 + }, + { + "epoch": 0.9952683583058867, + "grad_norm": 0.11018099635839462, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 257460 + }, + { + "epoch": 0.99530701550927, + "grad_norm": 0.09845305234193802, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 257470 + }, + { + "epoch": 0.9953456727126533, + "grad_norm": 0.09646732360124588, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 257480 + }, + { + "epoch": 0.9953843299160365, + "grad_norm": 0.11497314274311066, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 257490 + }, + { + "epoch": 0.9954229871194198, + "grad_norm": 0.11051710695028305, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 257500 + }, + { + "epoch": 0.9954616443228032, + "grad_norm": 0.12046661227941513, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 257510 + }, + { + "epoch": 0.9955003015261864, + "grad_norm": 0.10718783736228943, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 257520 + }, + { + "epoch": 0.9955389587295697, + "grad_norm": 0.18187186121940613, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 257530 + }, + { + "epoch": 0.9955776159329529, + "grad_norm": 0.11338195949792862, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 257540 + }, + { + "epoch": 0.9956162731363363, + "grad_norm": 0.10064757615327835, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 257550 + }, + { + "epoch": 0.9956549303397195, + "grad_norm": 0.11266288161277771, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 257560 + }, + { + "epoch": 0.9956935875431028, + "grad_norm": 0.12426048517227173, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 257570 + }, + { + "epoch": 0.995732244746486, + "grad_norm": 0.10261692106723785, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 257580 + }, + { + "epoch": 0.9957709019498694, + "grad_norm": 0.12305546551942825, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 257590 + }, + { + "epoch": 0.9958095591532526, + "grad_norm": 0.1324758678674698, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 257600 + }, + { + "epoch": 0.9958482163566359, + "grad_norm": 0.13306169211864471, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 257610 + }, + { + "epoch": 0.9958868735600191, + "grad_norm": 0.09178532660007477, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 257620 + }, + { + "epoch": 0.9959255307634024, + "grad_norm": 0.13694193959236145, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 257630 + }, + { + "epoch": 0.9959641879667858, + "grad_norm": 0.09993527829647064, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 257640 + }, + { + "epoch": 0.996002845170169, + "grad_norm": 0.1072765439748764, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 257650 + }, + { + "epoch": 0.9960415023735523, + "grad_norm": 0.1019691601395607, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 257660 + }, + { + "epoch": 0.9960801595769355, + "grad_norm": 0.129447802901268, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 257670 + }, + { + "epoch": 0.9961188167803189, + "grad_norm": 0.0991927981376648, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 257680 + }, + { + "epoch": 0.9961574739837021, + "grad_norm": 0.10918325185775757, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 257690 + }, + { + "epoch": 0.9961961311870854, + "grad_norm": 0.12434779852628708, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 257700 + }, + { + "epoch": 0.9962347883904686, + "grad_norm": 0.09883172065019608, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 257710 + }, + { + "epoch": 0.996273445593852, + "grad_norm": 0.10056095570325851, + "learning_rate": 0.002, + "loss": 2.336, + "step": 257720 + }, + { + "epoch": 0.9963121027972353, + "grad_norm": 0.114386647939682, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 257730 + }, + { + "epoch": 0.9963507600006185, + "grad_norm": 0.3775371015071869, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 257740 + }, + { + "epoch": 0.9963894172040018, + "grad_norm": 0.1052108183503151, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 257750 + }, + { + "epoch": 0.9964280744073851, + "grad_norm": 0.12209247797727585, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 257760 + }, + { + "epoch": 0.9964667316107684, + "grad_norm": 0.10437662899494171, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 257770 + }, + { + "epoch": 0.9965053888141516, + "grad_norm": 0.10273770987987518, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 257780 + }, + { + "epoch": 0.9965440460175349, + "grad_norm": 0.12768568098545074, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 257790 + }, + { + "epoch": 0.9965827032209181, + "grad_norm": 0.10613568872213364, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 257800 + }, + { + "epoch": 0.9966213604243015, + "grad_norm": 0.09213811159133911, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 257810 + }, + { + "epoch": 0.9966600176276847, + "grad_norm": 0.10028841346502304, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 257820 + }, + { + "epoch": 0.996698674831068, + "grad_norm": 0.12576673924922943, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 257830 + }, + { + "epoch": 0.9967373320344513, + "grad_norm": 0.09395378828048706, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 257840 + }, + { + "epoch": 0.9967759892378346, + "grad_norm": 0.105169877409935, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 257850 + }, + { + "epoch": 0.9968146464412179, + "grad_norm": 0.10482071340084076, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 257860 + }, + { + "epoch": 0.9968533036446011, + "grad_norm": 0.11198008805513382, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 257870 + }, + { + "epoch": 0.9968919608479844, + "grad_norm": 0.09476982802152634, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 257880 + }, + { + "epoch": 0.9969306180513677, + "grad_norm": 0.18312402069568634, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 257890 + }, + { + "epoch": 0.996969275254751, + "grad_norm": 0.1149301528930664, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 257900 + }, + { + "epoch": 0.9970079324581342, + "grad_norm": 0.10394993424415588, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 257910 + }, + { + "epoch": 0.9970465896615175, + "grad_norm": 0.12158011645078659, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 257920 + }, + { + "epoch": 0.9970852468649009, + "grad_norm": 0.116344153881073, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 257930 + }, + { + "epoch": 0.9971239040682841, + "grad_norm": 0.1335514634847641, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 257940 + }, + { + "epoch": 0.9971625612716674, + "grad_norm": 0.1063564270734787, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 257950 + }, + { + "epoch": 0.9972012184750506, + "grad_norm": 0.11535150557756424, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 257960 + }, + { + "epoch": 0.997239875678434, + "grad_norm": 0.12290379405021667, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 257970 + }, + { + "epoch": 0.9972785328818172, + "grad_norm": 0.09599297493696213, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 257980 + }, + { + "epoch": 0.9973171900852005, + "grad_norm": 0.12629227340221405, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 257990 + }, + { + "epoch": 0.9973558472885837, + "grad_norm": 0.10190927237272263, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 258000 + }, + { + "epoch": 0.997394504491967, + "grad_norm": 0.09525350481271744, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 258010 + }, + { + "epoch": 0.9974331616953503, + "grad_norm": 0.14495059847831726, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 258020 + }, + { + "epoch": 0.9974718188987336, + "grad_norm": 0.10044555366039276, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 258030 + }, + { + "epoch": 0.9975104761021168, + "grad_norm": 0.10116809606552124, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 258040 + }, + { + "epoch": 0.9975491333055001, + "grad_norm": 0.11794326454401016, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 258050 + }, + { + "epoch": 0.9975877905088835, + "grad_norm": 0.10003200173377991, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 258060 + }, + { + "epoch": 0.9976264477122667, + "grad_norm": 0.11192117631435394, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 258070 + }, + { + "epoch": 0.99766510491565, + "grad_norm": 0.1364094614982605, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 258080 + }, + { + "epoch": 0.9977037621190332, + "grad_norm": 0.11394291371107101, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 258090 + }, + { + "epoch": 0.9977424193224166, + "grad_norm": 0.11761811375617981, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 258100 + }, + { + "epoch": 0.9977810765257998, + "grad_norm": 0.10144181549549103, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 258110 + }, + { + "epoch": 0.9978197337291831, + "grad_norm": 0.09599994122982025, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 258120 + }, + { + "epoch": 0.9978583909325663, + "grad_norm": 0.09742031246423721, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 258130 + }, + { + "epoch": 0.9978970481359497, + "grad_norm": 0.09783077985048294, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 258140 + }, + { + "epoch": 0.997935705339333, + "grad_norm": 0.09153680503368378, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 258150 + }, + { + "epoch": 0.9979743625427162, + "grad_norm": 0.126260906457901, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 258160 + }, + { + "epoch": 0.9980130197460995, + "grad_norm": 0.11532141268253326, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 258170 + }, + { + "epoch": 0.9980516769494827, + "grad_norm": 0.09739511460065842, + "learning_rate": 0.002, + "loss": 2.336, + "step": 258180 + }, + { + "epoch": 0.9980903341528661, + "grad_norm": 0.10290549695491791, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 258190 + }, + { + "epoch": 0.9981289913562493, + "grad_norm": 0.11158601194620132, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 258200 + }, + { + "epoch": 0.9981676485596326, + "grad_norm": 0.14581574499607086, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 258210 + }, + { + "epoch": 0.9982063057630158, + "grad_norm": 0.13148726522922516, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 258220 + }, + { + "epoch": 0.9982449629663992, + "grad_norm": 0.10082720220088959, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 258230 + }, + { + "epoch": 0.9982836201697824, + "grad_norm": 0.10381853580474854, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 258240 + }, + { + "epoch": 0.9983222773731657, + "grad_norm": 0.10770632326602936, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 258250 + }, + { + "epoch": 0.998360934576549, + "grad_norm": 0.11605487763881683, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 258260 + }, + { + "epoch": 0.9983995917799323, + "grad_norm": 0.10654358565807343, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 258270 + }, + { + "epoch": 0.9984382489833156, + "grad_norm": 0.09872237592935562, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 258280 + }, + { + "epoch": 0.9984769061866988, + "grad_norm": 0.13964754343032837, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 258290 + }, + { + "epoch": 0.9985155633900821, + "grad_norm": 0.09526875615119934, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 258300 + }, + { + "epoch": 0.9985542205934654, + "grad_norm": 0.14932149648666382, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 258310 + }, + { + "epoch": 0.9985928777968487, + "grad_norm": 0.11767978221178055, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 258320 + }, + { + "epoch": 0.9986315350002319, + "grad_norm": 0.10140397399663925, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 258330 + }, + { + "epoch": 0.9986701922036152, + "grad_norm": 0.09302809834480286, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 258340 + }, + { + "epoch": 0.9987088494069986, + "grad_norm": 0.09450498968362808, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 258350 + }, + { + "epoch": 0.9987475066103818, + "grad_norm": 0.12323355674743652, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 258360 + }, + { + "epoch": 0.9987861638137651, + "grad_norm": 0.1029418408870697, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 258370 + }, + { + "epoch": 0.9988248210171483, + "grad_norm": 0.09626761823892593, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 258380 + }, + { + "epoch": 0.9988634782205316, + "grad_norm": 0.1211252510547638, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 258390 + }, + { + "epoch": 0.9989021354239149, + "grad_norm": 0.1048799678683281, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 258400 + }, + { + "epoch": 0.9989407926272982, + "grad_norm": 0.11532396078109741, + "learning_rate": 0.002, + "loss": 2.334, + "step": 258410 + }, + { + "epoch": 0.9989794498306814, + "grad_norm": 0.10050155967473984, + "learning_rate": 0.002, + "loss": 2.33, + "step": 258420 + }, + { + "epoch": 0.9990181070340647, + "grad_norm": 0.10442036390304565, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 258430 + }, + { + "epoch": 0.999056764237448, + "grad_norm": 0.12391044199466705, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 258440 + }, + { + "epoch": 0.9990954214408313, + "grad_norm": 0.11113929003477097, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 258450 + }, + { + "epoch": 0.9991340786442146, + "grad_norm": 0.1350424885749817, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 258460 + }, + { + "epoch": 0.9991727358475978, + "grad_norm": 0.10169805586338043, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 258470 + }, + { + "epoch": 0.9992113930509812, + "grad_norm": 0.0997602790594101, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 258480 + }, + { + "epoch": 0.9992500502543644, + "grad_norm": 0.10066650807857513, + "learning_rate": 0.002, + "loss": 2.342, + "step": 258490 + }, + { + "epoch": 0.9992887074577477, + "grad_norm": 0.09506989270448685, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 258500 + }, + { + "epoch": 0.9993273646611309, + "grad_norm": 0.1080443412065506, + "learning_rate": 0.002, + "loss": 2.339, + "step": 258510 + }, + { + "epoch": 0.9993660218645143, + "grad_norm": 0.09656565636396408, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 258520 + }, + { + "epoch": 0.9994046790678975, + "grad_norm": 0.15320773422718048, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 258530 + }, + { + "epoch": 0.9994433362712808, + "grad_norm": 0.09774304926395416, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 258540 + }, + { + "epoch": 0.999481993474664, + "grad_norm": 0.10777298361063004, + "learning_rate": 0.002, + "loss": 2.347, + "step": 258550 + }, + { + "epoch": 0.9995206506780473, + "grad_norm": 0.139897882938385, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 258560 + }, + { + "epoch": 0.9995593078814307, + "grad_norm": 0.10244569927453995, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 258570 + }, + { + "epoch": 0.9995979650848139, + "grad_norm": 0.12334243953227997, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 258580 + }, + { + "epoch": 0.9996366222881972, + "grad_norm": 0.10992878675460815, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 258590 + }, + { + "epoch": 0.9996752794915804, + "grad_norm": 0.099768728017807, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 258600 + }, + { + "epoch": 0.9997139366949638, + "grad_norm": 0.11886005103588104, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 258610 + }, + { + "epoch": 0.999752593898347, + "grad_norm": 0.10944291204214096, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 258620 + }, + { + "epoch": 0.9997912511017303, + "grad_norm": 0.1227748841047287, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 258630 + }, + { + "epoch": 0.9998299083051135, + "grad_norm": 0.12421202659606934, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 258640 + }, + { + "epoch": 0.9998685655084969, + "grad_norm": 0.10084223002195358, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 258650 + }, + { + "epoch": 0.9999072227118802, + "grad_norm": 0.10667677968740463, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 258660 + }, + { + "epoch": 0.9999458799152634, + "grad_norm": 0.10123629122972488, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 258670 + }, + { + "epoch": 0.9999845371186467, + "grad_norm": 0.10575992614030838, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 258680 + }, + { + "epoch": 1.00002319432203, + "grad_norm": 0.10037583857774734, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 258690 + }, + { + "epoch": 1.0000618515254132, + "grad_norm": 0.09392248094081879, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 258700 + }, + { + "epoch": 1.0001005087287966, + "grad_norm": 0.09586737304925919, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 258710 + }, + { + "epoch": 1.0001391659321799, + "grad_norm": 0.11475492268800735, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 258720 + }, + { + "epoch": 1.0001778231355631, + "grad_norm": 0.12680473923683167, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 258730 + }, + { + "epoch": 1.0002164803389464, + "grad_norm": 0.0987839326262474, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 258740 + }, + { + "epoch": 1.0002551375423296, + "grad_norm": 0.10396572202444077, + "learning_rate": 0.002, + "loss": 2.323, + "step": 258750 + }, + { + "epoch": 1.000293794745713, + "grad_norm": 0.1215694472193718, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 258760 + }, + { + "epoch": 1.0003324519490961, + "grad_norm": 0.11470723897218704, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 258770 + }, + { + "epoch": 1.0003711091524794, + "grad_norm": 0.1048058271408081, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 258780 + }, + { + "epoch": 1.0004097663558627, + "grad_norm": 0.10398370772600174, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 258790 + }, + { + "epoch": 1.0004484235592461, + "grad_norm": 0.0954103097319603, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 258800 + }, + { + "epoch": 1.0004870807626294, + "grad_norm": 0.10868433117866516, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 258810 + }, + { + "epoch": 1.0005257379660126, + "grad_norm": 0.11799705773591995, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 258820 + }, + { + "epoch": 1.0005643951693959, + "grad_norm": 0.10871309787034988, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 258830 + }, + { + "epoch": 1.0006030523727791, + "grad_norm": 0.12025268375873566, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 258840 + }, + { + "epoch": 1.0006417095761624, + "grad_norm": 0.08687405288219452, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 258850 + }, + { + "epoch": 1.0006803667795456, + "grad_norm": 0.10406842082738876, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 258860 + }, + { + "epoch": 1.000719023982929, + "grad_norm": 0.1017787903547287, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 258870 + }, + { + "epoch": 1.0007576811863124, + "grad_norm": 0.10971411317586899, + "learning_rate": 0.002, + "loss": 2.347, + "step": 258880 + }, + { + "epoch": 1.0007963383896956, + "grad_norm": 0.1039801836013794, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 258890 + }, + { + "epoch": 1.0008349955930789, + "grad_norm": 0.10778878629207611, + "learning_rate": 0.002, + "loss": 2.325, + "step": 258900 + }, + { + "epoch": 1.0008736527964621, + "grad_norm": 0.12391100823879242, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 258910 + }, + { + "epoch": 1.0009123099998454, + "grad_norm": 0.09689292311668396, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 258920 + }, + { + "epoch": 1.0009509672032286, + "grad_norm": 0.09480072557926178, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 258930 + }, + { + "epoch": 1.0009896244066119, + "grad_norm": 0.11858639866113663, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 258940 + }, + { + "epoch": 1.0010282816099951, + "grad_norm": 0.11604545265436172, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 258950 + }, + { + "epoch": 1.0010669388133784, + "grad_norm": 0.10790597647428513, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 258960 + }, + { + "epoch": 1.0011055960167619, + "grad_norm": 0.10130016505718231, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 258970 + }, + { + "epoch": 1.001144253220145, + "grad_norm": 0.12981268763542175, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 258980 + }, + { + "epoch": 1.0011829104235284, + "grad_norm": 0.1396598368883133, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 258990 + }, + { + "epoch": 1.0012215676269116, + "grad_norm": 0.11186467111110687, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 259000 + }, + { + "epoch": 1.0012602248302949, + "grad_norm": 0.09468837082386017, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 259010 + }, + { + "epoch": 1.0012988820336781, + "grad_norm": 0.10041584819555283, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 259020 + }, + { + "epoch": 1.0013375392370614, + "grad_norm": 0.1317100077867508, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 259030 + }, + { + "epoch": 1.0013761964404446, + "grad_norm": 0.11795981228351593, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 259040 + }, + { + "epoch": 1.001414853643828, + "grad_norm": 0.09735662490129471, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 259050 + }, + { + "epoch": 1.0014535108472113, + "grad_norm": 0.12275316566228867, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 259060 + }, + { + "epoch": 1.0014921680505946, + "grad_norm": 0.10865990072488785, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 259070 + }, + { + "epoch": 1.0015308252539779, + "grad_norm": 0.1025635376572609, + "learning_rate": 0.002, + "loss": 2.354, + "step": 259080 + }, + { + "epoch": 1.001569482457361, + "grad_norm": 0.10392650216817856, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 259090 + }, + { + "epoch": 1.0016081396607444, + "grad_norm": 0.10056158155202866, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 259100 + }, + { + "epoch": 1.0016467968641276, + "grad_norm": 0.10843063145875931, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 259110 + }, + { + "epoch": 1.0016854540675109, + "grad_norm": 0.09778635203838348, + "learning_rate": 0.002, + "loss": 2.337, + "step": 259120 + }, + { + "epoch": 1.0017241112708941, + "grad_norm": 0.09872616827487946, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 259130 + }, + { + "epoch": 1.0017627684742776, + "grad_norm": 0.09871741384267807, + "learning_rate": 0.002, + "loss": 2.337, + "step": 259140 + }, + { + "epoch": 1.0018014256776608, + "grad_norm": 0.10286332666873932, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 259150 + }, + { + "epoch": 1.001840082881044, + "grad_norm": 0.09847522526979446, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 259160 + }, + { + "epoch": 1.0018787400844273, + "grad_norm": 0.10867820680141449, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 259170 + }, + { + "epoch": 1.0019173972878106, + "grad_norm": 0.10226430743932724, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 259180 + }, + { + "epoch": 1.0019560544911938, + "grad_norm": 0.09926961362361908, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 259190 + }, + { + "epoch": 1.001994711694577, + "grad_norm": 0.10008293390274048, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 259200 + }, + { + "epoch": 1.0020333688979604, + "grad_norm": 0.10106901824474335, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 259210 + }, + { + "epoch": 1.0020720261013438, + "grad_norm": 0.1036130040884018, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 259220 + }, + { + "epoch": 1.002110683304727, + "grad_norm": 0.10225784033536911, + "learning_rate": 0.002, + "loss": 2.334, + "step": 259230 + }, + { + "epoch": 1.0021493405081103, + "grad_norm": 0.08691437542438507, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 259240 + }, + { + "epoch": 1.0021879977114936, + "grad_norm": 0.09674886614084244, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 259250 + }, + { + "epoch": 1.0022266549148768, + "grad_norm": 0.15050897002220154, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 259260 + }, + { + "epoch": 1.00226531211826, + "grad_norm": 0.17266719043254852, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 259270 + }, + { + "epoch": 1.0023039693216433, + "grad_norm": 0.09772799909114838, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 259280 + }, + { + "epoch": 1.0023426265250266, + "grad_norm": 0.09189768880605698, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 259290 + }, + { + "epoch": 1.0023812837284098, + "grad_norm": 0.10858861356973648, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 259300 + }, + { + "epoch": 1.0024199409317933, + "grad_norm": 0.12308477610349655, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 259310 + }, + { + "epoch": 1.0024585981351766, + "grad_norm": 0.14335885643959045, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 259320 + }, + { + "epoch": 1.0024972553385598, + "grad_norm": 0.09966814517974854, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 259330 + }, + { + "epoch": 1.002535912541943, + "grad_norm": 0.09803139418363571, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 259340 + }, + { + "epoch": 1.0025745697453263, + "grad_norm": 0.14151626825332642, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 259350 + }, + { + "epoch": 1.0026132269487096, + "grad_norm": 0.11047150939702988, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 259360 + }, + { + "epoch": 1.0026518841520928, + "grad_norm": 0.09904196858406067, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 259370 + }, + { + "epoch": 1.002690541355476, + "grad_norm": 0.10631304979324341, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 259380 + }, + { + "epoch": 1.0027291985588596, + "grad_norm": 0.1100512444972992, + "learning_rate": 0.002, + "loss": 2.328, + "step": 259390 + }, + { + "epoch": 1.0027678557622428, + "grad_norm": 0.11194992810487747, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 259400 + }, + { + "epoch": 1.002806512965626, + "grad_norm": 0.12639199197292328, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 259410 + }, + { + "epoch": 1.0028451701690093, + "grad_norm": 0.12143406271934509, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 259420 + }, + { + "epoch": 1.0028838273723926, + "grad_norm": 0.10168673098087311, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 259430 + }, + { + "epoch": 1.0029224845757758, + "grad_norm": 0.11748029291629791, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 259440 + }, + { + "epoch": 1.002961141779159, + "grad_norm": 0.10045807808637619, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 259450 + }, + { + "epoch": 1.0029997989825423, + "grad_norm": 0.09811057150363922, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 259460 + }, + { + "epoch": 1.0030384561859256, + "grad_norm": 0.11925587058067322, + "learning_rate": 0.002, + "loss": 2.333, + "step": 259470 + }, + { + "epoch": 1.003077113389309, + "grad_norm": 0.12439731508493423, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 259480 + }, + { + "epoch": 1.0031157705926923, + "grad_norm": 0.12637203931808472, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 259490 + }, + { + "epoch": 1.0031544277960756, + "grad_norm": 0.10005202889442444, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 259500 + }, + { + "epoch": 1.0031930849994588, + "grad_norm": 0.10126623511314392, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 259510 + }, + { + "epoch": 1.003231742202842, + "grad_norm": 0.12067006528377533, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 259520 + }, + { + "epoch": 1.0032703994062253, + "grad_norm": 0.11962800472974777, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 259530 + }, + { + "epoch": 1.0033090566096086, + "grad_norm": 0.09837370365858078, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 259540 + }, + { + "epoch": 1.0033477138129918, + "grad_norm": 0.11886280030012131, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 259550 + }, + { + "epoch": 1.0033863710163753, + "grad_norm": 0.09033774584531784, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 259560 + }, + { + "epoch": 1.0034250282197585, + "grad_norm": 0.10335791856050491, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 259570 + }, + { + "epoch": 1.0034636854231418, + "grad_norm": 0.09330695122480392, + "learning_rate": 0.002, + "loss": 2.338, + "step": 259580 + }, + { + "epoch": 1.003502342626525, + "grad_norm": 0.12462907284498215, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 259590 + }, + { + "epoch": 1.0035409998299083, + "grad_norm": 0.09722462296485901, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 259600 + }, + { + "epoch": 1.0035796570332916, + "grad_norm": 0.11706508696079254, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 259610 + }, + { + "epoch": 1.0036183142366748, + "grad_norm": 0.09267829358577728, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 259620 + }, + { + "epoch": 1.003656971440058, + "grad_norm": 0.08400312811136246, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 259630 + }, + { + "epoch": 1.0036956286434415, + "grad_norm": 0.1081552729010582, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 259640 + }, + { + "epoch": 1.0037342858468248, + "grad_norm": 0.10565217584371567, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 259650 + }, + { + "epoch": 1.003772943050208, + "grad_norm": 0.09283998608589172, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 259660 + }, + { + "epoch": 1.0038116002535913, + "grad_norm": 0.12594252824783325, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 259670 + }, + { + "epoch": 1.0038502574569745, + "grad_norm": 0.10351262241601944, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 259680 + }, + { + "epoch": 1.0038889146603578, + "grad_norm": 0.09758039563894272, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 259690 + }, + { + "epoch": 1.003927571863741, + "grad_norm": 0.11524217575788498, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 259700 + }, + { + "epoch": 1.0039662290671243, + "grad_norm": 0.08860163390636444, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 259710 + }, + { + "epoch": 1.0040048862705075, + "grad_norm": 0.09536821395158768, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 259720 + }, + { + "epoch": 1.004043543473891, + "grad_norm": 0.11455459147691727, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 259730 + }, + { + "epoch": 1.0040822006772743, + "grad_norm": 0.12582819163799286, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 259740 + }, + { + "epoch": 1.0041208578806575, + "grad_norm": 0.11551017314195633, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 259750 + }, + { + "epoch": 1.0041595150840408, + "grad_norm": 0.11985752731561661, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 259760 + }, + { + "epoch": 1.004198172287424, + "grad_norm": 0.09666941314935684, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 259770 + }, + { + "epoch": 1.0042368294908073, + "grad_norm": 0.11292658746242523, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 259780 + }, + { + "epoch": 1.0042754866941905, + "grad_norm": 0.10751032084226608, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 259790 + }, + { + "epoch": 1.0043141438975738, + "grad_norm": 0.09483782202005386, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 259800 + }, + { + "epoch": 1.0043528011009573, + "grad_norm": 0.10770095884799957, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 259810 + }, + { + "epoch": 1.0043914583043405, + "grad_norm": 0.10307003557682037, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 259820 + }, + { + "epoch": 1.0044301155077238, + "grad_norm": 0.1362147182226181, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 259830 + }, + { + "epoch": 1.004468772711107, + "grad_norm": 0.11033639311790466, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 259840 + }, + { + "epoch": 1.0045074299144903, + "grad_norm": 0.09604737907648087, + "learning_rate": 0.002, + "loss": 2.316, + "step": 259850 + }, + { + "epoch": 1.0045460871178735, + "grad_norm": 0.10238654911518097, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 259860 + }, + { + "epoch": 1.0045847443212568, + "grad_norm": 0.10481748729944229, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 259870 + }, + { + "epoch": 1.00462340152464, + "grad_norm": 0.11596349626779556, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 259880 + }, + { + "epoch": 1.0046620587280233, + "grad_norm": 0.13182884454727173, + "learning_rate": 0.002, + "loss": 2.344, + "step": 259890 + }, + { + "epoch": 1.0047007159314068, + "grad_norm": 0.11005352437496185, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 259900 + }, + { + "epoch": 1.00473937313479, + "grad_norm": 0.09401649236679077, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 259910 + }, + { + "epoch": 1.0047780303381733, + "grad_norm": 0.10460685193538666, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 259920 + }, + { + "epoch": 1.0048166875415565, + "grad_norm": 0.10680817812681198, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 259930 + }, + { + "epoch": 1.0048553447449398, + "grad_norm": 0.17173266410827637, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 259940 + }, + { + "epoch": 1.004894001948323, + "grad_norm": 0.08766689896583557, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 259950 + }, + { + "epoch": 1.0049326591517063, + "grad_norm": 0.11202475428581238, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 259960 + }, + { + "epoch": 1.0049713163550895, + "grad_norm": 0.10411037504673004, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 259970 + }, + { + "epoch": 1.005009973558473, + "grad_norm": 0.12385007739067078, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 259980 + }, + { + "epoch": 1.0050486307618562, + "grad_norm": 0.10399666428565979, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 259990 + }, + { + "epoch": 1.0050872879652395, + "grad_norm": 0.10041926056146622, + "learning_rate": 0.002, + "loss": 2.337, + "step": 260000 + }, + { + "epoch": 1.0051259451686227, + "grad_norm": 0.11362820863723755, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 260010 + }, + { + "epoch": 1.005164602372006, + "grad_norm": 0.09621165692806244, + "learning_rate": 0.002, + "loss": 2.343, + "step": 260020 + }, + { + "epoch": 1.0052032595753893, + "grad_norm": 0.09594116359949112, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 260030 + }, + { + "epoch": 1.0052419167787725, + "grad_norm": 0.12816180288791656, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 260040 + }, + { + "epoch": 1.0052805739821558, + "grad_norm": 0.09673391282558441, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 260050 + }, + { + "epoch": 1.005319231185539, + "grad_norm": 0.10709298402070999, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 260060 + }, + { + "epoch": 1.0053578883889225, + "grad_norm": 0.10160128772258759, + "learning_rate": 0.002, + "loss": 2.34, + "step": 260070 + }, + { + "epoch": 1.0053965455923057, + "grad_norm": 0.09211761504411697, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 260080 + }, + { + "epoch": 1.005435202795689, + "grad_norm": 0.10863563418388367, + "learning_rate": 0.002, + "loss": 2.348, + "step": 260090 + }, + { + "epoch": 1.0054738599990722, + "grad_norm": 0.1012507751584053, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 260100 + }, + { + "epoch": 1.0055125172024555, + "grad_norm": 0.10079700499773026, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 260110 + }, + { + "epoch": 1.0055511744058387, + "grad_norm": 0.11268491297960281, + "learning_rate": 0.002, + "loss": 2.331, + "step": 260120 + }, + { + "epoch": 1.005589831609222, + "grad_norm": 0.10046052187681198, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 260130 + }, + { + "epoch": 1.0056284888126052, + "grad_norm": 0.12722858786582947, + "learning_rate": 0.002, + "loss": 2.337, + "step": 260140 + }, + { + "epoch": 1.0056671460159887, + "grad_norm": 0.11442416161298752, + "learning_rate": 0.002, + "loss": 2.359, + "step": 260150 + }, + { + "epoch": 1.005705803219372, + "grad_norm": 0.09657229483127594, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 260160 + }, + { + "epoch": 1.0057444604227552, + "grad_norm": 0.11371700465679169, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 260170 + }, + { + "epoch": 1.0057831176261385, + "grad_norm": 0.11046253144741058, + "learning_rate": 0.002, + "loss": 2.3602, + "step": 260180 + }, + { + "epoch": 1.0058217748295217, + "grad_norm": 0.10766205191612244, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 260190 + }, + { + "epoch": 1.005860432032905, + "grad_norm": 0.1016693189740181, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 260200 + }, + { + "epoch": 1.0058990892362882, + "grad_norm": 0.11253286153078079, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 260210 + }, + { + "epoch": 1.0059377464396715, + "grad_norm": 0.1164514496922493, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 260220 + }, + { + "epoch": 1.0059764036430547, + "grad_norm": 0.1074240505695343, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 260230 + }, + { + "epoch": 1.0060150608464382, + "grad_norm": 0.09927522391080856, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 260240 + }, + { + "epoch": 1.0060537180498215, + "grad_norm": 0.11250677704811096, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 260250 + }, + { + "epoch": 1.0060923752532047, + "grad_norm": 0.1204577088356018, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 260260 + }, + { + "epoch": 1.006131032456588, + "grad_norm": 0.10001301020383835, + "learning_rate": 0.002, + "loss": 2.345, + "step": 260270 + }, + { + "epoch": 1.0061696896599712, + "grad_norm": 0.12086349725723267, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 260280 + }, + { + "epoch": 1.0062083468633545, + "grad_norm": 0.11192157119512558, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 260290 + }, + { + "epoch": 1.0062470040667377, + "grad_norm": 0.1001996248960495, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 260300 + }, + { + "epoch": 1.006285661270121, + "grad_norm": 0.10504887998104095, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 260310 + }, + { + "epoch": 1.0063243184735045, + "grad_norm": 0.1161704808473587, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 260320 + }, + { + "epoch": 1.0063629756768877, + "grad_norm": 0.10507213324308395, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 260330 + }, + { + "epoch": 1.006401632880271, + "grad_norm": 0.09793032705783844, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 260340 + }, + { + "epoch": 1.0064402900836542, + "grad_norm": 0.11236788332462311, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 260350 + }, + { + "epoch": 1.0064789472870375, + "grad_norm": 0.10597644746303558, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 260360 + }, + { + "epoch": 1.0065176044904207, + "grad_norm": 0.11315464228391647, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 260370 + }, + { + "epoch": 1.006556261693804, + "grad_norm": 0.1039297804236412, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 260380 + }, + { + "epoch": 1.0065949188971872, + "grad_norm": 0.09415727108716965, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 260390 + }, + { + "epoch": 1.0066335761005705, + "grad_norm": 0.11566637456417084, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 260400 + }, + { + "epoch": 1.006672233303954, + "grad_norm": 0.1017017811536789, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 260410 + }, + { + "epoch": 1.0067108905073372, + "grad_norm": 0.13970237970352173, + "learning_rate": 0.002, + "loss": 2.34, + "step": 260420 + }, + { + "epoch": 1.0067495477107204, + "grad_norm": 0.11136095970869064, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 260430 + }, + { + "epoch": 1.0067882049141037, + "grad_norm": 0.10176298767328262, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 260440 + }, + { + "epoch": 1.006826862117487, + "grad_norm": 0.08913923054933548, + "learning_rate": 0.002, + "loss": 2.336, + "step": 260450 + }, + { + "epoch": 1.0068655193208702, + "grad_norm": 0.10855156928300858, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 260460 + }, + { + "epoch": 1.0069041765242535, + "grad_norm": 0.11180077493190765, + "learning_rate": 0.002, + "loss": 2.338, + "step": 260470 + }, + { + "epoch": 1.0069428337276367, + "grad_norm": 0.10207480937242508, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 260480 + }, + { + "epoch": 1.0069814909310202, + "grad_norm": 0.08677595853805542, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 260490 + }, + { + "epoch": 1.0070201481344034, + "grad_norm": 0.13296319544315338, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 260500 + }, + { + "epoch": 1.0070588053377867, + "grad_norm": 0.08792676031589508, + "learning_rate": 0.002, + "loss": 2.34, + "step": 260510 + }, + { + "epoch": 1.00709746254117, + "grad_norm": 0.09438177943229675, + "learning_rate": 0.002, + "loss": 2.348, + "step": 260520 + }, + { + "epoch": 1.0071361197445532, + "grad_norm": 0.1061626598238945, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 260530 + }, + { + "epoch": 1.0071747769479364, + "grad_norm": 0.10752874612808228, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 260540 + }, + { + "epoch": 1.0072134341513197, + "grad_norm": 0.12163309752941132, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 260550 + }, + { + "epoch": 1.007252091354703, + "grad_norm": 0.13164447247982025, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 260560 + }, + { + "epoch": 1.0072907485580864, + "grad_norm": 0.09046389162540436, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 260570 + }, + { + "epoch": 1.0073294057614697, + "grad_norm": 0.10954193770885468, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 260580 + }, + { + "epoch": 1.007368062964853, + "grad_norm": 0.1164688989520073, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 260590 + }, + { + "epoch": 1.0074067201682362, + "grad_norm": 0.10950072854757309, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 260600 + }, + { + "epoch": 1.0074453773716194, + "grad_norm": 0.09672633558511734, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 260610 + }, + { + "epoch": 1.0074840345750027, + "grad_norm": 0.10479829460382462, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 260620 + }, + { + "epoch": 1.007522691778386, + "grad_norm": 0.09484247863292694, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 260630 + }, + { + "epoch": 1.0075613489817692, + "grad_norm": 0.09945641458034515, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 260640 + }, + { + "epoch": 1.0076000061851524, + "grad_norm": 0.11775811016559601, + "learning_rate": 0.002, + "loss": 2.331, + "step": 260650 + }, + { + "epoch": 1.007638663388536, + "grad_norm": 0.09762194007635117, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 260660 + }, + { + "epoch": 1.0076773205919192, + "grad_norm": 0.11502894759178162, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 260670 + }, + { + "epoch": 1.0077159777953024, + "grad_norm": 0.10429210960865021, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 260680 + }, + { + "epoch": 1.0077546349986857, + "grad_norm": 0.11102215945720673, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 260690 + }, + { + "epoch": 1.007793292202069, + "grad_norm": 0.10406568646430969, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 260700 + }, + { + "epoch": 1.0078319494054522, + "grad_norm": 0.0996653363108635, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 260710 + }, + { + "epoch": 1.0078706066088354, + "grad_norm": 0.10210791230201721, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 260720 + }, + { + "epoch": 1.0079092638122187, + "grad_norm": 0.10391468554735184, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 260730 + }, + { + "epoch": 1.0079479210156022, + "grad_norm": 0.09697168320417404, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 260740 + }, + { + "epoch": 1.0079865782189854, + "grad_norm": 0.10640589147806168, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 260750 + }, + { + "epoch": 1.0080252354223687, + "grad_norm": 0.12679150700569153, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 260760 + }, + { + "epoch": 1.008063892625752, + "grad_norm": 0.08968043327331543, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 260770 + }, + { + "epoch": 1.0081025498291352, + "grad_norm": 0.11260325461626053, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 260780 + }, + { + "epoch": 1.0081412070325184, + "grad_norm": 0.09950599074363708, + "learning_rate": 0.002, + "loss": 2.331, + "step": 260790 + }, + { + "epoch": 1.0081798642359017, + "grad_norm": 0.10537993907928467, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 260800 + }, + { + "epoch": 1.008218521439285, + "grad_norm": 0.10507483780384064, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 260810 + }, + { + "epoch": 1.0082571786426682, + "grad_norm": 0.09803906828165054, + "learning_rate": 0.002, + "loss": 2.345, + "step": 260820 + }, + { + "epoch": 1.0082958358460516, + "grad_norm": 0.10181978344917297, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 260830 + }, + { + "epoch": 1.008334493049435, + "grad_norm": 0.10805722326040268, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 260840 + }, + { + "epoch": 1.0083731502528182, + "grad_norm": 0.2941838204860687, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 260850 + }, + { + "epoch": 1.0084118074562014, + "grad_norm": 0.10703596472740173, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 260860 + }, + { + "epoch": 1.0084504646595847, + "grad_norm": 0.10350608825683594, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 260870 + }, + { + "epoch": 1.008489121862968, + "grad_norm": 0.09086059778928757, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 260880 + }, + { + "epoch": 1.0085277790663512, + "grad_norm": 0.09934677183628082, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 260890 + }, + { + "epoch": 1.0085664362697344, + "grad_norm": 0.10361968725919724, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 260900 + }, + { + "epoch": 1.0086050934731179, + "grad_norm": 0.10142628848552704, + "learning_rate": 0.002, + "loss": 2.345, + "step": 260910 + }, + { + "epoch": 1.0086437506765011, + "grad_norm": 0.10995934903621674, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 260920 + }, + { + "epoch": 1.0086824078798844, + "grad_norm": 0.10714113712310791, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 260930 + }, + { + "epoch": 1.0087210650832676, + "grad_norm": 0.09014647454023361, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 260940 + }, + { + "epoch": 1.008759722286651, + "grad_norm": 0.10555219650268555, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 260950 + }, + { + "epoch": 1.0087983794900341, + "grad_norm": 0.10361549258232117, + "learning_rate": 0.002, + "loss": 2.326, + "step": 260960 + }, + { + "epoch": 1.0088370366934174, + "grad_norm": 0.10561522841453552, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 260970 + }, + { + "epoch": 1.0088756938968007, + "grad_norm": 0.09323658794164658, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 260980 + }, + { + "epoch": 1.008914351100184, + "grad_norm": 0.11476288735866547, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 260990 + }, + { + "epoch": 1.0089530083035674, + "grad_norm": 0.11546418070793152, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 261000 + }, + { + "epoch": 1.0089916655069506, + "grad_norm": 0.10268153995275497, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 261010 + }, + { + "epoch": 1.0090303227103339, + "grad_norm": 0.10294366627931595, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 261020 + }, + { + "epoch": 1.0090689799137171, + "grad_norm": 0.09683190286159515, + "learning_rate": 0.002, + "loss": 2.344, + "step": 261030 + }, + { + "epoch": 1.0091076371171004, + "grad_norm": 0.098160021007061, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 261040 + }, + { + "epoch": 1.0091462943204836, + "grad_norm": 0.10156324505805969, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 261050 + }, + { + "epoch": 1.009184951523867, + "grad_norm": 0.10965844243764877, + "learning_rate": 0.002, + "loss": 2.343, + "step": 261060 + }, + { + "epoch": 1.0092236087272501, + "grad_norm": 0.1027294471859932, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 261070 + }, + { + "epoch": 1.0092622659306336, + "grad_norm": 0.0865500271320343, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 261080 + }, + { + "epoch": 1.0093009231340169, + "grad_norm": 0.13334614038467407, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 261090 + }, + { + "epoch": 1.0093395803374001, + "grad_norm": 0.11170141398906708, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 261100 + }, + { + "epoch": 1.0093782375407834, + "grad_norm": 0.1030377596616745, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 261110 + }, + { + "epoch": 1.0094168947441666, + "grad_norm": 0.11204218864440918, + "learning_rate": 0.002, + "loss": 2.344, + "step": 261120 + }, + { + "epoch": 1.0094555519475499, + "grad_norm": 0.10371985286474228, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 261130 + }, + { + "epoch": 1.0094942091509331, + "grad_norm": 0.10569999366998672, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 261140 + }, + { + "epoch": 1.0095328663543164, + "grad_norm": 0.10509838908910751, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 261150 + }, + { + "epoch": 1.0095715235576996, + "grad_norm": 0.11228816211223602, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 261160 + }, + { + "epoch": 1.009610180761083, + "grad_norm": 0.11870377510786057, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 261170 + }, + { + "epoch": 1.0096488379644664, + "grad_norm": 0.11041771620512009, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 261180 + }, + { + "epoch": 1.0096874951678496, + "grad_norm": 0.09817316383123398, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 261190 + }, + { + "epoch": 1.0097261523712329, + "grad_norm": 0.10386999696493149, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 261200 + }, + { + "epoch": 1.0097648095746161, + "grad_norm": 0.09739833325147629, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 261210 + }, + { + "epoch": 1.0098034667779994, + "grad_norm": 0.10959730297327042, + "learning_rate": 0.002, + "loss": 2.331, + "step": 261220 + }, + { + "epoch": 1.0098421239813826, + "grad_norm": 0.11688784509897232, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 261230 + }, + { + "epoch": 1.0098807811847659, + "grad_norm": 0.1169438585639, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 261240 + }, + { + "epoch": 1.0099194383881493, + "grad_norm": 0.1012745201587677, + "learning_rate": 0.002, + "loss": 2.33, + "step": 261250 + }, + { + "epoch": 1.0099580955915326, + "grad_norm": 0.09531766176223755, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 261260 + }, + { + "epoch": 1.0099967527949159, + "grad_norm": 0.10129254311323166, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 261270 + }, + { + "epoch": 1.010035409998299, + "grad_norm": 0.10263433307409286, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 261280 + }, + { + "epoch": 1.0100740672016824, + "grad_norm": 0.09622534364461899, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 261290 + }, + { + "epoch": 1.0101127244050656, + "grad_norm": 0.10295294225215912, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 261300 + }, + { + "epoch": 1.0101513816084489, + "grad_norm": 0.0946008712053299, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 261310 + }, + { + "epoch": 1.0101900388118321, + "grad_norm": 0.10893122851848602, + "learning_rate": 0.002, + "loss": 2.338, + "step": 261320 + }, + { + "epoch": 1.0102286960152154, + "grad_norm": 0.09725047647953033, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 261330 + }, + { + "epoch": 1.0102673532185988, + "grad_norm": 0.10373464226722717, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 261340 + }, + { + "epoch": 1.010306010421982, + "grad_norm": 0.4837754964828491, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 261350 + }, + { + "epoch": 1.0103446676253653, + "grad_norm": 0.14557918906211853, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 261360 + }, + { + "epoch": 1.0103833248287486, + "grad_norm": 0.13766615092754364, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 261370 + }, + { + "epoch": 1.0104219820321318, + "grad_norm": 0.09187277406454086, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 261380 + }, + { + "epoch": 1.010460639235515, + "grad_norm": 0.10806089639663696, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 261390 + }, + { + "epoch": 1.0104992964388984, + "grad_norm": 0.1004478856921196, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 261400 + }, + { + "epoch": 1.0105379536422816, + "grad_norm": 0.1010182648897171, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 261410 + }, + { + "epoch": 1.010576610845665, + "grad_norm": 0.2028651237487793, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 261420 + }, + { + "epoch": 1.0106152680490483, + "grad_norm": 0.090111143887043, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 261430 + }, + { + "epoch": 1.0106539252524316, + "grad_norm": 0.10565896332263947, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 261440 + }, + { + "epoch": 1.0106925824558148, + "grad_norm": 0.1016477420926094, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 261450 + }, + { + "epoch": 1.010731239659198, + "grad_norm": 0.09837300330400467, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 261460 + }, + { + "epoch": 1.0107698968625813, + "grad_norm": 0.10926554352045059, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 261470 + }, + { + "epoch": 1.0108085540659646, + "grad_norm": 0.10980924963951111, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 261480 + }, + { + "epoch": 1.0108472112693478, + "grad_norm": 0.10417355597019196, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 261490 + }, + { + "epoch": 1.010885868472731, + "grad_norm": 0.09010007232427597, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 261500 + }, + { + "epoch": 1.0109245256761146, + "grad_norm": 0.14487899839878082, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 261510 + }, + { + "epoch": 1.0109631828794978, + "grad_norm": 0.10193762928247452, + "learning_rate": 0.002, + "loss": 2.35, + "step": 261520 + }, + { + "epoch": 1.011001840082881, + "grad_norm": 0.09536083042621613, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 261530 + }, + { + "epoch": 1.0110404972862643, + "grad_norm": 0.10808293521404266, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 261540 + }, + { + "epoch": 1.0110791544896476, + "grad_norm": 0.10147858411073685, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 261550 + }, + { + "epoch": 1.0111178116930308, + "grad_norm": 0.14660899341106415, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 261560 + }, + { + "epoch": 1.011156468896414, + "grad_norm": 0.09802790731191635, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 261570 + }, + { + "epoch": 1.0111951260997973, + "grad_norm": 0.09951503574848175, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 261580 + }, + { + "epoch": 1.0112337833031808, + "grad_norm": 0.12643538415431976, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 261590 + }, + { + "epoch": 1.011272440506564, + "grad_norm": 0.1129726767539978, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 261600 + }, + { + "epoch": 1.0113110977099473, + "grad_norm": 0.0968206450343132, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 261610 + }, + { + "epoch": 1.0113497549133306, + "grad_norm": 0.11443905532360077, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 261620 + }, + { + "epoch": 1.0113884121167138, + "grad_norm": 0.12192221730947495, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 261630 + }, + { + "epoch": 1.011427069320097, + "grad_norm": 0.10034405440092087, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 261640 + }, + { + "epoch": 1.0114657265234803, + "grad_norm": 0.10083135962486267, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 261650 + }, + { + "epoch": 1.0115043837268636, + "grad_norm": 0.12016676366329193, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 261660 + }, + { + "epoch": 1.011543040930247, + "grad_norm": 0.09202968329191208, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 261670 + }, + { + "epoch": 1.0115816981336303, + "grad_norm": 0.09863679111003876, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 261680 + }, + { + "epoch": 1.0116203553370136, + "grad_norm": 0.11649753898382187, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 261690 + }, + { + "epoch": 1.0116590125403968, + "grad_norm": 0.10803171992301941, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 261700 + }, + { + "epoch": 1.01169766974378, + "grad_norm": 0.1016639918088913, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 261710 + }, + { + "epoch": 1.0117363269471633, + "grad_norm": 0.0989474281668663, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 261720 + }, + { + "epoch": 1.0117749841505466, + "grad_norm": 0.09995424002408981, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 261730 + }, + { + "epoch": 1.0118136413539298, + "grad_norm": 0.10237865895032883, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 261740 + }, + { + "epoch": 1.011852298557313, + "grad_norm": 0.10621260851621628, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 261750 + }, + { + "epoch": 1.0118909557606965, + "grad_norm": 0.1114802435040474, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 261760 + }, + { + "epoch": 1.0119296129640798, + "grad_norm": 0.1145864799618721, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 261770 + }, + { + "epoch": 1.011968270167463, + "grad_norm": 0.10577882826328278, + "learning_rate": 0.002, + "loss": 2.349, + "step": 261780 + }, + { + "epoch": 1.0120069273708463, + "grad_norm": 0.10166753083467484, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 261790 + }, + { + "epoch": 1.0120455845742296, + "grad_norm": 0.1139196902513504, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 261800 + }, + { + "epoch": 1.0120842417776128, + "grad_norm": 0.0951429158449173, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 261810 + }, + { + "epoch": 1.012122898980996, + "grad_norm": 0.19654862582683563, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 261820 + }, + { + "epoch": 1.0121615561843793, + "grad_norm": 0.09855328500270844, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 261830 + }, + { + "epoch": 1.0122002133877628, + "grad_norm": 0.10782697051763535, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 261840 + }, + { + "epoch": 1.012238870591146, + "grad_norm": 0.11967076361179352, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 261850 + }, + { + "epoch": 1.0122775277945293, + "grad_norm": 0.089723140001297, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 261860 + }, + { + "epoch": 1.0123161849979125, + "grad_norm": 0.09120474755764008, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 261870 + }, + { + "epoch": 1.0123548422012958, + "grad_norm": 0.09240848571062088, + "learning_rate": 0.002, + "loss": 2.326, + "step": 261880 + }, + { + "epoch": 1.012393499404679, + "grad_norm": 0.10871592164039612, + "learning_rate": 0.002, + "loss": 2.342, + "step": 261890 + }, + { + "epoch": 1.0124321566080623, + "grad_norm": 0.11382023990154266, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 261900 + }, + { + "epoch": 1.0124708138114455, + "grad_norm": 0.1023283526301384, + "learning_rate": 0.002, + "loss": 2.338, + "step": 261910 + }, + { + "epoch": 1.0125094710148288, + "grad_norm": 0.1051417887210846, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 261920 + }, + { + "epoch": 1.0125481282182123, + "grad_norm": 0.1110028624534607, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 261930 + }, + { + "epoch": 1.0125867854215955, + "grad_norm": 0.09721856564283371, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 261940 + }, + { + "epoch": 1.0126254426249788, + "grad_norm": 0.1076245903968811, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 261950 + }, + { + "epoch": 1.012664099828362, + "grad_norm": 0.09990669041872025, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 261960 + }, + { + "epoch": 1.0127027570317453, + "grad_norm": 0.14250290393829346, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 261970 + }, + { + "epoch": 1.0127414142351285, + "grad_norm": 0.10246880352497101, + "learning_rate": 0.002, + "loss": 2.327, + "step": 261980 + }, + { + "epoch": 1.0127800714385118, + "grad_norm": 0.09257150441408157, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 261990 + }, + { + "epoch": 1.012818728641895, + "grad_norm": 0.10628538578748703, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 262000 + }, + { + "epoch": 1.0128573858452785, + "grad_norm": 0.08506090193986893, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 262010 + }, + { + "epoch": 1.0128960430486618, + "grad_norm": 0.1248772144317627, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 262020 + }, + { + "epoch": 1.012934700252045, + "grad_norm": 0.10065978765487671, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 262030 + }, + { + "epoch": 1.0129733574554283, + "grad_norm": 0.09866821765899658, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 262040 + }, + { + "epoch": 1.0130120146588115, + "grad_norm": 0.10731372237205505, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 262050 + }, + { + "epoch": 1.0130506718621948, + "grad_norm": 0.10754463076591492, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 262060 + }, + { + "epoch": 1.013089329065578, + "grad_norm": 0.09564726054668427, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 262070 + }, + { + "epoch": 1.0131279862689613, + "grad_norm": 0.10373189300298691, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 262080 + }, + { + "epoch": 1.0131666434723445, + "grad_norm": 0.09389154613018036, + "learning_rate": 0.002, + "loss": 2.34, + "step": 262090 + }, + { + "epoch": 1.013205300675728, + "grad_norm": 0.1045018807053566, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 262100 + }, + { + "epoch": 1.0132439578791113, + "grad_norm": 0.10585897415876389, + "learning_rate": 0.002, + "loss": 2.341, + "step": 262110 + }, + { + "epoch": 1.0132826150824945, + "grad_norm": 0.09949793666601181, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 262120 + }, + { + "epoch": 1.0133212722858778, + "grad_norm": 0.14061234891414642, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 262130 + }, + { + "epoch": 1.013359929489261, + "grad_norm": 0.0932084396481514, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 262140 + }, + { + "epoch": 1.0133985866926443, + "grad_norm": 0.12497919052839279, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 262150 + }, + { + "epoch": 1.0134372438960275, + "grad_norm": 0.11568643897771835, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 262160 + }, + { + "epoch": 1.0134759010994108, + "grad_norm": 0.10818599164485931, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 262170 + }, + { + "epoch": 1.0135145583027942, + "grad_norm": 0.09462061524391174, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 262180 + }, + { + "epoch": 1.0135532155061775, + "grad_norm": 0.1294683963060379, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 262190 + }, + { + "epoch": 1.0135918727095607, + "grad_norm": 0.0871722474694252, + "learning_rate": 0.002, + "loss": 2.328, + "step": 262200 + }, + { + "epoch": 1.013630529912944, + "grad_norm": 0.10529077798128128, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 262210 + }, + { + "epoch": 1.0136691871163273, + "grad_norm": 0.10764256864786148, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 262220 + }, + { + "epoch": 1.0137078443197105, + "grad_norm": 0.11791141331195831, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 262230 + }, + { + "epoch": 1.0137465015230938, + "grad_norm": 0.13781200349330902, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 262240 + }, + { + "epoch": 1.013785158726477, + "grad_norm": 0.13462291657924652, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 262250 + }, + { + "epoch": 1.0138238159298603, + "grad_norm": 0.0965033695101738, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 262260 + }, + { + "epoch": 1.0138624731332437, + "grad_norm": 0.08449462056159973, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 262270 + }, + { + "epoch": 1.013901130336627, + "grad_norm": 0.10011344403028488, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 262280 + }, + { + "epoch": 1.0139397875400102, + "grad_norm": 0.1139802634716034, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 262290 + }, + { + "epoch": 1.0139784447433935, + "grad_norm": 0.11183168739080429, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 262300 + }, + { + "epoch": 1.0140171019467767, + "grad_norm": 0.09366361796855927, + "learning_rate": 0.002, + "loss": 2.345, + "step": 262310 + }, + { + "epoch": 1.01405575915016, + "grad_norm": 0.11254223436117172, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 262320 + }, + { + "epoch": 1.0140944163535432, + "grad_norm": 0.10272639989852905, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 262330 + }, + { + "epoch": 1.0141330735569265, + "grad_norm": 0.11101660132408142, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 262340 + }, + { + "epoch": 1.01417173076031, + "grad_norm": 0.1064949706196785, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 262350 + }, + { + "epoch": 1.0142103879636932, + "grad_norm": 0.12453845888376236, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 262360 + }, + { + "epoch": 1.0142490451670765, + "grad_norm": 0.129965141415596, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 262370 + }, + { + "epoch": 1.0142877023704597, + "grad_norm": 0.10267557948827744, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 262380 + }, + { + "epoch": 1.014326359573843, + "grad_norm": 0.10027313977479935, + "learning_rate": 0.002, + "loss": 2.353, + "step": 262390 + }, + { + "epoch": 1.0143650167772262, + "grad_norm": 0.09122640639543533, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 262400 + }, + { + "epoch": 1.0144036739806095, + "grad_norm": 0.11867644637823105, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 262410 + }, + { + "epoch": 1.0144423311839927, + "grad_norm": 0.11216876655817032, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 262420 + }, + { + "epoch": 1.0144809883873762, + "grad_norm": 0.10928855091333389, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 262430 + }, + { + "epoch": 1.0145196455907595, + "grad_norm": 0.09858279675245285, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 262440 + }, + { + "epoch": 1.0145583027941427, + "grad_norm": 0.09995760768651962, + "learning_rate": 0.002, + "loss": 2.333, + "step": 262450 + }, + { + "epoch": 1.014596959997526, + "grad_norm": 0.12695086002349854, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 262460 + }, + { + "epoch": 1.0146356172009092, + "grad_norm": 0.08780010044574738, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 262470 + }, + { + "epoch": 1.0146742744042925, + "grad_norm": 0.10888690501451492, + "learning_rate": 0.002, + "loss": 2.335, + "step": 262480 + }, + { + "epoch": 1.0147129316076757, + "grad_norm": 0.0955919399857521, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 262490 + }, + { + "epoch": 1.014751588811059, + "grad_norm": 0.10459954291582108, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 262500 + }, + { + "epoch": 1.0147902460144422, + "grad_norm": 0.10299643874168396, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 262510 + }, + { + "epoch": 1.0148289032178257, + "grad_norm": 0.10566985607147217, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 262520 + }, + { + "epoch": 1.014867560421209, + "grad_norm": 0.10248414427042007, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 262530 + }, + { + "epoch": 1.0149062176245922, + "grad_norm": 0.09281566739082336, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 262540 + }, + { + "epoch": 1.0149448748279755, + "grad_norm": 0.10866890102624893, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 262550 + }, + { + "epoch": 1.0149835320313587, + "grad_norm": 0.10817757993936539, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 262560 + }, + { + "epoch": 1.015022189234742, + "grad_norm": 0.10576806217432022, + "learning_rate": 0.002, + "loss": 2.346, + "step": 262570 + }, + { + "epoch": 1.0150608464381252, + "grad_norm": 0.10843439400196075, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 262580 + }, + { + "epoch": 1.0150995036415085, + "grad_norm": 0.09107603132724762, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 262590 + }, + { + "epoch": 1.015138160844892, + "grad_norm": 0.10346271842718124, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 262600 + }, + { + "epoch": 1.0151768180482752, + "grad_norm": 0.10407814383506775, + "learning_rate": 0.002, + "loss": 2.337, + "step": 262610 + }, + { + "epoch": 1.0152154752516585, + "grad_norm": 0.09561081975698471, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 262620 + }, + { + "epoch": 1.0152541324550417, + "grad_norm": 0.09386394917964935, + "learning_rate": 0.002, + "loss": 2.341, + "step": 262630 + }, + { + "epoch": 1.015292789658425, + "grad_norm": 0.13108910620212555, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 262640 + }, + { + "epoch": 1.0153314468618082, + "grad_norm": 0.1079326868057251, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 262650 + }, + { + "epoch": 1.0153701040651915, + "grad_norm": 0.12318453937768936, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 262660 + }, + { + "epoch": 1.0154087612685747, + "grad_norm": 0.10267659276723862, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 262670 + }, + { + "epoch": 1.015447418471958, + "grad_norm": 0.10497715324163437, + "learning_rate": 0.002, + "loss": 2.358, + "step": 262680 + }, + { + "epoch": 1.0154860756753414, + "grad_norm": 0.1006128340959549, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 262690 + }, + { + "epoch": 1.0155247328787247, + "grad_norm": 0.10568059980869293, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 262700 + }, + { + "epoch": 1.015563390082108, + "grad_norm": 0.09955970197916031, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 262710 + }, + { + "epoch": 1.0156020472854912, + "grad_norm": 0.10910652577877045, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 262720 + }, + { + "epoch": 1.0156407044888744, + "grad_norm": 0.11949540674686432, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 262730 + }, + { + "epoch": 1.0156793616922577, + "grad_norm": 0.1253557801246643, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 262740 + }, + { + "epoch": 1.015718018895641, + "grad_norm": 0.11321611702442169, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 262750 + }, + { + "epoch": 1.0157566760990242, + "grad_norm": 0.0908082127571106, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 262760 + }, + { + "epoch": 1.0157953333024077, + "grad_norm": 0.1201312318444252, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 262770 + }, + { + "epoch": 1.015833990505791, + "grad_norm": 0.10379394143819809, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 262780 + }, + { + "epoch": 1.0158726477091742, + "grad_norm": 0.10312970727682114, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 262790 + }, + { + "epoch": 1.0159113049125574, + "grad_norm": 0.1026928648352623, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 262800 + }, + { + "epoch": 1.0159499621159407, + "grad_norm": 0.10575632005929947, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 262810 + }, + { + "epoch": 1.015988619319324, + "grad_norm": 0.11193332076072693, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 262820 + }, + { + "epoch": 1.0160272765227072, + "grad_norm": 0.11323326081037521, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 262830 + }, + { + "epoch": 1.0160659337260904, + "grad_norm": 0.13184423744678497, + "learning_rate": 0.002, + "loss": 2.3588, + "step": 262840 + }, + { + "epoch": 1.0161045909294737, + "grad_norm": 0.10372630506753922, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 262850 + }, + { + "epoch": 1.0161432481328572, + "grad_norm": 0.08913901448249817, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 262860 + }, + { + "epoch": 1.0161819053362404, + "grad_norm": 0.10794923454523087, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 262870 + }, + { + "epoch": 1.0162205625396237, + "grad_norm": 0.09986615180969238, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 262880 + }, + { + "epoch": 1.016259219743007, + "grad_norm": 0.10644876956939697, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 262890 + }, + { + "epoch": 1.0162978769463902, + "grad_norm": 0.11530308425426483, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 262900 + }, + { + "epoch": 1.0163365341497734, + "grad_norm": 0.1062793955206871, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 262910 + }, + { + "epoch": 1.0163751913531567, + "grad_norm": 0.10342157632112503, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 262920 + }, + { + "epoch": 1.01641384855654, + "grad_norm": 0.09291832149028778, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 262930 + }, + { + "epoch": 1.0164525057599234, + "grad_norm": 0.09074714034795761, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 262940 + }, + { + "epoch": 1.0164911629633067, + "grad_norm": 0.1163024976849556, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 262950 + }, + { + "epoch": 1.01652982016669, + "grad_norm": 0.12023130804300308, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 262960 + }, + { + "epoch": 1.0165684773700732, + "grad_norm": 0.12284992635250092, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 262970 + }, + { + "epoch": 1.0166071345734564, + "grad_norm": 0.09862682223320007, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 262980 + }, + { + "epoch": 1.0166457917768397, + "grad_norm": 0.09955253452062607, + "learning_rate": 0.002, + "loss": 2.341, + "step": 262990 + }, + { + "epoch": 1.016684448980223, + "grad_norm": 0.1211884468793869, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 263000 + }, + { + "epoch": 1.0167231061836062, + "grad_norm": 0.11086461693048477, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 263010 + }, + { + "epoch": 1.0167617633869894, + "grad_norm": 0.09472446143627167, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 263020 + }, + { + "epoch": 1.016800420590373, + "grad_norm": 0.11582635343074799, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 263030 + }, + { + "epoch": 1.0168390777937562, + "grad_norm": 0.09361955523490906, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 263040 + }, + { + "epoch": 1.0168777349971394, + "grad_norm": 0.11129572242498398, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 263050 + }, + { + "epoch": 1.0169163922005227, + "grad_norm": 0.10749483853578568, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 263060 + }, + { + "epoch": 1.016955049403906, + "grad_norm": 0.10947995632886887, + "learning_rate": 0.002, + "loss": 2.339, + "step": 263070 + }, + { + "epoch": 1.0169937066072892, + "grad_norm": 0.09116199612617493, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 263080 + }, + { + "epoch": 1.0170323638106724, + "grad_norm": 0.11516207456588745, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 263090 + }, + { + "epoch": 1.0170710210140557, + "grad_norm": 0.10246528685092926, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 263100 + }, + { + "epoch": 1.0171096782174391, + "grad_norm": 0.10460419207811356, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 263110 + }, + { + "epoch": 1.0171483354208224, + "grad_norm": 0.10269016772508621, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 263120 + }, + { + "epoch": 1.0171869926242056, + "grad_norm": 0.0924811139702797, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 263130 + }, + { + "epoch": 1.017225649827589, + "grad_norm": 0.10778999328613281, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 263140 + }, + { + "epoch": 1.0172643070309721, + "grad_norm": 0.10321035981178284, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 263150 + }, + { + "epoch": 1.0173029642343554, + "grad_norm": 0.10112766176462173, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 263160 + }, + { + "epoch": 1.0173416214377387, + "grad_norm": 0.08981072902679443, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 263170 + }, + { + "epoch": 1.017380278641122, + "grad_norm": 0.1108328253030777, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 263180 + }, + { + "epoch": 1.0174189358445052, + "grad_norm": 0.17304444313049316, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 263190 + }, + { + "epoch": 1.0174575930478886, + "grad_norm": 0.13185708224773407, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 263200 + }, + { + "epoch": 1.0174962502512719, + "grad_norm": 0.12104866653680801, + "learning_rate": 0.002, + "loss": 2.334, + "step": 263210 + }, + { + "epoch": 1.0175349074546551, + "grad_norm": 0.10302011668682098, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 263220 + }, + { + "epoch": 1.0175735646580384, + "grad_norm": 0.09544021636247635, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 263230 + }, + { + "epoch": 1.0176122218614216, + "grad_norm": 0.10438983142375946, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 263240 + }, + { + "epoch": 1.017650879064805, + "grad_norm": 0.341610312461853, + "learning_rate": 0.002, + "loss": 2.343, + "step": 263250 + }, + { + "epoch": 1.0176895362681881, + "grad_norm": 0.2851675748825073, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 263260 + }, + { + "epoch": 1.0177281934715714, + "grad_norm": 0.40568825602531433, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 263270 + }, + { + "epoch": 1.0177668506749549, + "grad_norm": 0.10089606791734695, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 263280 + }, + { + "epoch": 1.0178055078783381, + "grad_norm": 0.10405469685792923, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 263290 + }, + { + "epoch": 1.0178441650817214, + "grad_norm": 0.1027701199054718, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 263300 + }, + { + "epoch": 1.0178828222851046, + "grad_norm": 0.09409894049167633, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 263310 + }, + { + "epoch": 1.0179214794884879, + "grad_norm": 0.12788711488246918, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 263320 + }, + { + "epoch": 1.0179601366918711, + "grad_norm": 0.12433791160583496, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 263330 + }, + { + "epoch": 1.0179987938952544, + "grad_norm": 0.0981535017490387, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 263340 + }, + { + "epoch": 1.0180374510986376, + "grad_norm": 0.1006656065583229, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 263350 + }, + { + "epoch": 1.0180761083020209, + "grad_norm": 0.12272592633962631, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 263360 + }, + { + "epoch": 1.0181147655054044, + "grad_norm": 0.10499448329210281, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 263370 + }, + { + "epoch": 1.0181534227087876, + "grad_norm": 0.09188871830701828, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 263380 + }, + { + "epoch": 1.0181920799121709, + "grad_norm": 0.10030457377433777, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 263390 + }, + { + "epoch": 1.0182307371155541, + "grad_norm": 0.11879902333021164, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 263400 + }, + { + "epoch": 1.0182693943189374, + "grad_norm": 0.09785284847021103, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 263410 + }, + { + "epoch": 1.0183080515223206, + "grad_norm": 0.10970498621463776, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 263420 + }, + { + "epoch": 1.0183467087257039, + "grad_norm": 0.10424668341875076, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 263430 + }, + { + "epoch": 1.0183853659290871, + "grad_norm": 0.09910289198160172, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 263440 + }, + { + "epoch": 1.0184240231324706, + "grad_norm": 0.09556056559085846, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 263450 + }, + { + "epoch": 1.0184626803358539, + "grad_norm": 0.09163806587457657, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 263460 + }, + { + "epoch": 1.018501337539237, + "grad_norm": 0.11754380911588669, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 263470 + }, + { + "epoch": 1.0185399947426204, + "grad_norm": 0.10090960562229156, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 263480 + }, + { + "epoch": 1.0185786519460036, + "grad_norm": 0.10696564614772797, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 263490 + }, + { + "epoch": 1.0186173091493869, + "grad_norm": 0.11893752962350845, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 263500 + }, + { + "epoch": 1.0186559663527701, + "grad_norm": 0.12111511826515198, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 263510 + }, + { + "epoch": 1.0186946235561534, + "grad_norm": 0.10253813862800598, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 263520 + }, + { + "epoch": 1.0187332807595368, + "grad_norm": 0.09811771661043167, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 263530 + }, + { + "epoch": 1.01877193796292, + "grad_norm": 0.09001462906599045, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 263540 + }, + { + "epoch": 1.0188105951663033, + "grad_norm": 0.11998558044433594, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 263550 + }, + { + "epoch": 1.0188492523696866, + "grad_norm": 0.11388974636793137, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 263560 + }, + { + "epoch": 1.0188879095730698, + "grad_norm": 0.10066687315702438, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 263570 + }, + { + "epoch": 1.018926566776453, + "grad_norm": 0.09237883985042572, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 263580 + }, + { + "epoch": 1.0189652239798364, + "grad_norm": 0.1033594161272049, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 263590 + }, + { + "epoch": 1.0190038811832196, + "grad_norm": 0.12771809101104736, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 263600 + }, + { + "epoch": 1.0190425383866029, + "grad_norm": 0.10033108294010162, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 263610 + }, + { + "epoch": 1.0190811955899863, + "grad_norm": 0.09493099898099899, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 263620 + }, + { + "epoch": 1.0191198527933696, + "grad_norm": 0.10437005013227463, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 263630 + }, + { + "epoch": 1.0191585099967528, + "grad_norm": 0.10589440166950226, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 263640 + }, + { + "epoch": 1.019197167200136, + "grad_norm": 0.12936919927597046, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 263650 + }, + { + "epoch": 1.0192358244035193, + "grad_norm": 0.09363681823015213, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 263660 + }, + { + "epoch": 1.0192744816069026, + "grad_norm": 0.122968889772892, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 263670 + }, + { + "epoch": 1.0193131388102858, + "grad_norm": 0.09942426532506943, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 263680 + }, + { + "epoch": 1.019351796013669, + "grad_norm": 0.09376782178878784, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 263690 + }, + { + "epoch": 1.0193904532170526, + "grad_norm": 0.08810889720916748, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 263700 + }, + { + "epoch": 1.0194291104204358, + "grad_norm": 0.11189023405313492, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 263710 + }, + { + "epoch": 1.019467767623819, + "grad_norm": 0.10335662215948105, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 263720 + }, + { + "epoch": 1.0195064248272023, + "grad_norm": 0.10368465632200241, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 263730 + }, + { + "epoch": 1.0195450820305856, + "grad_norm": 0.10068806260824203, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 263740 + }, + { + "epoch": 1.0195837392339688, + "grad_norm": 0.10646512359380722, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 263750 + }, + { + "epoch": 1.019622396437352, + "grad_norm": 0.09691212326288223, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 263760 + }, + { + "epoch": 1.0196610536407353, + "grad_norm": 0.11735524982213974, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 263770 + }, + { + "epoch": 1.0196997108441186, + "grad_norm": 0.09249353408813477, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 263780 + }, + { + "epoch": 1.019738368047502, + "grad_norm": 0.10997557640075684, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 263790 + }, + { + "epoch": 1.0197770252508853, + "grad_norm": 0.11306982487440109, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 263800 + }, + { + "epoch": 1.0198156824542686, + "grad_norm": 0.11018241196870804, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 263810 + }, + { + "epoch": 1.0198543396576518, + "grad_norm": 0.12374841421842575, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 263820 + }, + { + "epoch": 1.019892996861035, + "grad_norm": 0.10849356651306152, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 263830 + }, + { + "epoch": 1.0199316540644183, + "grad_norm": 0.09676603972911835, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 263840 + }, + { + "epoch": 1.0199703112678016, + "grad_norm": 0.11422029882669449, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 263850 + }, + { + "epoch": 1.0200089684711848, + "grad_norm": 0.10637599229812622, + "learning_rate": 0.002, + "loss": 2.334, + "step": 263860 + }, + { + "epoch": 1.0200476256745683, + "grad_norm": 0.10892908275127411, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 263870 + }, + { + "epoch": 1.0200862828779516, + "grad_norm": 0.0975891724228859, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 263880 + }, + { + "epoch": 1.0201249400813348, + "grad_norm": 0.10563914477825165, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 263890 + }, + { + "epoch": 1.020163597284718, + "grad_norm": 0.13005097210407257, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 263900 + }, + { + "epoch": 1.0202022544881013, + "grad_norm": 0.10534238070249557, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 263910 + }, + { + "epoch": 1.0202409116914846, + "grad_norm": 0.11824945360422134, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 263920 + }, + { + "epoch": 1.0202795688948678, + "grad_norm": 0.1261029839515686, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 263930 + }, + { + "epoch": 1.020318226098251, + "grad_norm": 0.10610140115022659, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 263940 + }, + { + "epoch": 1.0203568833016343, + "grad_norm": 0.09728217124938965, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 263950 + }, + { + "epoch": 1.0203955405050178, + "grad_norm": 0.11157053709030151, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 263960 + }, + { + "epoch": 1.020434197708401, + "grad_norm": 0.10309412330389023, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 263970 + }, + { + "epoch": 1.0204728549117843, + "grad_norm": 0.1188223734498024, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 263980 + }, + { + "epoch": 1.0205115121151676, + "grad_norm": 0.10410062223672867, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 263990 + }, + { + "epoch": 1.0205501693185508, + "grad_norm": 0.12492463737726212, + "learning_rate": 0.002, + "loss": 2.345, + "step": 264000 + }, + { + "epoch": 1.020588826521934, + "grad_norm": 0.12025908380746841, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 264010 + }, + { + "epoch": 1.0206274837253173, + "grad_norm": 0.10721295326948166, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 264020 + }, + { + "epoch": 1.0206661409287006, + "grad_norm": 0.0982886552810669, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 264030 + }, + { + "epoch": 1.020704798132084, + "grad_norm": 0.091331847012043, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 264040 + }, + { + "epoch": 1.0207434553354673, + "grad_norm": 0.08860914409160614, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 264050 + }, + { + "epoch": 1.0207821125388505, + "grad_norm": 0.10097967088222504, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 264060 + }, + { + "epoch": 1.0208207697422338, + "grad_norm": 0.09905195981264114, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 264070 + }, + { + "epoch": 1.020859426945617, + "grad_norm": 0.09901610761880875, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 264080 + }, + { + "epoch": 1.0208980841490003, + "grad_norm": 0.11977200210094452, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 264090 + }, + { + "epoch": 1.0209367413523835, + "grad_norm": 0.10924248397350311, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 264100 + }, + { + "epoch": 1.0209753985557668, + "grad_norm": 0.12424816191196442, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 264110 + }, + { + "epoch": 1.02101405575915, + "grad_norm": 0.10222361236810684, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 264120 + }, + { + "epoch": 1.0210527129625335, + "grad_norm": 0.09633691608905792, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 264130 + }, + { + "epoch": 1.0210913701659168, + "grad_norm": 0.1274564415216446, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 264140 + }, + { + "epoch": 1.0211300273693, + "grad_norm": 0.09376756846904755, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 264150 + }, + { + "epoch": 1.0211686845726833, + "grad_norm": 0.10471328347921371, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 264160 + }, + { + "epoch": 1.0212073417760665, + "grad_norm": 0.1049564778804779, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 264170 + }, + { + "epoch": 1.0212459989794498, + "grad_norm": 0.12720157206058502, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 264180 + }, + { + "epoch": 1.021284656182833, + "grad_norm": 0.10102929174900055, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 264190 + }, + { + "epoch": 1.0213233133862163, + "grad_norm": 0.09525764733552933, + "learning_rate": 0.002, + "loss": 2.33, + "step": 264200 + }, + { + "epoch": 1.0213619705895998, + "grad_norm": 0.10542795062065125, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 264210 + }, + { + "epoch": 1.021400627792983, + "grad_norm": 0.09321821480989456, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 264220 + }, + { + "epoch": 1.0214392849963663, + "grad_norm": 0.09394440799951553, + "learning_rate": 0.002, + "loss": 2.326, + "step": 264230 + }, + { + "epoch": 1.0214779421997495, + "grad_norm": 0.10297603905200958, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 264240 + }, + { + "epoch": 1.0215165994031328, + "grad_norm": 0.11685995757579803, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 264250 + }, + { + "epoch": 1.021555256606516, + "grad_norm": 0.09865585714578629, + "learning_rate": 0.002, + "loss": 2.342, + "step": 264260 + }, + { + "epoch": 1.0215939138098993, + "grad_norm": 0.09554792940616608, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 264270 + }, + { + "epoch": 1.0216325710132825, + "grad_norm": 0.10560063272714615, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 264280 + }, + { + "epoch": 1.021671228216666, + "grad_norm": 0.13222475349903107, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 264290 + }, + { + "epoch": 1.0217098854200493, + "grad_norm": 0.09948401898145676, + "learning_rate": 0.002, + "loss": 2.335, + "step": 264300 + }, + { + "epoch": 1.0217485426234325, + "grad_norm": 0.09130743891000748, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 264310 + }, + { + "epoch": 1.0217871998268158, + "grad_norm": 0.12201028317213058, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 264320 + }, + { + "epoch": 1.021825857030199, + "grad_norm": 0.0886576846241951, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 264330 + }, + { + "epoch": 1.0218645142335823, + "grad_norm": 0.10200691223144531, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 264340 + }, + { + "epoch": 1.0219031714369655, + "grad_norm": 0.10952626913785934, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 264350 + }, + { + "epoch": 1.0219418286403488, + "grad_norm": 0.12101206928491592, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 264360 + }, + { + "epoch": 1.021980485843732, + "grad_norm": 0.1118728369474411, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 264370 + }, + { + "epoch": 1.0220191430471155, + "grad_norm": 0.08517512679100037, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 264380 + }, + { + "epoch": 1.0220578002504987, + "grad_norm": 0.11177657544612885, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 264390 + }, + { + "epoch": 1.022096457453882, + "grad_norm": 0.09982944279909134, + "learning_rate": 0.002, + "loss": 2.346, + "step": 264400 + }, + { + "epoch": 1.0221351146572653, + "grad_norm": 0.09357168525457382, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 264410 + }, + { + "epoch": 1.0221737718606485, + "grad_norm": 0.11061622947454453, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 264420 + }, + { + "epoch": 1.0222124290640318, + "grad_norm": 0.09883379191160202, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 264430 + }, + { + "epoch": 1.022251086267415, + "grad_norm": 0.09657022356987, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 264440 + }, + { + "epoch": 1.0222897434707983, + "grad_norm": 0.08338421583175659, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 264450 + }, + { + "epoch": 1.0223284006741817, + "grad_norm": 0.15151934325695038, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 264460 + }, + { + "epoch": 1.022367057877565, + "grad_norm": 0.11291121691465378, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 264470 + }, + { + "epoch": 1.0224057150809482, + "grad_norm": 0.09912551939487457, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 264480 + }, + { + "epoch": 1.0224443722843315, + "grad_norm": 0.1053970605134964, + "learning_rate": 0.002, + "loss": 2.3126, + "step": 264490 + }, + { + "epoch": 1.0224830294877147, + "grad_norm": 0.11392966657876968, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 264500 + }, + { + "epoch": 1.022521686691098, + "grad_norm": 0.10310008376836777, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 264510 + }, + { + "epoch": 1.0225603438944812, + "grad_norm": 0.1299499273300171, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 264520 + }, + { + "epoch": 1.0225990010978645, + "grad_norm": 0.08739864081144333, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 264530 + }, + { + "epoch": 1.0226376583012478, + "grad_norm": 0.11028322577476501, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 264540 + }, + { + "epoch": 1.0226763155046312, + "grad_norm": 0.0937427282333374, + "learning_rate": 0.002, + "loss": 2.3652, + "step": 264550 + }, + { + "epoch": 1.0227149727080145, + "grad_norm": 0.11551933735609055, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 264560 + }, + { + "epoch": 1.0227536299113977, + "grad_norm": 0.12646719813346863, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 264570 + }, + { + "epoch": 1.022792287114781, + "grad_norm": 0.09259982407093048, + "learning_rate": 0.002, + "loss": 2.337, + "step": 264580 + }, + { + "epoch": 1.0228309443181642, + "grad_norm": 0.19581899046897888, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 264590 + }, + { + "epoch": 1.0228696015215475, + "grad_norm": 0.09492383897304535, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 264600 + }, + { + "epoch": 1.0229082587249307, + "grad_norm": 0.10611309111118317, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 264610 + }, + { + "epoch": 1.022946915928314, + "grad_norm": 0.09840042144060135, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 264620 + }, + { + "epoch": 1.0229855731316975, + "grad_norm": 0.10329700261354446, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 264630 + }, + { + "epoch": 1.0230242303350807, + "grad_norm": 0.12647895514965057, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 264640 + }, + { + "epoch": 1.023062887538464, + "grad_norm": 0.09441398084163666, + "learning_rate": 0.002, + "loss": 2.337, + "step": 264650 + }, + { + "epoch": 1.0231015447418472, + "grad_norm": 0.08766429871320724, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 264660 + }, + { + "epoch": 1.0231402019452305, + "grad_norm": 0.3329355716705322, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 264670 + }, + { + "epoch": 1.0231788591486137, + "grad_norm": 0.11475207656621933, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 264680 + }, + { + "epoch": 1.023217516351997, + "grad_norm": 0.09671181440353394, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 264690 + }, + { + "epoch": 1.0232561735553802, + "grad_norm": 0.09747885167598724, + "learning_rate": 0.002, + "loss": 2.329, + "step": 264700 + }, + { + "epoch": 1.0232948307587635, + "grad_norm": 0.10627662390470505, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 264710 + }, + { + "epoch": 1.023333487962147, + "grad_norm": 0.11692077666521072, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 264720 + }, + { + "epoch": 1.0233721451655302, + "grad_norm": 0.1147197037935257, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 264730 + }, + { + "epoch": 1.0234108023689135, + "grad_norm": 0.09349524974822998, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 264740 + }, + { + "epoch": 1.0234494595722967, + "grad_norm": 0.09220931679010391, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 264750 + }, + { + "epoch": 1.02348811677568, + "grad_norm": 0.11551348865032196, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 264760 + }, + { + "epoch": 1.0235267739790632, + "grad_norm": 0.09737013280391693, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 264770 + }, + { + "epoch": 1.0235654311824465, + "grad_norm": 0.1254809945821762, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 264780 + }, + { + "epoch": 1.0236040883858297, + "grad_norm": 0.10588192939758301, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 264790 + }, + { + "epoch": 1.0236427455892132, + "grad_norm": 0.11644317954778671, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 264800 + }, + { + "epoch": 1.0236814027925965, + "grad_norm": 0.16818253695964813, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 264810 + }, + { + "epoch": 1.0237200599959797, + "grad_norm": 0.10291523486375809, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 264820 + }, + { + "epoch": 1.023758717199363, + "grad_norm": 0.12808586657047272, + "learning_rate": 0.002, + "loss": 2.345, + "step": 264830 + }, + { + "epoch": 1.0237973744027462, + "grad_norm": 0.10281138122081757, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 264840 + }, + { + "epoch": 1.0238360316061295, + "grad_norm": 0.11928651481866837, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 264850 + }, + { + "epoch": 1.0238746888095127, + "grad_norm": 0.09416400641202927, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 264860 + }, + { + "epoch": 1.023913346012896, + "grad_norm": 0.10814538598060608, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 264870 + }, + { + "epoch": 1.0239520032162792, + "grad_norm": 0.10571702569723129, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 264880 + }, + { + "epoch": 1.0239906604196627, + "grad_norm": 0.1148458868265152, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 264890 + }, + { + "epoch": 1.024029317623046, + "grad_norm": 0.10211604833602905, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 264900 + }, + { + "epoch": 1.0240679748264292, + "grad_norm": 0.10494969040155411, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 264910 + }, + { + "epoch": 1.0241066320298124, + "grad_norm": 0.1249437928199768, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 264920 + }, + { + "epoch": 1.0241452892331957, + "grad_norm": 0.13109232485294342, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 264930 + }, + { + "epoch": 1.024183946436579, + "grad_norm": 0.0910966545343399, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 264940 + }, + { + "epoch": 1.0242226036399622, + "grad_norm": 0.10233844816684723, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 264950 + }, + { + "epoch": 1.0242612608433455, + "grad_norm": 0.10650918632745743, + "learning_rate": 0.002, + "loss": 2.329, + "step": 264960 + }, + { + "epoch": 1.024299918046729, + "grad_norm": 0.13375632464885712, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 264970 + }, + { + "epoch": 1.0243385752501122, + "grad_norm": 0.09612470120191574, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 264980 + }, + { + "epoch": 1.0243772324534954, + "grad_norm": 0.12482687085866928, + "learning_rate": 0.002, + "loss": 2.323, + "step": 264990 + }, + { + "epoch": 1.0244158896568787, + "grad_norm": 0.09300002455711365, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 265000 + }, + { + "epoch": 1.024454546860262, + "grad_norm": 0.10589489340782166, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 265010 + }, + { + "epoch": 1.0244932040636452, + "grad_norm": 0.10188213735818863, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 265020 + }, + { + "epoch": 1.0245318612670284, + "grad_norm": 0.1173492819070816, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 265030 + }, + { + "epoch": 1.0245705184704117, + "grad_norm": 0.09907522052526474, + "learning_rate": 0.002, + "loss": 2.346, + "step": 265040 + }, + { + "epoch": 1.024609175673795, + "grad_norm": 0.1274970918893814, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 265050 + }, + { + "epoch": 1.0246478328771784, + "grad_norm": 0.14310483634471893, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 265060 + }, + { + "epoch": 1.0246864900805617, + "grad_norm": 0.10303078591823578, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 265070 + }, + { + "epoch": 1.024725147283945, + "grad_norm": 0.122581347823143, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 265080 + }, + { + "epoch": 1.0247638044873282, + "grad_norm": 0.0957183688879013, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 265090 + }, + { + "epoch": 1.0248024616907114, + "grad_norm": 0.09310661256313324, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 265100 + }, + { + "epoch": 1.0248411188940947, + "grad_norm": 0.1101442500948906, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 265110 + }, + { + "epoch": 1.024879776097478, + "grad_norm": 0.09229353815317154, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 265120 + }, + { + "epoch": 1.0249184333008612, + "grad_norm": 0.12286297231912613, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 265130 + }, + { + "epoch": 1.0249570905042447, + "grad_norm": 0.11856617778539658, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 265140 + }, + { + "epoch": 1.024995747707628, + "grad_norm": 0.10073439031839371, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 265150 + }, + { + "epoch": 1.0250344049110112, + "grad_norm": 0.12509649991989136, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 265160 + }, + { + "epoch": 1.0250730621143944, + "grad_norm": 0.09993588179349899, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 265170 + }, + { + "epoch": 1.0251117193177777, + "grad_norm": 0.14093492925167084, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 265180 + }, + { + "epoch": 1.025150376521161, + "grad_norm": 0.10278402268886566, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 265190 + }, + { + "epoch": 1.0251890337245442, + "grad_norm": 0.12884950637817383, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 265200 + }, + { + "epoch": 1.0252276909279274, + "grad_norm": 0.09993290901184082, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 265210 + }, + { + "epoch": 1.0252663481313107, + "grad_norm": 0.1100497916340828, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 265220 + }, + { + "epoch": 1.0253050053346942, + "grad_norm": 0.11035642772912979, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 265230 + }, + { + "epoch": 1.0253436625380774, + "grad_norm": 0.10592029988765717, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 265240 + }, + { + "epoch": 1.0253823197414607, + "grad_norm": 0.10400436818599701, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 265250 + }, + { + "epoch": 1.025420976944844, + "grad_norm": 0.11346562951803207, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 265260 + }, + { + "epoch": 1.0254596341482272, + "grad_norm": 0.12035985291004181, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 265270 + }, + { + "epoch": 1.0254982913516104, + "grad_norm": 0.1068260669708252, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 265280 + }, + { + "epoch": 1.0255369485549937, + "grad_norm": 0.09974275529384613, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 265290 + }, + { + "epoch": 1.025575605758377, + "grad_norm": 0.14729996025562286, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 265300 + }, + { + "epoch": 1.0256142629617604, + "grad_norm": 0.11352241039276123, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 265310 + }, + { + "epoch": 1.0256529201651436, + "grad_norm": 0.09459713101387024, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 265320 + }, + { + "epoch": 1.025691577368527, + "grad_norm": 0.10980570316314697, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 265330 + }, + { + "epoch": 1.0257302345719101, + "grad_norm": 0.12916293740272522, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 265340 + }, + { + "epoch": 1.0257688917752934, + "grad_norm": 0.11712485551834106, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 265350 + }, + { + "epoch": 1.0258075489786767, + "grad_norm": 0.09614215046167374, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 265360 + }, + { + "epoch": 1.02584620618206, + "grad_norm": 0.1279195100069046, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 265370 + }, + { + "epoch": 1.0258848633854432, + "grad_norm": 0.10815688222646713, + "learning_rate": 0.002, + "loss": 2.339, + "step": 265380 + }, + { + "epoch": 1.0259235205888264, + "grad_norm": 0.10971692949533463, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 265390 + }, + { + "epoch": 1.0259621777922099, + "grad_norm": 0.09672383219003677, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 265400 + }, + { + "epoch": 1.0260008349955931, + "grad_norm": 0.21441972255706787, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 265410 + }, + { + "epoch": 1.0260394921989764, + "grad_norm": 0.10706193000078201, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 265420 + }, + { + "epoch": 1.0260781494023596, + "grad_norm": 0.09036446362733841, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 265430 + }, + { + "epoch": 1.026116806605743, + "grad_norm": 0.09791683405637741, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 265440 + }, + { + "epoch": 1.0261554638091261, + "grad_norm": 0.09223045408725739, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 265450 + }, + { + "epoch": 1.0261941210125094, + "grad_norm": 0.09813971072435379, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 265460 + }, + { + "epoch": 1.0262327782158926, + "grad_norm": 0.09205476194620132, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 265470 + }, + { + "epoch": 1.0262714354192761, + "grad_norm": 0.10239709168672562, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 265480 + }, + { + "epoch": 1.0263100926226594, + "grad_norm": 0.10483129322528839, + "learning_rate": 0.002, + "loss": 2.325, + "step": 265490 + }, + { + "epoch": 1.0263487498260426, + "grad_norm": 0.10401138663291931, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 265500 + }, + { + "epoch": 1.0263874070294259, + "grad_norm": 0.1368025243282318, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 265510 + }, + { + "epoch": 1.0264260642328091, + "grad_norm": 0.11704922467470169, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 265520 + }, + { + "epoch": 1.0264647214361924, + "grad_norm": 0.11174594610929489, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 265530 + }, + { + "epoch": 1.0265033786395756, + "grad_norm": 0.10312842577695847, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 265540 + }, + { + "epoch": 1.0265420358429589, + "grad_norm": 0.10455312579870224, + "learning_rate": 0.002, + "loss": 2.338, + "step": 265550 + }, + { + "epoch": 1.0265806930463424, + "grad_norm": 0.09767155349254608, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 265560 + }, + { + "epoch": 1.0266193502497256, + "grad_norm": 0.10167326033115387, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 265570 + }, + { + "epoch": 1.0266580074531089, + "grad_norm": 0.12255837023258209, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 265580 + }, + { + "epoch": 1.0266966646564921, + "grad_norm": 0.10716545581817627, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 265590 + }, + { + "epoch": 1.0267353218598754, + "grad_norm": 0.10378210246562958, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 265600 + }, + { + "epoch": 1.0267739790632586, + "grad_norm": 0.09660319238901138, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 265610 + }, + { + "epoch": 1.0268126362666419, + "grad_norm": 0.10769104957580566, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 265620 + }, + { + "epoch": 1.0268512934700251, + "grad_norm": 0.09757567197084427, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 265630 + }, + { + "epoch": 1.0268899506734084, + "grad_norm": 0.11647003144025803, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 265640 + }, + { + "epoch": 1.0269286078767919, + "grad_norm": 0.10532090067863464, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 265650 + }, + { + "epoch": 1.026967265080175, + "grad_norm": 0.11361627280712128, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 265660 + }, + { + "epoch": 1.0270059222835584, + "grad_norm": 0.09371964633464813, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 265670 + }, + { + "epoch": 1.0270445794869416, + "grad_norm": 0.09509408473968506, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 265680 + }, + { + "epoch": 1.0270832366903249, + "grad_norm": 0.11384610086679459, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 265690 + }, + { + "epoch": 1.0271218938937081, + "grad_norm": 0.10529225319623947, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 265700 + }, + { + "epoch": 1.0271605510970914, + "grad_norm": 0.1519152969121933, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 265710 + }, + { + "epoch": 1.0271992083004746, + "grad_norm": 0.10594785213470459, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 265720 + }, + { + "epoch": 1.027237865503858, + "grad_norm": 0.10499383509159088, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 265730 + }, + { + "epoch": 1.0272765227072413, + "grad_norm": 0.08883464336395264, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 265740 + }, + { + "epoch": 1.0273151799106246, + "grad_norm": 0.09568743407726288, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 265750 + }, + { + "epoch": 1.0273538371140079, + "grad_norm": 0.11473828554153442, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 265760 + }, + { + "epoch": 1.027392494317391, + "grad_norm": 0.11438367515802383, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 265770 + }, + { + "epoch": 1.0274311515207744, + "grad_norm": 0.12594550848007202, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 265780 + }, + { + "epoch": 1.0274698087241576, + "grad_norm": 0.16199305653572083, + "learning_rate": 0.002, + "loss": 2.334, + "step": 265790 + }, + { + "epoch": 1.0275084659275409, + "grad_norm": 0.10604202747344971, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 265800 + }, + { + "epoch": 1.0275471231309241, + "grad_norm": 0.12076123803853989, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 265810 + }, + { + "epoch": 1.0275857803343076, + "grad_norm": 0.10725189000368118, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 265820 + }, + { + "epoch": 1.0276244375376908, + "grad_norm": 0.10174400359392166, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 265830 + }, + { + "epoch": 1.027663094741074, + "grad_norm": 0.10425542294979095, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 265840 + }, + { + "epoch": 1.0277017519444573, + "grad_norm": 0.091111920773983, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 265850 + }, + { + "epoch": 1.0277404091478406, + "grad_norm": 0.11550716310739517, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 265860 + }, + { + "epoch": 1.0277790663512238, + "grad_norm": 0.12211589515209198, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 265870 + }, + { + "epoch": 1.027817723554607, + "grad_norm": 0.09479061514139175, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 265880 + }, + { + "epoch": 1.0278563807579904, + "grad_norm": 0.17094501852989197, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 265890 + }, + { + "epoch": 1.0278950379613738, + "grad_norm": 0.11208199709653854, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 265900 + }, + { + "epoch": 1.027933695164757, + "grad_norm": 0.10838782042264938, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 265910 + }, + { + "epoch": 1.0279723523681403, + "grad_norm": 0.11082717031240463, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 265920 + }, + { + "epoch": 1.0280110095715236, + "grad_norm": 0.10645514726638794, + "learning_rate": 0.002, + "loss": 2.344, + "step": 265930 + }, + { + "epoch": 1.0280496667749068, + "grad_norm": 0.0926586240530014, + "learning_rate": 0.002, + "loss": 2.328, + "step": 265940 + }, + { + "epoch": 1.02808832397829, + "grad_norm": 0.10408720374107361, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 265950 + }, + { + "epoch": 1.0281269811816733, + "grad_norm": 0.09145691990852356, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 265960 + }, + { + "epoch": 1.0281656383850566, + "grad_norm": 0.09304022043943405, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 265970 + }, + { + "epoch": 1.0282042955884398, + "grad_norm": 0.10211073607206345, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 265980 + }, + { + "epoch": 1.0282429527918233, + "grad_norm": 0.10932856053113937, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 265990 + }, + { + "epoch": 1.0282816099952066, + "grad_norm": 0.1237453743815422, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 266000 + }, + { + "epoch": 1.0283202671985898, + "grad_norm": 0.09511668980121613, + "learning_rate": 0.002, + "loss": 2.338, + "step": 266010 + }, + { + "epoch": 1.028358924401973, + "grad_norm": 0.10278719663619995, + "learning_rate": 0.002, + "loss": 2.323, + "step": 266020 + }, + { + "epoch": 1.0283975816053563, + "grad_norm": 0.1019146591424942, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 266030 + }, + { + "epoch": 1.0284362388087396, + "grad_norm": 0.10987775772809982, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 266040 + }, + { + "epoch": 1.0284748960121228, + "grad_norm": 0.11195337027311325, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 266050 + }, + { + "epoch": 1.028513553215506, + "grad_norm": 0.11950255185365677, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 266060 + }, + { + "epoch": 1.0285522104188896, + "grad_norm": 0.1296747326850891, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 266070 + }, + { + "epoch": 1.0285908676222728, + "grad_norm": 0.09848557412624359, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 266080 + }, + { + "epoch": 1.028629524825656, + "grad_norm": 0.09893085807561874, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 266090 + }, + { + "epoch": 1.0286681820290393, + "grad_norm": 0.09465838223695755, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 266100 + }, + { + "epoch": 1.0287068392324226, + "grad_norm": 0.09347520768642426, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 266110 + }, + { + "epoch": 1.0287454964358058, + "grad_norm": 0.11662112921476364, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 266120 + }, + { + "epoch": 1.028784153639189, + "grad_norm": 0.1154218465089798, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 266130 + }, + { + "epoch": 1.0288228108425723, + "grad_norm": 0.08964736014604568, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 266140 + }, + { + "epoch": 1.0288614680459558, + "grad_norm": 0.11205974221229553, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 266150 + }, + { + "epoch": 1.028900125249339, + "grad_norm": 0.10498930513858795, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 266160 + }, + { + "epoch": 1.0289387824527223, + "grad_norm": 0.1054898351430893, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 266170 + }, + { + "epoch": 1.0289774396561056, + "grad_norm": 0.11319541186094284, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 266180 + }, + { + "epoch": 1.0290160968594888, + "grad_norm": 0.10512629896402359, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 266190 + }, + { + "epoch": 1.029054754062872, + "grad_norm": 0.11395827680826187, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 266200 + }, + { + "epoch": 1.0290934112662553, + "grad_norm": 0.11131727695465088, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 266210 + }, + { + "epoch": 1.0291320684696386, + "grad_norm": 0.12667366862297058, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 266220 + }, + { + "epoch": 1.0291707256730218, + "grad_norm": 0.10688159614801407, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 266230 + }, + { + "epoch": 1.0292093828764053, + "grad_norm": 0.10538176447153091, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 266240 + }, + { + "epoch": 1.0292480400797885, + "grad_norm": 0.1173299103975296, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 266250 + }, + { + "epoch": 1.0292866972831718, + "grad_norm": 0.11167753487825394, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 266260 + }, + { + "epoch": 1.029325354486555, + "grad_norm": 0.09789314866065979, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 266270 + }, + { + "epoch": 1.0293640116899383, + "grad_norm": 0.11912738531827927, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 266280 + }, + { + "epoch": 1.0294026688933215, + "grad_norm": 0.10356725007295609, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 266290 + }, + { + "epoch": 1.0294413260967048, + "grad_norm": 0.11253173649311066, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 266300 + }, + { + "epoch": 1.029479983300088, + "grad_norm": 0.09901507943868637, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 266310 + }, + { + "epoch": 1.0295186405034715, + "grad_norm": 0.11068379133939743, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 266320 + }, + { + "epoch": 1.0295572977068548, + "grad_norm": 0.09040426462888718, + "learning_rate": 0.002, + "loss": 2.337, + "step": 266330 + }, + { + "epoch": 1.029595954910238, + "grad_norm": 0.10851092636585236, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 266340 + }, + { + "epoch": 1.0296346121136213, + "grad_norm": 0.139420285820961, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 266350 + }, + { + "epoch": 1.0296732693170045, + "grad_norm": 0.12576444447040558, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 266360 + }, + { + "epoch": 1.0297119265203878, + "grad_norm": 0.08843178302049637, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 266370 + }, + { + "epoch": 1.029750583723771, + "grad_norm": 0.10211913287639618, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 266380 + }, + { + "epoch": 1.0297892409271543, + "grad_norm": 0.11484231054782867, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 266390 + }, + { + "epoch": 1.0298278981305375, + "grad_norm": 0.13214653730392456, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 266400 + }, + { + "epoch": 1.029866555333921, + "grad_norm": 0.11222288012504578, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 266410 + }, + { + "epoch": 1.0299052125373043, + "grad_norm": 0.10335548967123032, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 266420 + }, + { + "epoch": 1.0299438697406875, + "grad_norm": 0.0991593673825264, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 266430 + }, + { + "epoch": 1.0299825269440708, + "grad_norm": 0.09273135662078857, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 266440 + }, + { + "epoch": 1.030021184147454, + "grad_norm": 0.1204661950469017, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 266450 + }, + { + "epoch": 1.0300598413508373, + "grad_norm": 0.10633353143930435, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 266460 + }, + { + "epoch": 1.0300984985542205, + "grad_norm": 0.09771783649921417, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 266470 + }, + { + "epoch": 1.0301371557576038, + "grad_norm": 0.11200734972953796, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 266480 + }, + { + "epoch": 1.0301758129609873, + "grad_norm": 0.08626644313335419, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 266490 + }, + { + "epoch": 1.0302144701643705, + "grad_norm": 0.11899591237306595, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 266500 + }, + { + "epoch": 1.0302531273677538, + "grad_norm": 0.10089579224586487, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 266510 + }, + { + "epoch": 1.030291784571137, + "grad_norm": 0.1076754629611969, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 266520 + }, + { + "epoch": 1.0303304417745203, + "grad_norm": 0.11318105459213257, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 266530 + }, + { + "epoch": 1.0303690989779035, + "grad_norm": 0.10871787369251251, + "learning_rate": 0.002, + "loss": 2.328, + "step": 266540 + }, + { + "epoch": 1.0304077561812868, + "grad_norm": 0.09567765146493912, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 266550 + }, + { + "epoch": 1.03044641338467, + "grad_norm": 0.13377155363559723, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 266560 + }, + { + "epoch": 1.0304850705880533, + "grad_norm": 0.10268377512693405, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 266570 + }, + { + "epoch": 1.0305237277914367, + "grad_norm": 0.12028588354587555, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 266580 + }, + { + "epoch": 1.03056238499482, + "grad_norm": 0.1063971221446991, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 266590 + }, + { + "epoch": 1.0306010421982033, + "grad_norm": 0.12419982999563217, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 266600 + }, + { + "epoch": 1.0306396994015865, + "grad_norm": 0.11007034033536911, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 266610 + }, + { + "epoch": 1.0306783566049698, + "grad_norm": 0.10347622632980347, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 266620 + }, + { + "epoch": 1.030717013808353, + "grad_norm": 0.10151407867670059, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 266630 + }, + { + "epoch": 1.0307556710117363, + "grad_norm": 0.1266144961118698, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 266640 + }, + { + "epoch": 1.0307943282151195, + "grad_norm": 0.10101339221000671, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 266650 + }, + { + "epoch": 1.030832985418503, + "grad_norm": 0.11579057574272156, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 266660 + }, + { + "epoch": 1.0308716426218862, + "grad_norm": 0.10573630034923553, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 266670 + }, + { + "epoch": 1.0309102998252695, + "grad_norm": 0.1263340413570404, + "learning_rate": 0.002, + "loss": 2.328, + "step": 266680 + }, + { + "epoch": 1.0309489570286527, + "grad_norm": 0.09813540428876877, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 266690 + }, + { + "epoch": 1.030987614232036, + "grad_norm": 0.10262829065322876, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 266700 + }, + { + "epoch": 1.0310262714354193, + "grad_norm": 0.1933591216802597, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 266710 + }, + { + "epoch": 1.0310649286388025, + "grad_norm": 0.11691686511039734, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 266720 + }, + { + "epoch": 1.0311035858421858, + "grad_norm": 0.10910283774137497, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 266730 + }, + { + "epoch": 1.031142243045569, + "grad_norm": 0.09778497368097305, + "learning_rate": 0.002, + "loss": 2.352, + "step": 266740 + }, + { + "epoch": 1.0311809002489525, + "grad_norm": 0.12451784312725067, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 266750 + }, + { + "epoch": 1.0312195574523357, + "grad_norm": 0.10196122527122498, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 266760 + }, + { + "epoch": 1.031258214655719, + "grad_norm": 0.10703619569540024, + "learning_rate": 0.002, + "loss": 2.336, + "step": 266770 + }, + { + "epoch": 1.0312968718591022, + "grad_norm": 0.1130760908126831, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 266780 + }, + { + "epoch": 1.0313355290624855, + "grad_norm": 0.12371481209993362, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 266790 + }, + { + "epoch": 1.0313741862658687, + "grad_norm": 0.11786756664514542, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 266800 + }, + { + "epoch": 1.031412843469252, + "grad_norm": 0.11346852779388428, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 266810 + }, + { + "epoch": 1.0314515006726352, + "grad_norm": 0.09816392511129379, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 266820 + }, + { + "epoch": 1.0314901578760187, + "grad_norm": 0.09450547397136688, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 266830 + }, + { + "epoch": 1.031528815079402, + "grad_norm": 0.1281440556049347, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 266840 + }, + { + "epoch": 1.0315674722827852, + "grad_norm": 0.10921774804592133, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 266850 + }, + { + "epoch": 1.0316061294861685, + "grad_norm": 0.12215537577867508, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 266860 + }, + { + "epoch": 1.0316447866895517, + "grad_norm": 0.09862463921308517, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 266870 + }, + { + "epoch": 1.031683443892935, + "grad_norm": 0.30553138256073, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 266880 + }, + { + "epoch": 1.0317221010963182, + "grad_norm": 0.10710947215557098, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 266890 + }, + { + "epoch": 1.0317607582997015, + "grad_norm": 0.09189295023679733, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 266900 + }, + { + "epoch": 1.0317994155030847, + "grad_norm": 0.10326312482357025, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 266910 + }, + { + "epoch": 1.0318380727064682, + "grad_norm": 0.10674124211072922, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 266920 + }, + { + "epoch": 1.0318767299098515, + "grad_norm": 0.11148896813392639, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 266930 + }, + { + "epoch": 1.0319153871132347, + "grad_norm": 0.10347460955381393, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 266940 + }, + { + "epoch": 1.031954044316618, + "grad_norm": 0.09875006228685379, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 266950 + }, + { + "epoch": 1.0319927015200012, + "grad_norm": 0.129219651222229, + "learning_rate": 0.002, + "loss": 2.319, + "step": 266960 + }, + { + "epoch": 1.0320313587233845, + "grad_norm": 0.11608944833278656, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 266970 + }, + { + "epoch": 1.0320700159267677, + "grad_norm": 0.1069372147321701, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 266980 + }, + { + "epoch": 1.032108673130151, + "grad_norm": 0.08900687843561172, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 266990 + }, + { + "epoch": 1.0321473303335345, + "grad_norm": 0.11886018514633179, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 267000 + }, + { + "epoch": 1.0321859875369177, + "grad_norm": 0.08972641080617905, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 267010 + }, + { + "epoch": 1.032224644740301, + "grad_norm": 0.10585705190896988, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 267020 + }, + { + "epoch": 1.0322633019436842, + "grad_norm": 0.10356702655553818, + "learning_rate": 0.002, + "loss": 2.332, + "step": 267030 + }, + { + "epoch": 1.0323019591470675, + "grad_norm": 0.09595673531293869, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 267040 + }, + { + "epoch": 1.0323406163504507, + "grad_norm": 0.10113181918859482, + "learning_rate": 0.002, + "loss": 2.326, + "step": 267050 + }, + { + "epoch": 1.032379273553834, + "grad_norm": 0.11091704666614532, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 267060 + }, + { + "epoch": 1.0324179307572172, + "grad_norm": 0.10351219028234482, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 267070 + }, + { + "epoch": 1.0324565879606005, + "grad_norm": 0.12491386383771896, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 267080 + }, + { + "epoch": 1.032495245163984, + "grad_norm": 0.10733221471309662, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 267090 + }, + { + "epoch": 1.0325339023673672, + "grad_norm": 0.10156510770320892, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 267100 + }, + { + "epoch": 1.0325725595707504, + "grad_norm": 0.10368265956640244, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 267110 + }, + { + "epoch": 1.0326112167741337, + "grad_norm": 0.10400926321744919, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 267120 + }, + { + "epoch": 1.032649873977517, + "grad_norm": 0.1019158810377121, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 267130 + }, + { + "epoch": 1.0326885311809002, + "grad_norm": 0.11867933720350266, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 267140 + }, + { + "epoch": 1.0327271883842835, + "grad_norm": 0.09400109946727753, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 267150 + }, + { + "epoch": 1.0327658455876667, + "grad_norm": 0.10336810350418091, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 267160 + }, + { + "epoch": 1.0328045027910502, + "grad_norm": 0.11142666637897491, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 267170 + }, + { + "epoch": 1.0328431599944334, + "grad_norm": 0.13898703455924988, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 267180 + }, + { + "epoch": 1.0328818171978167, + "grad_norm": 0.09304480254650116, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 267190 + }, + { + "epoch": 1.0329204744012, + "grad_norm": 0.11063506454229355, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 267200 + }, + { + "epoch": 1.0329591316045832, + "grad_norm": 0.09229174256324768, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 267210 + }, + { + "epoch": 1.0329977888079664, + "grad_norm": 0.09612909704446793, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 267220 + }, + { + "epoch": 1.0330364460113497, + "grad_norm": 0.11524896323680878, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 267230 + }, + { + "epoch": 1.033075103214733, + "grad_norm": 0.10607946664094925, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 267240 + }, + { + "epoch": 1.0331137604181162, + "grad_norm": 0.09210071712732315, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 267250 + }, + { + "epoch": 1.0331524176214997, + "grad_norm": 0.11773253232240677, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 267260 + }, + { + "epoch": 1.033191074824883, + "grad_norm": 0.10352618247270584, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 267270 + }, + { + "epoch": 1.0332297320282662, + "grad_norm": 0.1059403195977211, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 267280 + }, + { + "epoch": 1.0332683892316494, + "grad_norm": 0.09368275105953217, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 267290 + }, + { + "epoch": 1.0333070464350327, + "grad_norm": 0.1071181371808052, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 267300 + }, + { + "epoch": 1.033345703638416, + "grad_norm": 0.14391173422336578, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 267310 + }, + { + "epoch": 1.0333843608417992, + "grad_norm": 0.0961400493979454, + "learning_rate": 0.002, + "loss": 2.327, + "step": 267320 + }, + { + "epoch": 1.0334230180451824, + "grad_norm": 0.09671345353126526, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 267330 + }, + { + "epoch": 1.033461675248566, + "grad_norm": 0.13067911565303802, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 267340 + }, + { + "epoch": 1.0335003324519492, + "grad_norm": 0.11352329701185226, + "learning_rate": 0.002, + "loss": 2.342, + "step": 267350 + }, + { + "epoch": 1.0335389896553324, + "grad_norm": 0.12045929580926895, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 267360 + }, + { + "epoch": 1.0335776468587157, + "grad_norm": 0.09980279952287674, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 267370 + }, + { + "epoch": 1.033616304062099, + "grad_norm": 0.10935485363006592, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 267380 + }, + { + "epoch": 1.0336549612654822, + "grad_norm": 0.12776319682598114, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 267390 + }, + { + "epoch": 1.0336936184688654, + "grad_norm": 0.09210210293531418, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 267400 + }, + { + "epoch": 1.0337322756722487, + "grad_norm": 0.09530036896467209, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 267410 + }, + { + "epoch": 1.033770932875632, + "grad_norm": 0.12296126782894135, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 267420 + }, + { + "epoch": 1.0338095900790154, + "grad_norm": 0.11421650648117065, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 267430 + }, + { + "epoch": 1.0338482472823987, + "grad_norm": 0.10140615701675415, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 267440 + }, + { + "epoch": 1.033886904485782, + "grad_norm": 0.10139352083206177, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 267450 + }, + { + "epoch": 1.0339255616891652, + "grad_norm": 0.09594139456748962, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 267460 + }, + { + "epoch": 1.0339642188925484, + "grad_norm": 0.09860043227672577, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 267470 + }, + { + "epoch": 1.0340028760959317, + "grad_norm": 0.10134763270616531, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 267480 + }, + { + "epoch": 1.034041533299315, + "grad_norm": 0.1237870305776596, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 267490 + }, + { + "epoch": 1.0340801905026982, + "grad_norm": 0.08835762739181519, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 267500 + }, + { + "epoch": 1.0341188477060816, + "grad_norm": 0.09981214255094528, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 267510 + }, + { + "epoch": 1.034157504909465, + "grad_norm": 0.10252374410629272, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 267520 + }, + { + "epoch": 1.0341961621128481, + "grad_norm": 0.1146230399608612, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 267530 + }, + { + "epoch": 1.0342348193162314, + "grad_norm": 0.09919324517250061, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 267540 + }, + { + "epoch": 1.0342734765196147, + "grad_norm": 0.10665534436702728, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 267550 + }, + { + "epoch": 1.034312133722998, + "grad_norm": 0.11877695471048355, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 267560 + }, + { + "epoch": 1.0343507909263812, + "grad_norm": 0.11180389672517776, + "learning_rate": 0.002, + "loss": 2.327, + "step": 267570 + }, + { + "epoch": 1.0343894481297644, + "grad_norm": 0.10479209572076797, + "learning_rate": 0.002, + "loss": 2.351, + "step": 267580 + }, + { + "epoch": 1.0344281053331479, + "grad_norm": 0.12032832950353622, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 267590 + }, + { + "epoch": 1.0344667625365311, + "grad_norm": 0.10242997854948044, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 267600 + }, + { + "epoch": 1.0345054197399144, + "grad_norm": 0.12337980419397354, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 267610 + }, + { + "epoch": 1.0345440769432976, + "grad_norm": 0.1257912516593933, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 267620 + }, + { + "epoch": 1.034582734146681, + "grad_norm": 0.09377360343933105, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 267630 + }, + { + "epoch": 1.0346213913500641, + "grad_norm": 0.1034860759973526, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 267640 + }, + { + "epoch": 1.0346600485534474, + "grad_norm": 0.11057710647583008, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 267650 + }, + { + "epoch": 1.0346987057568307, + "grad_norm": 0.11116885393857956, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 267660 + }, + { + "epoch": 1.034737362960214, + "grad_norm": 0.1062694787979126, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 267670 + }, + { + "epoch": 1.0347760201635974, + "grad_norm": 0.10646285116672516, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 267680 + }, + { + "epoch": 1.0348146773669806, + "grad_norm": 0.0952630415558815, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 267690 + }, + { + "epoch": 1.0348533345703639, + "grad_norm": 0.12547989189624786, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 267700 + }, + { + "epoch": 1.0348919917737471, + "grad_norm": 0.09910428524017334, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 267710 + }, + { + "epoch": 1.0349306489771304, + "grad_norm": 0.09709003567695618, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 267720 + }, + { + "epoch": 1.0349693061805136, + "grad_norm": 0.08974325656890869, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 267730 + }, + { + "epoch": 1.035007963383897, + "grad_norm": 0.09742540866136551, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 267740 + }, + { + "epoch": 1.0350466205872801, + "grad_norm": 0.1201322078704834, + "learning_rate": 0.002, + "loss": 2.325, + "step": 267750 + }, + { + "epoch": 1.0350852777906636, + "grad_norm": 0.11878392845392227, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 267760 + }, + { + "epoch": 1.0351239349940469, + "grad_norm": 0.09439965337514877, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 267770 + }, + { + "epoch": 1.0351625921974301, + "grad_norm": 0.09864119440317154, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 267780 + }, + { + "epoch": 1.0352012494008134, + "grad_norm": 0.10157059133052826, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 267790 + }, + { + "epoch": 1.0352399066041966, + "grad_norm": 0.0926649197936058, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 267800 + }, + { + "epoch": 1.0352785638075799, + "grad_norm": 0.0927080512046814, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 267810 + }, + { + "epoch": 1.0353172210109631, + "grad_norm": 0.10915268212556839, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 267820 + }, + { + "epoch": 1.0353558782143464, + "grad_norm": 0.11072403192520142, + "learning_rate": 0.002, + "loss": 2.341, + "step": 267830 + }, + { + "epoch": 1.0353945354177296, + "grad_norm": 0.1306406855583191, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 267840 + }, + { + "epoch": 1.035433192621113, + "grad_norm": 0.09433692693710327, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 267850 + }, + { + "epoch": 1.0354718498244964, + "grad_norm": 0.1059279516339302, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 267860 + }, + { + "epoch": 1.0355105070278796, + "grad_norm": 0.10944026708602905, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 267870 + }, + { + "epoch": 1.0355491642312629, + "grad_norm": 0.12454365193843842, + "learning_rate": 0.002, + "loss": 2.331, + "step": 267880 + }, + { + "epoch": 1.0355878214346461, + "grad_norm": 0.11097417026758194, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 267890 + }, + { + "epoch": 1.0356264786380294, + "grad_norm": 0.13012464344501495, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 267900 + }, + { + "epoch": 1.0356651358414126, + "grad_norm": 0.10675875097513199, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 267910 + }, + { + "epoch": 1.0357037930447959, + "grad_norm": 0.10206145793199539, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 267920 + }, + { + "epoch": 1.0357424502481793, + "grad_norm": 0.11607825756072998, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 267930 + }, + { + "epoch": 1.0357811074515626, + "grad_norm": 0.11091554164886475, + "learning_rate": 0.002, + "loss": 2.335, + "step": 267940 + }, + { + "epoch": 1.0358197646549459, + "grad_norm": 0.09264187514781952, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 267950 + }, + { + "epoch": 1.035858421858329, + "grad_norm": 0.09681179374456406, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 267960 + }, + { + "epoch": 1.0358970790617124, + "grad_norm": 0.1079772561788559, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 267970 + }, + { + "epoch": 1.0359357362650956, + "grad_norm": 0.1027929037809372, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 267980 + }, + { + "epoch": 1.0359743934684789, + "grad_norm": 0.12011872231960297, + "learning_rate": 0.002, + "loss": 2.34, + "step": 267990 + }, + { + "epoch": 1.0360130506718621, + "grad_norm": 0.1037503257393837, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 268000 + }, + { + "epoch": 1.0360517078752456, + "grad_norm": 0.11351514607667923, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 268010 + }, + { + "epoch": 1.0360903650786288, + "grad_norm": 0.12285458296537399, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 268020 + }, + { + "epoch": 1.036129022282012, + "grad_norm": 0.11562503129243851, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 268030 + }, + { + "epoch": 1.0361676794853953, + "grad_norm": 0.10960686206817627, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 268040 + }, + { + "epoch": 1.0362063366887786, + "grad_norm": 0.0934433564543724, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 268050 + }, + { + "epoch": 1.0362449938921618, + "grad_norm": 0.11285009235143661, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 268060 + }, + { + "epoch": 1.036283651095545, + "grad_norm": 0.09301579743623734, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 268070 + }, + { + "epoch": 1.0363223082989284, + "grad_norm": 0.09829026460647583, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 268080 + }, + { + "epoch": 1.0363609655023116, + "grad_norm": 0.12021401524543762, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 268090 + }, + { + "epoch": 1.036399622705695, + "grad_norm": 0.12203650176525116, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 268100 + }, + { + "epoch": 1.0364382799090783, + "grad_norm": 0.09933951497077942, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 268110 + }, + { + "epoch": 1.0364769371124616, + "grad_norm": 0.1088651716709137, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 268120 + }, + { + "epoch": 1.0365155943158448, + "grad_norm": 0.09777121245861053, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 268130 + }, + { + "epoch": 1.036554251519228, + "grad_norm": 0.11730097979307175, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 268140 + }, + { + "epoch": 1.0365929087226113, + "grad_norm": 0.10772929340600967, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 268150 + }, + { + "epoch": 1.0366315659259946, + "grad_norm": 0.10943850874900818, + "learning_rate": 0.002, + "loss": 2.327, + "step": 268160 + }, + { + "epoch": 1.0366702231293778, + "grad_norm": 0.10009908676147461, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 268170 + }, + { + "epoch": 1.0367088803327613, + "grad_norm": 0.1684296876192093, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 268180 + }, + { + "epoch": 1.0367475375361446, + "grad_norm": 0.11017098277807236, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 268190 + }, + { + "epoch": 1.0367861947395278, + "grad_norm": 0.14785271883010864, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 268200 + }, + { + "epoch": 1.036824851942911, + "grad_norm": 0.12890072166919708, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 268210 + }, + { + "epoch": 1.0368635091462943, + "grad_norm": 0.12205611169338226, + "learning_rate": 0.002, + "loss": 2.342, + "step": 268220 + }, + { + "epoch": 1.0369021663496776, + "grad_norm": 0.10493602603673935, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 268230 + }, + { + "epoch": 1.0369408235530608, + "grad_norm": 0.09823311865329742, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 268240 + }, + { + "epoch": 1.036979480756444, + "grad_norm": 0.10882266610860825, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 268250 + }, + { + "epoch": 1.0370181379598273, + "grad_norm": 0.12620101869106293, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 268260 + }, + { + "epoch": 1.0370567951632108, + "grad_norm": 0.11481660604476929, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 268270 + }, + { + "epoch": 1.037095452366594, + "grad_norm": 0.12165917456150055, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 268280 + }, + { + "epoch": 1.0371341095699773, + "grad_norm": 0.09268329292535782, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 268290 + }, + { + "epoch": 1.0371727667733606, + "grad_norm": 0.11170244961977005, + "learning_rate": 0.002, + "loss": 2.333, + "step": 268300 + }, + { + "epoch": 1.0372114239767438, + "grad_norm": 0.09712453186511993, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 268310 + }, + { + "epoch": 1.037250081180127, + "grad_norm": 0.11297538876533508, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 268320 + }, + { + "epoch": 1.0372887383835103, + "grad_norm": 0.10871253907680511, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 268330 + }, + { + "epoch": 1.0373273955868936, + "grad_norm": 0.11911670118570328, + "learning_rate": 0.002, + "loss": 2.335, + "step": 268340 + }, + { + "epoch": 1.037366052790277, + "grad_norm": 0.09114481508731842, + "learning_rate": 0.002, + "loss": 2.3553, + "step": 268350 + }, + { + "epoch": 1.0374047099936603, + "grad_norm": 0.09637667238712311, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 268360 + }, + { + "epoch": 1.0374433671970436, + "grad_norm": 0.10665112733840942, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 268370 + }, + { + "epoch": 1.0374820244004268, + "grad_norm": 0.11053548753261566, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 268380 + }, + { + "epoch": 1.03752068160381, + "grad_norm": 0.12966634333133698, + "learning_rate": 0.002, + "loss": 2.339, + "step": 268390 + }, + { + "epoch": 1.0375593388071933, + "grad_norm": 0.12053807824850082, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 268400 + }, + { + "epoch": 1.0375979960105766, + "grad_norm": 0.09861911833286285, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 268410 + }, + { + "epoch": 1.0376366532139598, + "grad_norm": 0.14827530086040497, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 268420 + }, + { + "epoch": 1.037675310417343, + "grad_norm": 0.10329598933458328, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 268430 + }, + { + "epoch": 1.0377139676207265, + "grad_norm": 0.09511538594961166, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 268440 + }, + { + "epoch": 1.0377526248241098, + "grad_norm": 0.12171187996864319, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 268450 + }, + { + "epoch": 1.037791282027493, + "grad_norm": 0.09742235392332077, + "learning_rate": 0.002, + "loss": 2.341, + "step": 268460 + }, + { + "epoch": 1.0378299392308763, + "grad_norm": 0.1101989820599556, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 268470 + }, + { + "epoch": 1.0378685964342595, + "grad_norm": 0.11267191171646118, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 268480 + }, + { + "epoch": 1.0379072536376428, + "grad_norm": 0.09148108959197998, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 268490 + }, + { + "epoch": 1.037945910841026, + "grad_norm": 0.10221163928508759, + "learning_rate": 0.002, + "loss": 2.3141, + "step": 268500 + }, + { + "epoch": 1.0379845680444093, + "grad_norm": 0.1174292266368866, + "learning_rate": 0.002, + "loss": 2.333, + "step": 268510 + }, + { + "epoch": 1.0380232252477928, + "grad_norm": 0.09777496010065079, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 268520 + }, + { + "epoch": 1.038061882451176, + "grad_norm": 0.1312064230442047, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 268530 + }, + { + "epoch": 1.0381005396545593, + "grad_norm": 0.11722839623689651, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 268540 + }, + { + "epoch": 1.0381391968579425, + "grad_norm": 0.09800203889608383, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 268550 + }, + { + "epoch": 1.0381778540613258, + "grad_norm": 0.09853968769311905, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 268560 + }, + { + "epoch": 1.038216511264709, + "grad_norm": 0.1263112872838974, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 268570 + }, + { + "epoch": 1.0382551684680923, + "grad_norm": 0.1005529835820198, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 268580 + }, + { + "epoch": 1.0382938256714755, + "grad_norm": 0.11430198699235916, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 268590 + }, + { + "epoch": 1.0383324828748588, + "grad_norm": 0.12453008443117142, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 268600 + }, + { + "epoch": 1.0383711400782423, + "grad_norm": 0.09846989065408707, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 268610 + }, + { + "epoch": 1.0384097972816255, + "grad_norm": 0.11411017179489136, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 268620 + }, + { + "epoch": 1.0384484544850088, + "grad_norm": 0.10042417049407959, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 268630 + }, + { + "epoch": 1.038487111688392, + "grad_norm": 0.10524924844503403, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 268640 + }, + { + "epoch": 1.0385257688917753, + "grad_norm": 0.11033865064382553, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 268650 + }, + { + "epoch": 1.0385644260951585, + "grad_norm": 0.1116894781589508, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 268660 + }, + { + "epoch": 1.0386030832985418, + "grad_norm": 0.10501192510128021, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 268670 + }, + { + "epoch": 1.038641740501925, + "grad_norm": 0.10203655809164047, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 268680 + }, + { + "epoch": 1.0386803977053085, + "grad_norm": 0.09671195596456528, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 268690 + }, + { + "epoch": 1.0387190549086918, + "grad_norm": 0.11293695122003555, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 268700 + }, + { + "epoch": 1.038757712112075, + "grad_norm": 0.10207619518041611, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 268710 + }, + { + "epoch": 1.0387963693154583, + "grad_norm": 0.10498257726430893, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 268720 + }, + { + "epoch": 1.0388350265188415, + "grad_norm": 0.08968181908130646, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 268730 + }, + { + "epoch": 1.0388736837222248, + "grad_norm": 0.11083420366048813, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 268740 + }, + { + "epoch": 1.038912340925608, + "grad_norm": 0.09098499268293381, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 268750 + }, + { + "epoch": 1.0389509981289913, + "grad_norm": 0.14875374734401703, + "learning_rate": 0.002, + "loss": 2.3094, + "step": 268760 + }, + { + "epoch": 1.0389896553323745, + "grad_norm": 0.10430346429347992, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 268770 + }, + { + "epoch": 1.039028312535758, + "grad_norm": 0.09911978244781494, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 268780 + }, + { + "epoch": 1.0390669697391413, + "grad_norm": 0.10024331510066986, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 268790 + }, + { + "epoch": 1.0391056269425245, + "grad_norm": 0.0970604345202446, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 268800 + }, + { + "epoch": 1.0391442841459078, + "grad_norm": 0.09608924388885498, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 268810 + }, + { + "epoch": 1.039182941349291, + "grad_norm": 0.1133587583899498, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 268820 + }, + { + "epoch": 1.0392215985526743, + "grad_norm": 0.09642942249774933, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 268830 + }, + { + "epoch": 1.0392602557560575, + "grad_norm": 0.10437743365764618, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 268840 + }, + { + "epoch": 1.0392989129594408, + "grad_norm": 0.1008305475115776, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 268850 + }, + { + "epoch": 1.0393375701628242, + "grad_norm": 0.10197924822568893, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 268860 + }, + { + "epoch": 1.0393762273662075, + "grad_norm": 0.113011933863163, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 268870 + }, + { + "epoch": 1.0394148845695907, + "grad_norm": 0.10384301096200943, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 268880 + }, + { + "epoch": 1.039453541772974, + "grad_norm": 0.11199820786714554, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 268890 + }, + { + "epoch": 1.0394921989763573, + "grad_norm": 0.09927856922149658, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 268900 + }, + { + "epoch": 1.0395308561797405, + "grad_norm": 0.0970437228679657, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 268910 + }, + { + "epoch": 1.0395695133831238, + "grad_norm": 0.10486312955617905, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 268920 + }, + { + "epoch": 1.039608170586507, + "grad_norm": 0.10456685721874237, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 268930 + }, + { + "epoch": 1.0396468277898903, + "grad_norm": 0.10691561549901962, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 268940 + }, + { + "epoch": 1.0396854849932737, + "grad_norm": 0.09602787345647812, + "learning_rate": 0.002, + "loss": 2.353, + "step": 268950 + }, + { + "epoch": 1.039724142196657, + "grad_norm": 0.10299375653266907, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 268960 + }, + { + "epoch": 1.0397627994000402, + "grad_norm": 0.10255037248134613, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 268970 + }, + { + "epoch": 1.0398014566034235, + "grad_norm": 0.1187383309006691, + "learning_rate": 0.002, + "loss": 2.351, + "step": 268980 + }, + { + "epoch": 1.0398401138068067, + "grad_norm": 0.10880286991596222, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 268990 + }, + { + "epoch": 1.03987877101019, + "grad_norm": 0.11786052584648132, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 269000 + }, + { + "epoch": 1.0399174282135732, + "grad_norm": 0.11997605115175247, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 269010 + }, + { + "epoch": 1.0399560854169565, + "grad_norm": 0.1119430735707283, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 269020 + }, + { + "epoch": 1.03999474262034, + "grad_norm": 0.10212235152721405, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 269030 + }, + { + "epoch": 1.0400333998237232, + "grad_norm": 0.13267627358436584, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 269040 + }, + { + "epoch": 1.0400720570271065, + "grad_norm": 0.0917455404996872, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 269050 + }, + { + "epoch": 1.0401107142304897, + "grad_norm": 0.11168273538351059, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 269060 + }, + { + "epoch": 1.040149371433873, + "grad_norm": 0.10595767199993134, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 269070 + }, + { + "epoch": 1.0401880286372562, + "grad_norm": 0.15403233468532562, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 269080 + }, + { + "epoch": 1.0402266858406395, + "grad_norm": 0.10600115358829498, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 269090 + }, + { + "epoch": 1.0402653430440227, + "grad_norm": 0.11918964236974716, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 269100 + }, + { + "epoch": 1.040304000247406, + "grad_norm": 0.10387977957725525, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 269110 + }, + { + "epoch": 1.0403426574507895, + "grad_norm": 0.09495062381029129, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 269120 + }, + { + "epoch": 1.0403813146541727, + "grad_norm": 0.12769466638565063, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 269130 + }, + { + "epoch": 1.040419971857556, + "grad_norm": 0.08914276212453842, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 269140 + }, + { + "epoch": 1.0404586290609392, + "grad_norm": 0.11335669457912445, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 269150 + }, + { + "epoch": 1.0404972862643225, + "grad_norm": 0.10183510929346085, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 269160 + }, + { + "epoch": 1.0405359434677057, + "grad_norm": 0.12850168347358704, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 269170 + }, + { + "epoch": 1.040574600671089, + "grad_norm": 0.12186004221439362, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 269180 + }, + { + "epoch": 1.0406132578744722, + "grad_norm": 0.09551847726106644, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 269190 + }, + { + "epoch": 1.0406519150778557, + "grad_norm": 0.09610810875892639, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 269200 + }, + { + "epoch": 1.040690572281239, + "grad_norm": 0.10870260745286942, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 269210 + }, + { + "epoch": 1.0407292294846222, + "grad_norm": 0.08997610211372375, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 269220 + }, + { + "epoch": 1.0407678866880055, + "grad_norm": 0.12256580591201782, + "learning_rate": 0.002, + "loss": 2.327, + "step": 269230 + }, + { + "epoch": 1.0408065438913887, + "grad_norm": 0.09302297234535217, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 269240 + }, + { + "epoch": 1.040845201094772, + "grad_norm": 0.12043951451778412, + "learning_rate": 0.002, + "loss": 2.3623, + "step": 269250 + }, + { + "epoch": 1.0408838582981552, + "grad_norm": 0.10426823794841766, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 269260 + }, + { + "epoch": 1.0409225155015385, + "grad_norm": 0.13067109882831573, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 269270 + }, + { + "epoch": 1.0409611727049217, + "grad_norm": 0.08865130692720413, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 269280 + }, + { + "epoch": 1.0409998299083052, + "grad_norm": 0.16649721562862396, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 269290 + }, + { + "epoch": 1.0410384871116884, + "grad_norm": 0.09243342280387878, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 269300 + }, + { + "epoch": 1.0410771443150717, + "grad_norm": 0.14059138298034668, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 269310 + }, + { + "epoch": 1.041115801518455, + "grad_norm": 0.11420193314552307, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 269320 + }, + { + "epoch": 1.0411544587218382, + "grad_norm": 0.11908956617116928, + "learning_rate": 0.002, + "loss": 2.3141, + "step": 269330 + }, + { + "epoch": 1.0411931159252215, + "grad_norm": 0.11710888892412186, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 269340 + }, + { + "epoch": 1.0412317731286047, + "grad_norm": 0.10175123810768127, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 269350 + }, + { + "epoch": 1.041270430331988, + "grad_norm": 0.17233812808990479, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 269360 + }, + { + "epoch": 1.0413090875353714, + "grad_norm": 0.09387019276618958, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 269370 + }, + { + "epoch": 1.0413477447387547, + "grad_norm": 0.10078072547912598, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 269380 + }, + { + "epoch": 1.041386401942138, + "grad_norm": 0.09858350455760956, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 269390 + }, + { + "epoch": 1.0414250591455212, + "grad_norm": 0.09058808535337448, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 269400 + }, + { + "epoch": 1.0414637163489044, + "grad_norm": 0.10851434618234634, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 269410 + }, + { + "epoch": 1.0415023735522877, + "grad_norm": 0.13273876905441284, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 269420 + }, + { + "epoch": 1.041541030755671, + "grad_norm": 0.0915561392903328, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 269430 + }, + { + "epoch": 1.0415796879590542, + "grad_norm": 0.09703870862722397, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 269440 + }, + { + "epoch": 1.0416183451624377, + "grad_norm": 0.10767421126365662, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 269450 + }, + { + "epoch": 1.041657002365821, + "grad_norm": 0.09860119223594666, + "learning_rate": 0.002, + "loss": 2.324, + "step": 269460 + }, + { + "epoch": 1.0416956595692042, + "grad_norm": 0.10003165155649185, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 269470 + }, + { + "epoch": 1.0417343167725874, + "grad_norm": 0.152597114443779, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 269480 + }, + { + "epoch": 1.0417729739759707, + "grad_norm": 0.10329585522413254, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 269490 + }, + { + "epoch": 1.041811631179354, + "grad_norm": 0.1033838540315628, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 269500 + }, + { + "epoch": 1.0418502883827372, + "grad_norm": 0.1202467754483223, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 269510 + }, + { + "epoch": 1.0418889455861204, + "grad_norm": 0.0996757373213768, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 269520 + }, + { + "epoch": 1.0419276027895037, + "grad_norm": 0.09567472338676453, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 269530 + }, + { + "epoch": 1.0419662599928872, + "grad_norm": 0.1093759760260582, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 269540 + }, + { + "epoch": 1.0420049171962704, + "grad_norm": 0.101670041680336, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 269550 + }, + { + "epoch": 1.0420435743996537, + "grad_norm": 0.10272194445133209, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 269560 + }, + { + "epoch": 1.042082231603037, + "grad_norm": 0.11370494216680527, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 269570 + }, + { + "epoch": 1.0421208888064202, + "grad_norm": 0.11007405072450638, + "learning_rate": 0.002, + "loss": 2.335, + "step": 269580 + }, + { + "epoch": 1.0421595460098034, + "grad_norm": 0.11395785212516785, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 269590 + }, + { + "epoch": 1.0421982032131867, + "grad_norm": 0.09112124145030975, + "learning_rate": 0.002, + "loss": 2.3614, + "step": 269600 + }, + { + "epoch": 1.04223686041657, + "grad_norm": 0.09444142878055573, + "learning_rate": 0.002, + "loss": 2.338, + "step": 269610 + }, + { + "epoch": 1.0422755176199534, + "grad_norm": 0.11114094406366348, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 269620 + }, + { + "epoch": 1.0423141748233367, + "grad_norm": 0.10980421304702759, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 269630 + }, + { + "epoch": 1.04235283202672, + "grad_norm": 0.1392664611339569, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 269640 + }, + { + "epoch": 1.0423914892301032, + "grad_norm": 0.1121130958199501, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 269650 + }, + { + "epoch": 1.0424301464334864, + "grad_norm": 0.10922688990831375, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 269660 + }, + { + "epoch": 1.0424688036368697, + "grad_norm": 0.10173194855451584, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 269670 + }, + { + "epoch": 1.042507460840253, + "grad_norm": 0.10193554311990738, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 269680 + }, + { + "epoch": 1.0425461180436362, + "grad_norm": 0.12612669169902802, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 269690 + }, + { + "epoch": 1.0425847752470194, + "grad_norm": 0.0927710235118866, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 269700 + }, + { + "epoch": 1.042623432450403, + "grad_norm": 0.11005710810422897, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 269710 + }, + { + "epoch": 1.0426620896537862, + "grad_norm": 0.1214102953672409, + "learning_rate": 0.002, + "loss": 2.333, + "step": 269720 + }, + { + "epoch": 1.0427007468571694, + "grad_norm": 0.11738359928131104, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 269730 + }, + { + "epoch": 1.0427394040605527, + "grad_norm": 0.11865021288394928, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 269740 + }, + { + "epoch": 1.042778061263936, + "grad_norm": 0.0988292321562767, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 269750 + }, + { + "epoch": 1.0428167184673192, + "grad_norm": 0.09761074930429459, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 269760 + }, + { + "epoch": 1.0428553756707024, + "grad_norm": 0.12573882937431335, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 269770 + }, + { + "epoch": 1.0428940328740857, + "grad_norm": 0.11221905052661896, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 269780 + }, + { + "epoch": 1.0429326900774691, + "grad_norm": 0.12006543576717377, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 269790 + }, + { + "epoch": 1.0429713472808524, + "grad_norm": 0.09778149425983429, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 269800 + }, + { + "epoch": 1.0430100044842356, + "grad_norm": 0.08791869878768921, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 269810 + }, + { + "epoch": 1.043048661687619, + "grad_norm": 0.11982083320617676, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 269820 + }, + { + "epoch": 1.0430873188910021, + "grad_norm": 0.10641071200370789, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 269830 + }, + { + "epoch": 1.0431259760943854, + "grad_norm": 0.10933824628591537, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 269840 + }, + { + "epoch": 1.0431646332977687, + "grad_norm": 0.09866813570261002, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 269850 + }, + { + "epoch": 1.043203290501152, + "grad_norm": 0.09645739197731018, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 269860 + }, + { + "epoch": 1.0432419477045354, + "grad_norm": 0.10199181735515594, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 269870 + }, + { + "epoch": 1.0432806049079186, + "grad_norm": 0.10536587238311768, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 269880 + }, + { + "epoch": 1.0433192621113019, + "grad_norm": 0.11171141266822815, + "learning_rate": 0.002, + "loss": 2.341, + "step": 269890 + }, + { + "epoch": 1.0433579193146851, + "grad_norm": 0.09901822358369827, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 269900 + }, + { + "epoch": 1.0433965765180684, + "grad_norm": 0.09818220883607864, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 269910 + }, + { + "epoch": 1.0434352337214516, + "grad_norm": 0.10862912237644196, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 269920 + }, + { + "epoch": 1.043473890924835, + "grad_norm": 0.10346197336912155, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 269930 + }, + { + "epoch": 1.0435125481282181, + "grad_norm": 0.12310784310102463, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 269940 + }, + { + "epoch": 1.0435512053316014, + "grad_norm": 0.11298996955156326, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 269950 + }, + { + "epoch": 1.0435898625349849, + "grad_norm": 0.11274628341197968, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 269960 + }, + { + "epoch": 1.0436285197383681, + "grad_norm": 0.13122490048408508, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 269970 + }, + { + "epoch": 1.0436671769417514, + "grad_norm": 0.10723333805799484, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 269980 + }, + { + "epoch": 1.0437058341451346, + "grad_norm": 0.09092576801776886, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 269990 + }, + { + "epoch": 1.0437444913485179, + "grad_norm": 0.11523299664258957, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 270000 + }, + { + "epoch": 1.0437831485519011, + "grad_norm": 0.10284633189439774, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 270010 + }, + { + "epoch": 1.0438218057552844, + "grad_norm": 0.09350597113370895, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 270020 + }, + { + "epoch": 1.0438604629586676, + "grad_norm": 0.10208304971456528, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 270030 + }, + { + "epoch": 1.043899120162051, + "grad_norm": 0.1192924901843071, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 270040 + }, + { + "epoch": 1.0439377773654344, + "grad_norm": 0.10541343688964844, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 270050 + }, + { + "epoch": 1.0439764345688176, + "grad_norm": 0.10207168012857437, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 270060 + }, + { + "epoch": 1.0440150917722009, + "grad_norm": 0.09780795872211456, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 270070 + }, + { + "epoch": 1.0440537489755841, + "grad_norm": 0.10557345300912857, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 270080 + }, + { + "epoch": 1.0440924061789674, + "grad_norm": 0.11492825299501419, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 270090 + }, + { + "epoch": 1.0441310633823506, + "grad_norm": 0.1282857358455658, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 270100 + }, + { + "epoch": 1.0441697205857339, + "grad_norm": 0.09961901605129242, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 270110 + }, + { + "epoch": 1.0442083777891171, + "grad_norm": 0.09308364242315292, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 270120 + }, + { + "epoch": 1.0442470349925006, + "grad_norm": 0.08572640269994736, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 270130 + }, + { + "epoch": 1.0442856921958839, + "grad_norm": 0.08826728910207748, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 270140 + }, + { + "epoch": 1.044324349399267, + "grad_norm": 0.12766070663928986, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 270150 + }, + { + "epoch": 1.0443630066026504, + "grad_norm": 0.1239904910326004, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 270160 + }, + { + "epoch": 1.0444016638060336, + "grad_norm": 0.10418543219566345, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 270170 + }, + { + "epoch": 1.0444403210094169, + "grad_norm": 0.09708482027053833, + "learning_rate": 0.002, + "loss": 2.348, + "step": 270180 + }, + { + "epoch": 1.0444789782128001, + "grad_norm": 0.10282531380653381, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 270190 + }, + { + "epoch": 1.0445176354161834, + "grad_norm": 0.1008395329117775, + "learning_rate": 0.002, + "loss": 2.322, + "step": 270200 + }, + { + "epoch": 1.0445562926195668, + "grad_norm": 0.117550790309906, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 270210 + }, + { + "epoch": 1.04459494982295, + "grad_norm": 0.11662206053733826, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 270220 + }, + { + "epoch": 1.0446336070263333, + "grad_norm": 0.09839627146720886, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 270230 + }, + { + "epoch": 1.0446722642297166, + "grad_norm": 0.10696960985660553, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 270240 + }, + { + "epoch": 1.0447109214330998, + "grad_norm": 0.10411490499973297, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 270250 + }, + { + "epoch": 1.044749578636483, + "grad_norm": 0.10055858641862869, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 270260 + }, + { + "epoch": 1.0447882358398664, + "grad_norm": 0.10897232592105865, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 270270 + }, + { + "epoch": 1.0448268930432496, + "grad_norm": 0.12357335537672043, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 270280 + }, + { + "epoch": 1.0448655502466329, + "grad_norm": 0.10149303078651428, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 270290 + }, + { + "epoch": 1.0449042074500163, + "grad_norm": 0.09007223695516586, + "learning_rate": 0.002, + "loss": 2.343, + "step": 270300 + }, + { + "epoch": 1.0449428646533996, + "grad_norm": 0.09397382289171219, + "learning_rate": 0.002, + "loss": 2.33, + "step": 270310 + }, + { + "epoch": 1.0449815218567828, + "grad_norm": 0.11514809727668762, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 270320 + }, + { + "epoch": 1.045020179060166, + "grad_norm": 0.10506939142942429, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 270330 + }, + { + "epoch": 1.0450588362635493, + "grad_norm": 0.10700733959674835, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 270340 + }, + { + "epoch": 1.0450974934669326, + "grad_norm": 0.10480210930109024, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 270350 + }, + { + "epoch": 1.0451361506703158, + "grad_norm": 0.11787199229001999, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 270360 + }, + { + "epoch": 1.045174807873699, + "grad_norm": 0.09324786067008972, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 270370 + }, + { + "epoch": 1.0452134650770826, + "grad_norm": 0.10228384286165237, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 270380 + }, + { + "epoch": 1.0452521222804658, + "grad_norm": 0.11133359372615814, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 270390 + }, + { + "epoch": 1.045290779483849, + "grad_norm": 0.10327267646789551, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 270400 + }, + { + "epoch": 1.0453294366872323, + "grad_norm": 0.09917426854372025, + "learning_rate": 0.002, + "loss": 2.346, + "step": 270410 + }, + { + "epoch": 1.0453680938906156, + "grad_norm": 0.114399753510952, + "learning_rate": 0.002, + "loss": 2.329, + "step": 270420 + }, + { + "epoch": 1.0454067510939988, + "grad_norm": 0.10123814642429352, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 270430 + }, + { + "epoch": 1.045445408297382, + "grad_norm": 0.08698641508817673, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 270440 + }, + { + "epoch": 1.0454840655007653, + "grad_norm": 0.11683791130781174, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 270450 + }, + { + "epoch": 1.0455227227041486, + "grad_norm": 0.13808204233646393, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 270460 + }, + { + "epoch": 1.045561379907532, + "grad_norm": 0.11392877250909805, + "learning_rate": 0.002, + "loss": 2.3638, + "step": 270470 + }, + { + "epoch": 1.0456000371109153, + "grad_norm": 0.09933704882860184, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 270480 + }, + { + "epoch": 1.0456386943142986, + "grad_norm": 0.10919066518545151, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 270490 + }, + { + "epoch": 1.0456773515176818, + "grad_norm": 0.10833816230297089, + "learning_rate": 0.002, + "loss": 2.316, + "step": 270500 + }, + { + "epoch": 1.045716008721065, + "grad_norm": 0.1263931691646576, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 270510 + }, + { + "epoch": 1.0457546659244483, + "grad_norm": 0.11145421117544174, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 270520 + }, + { + "epoch": 1.0457933231278316, + "grad_norm": 0.10142406821250916, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 270530 + }, + { + "epoch": 1.0458319803312148, + "grad_norm": 0.11987996101379395, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 270540 + }, + { + "epoch": 1.0458706375345983, + "grad_norm": 0.10346390306949615, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 270550 + }, + { + "epoch": 1.0459092947379816, + "grad_norm": 0.11690594255924225, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 270560 + }, + { + "epoch": 1.0459479519413648, + "grad_norm": 0.10042134672403336, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 270570 + }, + { + "epoch": 1.045986609144748, + "grad_norm": 0.10842636227607727, + "learning_rate": 0.002, + "loss": 2.34, + "step": 270580 + }, + { + "epoch": 1.0460252663481313, + "grad_norm": 0.12476224452257156, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 270590 + }, + { + "epoch": 1.0460639235515146, + "grad_norm": 0.11158926784992218, + "learning_rate": 0.002, + "loss": 2.342, + "step": 270600 + }, + { + "epoch": 1.0461025807548978, + "grad_norm": 0.09732253104448318, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 270610 + }, + { + "epoch": 1.046141237958281, + "grad_norm": 0.10630382597446442, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 270620 + }, + { + "epoch": 1.0461798951616643, + "grad_norm": 0.10902847349643707, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 270630 + }, + { + "epoch": 1.0462185523650478, + "grad_norm": 0.09461641311645508, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 270640 + }, + { + "epoch": 1.046257209568431, + "grad_norm": 0.10040676593780518, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 270650 + }, + { + "epoch": 1.0462958667718143, + "grad_norm": 0.1279398798942566, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 270660 + }, + { + "epoch": 1.0463345239751976, + "grad_norm": 0.08386406302452087, + "learning_rate": 0.002, + "loss": 2.333, + "step": 270670 + }, + { + "epoch": 1.0463731811785808, + "grad_norm": 0.09664589166641235, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 270680 + }, + { + "epoch": 1.046411838381964, + "grad_norm": 0.09919747710227966, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 270690 + }, + { + "epoch": 1.0464504955853473, + "grad_norm": 0.11124635487794876, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 270700 + }, + { + "epoch": 1.0464891527887306, + "grad_norm": 0.09929902851581573, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 270710 + }, + { + "epoch": 1.046527809992114, + "grad_norm": 0.11733206361532211, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 270720 + }, + { + "epoch": 1.0465664671954973, + "grad_norm": 0.09919170290231705, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 270730 + }, + { + "epoch": 1.0466051243988805, + "grad_norm": 0.09976255893707275, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 270740 + }, + { + "epoch": 1.0466437816022638, + "grad_norm": 0.09964817017316818, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 270750 + }, + { + "epoch": 1.046682438805647, + "grad_norm": 0.08482794463634491, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 270760 + }, + { + "epoch": 1.0467210960090303, + "grad_norm": 0.12965020537376404, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 270770 + }, + { + "epoch": 1.0467597532124135, + "grad_norm": 0.09799892455339432, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 270780 + }, + { + "epoch": 1.0467984104157968, + "grad_norm": 0.12030211836099625, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 270790 + }, + { + "epoch": 1.04683706761918, + "grad_norm": 0.13467024266719818, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 270800 + }, + { + "epoch": 1.0468757248225635, + "grad_norm": 0.08914663642644882, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 270810 + }, + { + "epoch": 1.0469143820259468, + "grad_norm": 0.09452791512012482, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 270820 + }, + { + "epoch": 1.04695303922933, + "grad_norm": 0.11398884654045105, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 270830 + }, + { + "epoch": 1.0469916964327133, + "grad_norm": 0.9981908798217773, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 270840 + }, + { + "epoch": 1.0470303536360965, + "grad_norm": 0.11038073152303696, + "learning_rate": 0.002, + "loss": 2.328, + "step": 270850 + }, + { + "epoch": 1.0470690108394798, + "grad_norm": 0.10655945539474487, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 270860 + }, + { + "epoch": 1.047107668042863, + "grad_norm": 0.1006389707326889, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 270870 + }, + { + "epoch": 1.0471463252462463, + "grad_norm": 0.095772385597229, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 270880 + }, + { + "epoch": 1.0471849824496298, + "grad_norm": 0.1254042536020279, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 270890 + }, + { + "epoch": 1.047223639653013, + "grad_norm": 0.09856385737657547, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 270900 + }, + { + "epoch": 1.0472622968563963, + "grad_norm": 0.1072583720088005, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 270910 + }, + { + "epoch": 1.0473009540597795, + "grad_norm": 0.09270235896110535, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 270920 + }, + { + "epoch": 1.0473396112631628, + "grad_norm": 0.12417781352996826, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 270930 + }, + { + "epoch": 1.047378268466546, + "grad_norm": 0.11933035403490067, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 270940 + }, + { + "epoch": 1.0474169256699293, + "grad_norm": 0.09769248962402344, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 270950 + }, + { + "epoch": 1.0474555828733125, + "grad_norm": 0.10664346814155579, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 270960 + }, + { + "epoch": 1.0474942400766958, + "grad_norm": 0.12095224112272263, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 270970 + }, + { + "epoch": 1.0475328972800793, + "grad_norm": 0.10238323360681534, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 270980 + }, + { + "epoch": 1.0475715544834625, + "grad_norm": 0.11612109839916229, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 270990 + }, + { + "epoch": 1.0476102116868458, + "grad_norm": 0.10432593524456024, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 271000 + }, + { + "epoch": 1.047648868890229, + "grad_norm": 0.09392142295837402, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 271010 + }, + { + "epoch": 1.0476875260936123, + "grad_norm": 0.08814319223165512, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 271020 + }, + { + "epoch": 1.0477261832969955, + "grad_norm": 0.1037273108959198, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 271030 + }, + { + "epoch": 1.0477648405003788, + "grad_norm": 0.10812348872423172, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 271040 + }, + { + "epoch": 1.047803497703762, + "grad_norm": 0.1279609352350235, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 271050 + }, + { + "epoch": 1.0478421549071455, + "grad_norm": 0.0913081243634224, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 271060 + }, + { + "epoch": 1.0478808121105287, + "grad_norm": 0.11548645049333572, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 271070 + }, + { + "epoch": 1.047919469313912, + "grad_norm": 0.13509738445281982, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 271080 + }, + { + "epoch": 1.0479581265172953, + "grad_norm": 0.11165101826190948, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 271090 + }, + { + "epoch": 1.0479967837206785, + "grad_norm": 0.10891595482826233, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 271100 + }, + { + "epoch": 1.0480354409240618, + "grad_norm": 0.09405533224344254, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 271110 + }, + { + "epoch": 1.048074098127445, + "grad_norm": 0.09612392634153366, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 271120 + }, + { + "epoch": 1.0481127553308283, + "grad_norm": 0.0851694643497467, + "learning_rate": 0.002, + "loss": 2.339, + "step": 271130 + }, + { + "epoch": 1.0481514125342115, + "grad_norm": 0.1357465535402298, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 271140 + }, + { + "epoch": 1.048190069737595, + "grad_norm": 0.10865718126296997, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 271150 + }, + { + "epoch": 1.0482287269409782, + "grad_norm": 0.09514014422893524, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 271160 + }, + { + "epoch": 1.0482673841443615, + "grad_norm": 0.09729630500078201, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 271170 + }, + { + "epoch": 1.0483060413477447, + "grad_norm": 0.10377205163240433, + "learning_rate": 0.002, + "loss": 2.32, + "step": 271180 + }, + { + "epoch": 1.048344698551128, + "grad_norm": 0.11447707563638687, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 271190 + }, + { + "epoch": 1.0483833557545112, + "grad_norm": 0.09187958389520645, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 271200 + }, + { + "epoch": 1.0484220129578945, + "grad_norm": 0.11575819551944733, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 271210 + }, + { + "epoch": 1.0484606701612778, + "grad_norm": 0.14068150520324707, + "learning_rate": 0.002, + "loss": 2.328, + "step": 271220 + }, + { + "epoch": 1.0484993273646612, + "grad_norm": 0.10853346437215805, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 271230 + }, + { + "epoch": 1.0485379845680445, + "grad_norm": 0.09836877137422562, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 271240 + }, + { + "epoch": 1.0485766417714277, + "grad_norm": 0.11626017093658447, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 271250 + }, + { + "epoch": 1.048615298974811, + "grad_norm": 0.09962525218725204, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 271260 + }, + { + "epoch": 1.0486539561781942, + "grad_norm": 0.10745836049318314, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 271270 + }, + { + "epoch": 1.0486926133815775, + "grad_norm": 0.12288927286863327, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 271280 + }, + { + "epoch": 1.0487312705849607, + "grad_norm": 0.10615988075733185, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 271290 + }, + { + "epoch": 1.048769927788344, + "grad_norm": 0.11295727640390396, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 271300 + }, + { + "epoch": 1.0488085849917272, + "grad_norm": 0.09330426156520844, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 271310 + }, + { + "epoch": 1.0488472421951107, + "grad_norm": 0.1008860170841217, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 271320 + }, + { + "epoch": 1.048885899398494, + "grad_norm": 0.12525293231010437, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 271330 + }, + { + "epoch": 1.0489245566018772, + "grad_norm": 0.09700564295053482, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 271340 + }, + { + "epoch": 1.0489632138052605, + "grad_norm": 0.10422427207231522, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 271350 + }, + { + "epoch": 1.0490018710086437, + "grad_norm": 0.10832743346691132, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 271360 + }, + { + "epoch": 1.049040528212027, + "grad_norm": 0.1054845005273819, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 271370 + }, + { + "epoch": 1.0490791854154102, + "grad_norm": 0.09151072055101395, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 271380 + }, + { + "epoch": 1.0491178426187935, + "grad_norm": 0.10409058630466461, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 271390 + }, + { + "epoch": 1.049156499822177, + "grad_norm": 0.09544691443443298, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 271400 + }, + { + "epoch": 1.0491951570255602, + "grad_norm": 0.09302859753370285, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 271410 + }, + { + "epoch": 1.0492338142289435, + "grad_norm": 0.10048230737447739, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 271420 + }, + { + "epoch": 1.0492724714323267, + "grad_norm": 0.10445299744606018, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 271430 + }, + { + "epoch": 1.04931112863571, + "grad_norm": 0.12577003240585327, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 271440 + }, + { + "epoch": 1.0493497858390932, + "grad_norm": 0.11436501145362854, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 271450 + }, + { + "epoch": 1.0493884430424765, + "grad_norm": 0.1273229718208313, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 271460 + }, + { + "epoch": 1.0494271002458597, + "grad_norm": 0.11267320066690445, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 271470 + }, + { + "epoch": 1.0494657574492432, + "grad_norm": 0.09845497459173203, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 271480 + }, + { + "epoch": 1.0495044146526264, + "grad_norm": 0.10332329571247101, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 271490 + }, + { + "epoch": 1.0495430718560097, + "grad_norm": 0.09913578629493713, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 271500 + }, + { + "epoch": 1.049581729059393, + "grad_norm": 0.11222704499959946, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 271510 + }, + { + "epoch": 1.0496203862627762, + "grad_norm": 0.13589952886104584, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 271520 + }, + { + "epoch": 1.0496590434661595, + "grad_norm": 0.09837301075458527, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 271530 + }, + { + "epoch": 1.0496977006695427, + "grad_norm": 0.10624351352453232, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 271540 + }, + { + "epoch": 1.049736357872926, + "grad_norm": 0.11693647503852844, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 271550 + }, + { + "epoch": 1.0497750150763092, + "grad_norm": 0.09240395575761795, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 271560 + }, + { + "epoch": 1.0498136722796927, + "grad_norm": 0.09947966039180756, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 271570 + }, + { + "epoch": 1.049852329483076, + "grad_norm": 0.10705848783254623, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 271580 + }, + { + "epoch": 1.0498909866864592, + "grad_norm": 0.1160568818449974, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 271590 + }, + { + "epoch": 1.0499296438898424, + "grad_norm": 0.10577414929866791, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 271600 + }, + { + "epoch": 1.0499683010932257, + "grad_norm": 0.09413161128759384, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 271610 + }, + { + "epoch": 1.050006958296609, + "grad_norm": 0.10215846449136734, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 271620 + }, + { + "epoch": 1.0500456154999922, + "grad_norm": 0.1168697401881218, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 271630 + }, + { + "epoch": 1.0500842727033755, + "grad_norm": 0.09911565482616425, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 271640 + }, + { + "epoch": 1.050122929906759, + "grad_norm": 0.09280645102262497, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 271650 + }, + { + "epoch": 1.0501615871101422, + "grad_norm": 0.0974106714129448, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 271660 + }, + { + "epoch": 1.0502002443135254, + "grad_norm": 0.10810904204845428, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 271670 + }, + { + "epoch": 1.0502389015169087, + "grad_norm": 0.10441002994775772, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 271680 + }, + { + "epoch": 1.050277558720292, + "grad_norm": 0.11806993186473846, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 271690 + }, + { + "epoch": 1.0503162159236752, + "grad_norm": 0.10563837736845016, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 271700 + }, + { + "epoch": 1.0503548731270584, + "grad_norm": 0.10502533614635468, + "learning_rate": 0.002, + "loss": 2.349, + "step": 271710 + }, + { + "epoch": 1.0503935303304417, + "grad_norm": 0.10693574696779251, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 271720 + }, + { + "epoch": 1.050432187533825, + "grad_norm": 0.09122729301452637, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 271730 + }, + { + "epoch": 1.0504708447372084, + "grad_norm": 0.11178099364042282, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 271740 + }, + { + "epoch": 1.0505095019405917, + "grad_norm": 0.11062789708375931, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 271750 + }, + { + "epoch": 1.050548159143975, + "grad_norm": 0.09843680262565613, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 271760 + }, + { + "epoch": 1.0505868163473582, + "grad_norm": 0.1031612977385521, + "learning_rate": 0.002, + "loss": 2.348, + "step": 271770 + }, + { + "epoch": 1.0506254735507414, + "grad_norm": 0.12016744911670685, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 271780 + }, + { + "epoch": 1.0506641307541247, + "grad_norm": 0.11719794571399689, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 271790 + }, + { + "epoch": 1.050702787957508, + "grad_norm": 0.09663298726081848, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 271800 + }, + { + "epoch": 1.0507414451608912, + "grad_norm": 0.12161391973495483, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 271810 + }, + { + "epoch": 1.0507801023642747, + "grad_norm": 0.12131401151418686, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 271820 + }, + { + "epoch": 1.050818759567658, + "grad_norm": 0.0994456559419632, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 271830 + }, + { + "epoch": 1.0508574167710412, + "grad_norm": 0.09268373996019363, + "learning_rate": 0.002, + "loss": 2.33, + "step": 271840 + }, + { + "epoch": 1.0508960739744244, + "grad_norm": 0.09143264591693878, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 271850 + }, + { + "epoch": 1.0509347311778077, + "grad_norm": 0.10940791666507721, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 271860 + }, + { + "epoch": 1.050973388381191, + "grad_norm": 0.14378370344638824, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 271870 + }, + { + "epoch": 1.0510120455845742, + "grad_norm": 0.09318773448467255, + "learning_rate": 0.002, + "loss": 2.333, + "step": 271880 + }, + { + "epoch": 1.0510507027879574, + "grad_norm": 0.11682747304439545, + "learning_rate": 0.002, + "loss": 2.344, + "step": 271890 + }, + { + "epoch": 1.051089359991341, + "grad_norm": 0.10547391325235367, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 271900 + }, + { + "epoch": 1.0511280171947242, + "grad_norm": 0.09252913296222687, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 271910 + }, + { + "epoch": 1.0511666743981074, + "grad_norm": 0.12861141562461853, + "learning_rate": 0.002, + "loss": 2.333, + "step": 271920 + }, + { + "epoch": 1.0512053316014907, + "grad_norm": 0.09657534211874008, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 271930 + }, + { + "epoch": 1.051243988804874, + "grad_norm": 0.10140632092952728, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 271940 + }, + { + "epoch": 1.0512826460082572, + "grad_norm": 0.10031257569789886, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 271950 + }, + { + "epoch": 1.0513213032116404, + "grad_norm": 0.10987421125173569, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 271960 + }, + { + "epoch": 1.0513599604150237, + "grad_norm": 0.10414572060108185, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 271970 + }, + { + "epoch": 1.051398617618407, + "grad_norm": 0.12101298570632935, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 271980 + }, + { + "epoch": 1.0514372748217904, + "grad_norm": 0.09400589764118195, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 271990 + }, + { + "epoch": 1.0514759320251736, + "grad_norm": 0.10872562229633331, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 272000 + }, + { + "epoch": 1.051514589228557, + "grad_norm": 0.11514881998300552, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 272010 + }, + { + "epoch": 1.0515532464319401, + "grad_norm": 0.11787637323141098, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 272020 + }, + { + "epoch": 1.0515919036353234, + "grad_norm": 0.12149105221033096, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 272030 + }, + { + "epoch": 1.0516305608387067, + "grad_norm": 0.10233311355113983, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 272040 + }, + { + "epoch": 1.05166921804209, + "grad_norm": 0.11979241669178009, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 272050 + }, + { + "epoch": 1.0517078752454732, + "grad_norm": 0.10569998621940613, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 272060 + }, + { + "epoch": 1.0517465324488566, + "grad_norm": 0.09993526339530945, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 272070 + }, + { + "epoch": 1.0517851896522399, + "grad_norm": 0.11679543554782867, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 272080 + }, + { + "epoch": 1.0518238468556231, + "grad_norm": 0.09242987632751465, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 272090 + }, + { + "epoch": 1.0518625040590064, + "grad_norm": 0.0978369191288948, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 272100 + }, + { + "epoch": 1.0519011612623896, + "grad_norm": 0.10136271268129349, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 272110 + }, + { + "epoch": 1.051939818465773, + "grad_norm": 0.10128985345363617, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 272120 + }, + { + "epoch": 1.0519784756691561, + "grad_norm": 0.0996379554271698, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 272130 + }, + { + "epoch": 1.0520171328725394, + "grad_norm": 0.09816455841064453, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 272140 + }, + { + "epoch": 1.0520557900759226, + "grad_norm": 0.13871274888515472, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 272150 + }, + { + "epoch": 1.0520944472793061, + "grad_norm": 0.10068456828594208, + "learning_rate": 0.002, + "loss": 2.339, + "step": 272160 + }, + { + "epoch": 1.0521331044826894, + "grad_norm": 0.10515620559453964, + "learning_rate": 0.002, + "loss": 2.316, + "step": 272170 + }, + { + "epoch": 1.0521717616860726, + "grad_norm": 0.11310466378927231, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 272180 + }, + { + "epoch": 1.0522104188894559, + "grad_norm": 0.10717508941888809, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 272190 + }, + { + "epoch": 1.0522490760928391, + "grad_norm": 0.10464346408843994, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 272200 + }, + { + "epoch": 1.0522877332962224, + "grad_norm": 0.10172848403453827, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 272210 + }, + { + "epoch": 1.0523263904996056, + "grad_norm": 0.10452727228403091, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 272220 + }, + { + "epoch": 1.0523650477029889, + "grad_norm": 0.15429966151714325, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 272230 + }, + { + "epoch": 1.0524037049063724, + "grad_norm": 0.09801948070526123, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 272240 + }, + { + "epoch": 1.0524423621097556, + "grad_norm": 0.09865277260541916, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 272250 + }, + { + "epoch": 1.0524810193131389, + "grad_norm": 0.11756690591573715, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 272260 + }, + { + "epoch": 1.0525196765165221, + "grad_norm": 0.11615061014890671, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 272270 + }, + { + "epoch": 1.0525583337199054, + "grad_norm": 0.1278165876865387, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 272280 + }, + { + "epoch": 1.0525969909232886, + "grad_norm": 0.10149218887090683, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 272290 + }, + { + "epoch": 1.0526356481266719, + "grad_norm": 0.1761365681886673, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 272300 + }, + { + "epoch": 1.0526743053300551, + "grad_norm": 0.09013681858778, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 272310 + }, + { + "epoch": 1.0527129625334384, + "grad_norm": 0.10677545517683029, + "learning_rate": 0.002, + "loss": 2.335, + "step": 272320 + }, + { + "epoch": 1.0527516197368219, + "grad_norm": 0.11683090031147003, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 272330 + }, + { + "epoch": 1.052790276940205, + "grad_norm": 0.10520520806312561, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 272340 + }, + { + "epoch": 1.0528289341435884, + "grad_norm": 0.10680434852838516, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 272350 + }, + { + "epoch": 1.0528675913469716, + "grad_norm": 0.12004678696393967, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 272360 + }, + { + "epoch": 1.0529062485503549, + "grad_norm": 0.0969165787100792, + "learning_rate": 0.002, + "loss": 2.3594, + "step": 272370 + }, + { + "epoch": 1.0529449057537381, + "grad_norm": 0.17345726490020752, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 272380 + }, + { + "epoch": 1.0529835629571214, + "grad_norm": 0.10361625254154205, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 272390 + }, + { + "epoch": 1.0530222201605046, + "grad_norm": 0.1054096519947052, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 272400 + }, + { + "epoch": 1.053060877363888, + "grad_norm": 0.09320559352636337, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 272410 + }, + { + "epoch": 1.0530995345672713, + "grad_norm": 0.0882129818201065, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 272420 + }, + { + "epoch": 1.0531381917706546, + "grad_norm": 0.12000753730535507, + "learning_rate": 0.002, + "loss": 2.335, + "step": 272430 + }, + { + "epoch": 1.0531768489740378, + "grad_norm": 0.11470437794923782, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 272440 + }, + { + "epoch": 1.053215506177421, + "grad_norm": 0.11474449932575226, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 272450 + }, + { + "epoch": 1.0532541633808044, + "grad_norm": 0.10656072199344635, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 272460 + }, + { + "epoch": 1.0532928205841876, + "grad_norm": 0.10870281606912613, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 272470 + }, + { + "epoch": 1.0533314777875709, + "grad_norm": 0.09416912496089935, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 272480 + }, + { + "epoch": 1.053370134990954, + "grad_norm": 0.11438991129398346, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 272490 + }, + { + "epoch": 1.0534087921943376, + "grad_norm": 0.0996420681476593, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 272500 + }, + { + "epoch": 1.0534474493977208, + "grad_norm": 0.11108607798814774, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 272510 + }, + { + "epoch": 1.053486106601104, + "grad_norm": 0.09957139939069748, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 272520 + }, + { + "epoch": 1.0535247638044873, + "grad_norm": 0.11952586472034454, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 272530 + }, + { + "epoch": 1.0535634210078706, + "grad_norm": 0.22855331003665924, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 272540 + }, + { + "epoch": 1.0536020782112538, + "grad_norm": 0.10648853331804276, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 272550 + }, + { + "epoch": 1.053640735414637, + "grad_norm": 0.10383137315511703, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 272560 + }, + { + "epoch": 1.0536793926180203, + "grad_norm": 0.10470456629991531, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 272570 + }, + { + "epoch": 1.0537180498214038, + "grad_norm": 0.09900840371847153, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 272580 + }, + { + "epoch": 1.053756707024787, + "grad_norm": 0.09683623909950256, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 272590 + }, + { + "epoch": 1.0537953642281703, + "grad_norm": 0.09178218990564346, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 272600 + }, + { + "epoch": 1.0538340214315536, + "grad_norm": 0.10908043384552002, + "learning_rate": 0.002, + "loss": 2.324, + "step": 272610 + }, + { + "epoch": 1.0538726786349368, + "grad_norm": 0.1315956562757492, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 272620 + }, + { + "epoch": 1.05391133583832, + "grad_norm": 0.10674086958169937, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 272630 + }, + { + "epoch": 1.0539499930417033, + "grad_norm": 0.10858672112226486, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 272640 + }, + { + "epoch": 1.0539886502450866, + "grad_norm": 0.0922374576330185, + "learning_rate": 0.002, + "loss": 2.341, + "step": 272650 + }, + { + "epoch": 1.0540273074484698, + "grad_norm": 0.10420143604278564, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 272660 + }, + { + "epoch": 1.0540659646518533, + "grad_norm": 0.10324354469776154, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 272670 + }, + { + "epoch": 1.0541046218552366, + "grad_norm": 0.12462551146745682, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 272680 + }, + { + "epoch": 1.0541432790586198, + "grad_norm": 0.10152140259742737, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 272690 + }, + { + "epoch": 1.054181936262003, + "grad_norm": 0.1041497215628624, + "learning_rate": 0.002, + "loss": 2.333, + "step": 272700 + }, + { + "epoch": 1.0542205934653863, + "grad_norm": 0.10439198464155197, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 272710 + }, + { + "epoch": 1.0542592506687696, + "grad_norm": 0.09257284551858902, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 272720 + }, + { + "epoch": 1.0542979078721528, + "grad_norm": 0.11677654087543488, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 272730 + }, + { + "epoch": 1.054336565075536, + "grad_norm": 0.12653188407421112, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 272740 + }, + { + "epoch": 1.0543752222789196, + "grad_norm": 0.10007433593273163, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 272750 + }, + { + "epoch": 1.0544138794823028, + "grad_norm": 0.10211939364671707, + "learning_rate": 0.002, + "loss": 2.3586, + "step": 272760 + }, + { + "epoch": 1.054452536685686, + "grad_norm": 0.09609940648078918, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 272770 + }, + { + "epoch": 1.0544911938890693, + "grad_norm": 0.11618515849113464, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 272780 + }, + { + "epoch": 1.0545298510924526, + "grad_norm": 0.10952292382717133, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 272790 + }, + { + "epoch": 1.0545685082958358, + "grad_norm": 0.11988136917352676, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 272800 + }, + { + "epoch": 1.054607165499219, + "grad_norm": 0.09441050887107849, + "learning_rate": 0.002, + "loss": 2.338, + "step": 272810 + }, + { + "epoch": 1.0546458227026023, + "grad_norm": 0.11049690842628479, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 272820 + }, + { + "epoch": 1.0546844799059856, + "grad_norm": 0.10169093310832977, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 272830 + }, + { + "epoch": 1.054723137109369, + "grad_norm": 0.09456352889537811, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 272840 + }, + { + "epoch": 1.0547617943127523, + "grad_norm": 0.09587942808866501, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 272850 + }, + { + "epoch": 1.0548004515161356, + "grad_norm": 0.10359784215688705, + "learning_rate": 0.002, + "loss": 2.3143, + "step": 272860 + }, + { + "epoch": 1.0548391087195188, + "grad_norm": 0.11548492312431335, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 272870 + }, + { + "epoch": 1.054877765922902, + "grad_norm": 0.11817555874586105, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 272880 + }, + { + "epoch": 1.0549164231262853, + "grad_norm": 0.1032852828502655, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 272890 + }, + { + "epoch": 1.0549550803296686, + "grad_norm": 0.08856106549501419, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 272900 + }, + { + "epoch": 1.0549937375330518, + "grad_norm": 0.10905643552541733, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 272910 + }, + { + "epoch": 1.0550323947364353, + "grad_norm": 0.11078491061925888, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 272920 + }, + { + "epoch": 1.0550710519398185, + "grad_norm": 0.09490056335926056, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 272930 + }, + { + "epoch": 1.0551097091432018, + "grad_norm": 0.09796487540006638, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 272940 + }, + { + "epoch": 1.055148366346585, + "grad_norm": 0.10905220359563828, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 272950 + }, + { + "epoch": 1.0551870235499683, + "grad_norm": 0.09488983452320099, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 272960 + }, + { + "epoch": 1.0552256807533515, + "grad_norm": 0.09224867075681686, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 272970 + }, + { + "epoch": 1.0552643379567348, + "grad_norm": 0.1428574025630951, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 272980 + }, + { + "epoch": 1.055302995160118, + "grad_norm": 0.11071029305458069, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 272990 + }, + { + "epoch": 1.0553416523635013, + "grad_norm": 0.0985381081700325, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 273000 + }, + { + "epoch": 1.0553803095668848, + "grad_norm": 0.10195576399564743, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 273010 + }, + { + "epoch": 1.055418966770268, + "grad_norm": 0.12107205390930176, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 273020 + }, + { + "epoch": 1.0554576239736513, + "grad_norm": 0.0928567424416542, + "learning_rate": 0.002, + "loss": 2.327, + "step": 273030 + }, + { + "epoch": 1.0554962811770345, + "grad_norm": 0.10422313958406448, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 273040 + }, + { + "epoch": 1.0555349383804178, + "grad_norm": 0.1095958948135376, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 273050 + }, + { + "epoch": 1.055573595583801, + "grad_norm": 0.10636148601770401, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 273060 + }, + { + "epoch": 1.0556122527871843, + "grad_norm": 0.12696272134780884, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 273070 + }, + { + "epoch": 1.0556509099905675, + "grad_norm": 0.09934327751398087, + "learning_rate": 0.002, + "loss": 2.34, + "step": 273080 + }, + { + "epoch": 1.055689567193951, + "grad_norm": 0.09703146666288376, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 273090 + }, + { + "epoch": 1.0557282243973343, + "grad_norm": 0.11481180787086487, + "learning_rate": 0.002, + "loss": 2.325, + "step": 273100 + }, + { + "epoch": 1.0557668816007175, + "grad_norm": 0.11768441647291183, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 273110 + }, + { + "epoch": 1.0558055388041008, + "grad_norm": 0.1007218062877655, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 273120 + }, + { + "epoch": 1.055844196007484, + "grad_norm": 0.11573312431573868, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 273130 + }, + { + "epoch": 1.0558828532108673, + "grad_norm": 0.0907236784696579, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 273140 + }, + { + "epoch": 1.0559215104142505, + "grad_norm": 0.0962614119052887, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 273150 + }, + { + "epoch": 1.0559601676176338, + "grad_norm": 0.09105678647756577, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 273160 + }, + { + "epoch": 1.055998824821017, + "grad_norm": 0.09578508883714676, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 273170 + }, + { + "epoch": 1.0560374820244005, + "grad_norm": 0.09904925525188446, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 273180 + }, + { + "epoch": 1.0560761392277838, + "grad_norm": 0.09582720696926117, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 273190 + }, + { + "epoch": 1.056114796431167, + "grad_norm": 0.11303769797086716, + "learning_rate": 0.002, + "loss": 2.343, + "step": 273200 + }, + { + "epoch": 1.0561534536345503, + "grad_norm": 0.10091835260391235, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 273210 + }, + { + "epoch": 1.0561921108379335, + "grad_norm": 0.10374817997217178, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 273220 + }, + { + "epoch": 1.0562307680413168, + "grad_norm": 0.14822401106357574, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 273230 + }, + { + "epoch": 1.0562694252447, + "grad_norm": 0.08519788831472397, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 273240 + }, + { + "epoch": 1.0563080824480833, + "grad_norm": 0.10144971311092377, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 273250 + }, + { + "epoch": 1.0563467396514667, + "grad_norm": 0.13993439078330994, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 273260 + }, + { + "epoch": 1.05638539685485, + "grad_norm": 0.10113970935344696, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 273270 + }, + { + "epoch": 1.0564240540582333, + "grad_norm": 0.10195260494947433, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 273280 + }, + { + "epoch": 1.0564627112616165, + "grad_norm": 0.10974332690238953, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 273290 + }, + { + "epoch": 1.0565013684649998, + "grad_norm": 0.09610135853290558, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 273300 + }, + { + "epoch": 1.056540025668383, + "grad_norm": 0.10756216943264008, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 273310 + }, + { + "epoch": 1.0565786828717663, + "grad_norm": 0.09635698795318604, + "learning_rate": 0.002, + "loss": 2.341, + "step": 273320 + }, + { + "epoch": 1.0566173400751495, + "grad_norm": 0.11100976914167404, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 273330 + }, + { + "epoch": 1.056655997278533, + "grad_norm": 0.093922458589077, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 273340 + }, + { + "epoch": 1.0566946544819162, + "grad_norm": 0.0890863686800003, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 273350 + }, + { + "epoch": 1.0567333116852995, + "grad_norm": 0.11728120595216751, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 273360 + }, + { + "epoch": 1.0567719688886827, + "grad_norm": 0.12343502789735794, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 273370 + }, + { + "epoch": 1.056810626092066, + "grad_norm": 0.112137570977211, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 273380 + }, + { + "epoch": 1.0568492832954492, + "grad_norm": 0.10779105126857758, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 273390 + }, + { + "epoch": 1.0568879404988325, + "grad_norm": 0.0963764414191246, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 273400 + }, + { + "epoch": 1.0569265977022158, + "grad_norm": 0.09978457540273666, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 273410 + }, + { + "epoch": 1.056965254905599, + "grad_norm": 0.11221887916326523, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 273420 + }, + { + "epoch": 1.0570039121089825, + "grad_norm": 0.12512801587581635, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 273430 + }, + { + "epoch": 1.0570425693123657, + "grad_norm": 0.11520524322986603, + "learning_rate": 0.002, + "loss": 2.34, + "step": 273440 + }, + { + "epoch": 1.057081226515749, + "grad_norm": 0.09933692216873169, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 273450 + }, + { + "epoch": 1.0571198837191322, + "grad_norm": 0.09837357699871063, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 273460 + }, + { + "epoch": 1.0571585409225155, + "grad_norm": 0.09668579697608948, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 273470 + }, + { + "epoch": 1.0571971981258987, + "grad_norm": 0.10653632134199142, + "learning_rate": 0.002, + "loss": 2.319, + "step": 273480 + }, + { + "epoch": 1.057235855329282, + "grad_norm": 0.1118902936577797, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 273490 + }, + { + "epoch": 1.0572745125326652, + "grad_norm": 0.11374566704034805, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 273500 + }, + { + "epoch": 1.0573131697360487, + "grad_norm": 0.09796427935361862, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 273510 + }, + { + "epoch": 1.057351826939432, + "grad_norm": 0.10345358401536942, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 273520 + }, + { + "epoch": 1.0573904841428152, + "grad_norm": 0.12856706976890564, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 273530 + }, + { + "epoch": 1.0574291413461985, + "grad_norm": 0.10860460251569748, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 273540 + }, + { + "epoch": 1.0574677985495817, + "grad_norm": 0.09792128205299377, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 273550 + }, + { + "epoch": 1.057506455752965, + "grad_norm": 0.10655204206705093, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 273560 + }, + { + "epoch": 1.0575451129563482, + "grad_norm": 0.10076314955949783, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 273570 + }, + { + "epoch": 1.0575837701597315, + "grad_norm": 0.0985211580991745, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 273580 + }, + { + "epoch": 1.0576224273631147, + "grad_norm": 0.09232288599014282, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 273590 + }, + { + "epoch": 1.0576610845664982, + "grad_norm": 0.10147222131490707, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 273600 + }, + { + "epoch": 1.0576997417698815, + "grad_norm": 0.11548271775245667, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 273610 + }, + { + "epoch": 1.0577383989732647, + "grad_norm": 0.10642686486244202, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 273620 + }, + { + "epoch": 1.057777056176648, + "grad_norm": 0.10518043488264084, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 273630 + }, + { + "epoch": 1.0578157133800312, + "grad_norm": 0.10492758452892303, + "learning_rate": 0.002, + "loss": 2.3087, + "step": 273640 + }, + { + "epoch": 1.0578543705834145, + "grad_norm": 0.10234058648347855, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 273650 + }, + { + "epoch": 1.0578930277867977, + "grad_norm": 0.09478472173213959, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 273660 + }, + { + "epoch": 1.057931684990181, + "grad_norm": 0.10300685465335846, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 273670 + }, + { + "epoch": 1.0579703421935645, + "grad_norm": 0.09842425584793091, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 273680 + }, + { + "epoch": 1.0580089993969477, + "grad_norm": 0.11219187080860138, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 273690 + }, + { + "epoch": 1.058047656600331, + "grad_norm": 0.11639194190502167, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 273700 + }, + { + "epoch": 1.0580863138037142, + "grad_norm": 0.1035163626074791, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 273710 + }, + { + "epoch": 1.0581249710070975, + "grad_norm": 0.11216370761394501, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 273720 + }, + { + "epoch": 1.0581636282104807, + "grad_norm": 0.12635914981365204, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 273730 + }, + { + "epoch": 1.058202285413864, + "grad_norm": 0.10730802267789841, + "learning_rate": 0.002, + "loss": 2.328, + "step": 273740 + }, + { + "epoch": 1.0582409426172472, + "grad_norm": 0.13340860605239868, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 273750 + }, + { + "epoch": 1.0582795998206307, + "grad_norm": 0.11904604732990265, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 273760 + }, + { + "epoch": 1.058318257024014, + "grad_norm": 0.09762408584356308, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 273770 + }, + { + "epoch": 1.0583569142273972, + "grad_norm": 0.09792578965425491, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 273780 + }, + { + "epoch": 1.0583955714307804, + "grad_norm": 0.09640190005302429, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 273790 + }, + { + "epoch": 1.0584342286341637, + "grad_norm": 0.10028199106454849, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 273800 + }, + { + "epoch": 1.058472885837547, + "grad_norm": 0.09049591422080994, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 273810 + }, + { + "epoch": 1.0585115430409302, + "grad_norm": 0.10669384151697159, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 273820 + }, + { + "epoch": 1.0585502002443135, + "grad_norm": 0.11356060951948166, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 273830 + }, + { + "epoch": 1.0585888574476967, + "grad_norm": 0.09744501858949661, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 273840 + }, + { + "epoch": 1.0586275146510802, + "grad_norm": 0.13544464111328125, + "learning_rate": 0.002, + "loss": 2.322, + "step": 273850 + }, + { + "epoch": 1.0586661718544634, + "grad_norm": 0.11132447421550751, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 273860 + }, + { + "epoch": 1.0587048290578467, + "grad_norm": 0.11165434122085571, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 273870 + }, + { + "epoch": 1.05874348626123, + "grad_norm": 0.09332270920276642, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 273880 + }, + { + "epoch": 1.0587821434646132, + "grad_norm": 0.10789580643177032, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 273890 + }, + { + "epoch": 1.0588208006679964, + "grad_norm": 0.10863988101482391, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 273900 + }, + { + "epoch": 1.0588594578713797, + "grad_norm": 0.11976487934589386, + "learning_rate": 0.002, + "loss": 2.337, + "step": 273910 + }, + { + "epoch": 1.058898115074763, + "grad_norm": 0.1331743597984314, + "learning_rate": 0.002, + "loss": 2.344, + "step": 273920 + }, + { + "epoch": 1.0589367722781464, + "grad_norm": 0.09090913832187653, + "learning_rate": 0.002, + "loss": 2.341, + "step": 273930 + }, + { + "epoch": 1.0589754294815297, + "grad_norm": 0.10430900752544403, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 273940 + }, + { + "epoch": 1.059014086684913, + "grad_norm": 0.08921810984611511, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 273950 + }, + { + "epoch": 1.0590527438882962, + "grad_norm": 0.10172532498836517, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 273960 + }, + { + "epoch": 1.0590914010916794, + "grad_norm": 0.11895634233951569, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 273970 + }, + { + "epoch": 1.0591300582950627, + "grad_norm": 0.12765932083129883, + "learning_rate": 0.002, + "loss": 2.3675, + "step": 273980 + }, + { + "epoch": 1.059168715498446, + "grad_norm": 0.09840631484985352, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 273990 + }, + { + "epoch": 1.0592073727018292, + "grad_norm": 0.1438383013010025, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 274000 + }, + { + "epoch": 1.0592460299052124, + "grad_norm": 0.09646342694759369, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 274010 + }, + { + "epoch": 1.059284687108596, + "grad_norm": 0.10129228979349136, + "learning_rate": 0.002, + "loss": 2.341, + "step": 274020 + }, + { + "epoch": 1.0593233443119792, + "grad_norm": 0.13618502020835876, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 274030 + }, + { + "epoch": 1.0593620015153624, + "grad_norm": 0.11484713852405548, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 274040 + }, + { + "epoch": 1.0594006587187457, + "grad_norm": 0.09245438128709793, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 274050 + }, + { + "epoch": 1.059439315922129, + "grad_norm": 0.0981937125325203, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 274060 + }, + { + "epoch": 1.0594779731255122, + "grad_norm": 0.13430973887443542, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 274070 + }, + { + "epoch": 1.0595166303288954, + "grad_norm": 0.10338521003723145, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 274080 + }, + { + "epoch": 1.0595552875322787, + "grad_norm": 0.09139207750558853, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 274090 + }, + { + "epoch": 1.0595939447356622, + "grad_norm": 0.10357996821403503, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 274100 + }, + { + "epoch": 1.0596326019390454, + "grad_norm": 0.1256159543991089, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 274110 + }, + { + "epoch": 1.0596712591424287, + "grad_norm": 0.10506030172109604, + "learning_rate": 0.002, + "loss": 2.341, + "step": 274120 + }, + { + "epoch": 1.059709916345812, + "grad_norm": 0.16365769505500793, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 274130 + }, + { + "epoch": 1.0597485735491952, + "grad_norm": 0.10342303663492203, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 274140 + }, + { + "epoch": 1.0597872307525784, + "grad_norm": 0.10078897327184677, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 274150 + }, + { + "epoch": 1.0598258879559617, + "grad_norm": 0.14313046634197235, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 274160 + }, + { + "epoch": 1.059864545159345, + "grad_norm": 0.11433429270982742, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 274170 + }, + { + "epoch": 1.0599032023627282, + "grad_norm": 0.11010809987783432, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 274180 + }, + { + "epoch": 1.0599418595661116, + "grad_norm": 0.09769423305988312, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 274190 + }, + { + "epoch": 1.059980516769495, + "grad_norm": 0.13408128917217255, + "learning_rate": 0.002, + "loss": 2.351, + "step": 274200 + }, + { + "epoch": 1.0600191739728781, + "grad_norm": 0.10010013729333878, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 274210 + }, + { + "epoch": 1.0600578311762614, + "grad_norm": 0.09443888068199158, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 274220 + }, + { + "epoch": 1.0600964883796447, + "grad_norm": 0.10674253851175308, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 274230 + }, + { + "epoch": 1.060135145583028, + "grad_norm": 0.09683874249458313, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 274240 + }, + { + "epoch": 1.0601738027864112, + "grad_norm": 0.11076508462429047, + "learning_rate": 0.002, + "loss": 2.331, + "step": 274250 + }, + { + "epoch": 1.0602124599897944, + "grad_norm": 0.12387531995773315, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 274260 + }, + { + "epoch": 1.0602511171931779, + "grad_norm": 0.10102979093790054, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 274270 + }, + { + "epoch": 1.0602897743965611, + "grad_norm": 0.09049500524997711, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 274280 + }, + { + "epoch": 1.0603284315999444, + "grad_norm": 0.10448785871267319, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 274290 + }, + { + "epoch": 1.0603670888033276, + "grad_norm": 0.12147627770900726, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 274300 + }, + { + "epoch": 1.060405746006711, + "grad_norm": 0.11697643250226974, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 274310 + }, + { + "epoch": 1.0604444032100941, + "grad_norm": 0.09097470343112946, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 274320 + }, + { + "epoch": 1.0604830604134774, + "grad_norm": 0.11141699552536011, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 274330 + }, + { + "epoch": 1.0605217176168606, + "grad_norm": 0.11854037642478943, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 274340 + }, + { + "epoch": 1.060560374820244, + "grad_norm": 0.10185335576534271, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 274350 + }, + { + "epoch": 1.0605990320236274, + "grad_norm": 0.10460841655731201, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 274360 + }, + { + "epoch": 1.0606376892270106, + "grad_norm": 0.10948032140731812, + "learning_rate": 0.002, + "loss": 2.328, + "step": 274370 + }, + { + "epoch": 1.0606763464303939, + "grad_norm": 0.14033067226409912, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 274380 + }, + { + "epoch": 1.0607150036337771, + "grad_norm": 0.10076010227203369, + "learning_rate": 0.002, + "loss": 2.348, + "step": 274390 + }, + { + "epoch": 1.0607536608371604, + "grad_norm": 0.09487166255712509, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 274400 + }, + { + "epoch": 1.0607923180405436, + "grad_norm": 0.113301582634449, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 274410 + }, + { + "epoch": 1.0608309752439269, + "grad_norm": 0.2377205640077591, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 274420 + }, + { + "epoch": 1.0608696324473101, + "grad_norm": 0.11768513172864914, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 274430 + }, + { + "epoch": 1.0609082896506936, + "grad_norm": 0.09380123764276505, + "learning_rate": 0.002, + "loss": 2.325, + "step": 274440 + }, + { + "epoch": 1.0609469468540769, + "grad_norm": 0.10629244893789291, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 274450 + }, + { + "epoch": 1.0609856040574601, + "grad_norm": 0.11308005452156067, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 274460 + }, + { + "epoch": 1.0610242612608434, + "grad_norm": 0.11594302952289581, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 274470 + }, + { + "epoch": 1.0610629184642266, + "grad_norm": 0.11383994668722153, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 274480 + }, + { + "epoch": 1.0611015756676099, + "grad_norm": 0.10329388827085495, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 274490 + }, + { + "epoch": 1.0611402328709931, + "grad_norm": 0.11112868785858154, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 274500 + }, + { + "epoch": 1.0611788900743764, + "grad_norm": 0.10998545587062836, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 274510 + }, + { + "epoch": 1.0612175472777596, + "grad_norm": 0.09963172674179077, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 274520 + }, + { + "epoch": 1.061256204481143, + "grad_norm": 0.11295843869447708, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 274530 + }, + { + "epoch": 1.0612948616845264, + "grad_norm": 0.10796195268630981, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 274540 + }, + { + "epoch": 1.0613335188879096, + "grad_norm": 0.14060470461845398, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 274550 + }, + { + "epoch": 1.0613721760912929, + "grad_norm": 0.09685548394918442, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 274560 + }, + { + "epoch": 1.0614108332946761, + "grad_norm": 0.105289526283741, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 274570 + }, + { + "epoch": 1.0614494904980594, + "grad_norm": 0.11777745932340622, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 274580 + }, + { + "epoch": 1.0614881477014426, + "grad_norm": 0.11257560551166534, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 274590 + }, + { + "epoch": 1.0615268049048259, + "grad_norm": 0.11551818251609802, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 274600 + }, + { + "epoch": 1.0615654621082093, + "grad_norm": 0.10022301971912384, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 274610 + }, + { + "epoch": 1.0616041193115926, + "grad_norm": 0.10636663436889648, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 274620 + }, + { + "epoch": 1.0616427765149758, + "grad_norm": 0.09114213287830353, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 274630 + }, + { + "epoch": 1.061681433718359, + "grad_norm": 0.09075837582349777, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 274640 + }, + { + "epoch": 1.0617200909217424, + "grad_norm": 0.1268124133348465, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 274650 + }, + { + "epoch": 1.0617587481251256, + "grad_norm": 0.08955233544111252, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 274660 + }, + { + "epoch": 1.0617974053285089, + "grad_norm": 0.09312479943037033, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 274670 + }, + { + "epoch": 1.061836062531892, + "grad_norm": 0.10527783632278442, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 274680 + }, + { + "epoch": 1.0618747197352754, + "grad_norm": 0.09795436263084412, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 274690 + }, + { + "epoch": 1.0619133769386588, + "grad_norm": 0.11870022118091583, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 274700 + }, + { + "epoch": 1.061952034142042, + "grad_norm": 0.10405212640762329, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 274710 + }, + { + "epoch": 1.0619906913454253, + "grad_norm": 0.11261092126369476, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 274720 + }, + { + "epoch": 1.0620293485488086, + "grad_norm": 0.11202120780944824, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 274730 + }, + { + "epoch": 1.0620680057521918, + "grad_norm": 0.1228702962398529, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 274740 + }, + { + "epoch": 1.062106662955575, + "grad_norm": 0.10662976652383804, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 274750 + }, + { + "epoch": 1.0621453201589584, + "grad_norm": 0.10735514760017395, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 274760 + }, + { + "epoch": 1.0621839773623416, + "grad_norm": 0.09139461070299149, + "learning_rate": 0.002, + "loss": 2.335, + "step": 274770 + }, + { + "epoch": 1.062222634565725, + "grad_norm": 0.11393001675605774, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 274780 + }, + { + "epoch": 1.0622612917691083, + "grad_norm": 0.1101556345820427, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 274790 + }, + { + "epoch": 1.0622999489724916, + "grad_norm": 0.09900113195180893, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 274800 + }, + { + "epoch": 1.0623386061758748, + "grad_norm": 0.1438235342502594, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 274810 + }, + { + "epoch": 1.062377263379258, + "grad_norm": 0.10571251809597015, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 274820 + }, + { + "epoch": 1.0624159205826413, + "grad_norm": 0.10640508681535721, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 274830 + }, + { + "epoch": 1.0624545777860246, + "grad_norm": 0.09718748182058334, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 274840 + }, + { + "epoch": 1.0624932349894078, + "grad_norm": 0.11124593019485474, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 274850 + }, + { + "epoch": 1.062531892192791, + "grad_norm": 0.11500213295221329, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 274860 + }, + { + "epoch": 1.0625705493961746, + "grad_norm": 0.0943385511636734, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 274870 + }, + { + "epoch": 1.0626092065995578, + "grad_norm": 0.1425260305404663, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 274880 + }, + { + "epoch": 1.062647863802941, + "grad_norm": 0.10338190197944641, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 274890 + }, + { + "epoch": 1.0626865210063243, + "grad_norm": 0.10322872549295425, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 274900 + }, + { + "epoch": 1.0627251782097076, + "grad_norm": 0.1208379790186882, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 274910 + }, + { + "epoch": 1.0627638354130908, + "grad_norm": 0.1032503992319107, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 274920 + }, + { + "epoch": 1.062802492616474, + "grad_norm": 0.10165394842624664, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 274930 + }, + { + "epoch": 1.0628411498198573, + "grad_norm": 0.09076716005802155, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 274940 + }, + { + "epoch": 1.0628798070232408, + "grad_norm": 0.10745106637477875, + "learning_rate": 0.002, + "loss": 2.3127, + "step": 274950 + }, + { + "epoch": 1.062918464226624, + "grad_norm": 0.1258857101202011, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 274960 + }, + { + "epoch": 1.0629571214300073, + "grad_norm": 0.10054262727499008, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 274970 + }, + { + "epoch": 1.0629957786333906, + "grad_norm": 0.11825623363256454, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 274980 + }, + { + "epoch": 1.0630344358367738, + "grad_norm": 0.09283744543790817, + "learning_rate": 0.002, + "loss": 2.3143, + "step": 274990 + }, + { + "epoch": 1.063073093040157, + "grad_norm": 0.11862016469240189, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 275000 + }, + { + "epoch": 1.0631117502435403, + "grad_norm": 0.09345857799053192, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 275010 + }, + { + "epoch": 1.0631504074469236, + "grad_norm": 0.0908292606472969, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 275020 + }, + { + "epoch": 1.0631890646503068, + "grad_norm": 0.10507423430681229, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 275030 + }, + { + "epoch": 1.0632277218536903, + "grad_norm": 0.114536352455616, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 275040 + }, + { + "epoch": 1.0632663790570736, + "grad_norm": 0.09364758431911469, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 275050 + }, + { + "epoch": 1.0633050362604568, + "grad_norm": 0.11448070406913757, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 275060 + }, + { + "epoch": 1.06334369346384, + "grad_norm": 0.09450113028287888, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 275070 + }, + { + "epoch": 1.0633823506672233, + "grad_norm": 1.2738556861877441, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 275080 + }, + { + "epoch": 1.0634210078706066, + "grad_norm": 0.11922585964202881, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 275090 + }, + { + "epoch": 1.0634596650739898, + "grad_norm": 0.10912361741065979, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 275100 + }, + { + "epoch": 1.063498322277373, + "grad_norm": 0.10273371636867523, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 275110 + }, + { + "epoch": 1.0635369794807565, + "grad_norm": 0.10637617111206055, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 275120 + }, + { + "epoch": 1.0635756366841398, + "grad_norm": 0.11244622617959976, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 275130 + }, + { + "epoch": 1.063614293887523, + "grad_norm": 0.10040455311536789, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 275140 + }, + { + "epoch": 1.0636529510909063, + "grad_norm": 0.13314113020896912, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 275150 + }, + { + "epoch": 1.0636916082942895, + "grad_norm": 0.10646568983793259, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 275160 + }, + { + "epoch": 1.0637302654976728, + "grad_norm": 0.10095401108264923, + "learning_rate": 0.002, + "loss": 2.32, + "step": 275170 + }, + { + "epoch": 1.063768922701056, + "grad_norm": 0.0974881574511528, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 275180 + }, + { + "epoch": 1.0638075799044393, + "grad_norm": 0.0964067354798317, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 275190 + }, + { + "epoch": 1.0638462371078226, + "grad_norm": 0.1156691312789917, + "learning_rate": 0.002, + "loss": 2.359, + "step": 275200 + }, + { + "epoch": 1.063884894311206, + "grad_norm": 0.11896368861198425, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 275210 + }, + { + "epoch": 1.0639235515145893, + "grad_norm": 0.11752109229564667, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 275220 + }, + { + "epoch": 1.0639622087179725, + "grad_norm": 0.11022363603115082, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 275230 + }, + { + "epoch": 1.0640008659213558, + "grad_norm": 0.09471013396978378, + "learning_rate": 0.002, + "loss": 2.349, + "step": 275240 + }, + { + "epoch": 1.064039523124739, + "grad_norm": 0.10919588059186935, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 275250 + }, + { + "epoch": 1.0640781803281223, + "grad_norm": 0.11964140087366104, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 275260 + }, + { + "epoch": 1.0641168375315055, + "grad_norm": 0.11185113340616226, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 275270 + }, + { + "epoch": 1.0641554947348888, + "grad_norm": 0.14997375011444092, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 275280 + }, + { + "epoch": 1.0641941519382723, + "grad_norm": 0.11473798006772995, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 275290 + }, + { + "epoch": 1.0642328091416555, + "grad_norm": 0.1455710083246231, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 275300 + }, + { + "epoch": 1.0642714663450388, + "grad_norm": 0.10797169804573059, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 275310 + }, + { + "epoch": 1.064310123548422, + "grad_norm": 0.10607920587062836, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 275320 + }, + { + "epoch": 1.0643487807518053, + "grad_norm": 0.10385407507419586, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 275330 + }, + { + "epoch": 1.0643874379551885, + "grad_norm": 0.15248897671699524, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 275340 + }, + { + "epoch": 1.0644260951585718, + "grad_norm": 0.10435408353805542, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 275350 + }, + { + "epoch": 1.064464752361955, + "grad_norm": 0.1083153784275055, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 275360 + }, + { + "epoch": 1.0645034095653383, + "grad_norm": 0.10544626414775848, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 275370 + }, + { + "epoch": 1.0645420667687218, + "grad_norm": 0.11898812651634216, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 275380 + }, + { + "epoch": 1.064580723972105, + "grad_norm": 0.09902945905923843, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 275390 + }, + { + "epoch": 1.0646193811754883, + "grad_norm": 0.1146288812160492, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 275400 + }, + { + "epoch": 1.0646580383788715, + "grad_norm": 0.12026480585336685, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 275410 + }, + { + "epoch": 1.0646966955822548, + "grad_norm": 0.4733019471168518, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 275420 + }, + { + "epoch": 1.064735352785638, + "grad_norm": 0.11432819813489914, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 275430 + }, + { + "epoch": 1.0647740099890213, + "grad_norm": 0.11281923949718475, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 275440 + }, + { + "epoch": 1.0648126671924047, + "grad_norm": 0.11215028911828995, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 275450 + }, + { + "epoch": 1.064851324395788, + "grad_norm": 0.10150204598903656, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 275460 + }, + { + "epoch": 1.0648899815991713, + "grad_norm": 0.09803950786590576, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 275470 + }, + { + "epoch": 1.0649286388025545, + "grad_norm": 0.10600266605615616, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 275480 + }, + { + "epoch": 1.0649672960059378, + "grad_norm": 0.1063169315457344, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 275490 + }, + { + "epoch": 1.065005953209321, + "grad_norm": 0.08664971590042114, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 275500 + }, + { + "epoch": 1.0650446104127043, + "grad_norm": 0.1225690245628357, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 275510 + }, + { + "epoch": 1.0650832676160875, + "grad_norm": 0.11429854482412338, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 275520 + }, + { + "epoch": 1.0651219248194708, + "grad_norm": 0.11013131588697433, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 275530 + }, + { + "epoch": 1.0651605820228542, + "grad_norm": 0.09335508197546005, + "learning_rate": 0.002, + "loss": 2.3106, + "step": 275540 + }, + { + "epoch": 1.0651992392262375, + "grad_norm": 0.11192896962165833, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 275550 + }, + { + "epoch": 1.0652378964296207, + "grad_norm": 0.09795791655778885, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 275560 + }, + { + "epoch": 1.065276553633004, + "grad_norm": 0.11031638830900192, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 275570 + }, + { + "epoch": 1.0653152108363872, + "grad_norm": 0.11539456248283386, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 275580 + }, + { + "epoch": 1.0653538680397705, + "grad_norm": 0.130585178732872, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 275590 + }, + { + "epoch": 1.0653925252431538, + "grad_norm": 0.10120509564876556, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 275600 + }, + { + "epoch": 1.065431182446537, + "grad_norm": 0.09373080730438232, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 275610 + }, + { + "epoch": 1.0654698396499205, + "grad_norm": 0.09474208205938339, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 275620 + }, + { + "epoch": 1.0655084968533037, + "grad_norm": 0.12688371539115906, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 275630 + }, + { + "epoch": 1.065547154056687, + "grad_norm": 0.13162842392921448, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 275640 + }, + { + "epoch": 1.0655858112600702, + "grad_norm": 0.10550396889448166, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 275650 + }, + { + "epoch": 1.0656244684634535, + "grad_norm": 0.10006772726774216, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 275660 + }, + { + "epoch": 1.0656631256668367, + "grad_norm": 0.11802761256694794, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 275670 + }, + { + "epoch": 1.06570178287022, + "grad_norm": 0.11319725215435028, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 275680 + }, + { + "epoch": 1.0657404400736032, + "grad_norm": 0.10054709017276764, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 275690 + }, + { + "epoch": 1.0657790972769865, + "grad_norm": 0.11254961788654327, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 275700 + }, + { + "epoch": 1.06581775448037, + "grad_norm": 0.09680163115262985, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 275710 + }, + { + "epoch": 1.0658564116837532, + "grad_norm": 0.10819884389638901, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 275720 + }, + { + "epoch": 1.0658950688871365, + "grad_norm": 0.11223629862070084, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 275730 + }, + { + "epoch": 1.0659337260905197, + "grad_norm": 0.1306917816400528, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 275740 + }, + { + "epoch": 1.065972383293903, + "grad_norm": 0.08868222683668137, + "learning_rate": 0.002, + "loss": 2.349, + "step": 275750 + }, + { + "epoch": 1.0660110404972862, + "grad_norm": 0.10296325385570526, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 275760 + }, + { + "epoch": 1.0660496977006695, + "grad_norm": 0.10850217193365097, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 275770 + }, + { + "epoch": 1.0660883549040527, + "grad_norm": 0.11120878159999847, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 275780 + }, + { + "epoch": 1.0661270121074362, + "grad_norm": 0.093369260430336, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 275790 + }, + { + "epoch": 1.0661656693108195, + "grad_norm": 0.1033010482788086, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 275800 + }, + { + "epoch": 1.0662043265142027, + "grad_norm": 0.10789181292057037, + "learning_rate": 0.002, + "loss": 2.346, + "step": 275810 + }, + { + "epoch": 1.066242983717586, + "grad_norm": 0.09879382699728012, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 275820 + }, + { + "epoch": 1.0662816409209692, + "grad_norm": 0.10098763555288315, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 275830 + }, + { + "epoch": 1.0663202981243525, + "grad_norm": 0.10814845561981201, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 275840 + }, + { + "epoch": 1.0663589553277357, + "grad_norm": 0.11707577854394913, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 275850 + }, + { + "epoch": 1.066397612531119, + "grad_norm": 0.10472466051578522, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 275860 + }, + { + "epoch": 1.0664362697345022, + "grad_norm": 0.11059992760419846, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 275870 + }, + { + "epoch": 1.0664749269378857, + "grad_norm": 0.114679716527462, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 275880 + }, + { + "epoch": 1.066513584141269, + "grad_norm": 0.09872438758611679, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 275890 + }, + { + "epoch": 1.0665522413446522, + "grad_norm": 0.10187365114688873, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 275900 + }, + { + "epoch": 1.0665908985480355, + "grad_norm": 0.13637171685695648, + "learning_rate": 0.002, + "loss": 2.318, + "step": 275910 + }, + { + "epoch": 1.0666295557514187, + "grad_norm": 0.09725106507539749, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 275920 + }, + { + "epoch": 1.066668212954802, + "grad_norm": 0.12772177159786224, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 275930 + }, + { + "epoch": 1.0667068701581852, + "grad_norm": 0.10485053062438965, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 275940 + }, + { + "epoch": 1.0667455273615685, + "grad_norm": 0.10371177643537521, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 275950 + }, + { + "epoch": 1.066784184564952, + "grad_norm": 0.09366276115179062, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 275960 + }, + { + "epoch": 1.0668228417683352, + "grad_norm": 0.09731849282979965, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 275970 + }, + { + "epoch": 1.0668614989717184, + "grad_norm": 0.09356456249952316, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 275980 + }, + { + "epoch": 1.0669001561751017, + "grad_norm": 0.1003258153796196, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 275990 + }, + { + "epoch": 1.066938813378485, + "grad_norm": 0.13170093297958374, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 276000 + }, + { + "epoch": 1.0669774705818682, + "grad_norm": 0.09846068918704987, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 276010 + }, + { + "epoch": 1.0670161277852515, + "grad_norm": 0.12985065579414368, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 276020 + }, + { + "epoch": 1.0670547849886347, + "grad_norm": 0.11032890528440475, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 276030 + }, + { + "epoch": 1.067093442192018, + "grad_norm": 0.10538095980882645, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 276040 + }, + { + "epoch": 1.0671320993954014, + "grad_norm": 0.10463589429855347, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 276050 + }, + { + "epoch": 1.0671707565987847, + "grad_norm": 0.11432314664125443, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 276060 + }, + { + "epoch": 1.067209413802168, + "grad_norm": 0.11011043936014175, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 276070 + }, + { + "epoch": 1.0672480710055512, + "grad_norm": 0.08819034695625305, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 276080 + }, + { + "epoch": 1.0672867282089344, + "grad_norm": 0.09931288659572601, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 276090 + }, + { + "epoch": 1.0673253854123177, + "grad_norm": 0.09332785755395889, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 276100 + }, + { + "epoch": 1.067364042615701, + "grad_norm": 0.10240274667739868, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 276110 + }, + { + "epoch": 1.0674026998190842, + "grad_norm": 0.09615787118673325, + "learning_rate": 0.002, + "loss": 2.338, + "step": 276120 + }, + { + "epoch": 1.0674413570224677, + "grad_norm": 0.11123973876237869, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 276130 + }, + { + "epoch": 1.067480014225851, + "grad_norm": 0.11293863505125046, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 276140 + }, + { + "epoch": 1.0675186714292342, + "grad_norm": 0.12291529029607773, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 276150 + }, + { + "epoch": 1.0675573286326174, + "grad_norm": 0.10829268395900726, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 276160 + }, + { + "epoch": 1.0675959858360007, + "grad_norm": 0.09903152287006378, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 276170 + }, + { + "epoch": 1.067634643039384, + "grad_norm": 0.10907009243965149, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 276180 + }, + { + "epoch": 1.0676733002427672, + "grad_norm": 0.09899713099002838, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 276190 + }, + { + "epoch": 1.0677119574461504, + "grad_norm": 0.12115491181612015, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 276200 + }, + { + "epoch": 1.0677506146495337, + "grad_norm": 0.090184785425663, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 276210 + }, + { + "epoch": 1.0677892718529172, + "grad_norm": 0.10801292955875397, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 276220 + }, + { + "epoch": 1.0678279290563004, + "grad_norm": 0.09343937039375305, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 276230 + }, + { + "epoch": 1.0678665862596837, + "grad_norm": 0.10986505448818207, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 276240 + }, + { + "epoch": 1.067905243463067, + "grad_norm": 0.11481022089719772, + "learning_rate": 0.002, + "loss": 2.328, + "step": 276250 + }, + { + "epoch": 1.0679439006664502, + "grad_norm": 0.12644775211811066, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 276260 + }, + { + "epoch": 1.0679825578698334, + "grad_norm": 0.11024738848209381, + "learning_rate": 0.002, + "loss": 2.346, + "step": 276270 + }, + { + "epoch": 1.0680212150732167, + "grad_norm": 0.0977511778473854, + "learning_rate": 0.002, + "loss": 2.349, + "step": 276280 + }, + { + "epoch": 1.0680598722766, + "grad_norm": 0.09417412430047989, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 276290 + }, + { + "epoch": 1.0680985294799834, + "grad_norm": 0.10336937010288239, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 276300 + }, + { + "epoch": 1.0681371866833667, + "grad_norm": 0.5236482620239258, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 276310 + }, + { + "epoch": 1.06817584388675, + "grad_norm": 0.0997222289443016, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 276320 + }, + { + "epoch": 1.0682145010901332, + "grad_norm": 0.10604297369718552, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 276330 + }, + { + "epoch": 1.0682531582935164, + "grad_norm": 0.10758452862501144, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 276340 + }, + { + "epoch": 1.0682918154968997, + "grad_norm": 0.11882123351097107, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 276350 + }, + { + "epoch": 1.068330472700283, + "grad_norm": 0.09060604125261307, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 276360 + }, + { + "epoch": 1.0683691299036662, + "grad_norm": 0.8458299040794373, + "learning_rate": 0.002, + "loss": 2.3711, + "step": 276370 + }, + { + "epoch": 1.0684077871070494, + "grad_norm": 0.1441306173801422, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 276380 + }, + { + "epoch": 1.068446444310433, + "grad_norm": 0.11246976256370544, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 276390 + }, + { + "epoch": 1.0684851015138161, + "grad_norm": 0.09019514173269272, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 276400 + }, + { + "epoch": 1.0685237587171994, + "grad_norm": 0.10581713914871216, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 276410 + }, + { + "epoch": 1.0685624159205827, + "grad_norm": 0.10104959458112717, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 276420 + }, + { + "epoch": 1.068601073123966, + "grad_norm": 0.12771807610988617, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 276430 + }, + { + "epoch": 1.0686397303273492, + "grad_norm": 0.13741634786128998, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 276440 + }, + { + "epoch": 1.0686783875307324, + "grad_norm": 0.09425902366638184, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 276450 + }, + { + "epoch": 1.0687170447341157, + "grad_norm": 0.11592981219291687, + "learning_rate": 0.002, + "loss": 2.322, + "step": 276460 + }, + { + "epoch": 1.0687557019374991, + "grad_norm": 0.09700343012809753, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 276470 + }, + { + "epoch": 1.0687943591408824, + "grad_norm": 0.11174482107162476, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 276480 + }, + { + "epoch": 1.0688330163442656, + "grad_norm": 0.11250240355730057, + "learning_rate": 0.002, + "loss": 2.337, + "step": 276490 + }, + { + "epoch": 1.068871673547649, + "grad_norm": 0.1096969023346901, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 276500 + }, + { + "epoch": 1.0689103307510321, + "grad_norm": 0.0991690382361412, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 276510 + }, + { + "epoch": 1.0689489879544154, + "grad_norm": 0.10980476438999176, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 276520 + }, + { + "epoch": 1.0689876451577986, + "grad_norm": 0.10511131584644318, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 276530 + }, + { + "epoch": 1.069026302361182, + "grad_norm": 0.10734134167432785, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 276540 + }, + { + "epoch": 1.0690649595645652, + "grad_norm": 0.10478593409061432, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 276550 + }, + { + "epoch": 1.0691036167679486, + "grad_norm": 0.10274649411439896, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 276560 + }, + { + "epoch": 1.0691422739713319, + "grad_norm": 0.09574639052152634, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 276570 + }, + { + "epoch": 1.0691809311747151, + "grad_norm": 0.10025924444198608, + "learning_rate": 0.002, + "loss": 2.347, + "step": 276580 + }, + { + "epoch": 1.0692195883780984, + "grad_norm": 0.09721271693706512, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 276590 + }, + { + "epoch": 1.0692582455814816, + "grad_norm": 0.10139498859643936, + "learning_rate": 0.002, + "loss": 2.336, + "step": 276600 + }, + { + "epoch": 1.0692969027848649, + "grad_norm": 0.09656251966953278, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 276610 + }, + { + "epoch": 1.0693355599882481, + "grad_norm": 0.12592603266239166, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 276620 + }, + { + "epoch": 1.0693742171916314, + "grad_norm": 0.1186826229095459, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 276630 + }, + { + "epoch": 1.0694128743950149, + "grad_norm": 0.12251210957765579, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 276640 + }, + { + "epoch": 1.0694515315983981, + "grad_norm": 0.08772478997707367, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 276650 + }, + { + "epoch": 1.0694901888017814, + "grad_norm": 0.11454391479492188, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 276660 + }, + { + "epoch": 1.0695288460051646, + "grad_norm": 0.10247767716646194, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 276670 + }, + { + "epoch": 1.0695675032085479, + "grad_norm": 0.11828066408634186, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 276680 + }, + { + "epoch": 1.0696061604119311, + "grad_norm": 0.11170656979084015, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 276690 + }, + { + "epoch": 1.0696448176153144, + "grad_norm": 0.11362114548683167, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 276700 + }, + { + "epoch": 1.0696834748186976, + "grad_norm": 0.10755255818367004, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 276710 + }, + { + "epoch": 1.0697221320220809, + "grad_norm": 0.08972054719924927, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 276720 + }, + { + "epoch": 1.0697607892254644, + "grad_norm": 0.1052330732345581, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 276730 + }, + { + "epoch": 1.0697994464288476, + "grad_norm": 0.09455091506242752, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 276740 + }, + { + "epoch": 1.0698381036322309, + "grad_norm": 0.09553559124469757, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 276750 + }, + { + "epoch": 1.0698767608356141, + "grad_norm": 0.11882331967353821, + "learning_rate": 0.002, + "loss": 2.346, + "step": 276760 + }, + { + "epoch": 1.0699154180389974, + "grad_norm": 0.09439017623662949, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 276770 + }, + { + "epoch": 1.0699540752423806, + "grad_norm": 0.10479924082756042, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 276780 + }, + { + "epoch": 1.0699927324457639, + "grad_norm": 0.10761447995901108, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 276790 + }, + { + "epoch": 1.0700313896491471, + "grad_norm": 0.10427835583686829, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 276800 + }, + { + "epoch": 1.0700700468525306, + "grad_norm": 0.10986112803220749, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 276810 + }, + { + "epoch": 1.0701087040559139, + "grad_norm": 0.11656492203474045, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 276820 + }, + { + "epoch": 1.070147361259297, + "grad_norm": 0.11677859723567963, + "learning_rate": 0.002, + "loss": 2.346, + "step": 276830 + }, + { + "epoch": 1.0701860184626804, + "grad_norm": 0.10161464661359787, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 276840 + }, + { + "epoch": 1.0702246756660636, + "grad_norm": 0.11238999664783478, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 276850 + }, + { + "epoch": 1.0702633328694469, + "grad_norm": 0.10180098563432693, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 276860 + }, + { + "epoch": 1.0703019900728301, + "grad_norm": 0.09605445712804794, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 276870 + }, + { + "epoch": 1.0703406472762134, + "grad_norm": 0.09548425674438477, + "learning_rate": 0.002, + "loss": 2.338, + "step": 276880 + }, + { + "epoch": 1.0703793044795966, + "grad_norm": 0.1020766943693161, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 276890 + }, + { + "epoch": 1.07041796168298, + "grad_norm": 0.11540831625461578, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 276900 + }, + { + "epoch": 1.0704566188863633, + "grad_norm": 0.10443674027919769, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 276910 + }, + { + "epoch": 1.0704952760897466, + "grad_norm": 0.1141781434416771, + "learning_rate": 0.002, + "loss": 2.334, + "step": 276920 + }, + { + "epoch": 1.0705339332931298, + "grad_norm": 0.11112864315509796, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 276930 + }, + { + "epoch": 1.070572590496513, + "grad_norm": 0.0906246155500412, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 276940 + }, + { + "epoch": 1.0706112476998964, + "grad_norm": 0.10664301365613937, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 276950 + }, + { + "epoch": 1.0706499049032796, + "grad_norm": 0.12030696123838425, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 276960 + }, + { + "epoch": 1.0706885621066629, + "grad_norm": 0.09777712821960449, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 276970 + }, + { + "epoch": 1.0707272193100463, + "grad_norm": 0.10626299679279327, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 276980 + }, + { + "epoch": 1.0707658765134296, + "grad_norm": 0.11611949652433395, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 276990 + }, + { + "epoch": 1.0708045337168128, + "grad_norm": 0.09844252467155457, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 277000 + }, + { + "epoch": 1.070843190920196, + "grad_norm": 0.10335033386945724, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 277010 + }, + { + "epoch": 1.0708818481235793, + "grad_norm": 0.10708364844322205, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 277020 + }, + { + "epoch": 1.0709205053269626, + "grad_norm": 0.10955175757408142, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 277030 + }, + { + "epoch": 1.0709591625303458, + "grad_norm": 0.10562895983457565, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 277040 + }, + { + "epoch": 1.070997819733729, + "grad_norm": 0.10760527104139328, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 277050 + }, + { + "epoch": 1.0710364769371123, + "grad_norm": 0.08946993947029114, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 277060 + }, + { + "epoch": 1.0710751341404958, + "grad_norm": 0.09455015510320663, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 277070 + }, + { + "epoch": 1.071113791343879, + "grad_norm": 0.1026642695069313, + "learning_rate": 0.002, + "loss": 2.34, + "step": 277080 + }, + { + "epoch": 1.0711524485472623, + "grad_norm": 0.09760484099388123, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 277090 + }, + { + "epoch": 1.0711911057506456, + "grad_norm": 0.12051312625408173, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 277100 + }, + { + "epoch": 1.0712297629540288, + "grad_norm": 0.10644880682229996, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 277110 + }, + { + "epoch": 1.071268420157412, + "grad_norm": 0.11814972013235092, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 277120 + }, + { + "epoch": 1.0713070773607953, + "grad_norm": 0.09355330467224121, + "learning_rate": 0.002, + "loss": 2.3592, + "step": 277130 + }, + { + "epoch": 1.0713457345641786, + "grad_norm": 0.09649022668600082, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 277140 + }, + { + "epoch": 1.071384391767562, + "grad_norm": 0.10202883183956146, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 277150 + }, + { + "epoch": 1.0714230489709453, + "grad_norm": 0.10278471559286118, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 277160 + }, + { + "epoch": 1.0714617061743286, + "grad_norm": 0.12783558666706085, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 277170 + }, + { + "epoch": 1.0715003633777118, + "grad_norm": 0.09995948523283005, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 277180 + }, + { + "epoch": 1.071539020581095, + "grad_norm": 0.11257115751504898, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 277190 + }, + { + "epoch": 1.0715776777844783, + "grad_norm": 0.11943518370389938, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 277200 + }, + { + "epoch": 1.0716163349878616, + "grad_norm": 0.09310497343540192, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 277210 + }, + { + "epoch": 1.0716549921912448, + "grad_norm": 0.10629613697528839, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 277220 + }, + { + "epoch": 1.071693649394628, + "grad_norm": 0.1634482592344284, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 277230 + }, + { + "epoch": 1.0717323065980116, + "grad_norm": 0.1426219344139099, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 277240 + }, + { + "epoch": 1.0717709638013948, + "grad_norm": 0.09856656938791275, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 277250 + }, + { + "epoch": 1.071809621004778, + "grad_norm": 0.11183300614356995, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 277260 + }, + { + "epoch": 1.0718482782081613, + "grad_norm": 0.16957798600196838, + "learning_rate": 0.002, + "loss": 2.334, + "step": 277270 + }, + { + "epoch": 1.0718869354115446, + "grad_norm": 0.2407703846693039, + "learning_rate": 0.002, + "loss": 2.342, + "step": 277280 + }, + { + "epoch": 1.0719255926149278, + "grad_norm": 0.11991997063159943, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 277290 + }, + { + "epoch": 1.071964249818311, + "grad_norm": 0.09134931117296219, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 277300 + }, + { + "epoch": 1.0720029070216945, + "grad_norm": 0.11510300636291504, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 277310 + }, + { + "epoch": 1.0720415642250778, + "grad_norm": 0.09177210181951523, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 277320 + }, + { + "epoch": 1.072080221428461, + "grad_norm": 0.10126060247421265, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 277330 + }, + { + "epoch": 1.0721188786318443, + "grad_norm": 0.11494698375463486, + "learning_rate": 0.002, + "loss": 2.339, + "step": 277340 + }, + { + "epoch": 1.0721575358352275, + "grad_norm": 0.11380572617053986, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 277350 + }, + { + "epoch": 1.0721961930386108, + "grad_norm": 0.10477299988269806, + "learning_rate": 0.002, + "loss": 2.333, + "step": 277360 + }, + { + "epoch": 1.072234850241994, + "grad_norm": 0.1232742890715599, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 277370 + }, + { + "epoch": 1.0722735074453773, + "grad_norm": 0.1035618707537651, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 277380 + }, + { + "epoch": 1.0723121646487606, + "grad_norm": 0.1024998351931572, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 277390 + }, + { + "epoch": 1.072350821852144, + "grad_norm": 0.09486802667379379, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 277400 + }, + { + "epoch": 1.0723894790555273, + "grad_norm": 0.11042434722185135, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 277410 + }, + { + "epoch": 1.0724281362589105, + "grad_norm": 0.09347493946552277, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 277420 + }, + { + "epoch": 1.0724667934622938, + "grad_norm": 0.09824241697788239, + "learning_rate": 0.002, + "loss": 2.336, + "step": 277430 + }, + { + "epoch": 1.072505450665677, + "grad_norm": 0.10214419662952423, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 277440 + }, + { + "epoch": 1.0725441078690603, + "grad_norm": 0.09179384261369705, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 277450 + }, + { + "epoch": 1.0725827650724435, + "grad_norm": 0.11253994703292847, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 277460 + }, + { + "epoch": 1.0726214222758268, + "grad_norm": 0.09997082501649857, + "learning_rate": 0.002, + "loss": 2.348, + "step": 277470 + }, + { + "epoch": 1.0726600794792103, + "grad_norm": 0.09391092509031296, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 277480 + }, + { + "epoch": 1.0726987366825935, + "grad_norm": 0.1391114741563797, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 277490 + }, + { + "epoch": 1.0727373938859768, + "grad_norm": 0.10522706061601639, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 277500 + }, + { + "epoch": 1.07277605108936, + "grad_norm": 0.1032128557562828, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 277510 + }, + { + "epoch": 1.0728147082927433, + "grad_norm": 0.12092791497707367, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 277520 + }, + { + "epoch": 1.0728533654961265, + "grad_norm": 0.1165420338511467, + "learning_rate": 0.002, + "loss": 2.348, + "step": 277530 + }, + { + "epoch": 1.0728920226995098, + "grad_norm": 0.10182865709066391, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 277540 + }, + { + "epoch": 1.072930679902893, + "grad_norm": 0.10662321746349335, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 277550 + }, + { + "epoch": 1.0729693371062763, + "grad_norm": 0.09967391192913055, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 277560 + }, + { + "epoch": 1.0730079943096598, + "grad_norm": 0.08776018023490906, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 277570 + }, + { + "epoch": 1.073046651513043, + "grad_norm": 0.09252668172121048, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 277580 + }, + { + "epoch": 1.0730853087164263, + "grad_norm": 0.09517596662044525, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 277590 + }, + { + "epoch": 1.0731239659198095, + "grad_norm": 0.12813012301921844, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 277600 + }, + { + "epoch": 1.0731626231231928, + "grad_norm": 0.09909941256046295, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 277610 + }, + { + "epoch": 1.073201280326576, + "grad_norm": 0.10072305053472519, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 277620 + }, + { + "epoch": 1.0732399375299593, + "grad_norm": 0.18498556315898895, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 277630 + }, + { + "epoch": 1.0732785947333425, + "grad_norm": 0.10503578931093216, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 277640 + }, + { + "epoch": 1.073317251936726, + "grad_norm": 0.10219155251979828, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 277650 + }, + { + "epoch": 1.0733559091401093, + "grad_norm": 0.1031673327088356, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 277660 + }, + { + "epoch": 1.0733945663434925, + "grad_norm": 0.10291045904159546, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 277670 + }, + { + "epoch": 1.0734332235468758, + "grad_norm": 0.10479222238063812, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 277680 + }, + { + "epoch": 1.073471880750259, + "grad_norm": 0.11077100038528442, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 277690 + }, + { + "epoch": 1.0735105379536423, + "grad_norm": 0.10235881060361862, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 277700 + }, + { + "epoch": 1.0735491951570255, + "grad_norm": 0.10528463870286942, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 277710 + }, + { + "epoch": 1.0735878523604088, + "grad_norm": 0.11103863269090652, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 277720 + }, + { + "epoch": 1.073626509563792, + "grad_norm": 0.11000587791204453, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 277730 + }, + { + "epoch": 1.0736651667671755, + "grad_norm": 0.1038767471909523, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 277740 + }, + { + "epoch": 1.0737038239705587, + "grad_norm": 0.10358269512653351, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 277750 + }, + { + "epoch": 1.073742481173942, + "grad_norm": 0.12331962585449219, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 277760 + }, + { + "epoch": 1.0737811383773253, + "grad_norm": 0.10131274908781052, + "learning_rate": 0.002, + "loss": 2.337, + "step": 277770 + }, + { + "epoch": 1.0738197955807085, + "grad_norm": 0.10317271202802658, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 277780 + }, + { + "epoch": 1.0738584527840918, + "grad_norm": 0.10653816163539886, + "learning_rate": 0.002, + "loss": 2.3571, + "step": 277790 + }, + { + "epoch": 1.073897109987475, + "grad_norm": 0.09283388406038284, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 277800 + }, + { + "epoch": 1.0739357671908583, + "grad_norm": 0.08693074434995651, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 277810 + }, + { + "epoch": 1.0739744243942417, + "grad_norm": 0.10669288039207458, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 277820 + }, + { + "epoch": 1.074013081597625, + "grad_norm": 0.11283266544342041, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 277830 + }, + { + "epoch": 1.0740517388010082, + "grad_norm": 0.10510426014661789, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 277840 + }, + { + "epoch": 1.0740903960043915, + "grad_norm": 0.09539424628019333, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 277850 + }, + { + "epoch": 1.0741290532077747, + "grad_norm": 0.10931488126516342, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 277860 + }, + { + "epoch": 1.074167710411158, + "grad_norm": 0.1264454573392868, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 277870 + }, + { + "epoch": 1.0742063676145412, + "grad_norm": 0.09714031964540482, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 277880 + }, + { + "epoch": 1.0742450248179245, + "grad_norm": 0.08943731337785721, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 277890 + }, + { + "epoch": 1.0742836820213078, + "grad_norm": 0.09937284141778946, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 277900 + }, + { + "epoch": 1.0743223392246912, + "grad_norm": 0.0941234901547432, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 277910 + }, + { + "epoch": 1.0743609964280745, + "grad_norm": 0.11058460175991058, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 277920 + }, + { + "epoch": 1.0743996536314577, + "grad_norm": 0.10181267559528351, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 277930 + }, + { + "epoch": 1.074438310834841, + "grad_norm": 0.11269213259220123, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 277940 + }, + { + "epoch": 1.0744769680382242, + "grad_norm": 0.1178668662905693, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 277950 + }, + { + "epoch": 1.0745156252416075, + "grad_norm": 0.10448413342237473, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 277960 + }, + { + "epoch": 1.0745542824449907, + "grad_norm": 0.0995522066950798, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 277970 + }, + { + "epoch": 1.074592939648374, + "grad_norm": 0.10711553692817688, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 277980 + }, + { + "epoch": 1.0746315968517575, + "grad_norm": 0.12523172795772552, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 277990 + }, + { + "epoch": 1.0746702540551407, + "grad_norm": 0.09922770410776138, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 278000 + }, + { + "epoch": 1.074708911258524, + "grad_norm": 0.09114769101142883, + "learning_rate": 0.002, + "loss": 2.322, + "step": 278010 + }, + { + "epoch": 1.0747475684619072, + "grad_norm": 0.11168397963047028, + "learning_rate": 0.002, + "loss": 2.355, + "step": 278020 + }, + { + "epoch": 1.0747862256652905, + "grad_norm": 0.12937746942043304, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 278030 + }, + { + "epoch": 1.0748248828686737, + "grad_norm": 0.10723833739757538, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 278040 + }, + { + "epoch": 1.074863540072057, + "grad_norm": 0.1325829029083252, + "learning_rate": 0.002, + "loss": 2.329, + "step": 278050 + }, + { + "epoch": 1.0749021972754402, + "grad_norm": 0.10723695904016495, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 278060 + }, + { + "epoch": 1.0749408544788235, + "grad_norm": 0.09187041223049164, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 278070 + }, + { + "epoch": 1.074979511682207, + "grad_norm": 0.1059562936425209, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 278080 + }, + { + "epoch": 1.0750181688855902, + "grad_norm": 0.10393907129764557, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 278090 + }, + { + "epoch": 1.0750568260889735, + "grad_norm": 0.11185088008642197, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 278100 + }, + { + "epoch": 1.0750954832923567, + "grad_norm": 0.10708445310592651, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 278110 + }, + { + "epoch": 1.07513414049574, + "grad_norm": 0.10166383534669876, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 278120 + }, + { + "epoch": 1.0751727976991232, + "grad_norm": 0.1215904951095581, + "learning_rate": 0.002, + "loss": 2.3113, + "step": 278130 + }, + { + "epoch": 1.0752114549025065, + "grad_norm": 0.11430537700653076, + "learning_rate": 0.002, + "loss": 2.352, + "step": 278140 + }, + { + "epoch": 1.0752501121058897, + "grad_norm": 0.09518372267484665, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 278150 + }, + { + "epoch": 1.0752887693092732, + "grad_norm": 0.11318797618150711, + "learning_rate": 0.002, + "loss": 2.322, + "step": 278160 + }, + { + "epoch": 1.0753274265126564, + "grad_norm": 0.0915597602725029, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 278170 + }, + { + "epoch": 1.0753660837160397, + "grad_norm": 0.1077663004398346, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 278180 + }, + { + "epoch": 1.075404740919423, + "grad_norm": 0.09900057315826416, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 278190 + }, + { + "epoch": 1.0754433981228062, + "grad_norm": 0.12153781205415726, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 278200 + }, + { + "epoch": 1.0754820553261895, + "grad_norm": 0.10533221065998077, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 278210 + }, + { + "epoch": 1.0755207125295727, + "grad_norm": 0.10090882331132889, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 278220 + }, + { + "epoch": 1.075559369732956, + "grad_norm": 0.09460879117250443, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 278230 + }, + { + "epoch": 1.0755980269363392, + "grad_norm": 0.11132773756980896, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 278240 + }, + { + "epoch": 1.0756366841397227, + "grad_norm": 0.1132529079914093, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 278250 + }, + { + "epoch": 1.075675341343106, + "grad_norm": 0.09819561243057251, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 278260 + }, + { + "epoch": 1.0757139985464892, + "grad_norm": 0.11191277205944061, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 278270 + }, + { + "epoch": 1.0757526557498724, + "grad_norm": 0.10459837317466736, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 278280 + }, + { + "epoch": 1.0757913129532557, + "grad_norm": 0.09608481824398041, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 278290 + }, + { + "epoch": 1.075829970156639, + "grad_norm": 0.10903522372245789, + "learning_rate": 0.002, + "loss": 2.355, + "step": 278300 + }, + { + "epoch": 1.0758686273600222, + "grad_norm": 0.11461076885461807, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 278310 + }, + { + "epoch": 1.0759072845634055, + "grad_norm": 0.12711842358112335, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 278320 + }, + { + "epoch": 1.075945941766789, + "grad_norm": 0.11210744082927704, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 278330 + }, + { + "epoch": 1.0759845989701722, + "grad_norm": 0.09837070852518082, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 278340 + }, + { + "epoch": 1.0760232561735554, + "grad_norm": 0.12900109589099884, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 278350 + }, + { + "epoch": 1.0760619133769387, + "grad_norm": 0.10696274787187576, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 278360 + }, + { + "epoch": 1.076100570580322, + "grad_norm": 0.09934203326702118, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 278370 + }, + { + "epoch": 1.0761392277837052, + "grad_norm": 0.10716903209686279, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 278380 + }, + { + "epoch": 1.0761778849870884, + "grad_norm": 0.09976894408464432, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 278390 + }, + { + "epoch": 1.0762165421904717, + "grad_norm": 0.1285540908575058, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 278400 + }, + { + "epoch": 1.076255199393855, + "grad_norm": 0.09722083061933517, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 278410 + }, + { + "epoch": 1.0762938565972384, + "grad_norm": 0.10156688094139099, + "learning_rate": 0.002, + "loss": 2.326, + "step": 278420 + }, + { + "epoch": 1.0763325138006217, + "grad_norm": 0.15742437541484833, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 278430 + }, + { + "epoch": 1.076371171004005, + "grad_norm": 0.11908959597349167, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 278440 + }, + { + "epoch": 1.0764098282073882, + "grad_norm": 0.17446118593215942, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 278450 + }, + { + "epoch": 1.0764484854107714, + "grad_norm": 0.10834288597106934, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 278460 + }, + { + "epoch": 1.0764871426141547, + "grad_norm": 0.09273096174001694, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 278470 + }, + { + "epoch": 1.076525799817538, + "grad_norm": 0.10428832471370697, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 278480 + }, + { + "epoch": 1.0765644570209212, + "grad_norm": 0.10714827477931976, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 278490 + }, + { + "epoch": 1.0766031142243047, + "grad_norm": 0.09862946718931198, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 278500 + }, + { + "epoch": 1.076641771427688, + "grad_norm": 0.0923583135008812, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 278510 + }, + { + "epoch": 1.0766804286310712, + "grad_norm": 0.12656864523887634, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 278520 + }, + { + "epoch": 1.0767190858344544, + "grad_norm": 0.09982597082853317, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 278530 + }, + { + "epoch": 1.0767577430378377, + "grad_norm": 0.09666809439659119, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 278540 + }, + { + "epoch": 1.076796400241221, + "grad_norm": 0.10843471437692642, + "learning_rate": 0.002, + "loss": 2.3157, + "step": 278550 + }, + { + "epoch": 1.0768350574446042, + "grad_norm": 0.10855158418416977, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 278560 + }, + { + "epoch": 1.0768737146479874, + "grad_norm": 0.11012399196624756, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 278570 + }, + { + "epoch": 1.0769123718513707, + "grad_norm": 0.1115560233592987, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 278580 + }, + { + "epoch": 1.0769510290547541, + "grad_norm": 0.09982690215110779, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 278590 + }, + { + "epoch": 1.0769896862581374, + "grad_norm": 0.10116339474916458, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 278600 + }, + { + "epoch": 1.0770283434615207, + "grad_norm": 0.09681201726198196, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 278610 + }, + { + "epoch": 1.077067000664904, + "grad_norm": 0.15509819984436035, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 278620 + }, + { + "epoch": 1.0771056578682872, + "grad_norm": 0.10198183357715607, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 278630 + }, + { + "epoch": 1.0771443150716704, + "grad_norm": 0.09804889559745789, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 278640 + }, + { + "epoch": 1.0771829722750537, + "grad_norm": 0.10165011882781982, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 278650 + }, + { + "epoch": 1.077221629478437, + "grad_norm": 0.11190210282802582, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 278660 + }, + { + "epoch": 1.0772602866818204, + "grad_norm": 0.09899290651082993, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 278670 + }, + { + "epoch": 1.0772989438852036, + "grad_norm": 0.09202511608600616, + "learning_rate": 0.002, + "loss": 2.3087, + "step": 278680 + }, + { + "epoch": 1.077337601088587, + "grad_norm": 0.1107383444905281, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 278690 + }, + { + "epoch": 1.0773762582919701, + "grad_norm": 0.10847750306129456, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 278700 + }, + { + "epoch": 1.0774149154953534, + "grad_norm": 0.1263248175382614, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 278710 + }, + { + "epoch": 1.0774535726987367, + "grad_norm": 0.1095813661813736, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 278720 + }, + { + "epoch": 1.07749222990212, + "grad_norm": 0.10310844331979752, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 278730 + }, + { + "epoch": 1.0775308871055032, + "grad_norm": 0.11807134002447128, + "learning_rate": 0.002, + "loss": 2.327, + "step": 278740 + }, + { + "epoch": 1.0775695443088864, + "grad_norm": 0.09761965274810791, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 278750 + }, + { + "epoch": 1.0776082015122699, + "grad_norm": 0.118479885160923, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 278760 + }, + { + "epoch": 1.0776468587156531, + "grad_norm": 0.10395986586809158, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 278770 + }, + { + "epoch": 1.0776855159190364, + "grad_norm": 0.09768043458461761, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 278780 + }, + { + "epoch": 1.0777241731224196, + "grad_norm": 0.11547421663999557, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 278790 + }, + { + "epoch": 1.077762830325803, + "grad_norm": 0.13452501595020294, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 278800 + }, + { + "epoch": 1.0778014875291861, + "grad_norm": 0.1132790595293045, + "learning_rate": 0.002, + "loss": 2.3555, + "step": 278810 + }, + { + "epoch": 1.0778401447325694, + "grad_norm": 0.10164622217416763, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 278820 + }, + { + "epoch": 1.0778788019359526, + "grad_norm": 0.11321406066417694, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 278830 + }, + { + "epoch": 1.0779174591393361, + "grad_norm": 0.12079408764839172, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 278840 + }, + { + "epoch": 1.0779561163427194, + "grad_norm": 0.10696251690387726, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 278850 + }, + { + "epoch": 1.0779947735461026, + "grad_norm": 0.10298726707696915, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 278860 + }, + { + "epoch": 1.0780334307494859, + "grad_norm": 0.11193158477544785, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 278870 + }, + { + "epoch": 1.0780720879528691, + "grad_norm": 0.09726030379533768, + "learning_rate": 0.002, + "loss": 2.323, + "step": 278880 + }, + { + "epoch": 1.0781107451562524, + "grad_norm": 0.0914492979645729, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 278890 + }, + { + "epoch": 1.0781494023596356, + "grad_norm": 0.10119852423667908, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 278900 + }, + { + "epoch": 1.0781880595630189, + "grad_norm": 0.10391935706138611, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 278910 + }, + { + "epoch": 1.0782267167664021, + "grad_norm": 0.110045425593853, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 278920 + }, + { + "epoch": 1.0782653739697856, + "grad_norm": 0.10475011169910431, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 278930 + }, + { + "epoch": 1.0783040311731689, + "grad_norm": 0.09570727497339249, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 278940 + }, + { + "epoch": 1.0783426883765521, + "grad_norm": 0.10496969521045685, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 278950 + }, + { + "epoch": 1.0783813455799354, + "grad_norm": 0.10734902322292328, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 278960 + }, + { + "epoch": 1.0784200027833186, + "grad_norm": 0.11177080869674683, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 278970 + }, + { + "epoch": 1.0784586599867019, + "grad_norm": 0.12020063400268555, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 278980 + }, + { + "epoch": 1.0784973171900851, + "grad_norm": 0.09938409179449081, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 278990 + }, + { + "epoch": 1.0785359743934684, + "grad_norm": 0.10291053354740143, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 279000 + }, + { + "epoch": 1.0785746315968519, + "grad_norm": 0.10316527634859085, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 279010 + }, + { + "epoch": 1.078613288800235, + "grad_norm": 0.12356262654066086, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 279020 + }, + { + "epoch": 1.0786519460036184, + "grad_norm": 0.10434281080961227, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 279030 + }, + { + "epoch": 1.0786906032070016, + "grad_norm": 0.09368117153644562, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 279040 + }, + { + "epoch": 1.0787292604103849, + "grad_norm": 0.09900877624750137, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 279050 + }, + { + "epoch": 1.0787679176137681, + "grad_norm": 0.10736262798309326, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 279060 + }, + { + "epoch": 1.0788065748171514, + "grad_norm": 0.08920001238584518, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 279070 + }, + { + "epoch": 1.0788452320205346, + "grad_norm": 0.1025761291384697, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 279080 + }, + { + "epoch": 1.0788838892239179, + "grad_norm": 0.12492604553699493, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 279090 + }, + { + "epoch": 1.0789225464273013, + "grad_norm": 0.12873578071594238, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 279100 + }, + { + "epoch": 1.0789612036306846, + "grad_norm": 0.11375364661216736, + "learning_rate": 0.002, + "loss": 2.341, + "step": 279110 + }, + { + "epoch": 1.0789998608340678, + "grad_norm": 0.12128067016601562, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 279120 + }, + { + "epoch": 1.079038518037451, + "grad_norm": 0.10374868661165237, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 279130 + }, + { + "epoch": 1.0790771752408344, + "grad_norm": 0.10124023258686066, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 279140 + }, + { + "epoch": 1.0791158324442176, + "grad_norm": 0.13224327564239502, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 279150 + }, + { + "epoch": 1.0791544896476009, + "grad_norm": 0.10030703991651535, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 279160 + }, + { + "epoch": 1.079193146850984, + "grad_norm": 0.12761624157428741, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 279170 + }, + { + "epoch": 1.0792318040543676, + "grad_norm": 0.10995358228683472, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 279180 + }, + { + "epoch": 1.0792704612577508, + "grad_norm": 0.10545569658279419, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 279190 + }, + { + "epoch": 1.079309118461134, + "grad_norm": 0.09752767533063889, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 279200 + }, + { + "epoch": 1.0793477756645173, + "grad_norm": 0.09510950744152069, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 279210 + }, + { + "epoch": 1.0793864328679006, + "grad_norm": 0.10693711042404175, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 279220 + }, + { + "epoch": 1.0794250900712838, + "grad_norm": 0.09939834475517273, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 279230 + }, + { + "epoch": 1.079463747274667, + "grad_norm": 0.11319193243980408, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 279240 + }, + { + "epoch": 1.0795024044780503, + "grad_norm": 0.08854492753744125, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 279250 + }, + { + "epoch": 1.0795410616814338, + "grad_norm": 0.10071007162332535, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 279260 + }, + { + "epoch": 1.079579718884817, + "grad_norm": 0.10331925004720688, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 279270 + }, + { + "epoch": 1.0796183760882003, + "grad_norm": 0.09988714009523392, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 279280 + }, + { + "epoch": 1.0796570332915836, + "grad_norm": 0.11048047989606857, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 279290 + }, + { + "epoch": 1.0796956904949668, + "grad_norm": 0.0965648666024208, + "learning_rate": 0.002, + "loss": 2.337, + "step": 279300 + }, + { + "epoch": 1.07973434769835, + "grad_norm": 0.08937753736972809, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 279310 + }, + { + "epoch": 1.0797730049017333, + "grad_norm": 0.13992008566856384, + "learning_rate": 0.002, + "loss": 2.346, + "step": 279320 + }, + { + "epoch": 1.0798116621051166, + "grad_norm": 0.09776229411363602, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 279330 + }, + { + "epoch": 1.0798503193085, + "grad_norm": 0.11652453243732452, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 279340 + }, + { + "epoch": 1.0798889765118833, + "grad_norm": 0.3779715597629547, + "learning_rate": 0.002, + "loss": 2.351, + "step": 279350 + }, + { + "epoch": 1.0799276337152666, + "grad_norm": 0.24565216898918152, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 279360 + }, + { + "epoch": 1.0799662909186498, + "grad_norm": 0.1088612824678421, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 279370 + }, + { + "epoch": 1.080004948122033, + "grad_norm": 0.10705023258924484, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 279380 + }, + { + "epoch": 1.0800436053254163, + "grad_norm": 0.10938013345003128, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 279390 + }, + { + "epoch": 1.0800822625287996, + "grad_norm": 0.09505293518304825, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 279400 + }, + { + "epoch": 1.0801209197321828, + "grad_norm": 0.10040805488824844, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 279410 + }, + { + "epoch": 1.080159576935566, + "grad_norm": 0.10634848475456238, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 279420 + }, + { + "epoch": 1.0801982341389496, + "grad_norm": 0.1252964287996292, + "learning_rate": 0.002, + "loss": 2.322, + "step": 279430 + }, + { + "epoch": 1.0802368913423328, + "grad_norm": 0.08823366463184357, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 279440 + }, + { + "epoch": 1.080275548545716, + "grad_norm": 0.11154643446207047, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 279450 + }, + { + "epoch": 1.0803142057490993, + "grad_norm": 0.10544943064451218, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 279460 + }, + { + "epoch": 1.0803528629524826, + "grad_norm": 0.13477879762649536, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 279470 + }, + { + "epoch": 1.0803915201558658, + "grad_norm": 0.1104428619146347, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 279480 + }, + { + "epoch": 1.080430177359249, + "grad_norm": 0.0944984182715416, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 279490 + }, + { + "epoch": 1.0804688345626323, + "grad_norm": 0.1066855788230896, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 279500 + }, + { + "epoch": 1.0805074917660158, + "grad_norm": 0.10000107437372208, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 279510 + }, + { + "epoch": 1.080546148969399, + "grad_norm": 0.09573391079902649, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 279520 + }, + { + "epoch": 1.0805848061727823, + "grad_norm": 0.12505954504013062, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 279530 + }, + { + "epoch": 1.0806234633761655, + "grad_norm": 0.10334816575050354, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 279540 + }, + { + "epoch": 1.0806621205795488, + "grad_norm": 0.09821376204490662, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 279550 + }, + { + "epoch": 1.080700777782932, + "grad_norm": 0.09971542656421661, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 279560 + }, + { + "epoch": 1.0807394349863153, + "grad_norm": 0.11195044964551926, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 279570 + }, + { + "epoch": 1.0807780921896986, + "grad_norm": 0.11056491732597351, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 279580 + }, + { + "epoch": 1.0808167493930818, + "grad_norm": 0.09117507189512253, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 279590 + }, + { + "epoch": 1.0808554065964653, + "grad_norm": 0.10031966120004654, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 279600 + }, + { + "epoch": 1.0808940637998485, + "grad_norm": 0.09302040189504623, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 279610 + }, + { + "epoch": 1.0809327210032318, + "grad_norm": 0.1287396401166916, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 279620 + }, + { + "epoch": 1.080971378206615, + "grad_norm": 0.1282958984375, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 279630 + }, + { + "epoch": 1.0810100354099983, + "grad_norm": 0.09668225795030594, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 279640 + }, + { + "epoch": 1.0810486926133815, + "grad_norm": 0.12607425451278687, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 279650 + }, + { + "epoch": 1.0810873498167648, + "grad_norm": 0.10190489888191223, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 279660 + }, + { + "epoch": 1.081126007020148, + "grad_norm": 0.11293112486600876, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 279670 + }, + { + "epoch": 1.0811646642235315, + "grad_norm": 0.11115790903568268, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 279680 + }, + { + "epoch": 1.0812033214269148, + "grad_norm": 0.10267027467489243, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 279690 + }, + { + "epoch": 1.081241978630298, + "grad_norm": 0.10083723813295364, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 279700 + }, + { + "epoch": 1.0812806358336813, + "grad_norm": 0.09403855353593826, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 279710 + }, + { + "epoch": 1.0813192930370645, + "grad_norm": 0.0995630994439125, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 279720 + }, + { + "epoch": 1.0813579502404478, + "grad_norm": 0.11248226463794708, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 279730 + }, + { + "epoch": 1.081396607443831, + "grad_norm": 0.11201859265565872, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 279740 + }, + { + "epoch": 1.0814352646472143, + "grad_norm": 0.08905567973852158, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 279750 + }, + { + "epoch": 1.0814739218505975, + "grad_norm": 0.09506616741418839, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 279760 + }, + { + "epoch": 1.081512579053981, + "grad_norm": 0.09160199016332626, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 279770 + }, + { + "epoch": 1.0815512362573643, + "grad_norm": 0.12843123078346252, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 279780 + }, + { + "epoch": 1.0815898934607475, + "grad_norm": 0.12316249310970306, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 279790 + }, + { + "epoch": 1.0816285506641308, + "grad_norm": 0.11130484938621521, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 279800 + }, + { + "epoch": 1.081667207867514, + "grad_norm": 0.10220065712928772, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 279810 + }, + { + "epoch": 1.0817058650708973, + "grad_norm": 0.11963673681020737, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 279820 + }, + { + "epoch": 1.0817445222742805, + "grad_norm": 0.1078323945403099, + "learning_rate": 0.002, + "loss": 2.352, + "step": 279830 + }, + { + "epoch": 1.0817831794776638, + "grad_norm": 0.10187908262014389, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 279840 + }, + { + "epoch": 1.0818218366810473, + "grad_norm": 0.10245156288146973, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 279850 + }, + { + "epoch": 1.0818604938844305, + "grad_norm": 0.10593272000551224, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 279860 + }, + { + "epoch": 1.0818991510878138, + "grad_norm": 0.11321359872817993, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 279870 + }, + { + "epoch": 1.081937808291197, + "grad_norm": 0.11144161224365234, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 279880 + }, + { + "epoch": 1.0819764654945803, + "grad_norm": 0.10398919135332108, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 279890 + }, + { + "epoch": 1.0820151226979635, + "grad_norm": 0.10995649546384811, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 279900 + }, + { + "epoch": 1.0820537799013468, + "grad_norm": 0.1734006255865097, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 279910 + }, + { + "epoch": 1.08209243710473, + "grad_norm": 0.09477046877145767, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 279920 + }, + { + "epoch": 1.0821310943081133, + "grad_norm": 0.11546581983566284, + "learning_rate": 0.002, + "loss": 2.338, + "step": 279930 + }, + { + "epoch": 1.0821697515114967, + "grad_norm": 0.10040906816720963, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 279940 + }, + { + "epoch": 1.08220840871488, + "grad_norm": 0.09605206549167633, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 279950 + }, + { + "epoch": 1.0822470659182633, + "grad_norm": 0.12556235492229462, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 279960 + }, + { + "epoch": 1.0822857231216465, + "grad_norm": 0.10956462472677231, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 279970 + }, + { + "epoch": 1.0823243803250298, + "grad_norm": 0.10256555676460266, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 279980 + }, + { + "epoch": 1.082363037528413, + "grad_norm": 0.10385078191757202, + "learning_rate": 0.002, + "loss": 2.325, + "step": 279990 + }, + { + "epoch": 1.0824016947317963, + "grad_norm": 0.11647751182317734, + "learning_rate": 0.002, + "loss": 2.33, + "step": 280000 + }, + { + "epoch": 1.0824403519351795, + "grad_norm": 0.10998006910085678, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 280010 + }, + { + "epoch": 1.082479009138563, + "grad_norm": 0.09821558743715286, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 280020 + }, + { + "epoch": 1.0825176663419462, + "grad_norm": 0.10078676044940948, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 280030 + }, + { + "epoch": 1.0825563235453295, + "grad_norm": 0.10271522402763367, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 280040 + }, + { + "epoch": 1.0825949807487127, + "grad_norm": 0.09726682305335999, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 280050 + }, + { + "epoch": 1.082633637952096, + "grad_norm": 0.09116078913211823, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 280060 + }, + { + "epoch": 1.0826722951554792, + "grad_norm": 0.10180582851171494, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 280070 + }, + { + "epoch": 1.0827109523588625, + "grad_norm": 0.10629677027463913, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 280080 + }, + { + "epoch": 1.0827496095622458, + "grad_norm": 0.1072465181350708, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 280090 + }, + { + "epoch": 1.082788266765629, + "grad_norm": 0.10737710446119308, + "learning_rate": 0.002, + "loss": 2.338, + "step": 280100 + }, + { + "epoch": 1.0828269239690125, + "grad_norm": 0.10657298564910889, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 280110 + }, + { + "epoch": 1.0828655811723957, + "grad_norm": 0.12205453217029572, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 280120 + }, + { + "epoch": 1.082904238375779, + "grad_norm": 0.11014603823423386, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 280130 + }, + { + "epoch": 1.0829428955791622, + "grad_norm": 0.261272132396698, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 280140 + }, + { + "epoch": 1.0829815527825455, + "grad_norm": 0.10859159380197525, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 280150 + }, + { + "epoch": 1.0830202099859287, + "grad_norm": 0.11122594028711319, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 280160 + }, + { + "epoch": 1.083058867189312, + "grad_norm": 0.11559804528951645, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 280170 + }, + { + "epoch": 1.0830975243926952, + "grad_norm": 0.0992172583937645, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 280180 + }, + { + "epoch": 1.0831361815960787, + "grad_norm": 0.12179453670978546, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 280190 + }, + { + "epoch": 1.083174838799462, + "grad_norm": 0.10722031444311142, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 280200 + }, + { + "epoch": 1.0832134960028452, + "grad_norm": 0.09454517811536789, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 280210 + }, + { + "epoch": 1.0832521532062285, + "grad_norm": 0.11187735199928284, + "learning_rate": 0.002, + "loss": 2.341, + "step": 280220 + }, + { + "epoch": 1.0832908104096117, + "grad_norm": 0.13172794878482819, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 280230 + }, + { + "epoch": 1.083329467612995, + "grad_norm": 0.10378477722406387, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 280240 + }, + { + "epoch": 1.0833681248163782, + "grad_norm": 0.10433642566204071, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 280250 + }, + { + "epoch": 1.0834067820197615, + "grad_norm": 0.10065358877182007, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 280260 + }, + { + "epoch": 1.0834454392231447, + "grad_norm": 0.09691182523965836, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 280270 + }, + { + "epoch": 1.0834840964265282, + "grad_norm": 0.10980410128831863, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 280280 + }, + { + "epoch": 1.0835227536299115, + "grad_norm": 0.11095762252807617, + "learning_rate": 0.002, + "loss": 2.34, + "step": 280290 + }, + { + "epoch": 1.0835614108332947, + "grad_norm": 0.09890416264533997, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 280300 + }, + { + "epoch": 1.083600068036678, + "grad_norm": 0.13483504951000214, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 280310 + }, + { + "epoch": 1.0836387252400612, + "grad_norm": 0.10112215578556061, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 280320 + }, + { + "epoch": 1.0836773824434445, + "grad_norm": 0.1662277728319168, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 280330 + }, + { + "epoch": 1.0837160396468277, + "grad_norm": 0.1037745475769043, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 280340 + }, + { + "epoch": 1.083754696850211, + "grad_norm": 0.11243646591901779, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 280350 + }, + { + "epoch": 1.0837933540535944, + "grad_norm": 0.1472591906785965, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 280360 + }, + { + "epoch": 1.0838320112569777, + "grad_norm": 0.092743881046772, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 280370 + }, + { + "epoch": 1.083870668460361, + "grad_norm": 0.10754603147506714, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 280380 + }, + { + "epoch": 1.0839093256637442, + "grad_norm": 0.160426527261734, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 280390 + }, + { + "epoch": 1.0839479828671275, + "grad_norm": 0.11159920692443848, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 280400 + }, + { + "epoch": 1.0839866400705107, + "grad_norm": 0.10037899762392044, + "learning_rate": 0.002, + "loss": 2.349, + "step": 280410 + }, + { + "epoch": 1.084025297273894, + "grad_norm": 0.1004137396812439, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 280420 + }, + { + "epoch": 1.0840639544772772, + "grad_norm": 0.10265239328145981, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 280430 + }, + { + "epoch": 1.0841026116806605, + "grad_norm": 0.10469073057174683, + "learning_rate": 0.002, + "loss": 2.344, + "step": 280440 + }, + { + "epoch": 1.084141268884044, + "grad_norm": 0.10935996472835541, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 280450 + }, + { + "epoch": 1.0841799260874272, + "grad_norm": 0.09472064673900604, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 280460 + }, + { + "epoch": 1.0842185832908104, + "grad_norm": 0.16456358134746552, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 280470 + }, + { + "epoch": 1.0842572404941937, + "grad_norm": 0.10961253941059113, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 280480 + }, + { + "epoch": 1.084295897697577, + "grad_norm": 0.12051593512296677, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 280490 + }, + { + "epoch": 1.0843345549009602, + "grad_norm": 0.10590102523565292, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 280500 + }, + { + "epoch": 1.0843732121043435, + "grad_norm": 0.08928204327821732, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 280510 + }, + { + "epoch": 1.0844118693077267, + "grad_norm": 0.11208263784646988, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 280520 + }, + { + "epoch": 1.0844505265111102, + "grad_norm": 0.115069180727005, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 280530 + }, + { + "epoch": 1.0844891837144934, + "grad_norm": 0.10561076551675797, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 280540 + }, + { + "epoch": 1.0845278409178767, + "grad_norm": 0.12431753426790237, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 280550 + }, + { + "epoch": 1.08456649812126, + "grad_norm": 0.09250196069478989, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 280560 + }, + { + "epoch": 1.0846051553246432, + "grad_norm": 0.10803579539060593, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 280570 + }, + { + "epoch": 1.0846438125280264, + "grad_norm": 0.10429500788450241, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 280580 + }, + { + "epoch": 1.0846824697314097, + "grad_norm": 0.09534043818712234, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 280590 + }, + { + "epoch": 1.084721126934793, + "grad_norm": 0.10288545489311218, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 280600 + }, + { + "epoch": 1.0847597841381762, + "grad_norm": 0.12578828632831573, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 280610 + }, + { + "epoch": 1.0847984413415597, + "grad_norm": 0.1013014167547226, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 280620 + }, + { + "epoch": 1.084837098544943, + "grad_norm": 0.10470990836620331, + "learning_rate": 0.002, + "loss": 2.341, + "step": 280630 + }, + { + "epoch": 1.0848757557483262, + "grad_norm": 0.10608610510826111, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 280640 + }, + { + "epoch": 1.0849144129517094, + "grad_norm": 0.1138029396533966, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 280650 + }, + { + "epoch": 1.0849530701550927, + "grad_norm": 0.13247482478618622, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 280660 + }, + { + "epoch": 1.084991727358476, + "grad_norm": 0.09688462316989899, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 280670 + }, + { + "epoch": 1.0850303845618592, + "grad_norm": 0.10027146339416504, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 280680 + }, + { + "epoch": 1.0850690417652424, + "grad_norm": 0.09682323783636093, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 280690 + }, + { + "epoch": 1.085107698968626, + "grad_norm": 0.0977088138461113, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 280700 + }, + { + "epoch": 1.0851463561720092, + "grad_norm": 0.11356858164072037, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 280710 + }, + { + "epoch": 1.0851850133753924, + "grad_norm": 0.10534996539354324, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 280720 + }, + { + "epoch": 1.0852236705787757, + "grad_norm": 0.14064523577690125, + "learning_rate": 0.002, + "loss": 2.334, + "step": 280730 + }, + { + "epoch": 1.085262327782159, + "grad_norm": 0.108973428606987, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 280740 + }, + { + "epoch": 1.0853009849855422, + "grad_norm": 0.10470664501190186, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 280750 + }, + { + "epoch": 1.0853396421889254, + "grad_norm": 0.10709752142429352, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 280760 + }, + { + "epoch": 1.0853782993923087, + "grad_norm": 0.1092754602432251, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 280770 + }, + { + "epoch": 1.085416956595692, + "grad_norm": 0.11230559647083282, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 280780 + }, + { + "epoch": 1.0854556137990754, + "grad_norm": 0.09335718303918839, + "learning_rate": 0.002, + "loss": 2.347, + "step": 280790 + }, + { + "epoch": 1.0854942710024587, + "grad_norm": 0.1012687161564827, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 280800 + }, + { + "epoch": 1.085532928205842, + "grad_norm": 0.09886537492275238, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 280810 + }, + { + "epoch": 1.0855715854092252, + "grad_norm": 0.10019920766353607, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 280820 + }, + { + "epoch": 1.0856102426126084, + "grad_norm": 0.12431039661169052, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 280830 + }, + { + "epoch": 1.0856488998159917, + "grad_norm": 0.1461450308561325, + "learning_rate": 0.002, + "loss": 2.34, + "step": 280840 + }, + { + "epoch": 1.085687557019375, + "grad_norm": 0.0981675386428833, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 280850 + }, + { + "epoch": 1.0857262142227582, + "grad_norm": 0.09918622672557831, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 280860 + }, + { + "epoch": 1.0857648714261416, + "grad_norm": 0.0869884192943573, + "learning_rate": 0.002, + "loss": 2.329, + "step": 280870 + }, + { + "epoch": 1.085803528629525, + "grad_norm": 0.10177048295736313, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 280880 + }, + { + "epoch": 1.0858421858329081, + "grad_norm": 0.12214681506156921, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 280890 + }, + { + "epoch": 1.0858808430362914, + "grad_norm": 0.10338126868009567, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 280900 + }, + { + "epoch": 1.0859195002396747, + "grad_norm": 0.09947532415390015, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 280910 + }, + { + "epoch": 1.085958157443058, + "grad_norm": 0.11874710768461227, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 280920 + }, + { + "epoch": 1.0859968146464412, + "grad_norm": 0.10474177449941635, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 280930 + }, + { + "epoch": 1.0860354718498244, + "grad_norm": 0.10702405124902725, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 280940 + }, + { + "epoch": 1.0860741290532077, + "grad_norm": 0.11294613033533096, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 280950 + }, + { + "epoch": 1.0861127862565911, + "grad_norm": 0.11382812261581421, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 280960 + }, + { + "epoch": 1.0861514434599744, + "grad_norm": 0.10009848326444626, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 280970 + }, + { + "epoch": 1.0861901006633576, + "grad_norm": 0.09343680739402771, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 280980 + }, + { + "epoch": 1.086228757866741, + "grad_norm": 0.11350204795598984, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 280990 + }, + { + "epoch": 1.0862674150701241, + "grad_norm": 0.11360235512256622, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 281000 + }, + { + "epoch": 1.0863060722735074, + "grad_norm": 0.0953127071261406, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 281010 + }, + { + "epoch": 1.0863447294768906, + "grad_norm": 0.10144411772489548, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 281020 + }, + { + "epoch": 1.086383386680274, + "grad_norm": 0.1047971174120903, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 281030 + }, + { + "epoch": 1.0864220438836574, + "grad_norm": 0.11271246522665024, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 281040 + }, + { + "epoch": 1.0864607010870406, + "grad_norm": 0.1031818762421608, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 281050 + }, + { + "epoch": 1.0864993582904239, + "grad_norm": 0.10849371552467346, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 281060 + }, + { + "epoch": 1.0865380154938071, + "grad_norm": 0.10233020782470703, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 281070 + }, + { + "epoch": 1.0865766726971904, + "grad_norm": 0.10694386065006256, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 281080 + }, + { + "epoch": 1.0866153299005736, + "grad_norm": 0.11533127725124359, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 281090 + }, + { + "epoch": 1.0866539871039569, + "grad_norm": 0.11579183489084244, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 281100 + }, + { + "epoch": 1.0866926443073401, + "grad_norm": 0.12135932594537735, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 281110 + }, + { + "epoch": 1.0867313015107234, + "grad_norm": 0.08877555280923843, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 281120 + }, + { + "epoch": 1.0867699587141069, + "grad_norm": 0.12630535662174225, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 281130 + }, + { + "epoch": 1.0868086159174901, + "grad_norm": 0.09699002653360367, + "learning_rate": 0.002, + "loss": 2.321, + "step": 281140 + }, + { + "epoch": 1.0868472731208734, + "grad_norm": 0.1044619083404541, + "learning_rate": 0.002, + "loss": 2.3625, + "step": 281150 + }, + { + "epoch": 1.0868859303242566, + "grad_norm": 0.10375456511974335, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 281160 + }, + { + "epoch": 1.0869245875276399, + "grad_norm": 0.10800100117921829, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 281170 + }, + { + "epoch": 1.0869632447310231, + "grad_norm": 0.11725430935621262, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 281180 + }, + { + "epoch": 1.0870019019344064, + "grad_norm": 0.11250857263803482, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 281190 + }, + { + "epoch": 1.0870405591377899, + "grad_norm": 0.11485522985458374, + "learning_rate": 0.002, + "loss": 2.333, + "step": 281200 + }, + { + "epoch": 1.087079216341173, + "grad_norm": 0.09864205867052078, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 281210 + }, + { + "epoch": 1.0871178735445564, + "grad_norm": 0.12152174115180969, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 281220 + }, + { + "epoch": 1.0871565307479396, + "grad_norm": 0.1147390827536583, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 281230 + }, + { + "epoch": 1.0871951879513229, + "grad_norm": 0.11349363625049591, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 281240 + }, + { + "epoch": 1.0872338451547061, + "grad_norm": 0.09549278020858765, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 281250 + }, + { + "epoch": 1.0872725023580894, + "grad_norm": 0.10776299983263016, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 281260 + }, + { + "epoch": 1.0873111595614726, + "grad_norm": 0.09684637188911438, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 281270 + }, + { + "epoch": 1.0873498167648559, + "grad_norm": 0.10151070356369019, + "learning_rate": 0.002, + "loss": 2.356, + "step": 281280 + }, + { + "epoch": 1.0873884739682393, + "grad_norm": 0.10506577789783478, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 281290 + }, + { + "epoch": 1.0874271311716226, + "grad_norm": 0.10208103805780411, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 281300 + }, + { + "epoch": 1.0874657883750058, + "grad_norm": 0.1129027009010315, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 281310 + }, + { + "epoch": 1.087504445578389, + "grad_norm": 0.09576416015625, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 281320 + }, + { + "epoch": 1.0875431027817724, + "grad_norm": 0.10639072954654694, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 281330 + }, + { + "epoch": 1.0875817599851556, + "grad_norm": 0.10176964104175568, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 281340 + }, + { + "epoch": 1.0876204171885389, + "grad_norm": 0.09694419801235199, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 281350 + }, + { + "epoch": 1.087659074391922, + "grad_norm": 0.1049187034368515, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 281360 + }, + { + "epoch": 1.0876977315953056, + "grad_norm": 0.10904958844184875, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 281370 + }, + { + "epoch": 1.0877363887986888, + "grad_norm": 0.10921395570039749, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 281380 + }, + { + "epoch": 1.087775046002072, + "grad_norm": 0.11888540536165237, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 281390 + }, + { + "epoch": 1.0878137032054553, + "grad_norm": 0.1021900326013565, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 281400 + }, + { + "epoch": 1.0878523604088386, + "grad_norm": 0.13177058100700378, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 281410 + }, + { + "epoch": 1.0878910176122218, + "grad_norm": 0.10043606162071228, + "learning_rate": 0.002, + "loss": 2.343, + "step": 281420 + }, + { + "epoch": 1.087929674815605, + "grad_norm": 0.11822836846113205, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 281430 + }, + { + "epoch": 1.0879683320189883, + "grad_norm": 0.12574489414691925, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 281440 + }, + { + "epoch": 1.0880069892223716, + "grad_norm": 0.1164129227399826, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 281450 + }, + { + "epoch": 1.088045646425755, + "grad_norm": 0.10353794693946838, + "learning_rate": 0.002, + "loss": 2.352, + "step": 281460 + }, + { + "epoch": 1.0880843036291383, + "grad_norm": 0.10000605136156082, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 281470 + }, + { + "epoch": 1.0881229608325216, + "grad_norm": 0.12134052067995071, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 281480 + }, + { + "epoch": 1.0881616180359048, + "grad_norm": 0.11231108754873276, + "learning_rate": 0.002, + "loss": 2.333, + "step": 281490 + }, + { + "epoch": 1.088200275239288, + "grad_norm": 0.10282287746667862, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 281500 + }, + { + "epoch": 1.0882389324426713, + "grad_norm": 0.09950326383113861, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 281510 + }, + { + "epoch": 1.0882775896460546, + "grad_norm": 0.09664419293403625, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 281520 + }, + { + "epoch": 1.0883162468494378, + "grad_norm": 0.10877744108438492, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 281530 + }, + { + "epoch": 1.0883549040528213, + "grad_norm": 0.1068381667137146, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 281540 + }, + { + "epoch": 1.0883935612562046, + "grad_norm": 0.12714095413684845, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 281550 + }, + { + "epoch": 1.0884322184595878, + "grad_norm": 0.11153408885002136, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 281560 + }, + { + "epoch": 1.088470875662971, + "grad_norm": 0.0963999554514885, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 281570 + }, + { + "epoch": 1.0885095328663543, + "grad_norm": 0.09225306659936905, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 281580 + }, + { + "epoch": 1.0885481900697376, + "grad_norm": 0.08588103950023651, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 281590 + }, + { + "epoch": 1.0885868472731208, + "grad_norm": 0.1292261779308319, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 281600 + }, + { + "epoch": 1.088625504476504, + "grad_norm": 0.09912700951099396, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 281610 + }, + { + "epoch": 1.0886641616798873, + "grad_norm": 0.08940158784389496, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 281620 + }, + { + "epoch": 1.0887028188832708, + "grad_norm": 0.12613654136657715, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 281630 + }, + { + "epoch": 1.088741476086654, + "grad_norm": 0.10662415623664856, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 281640 + }, + { + "epoch": 1.0887801332900373, + "grad_norm": 0.10559440404176712, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 281650 + }, + { + "epoch": 1.0888187904934206, + "grad_norm": 0.10085096955299377, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 281660 + }, + { + "epoch": 1.0888574476968038, + "grad_norm": 0.10817304253578186, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 281670 + }, + { + "epoch": 1.088896104900187, + "grad_norm": 0.09820788353681564, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 281680 + }, + { + "epoch": 1.0889347621035703, + "grad_norm": 0.1037544384598732, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 281690 + }, + { + "epoch": 1.0889734193069536, + "grad_norm": 0.11510848253965378, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 281700 + }, + { + "epoch": 1.089012076510337, + "grad_norm": 0.0945771336555481, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 281710 + }, + { + "epoch": 1.0890507337137203, + "grad_norm": 0.08993828296661377, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 281720 + }, + { + "epoch": 1.0890893909171036, + "grad_norm": 0.11030997335910797, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 281730 + }, + { + "epoch": 1.0891280481204868, + "grad_norm": 0.0994657352566719, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 281740 + }, + { + "epoch": 1.08916670532387, + "grad_norm": 0.10831445455551147, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 281750 + }, + { + "epoch": 1.0892053625272533, + "grad_norm": 0.11834604293107986, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 281760 + }, + { + "epoch": 1.0892440197306366, + "grad_norm": 0.09943471103906631, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 281770 + }, + { + "epoch": 1.0892826769340198, + "grad_norm": 0.09234745800495148, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 281780 + }, + { + "epoch": 1.089321334137403, + "grad_norm": 0.10414262861013412, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 281790 + }, + { + "epoch": 1.0893599913407865, + "grad_norm": 0.11926162987947464, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 281800 + }, + { + "epoch": 1.0893986485441698, + "grad_norm": 0.10056176781654358, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 281810 + }, + { + "epoch": 1.089437305747553, + "grad_norm": 0.10224136710166931, + "learning_rate": 0.002, + "loss": 2.336, + "step": 281820 + }, + { + "epoch": 1.0894759629509363, + "grad_norm": 0.09794122725725174, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 281830 + }, + { + "epoch": 1.0895146201543195, + "grad_norm": 0.1126994714140892, + "learning_rate": 0.002, + "loss": 2.332, + "step": 281840 + }, + { + "epoch": 1.0895532773577028, + "grad_norm": 0.09958003461360931, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 281850 + }, + { + "epoch": 1.089591934561086, + "grad_norm": 0.10423572361469269, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 281860 + }, + { + "epoch": 1.0896305917644693, + "grad_norm": 0.10851602256298065, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 281870 + }, + { + "epoch": 1.0896692489678528, + "grad_norm": 0.1614455133676529, + "learning_rate": 0.002, + "loss": 2.336, + "step": 281880 + }, + { + "epoch": 1.089707906171236, + "grad_norm": 0.10868556052446365, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 281890 + }, + { + "epoch": 1.0897465633746193, + "grad_norm": 0.1033228188753128, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 281900 + }, + { + "epoch": 1.0897852205780025, + "grad_norm": 0.10506334900856018, + "learning_rate": 0.002, + "loss": 2.35, + "step": 281910 + }, + { + "epoch": 1.0898238777813858, + "grad_norm": 0.10521656274795532, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 281920 + }, + { + "epoch": 1.089862534984769, + "grad_norm": 0.10547047853469849, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 281930 + }, + { + "epoch": 1.0899011921881523, + "grad_norm": 0.11806923151016235, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 281940 + }, + { + "epoch": 1.0899398493915355, + "grad_norm": 0.10370279103517532, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 281950 + }, + { + "epoch": 1.0899785065949188, + "grad_norm": 0.1059202328324318, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 281960 + }, + { + "epoch": 1.0900171637983023, + "grad_norm": 0.09515325725078583, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 281970 + }, + { + "epoch": 1.0900558210016855, + "grad_norm": 0.13132987916469574, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 281980 + }, + { + "epoch": 1.0900944782050688, + "grad_norm": 0.14355821907520294, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 281990 + }, + { + "epoch": 1.090133135408452, + "grad_norm": 0.10220211744308472, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 282000 + }, + { + "epoch": 1.0901717926118353, + "grad_norm": 0.09334767609834671, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 282010 + }, + { + "epoch": 1.0902104498152185, + "grad_norm": 0.12202389538288116, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 282020 + }, + { + "epoch": 1.0902491070186018, + "grad_norm": 0.12086988985538483, + "learning_rate": 0.002, + "loss": 2.337, + "step": 282030 + }, + { + "epoch": 1.090287764221985, + "grad_norm": 0.11102332174777985, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 282040 + }, + { + "epoch": 1.0903264214253685, + "grad_norm": 0.1081869900226593, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 282050 + }, + { + "epoch": 1.0903650786287518, + "grad_norm": 0.09771132469177246, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 282060 + }, + { + "epoch": 1.090403735832135, + "grad_norm": 0.1110697090625763, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 282070 + }, + { + "epoch": 1.0904423930355183, + "grad_norm": 0.11461424827575684, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 282080 + }, + { + "epoch": 1.0904810502389015, + "grad_norm": 0.1096828505396843, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 282090 + }, + { + "epoch": 1.0905197074422848, + "grad_norm": 0.10947354882955551, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 282100 + }, + { + "epoch": 1.090558364645668, + "grad_norm": 0.09177210927009583, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 282110 + }, + { + "epoch": 1.0905970218490513, + "grad_norm": 0.10618321597576141, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 282120 + }, + { + "epoch": 1.0906356790524345, + "grad_norm": 0.10912807285785675, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 282130 + }, + { + "epoch": 1.090674336255818, + "grad_norm": 0.12251587957143784, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 282140 + }, + { + "epoch": 1.0907129934592013, + "grad_norm": 0.09408677369356155, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 282150 + }, + { + "epoch": 1.0907516506625845, + "grad_norm": 0.11050599068403244, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 282160 + }, + { + "epoch": 1.0907903078659678, + "grad_norm": 0.11192735284566879, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 282170 + }, + { + "epoch": 1.090828965069351, + "grad_norm": 0.09625013172626495, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 282180 + }, + { + "epoch": 1.0908676222727343, + "grad_norm": 0.12688453495502472, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 282190 + }, + { + "epoch": 1.0909062794761175, + "grad_norm": 0.09461814165115356, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 282200 + }, + { + "epoch": 1.0909449366795008, + "grad_norm": 0.08834907412528992, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 282210 + }, + { + "epoch": 1.0909835938828842, + "grad_norm": 0.10458917915821075, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 282220 + }, + { + "epoch": 1.0910222510862675, + "grad_norm": 0.1085110753774643, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 282230 + }, + { + "epoch": 1.0910609082896507, + "grad_norm": 0.0959525927901268, + "learning_rate": 0.002, + "loss": 2.332, + "step": 282240 + }, + { + "epoch": 1.091099565493034, + "grad_norm": 0.09756126999855042, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 282250 + }, + { + "epoch": 1.0911382226964172, + "grad_norm": 0.11785954982042313, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 282260 + }, + { + "epoch": 1.0911768798998005, + "grad_norm": 0.09997006505727768, + "learning_rate": 0.002, + "loss": 2.333, + "step": 282270 + }, + { + "epoch": 1.0912155371031838, + "grad_norm": 0.10260170698165894, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 282280 + }, + { + "epoch": 1.091254194306567, + "grad_norm": 0.11056840419769287, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 282290 + }, + { + "epoch": 1.0912928515099503, + "grad_norm": 0.08586467802524567, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 282300 + }, + { + "epoch": 1.0913315087133337, + "grad_norm": 0.09769351780414581, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 282310 + }, + { + "epoch": 1.091370165916717, + "grad_norm": 0.09916350245475769, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 282320 + }, + { + "epoch": 1.0914088231201002, + "grad_norm": 0.14137159287929535, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 282330 + }, + { + "epoch": 1.0914474803234835, + "grad_norm": 0.10548698902130127, + "learning_rate": 0.002, + "loss": 2.326, + "step": 282340 + }, + { + "epoch": 1.0914861375268667, + "grad_norm": 0.1067422479391098, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 282350 + }, + { + "epoch": 1.09152479473025, + "grad_norm": 0.09344740211963654, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 282360 + }, + { + "epoch": 1.0915634519336332, + "grad_norm": 0.1000688299536705, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 282370 + }, + { + "epoch": 1.0916021091370165, + "grad_norm": 0.10962257534265518, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 282380 + }, + { + "epoch": 1.0916407663404, + "grad_norm": 0.10238444805145264, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 282390 + }, + { + "epoch": 1.0916794235437832, + "grad_norm": 0.09976116567850113, + "learning_rate": 0.002, + "loss": 2.341, + "step": 282400 + }, + { + "epoch": 1.0917180807471665, + "grad_norm": 0.10841300338506699, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 282410 + }, + { + "epoch": 1.0917567379505497, + "grad_norm": 0.13784433901309967, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 282420 + }, + { + "epoch": 1.091795395153933, + "grad_norm": 0.10153056681156158, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 282430 + }, + { + "epoch": 1.0918340523573162, + "grad_norm": 0.0959775373339653, + "learning_rate": 0.002, + "loss": 2.324, + "step": 282440 + }, + { + "epoch": 1.0918727095606995, + "grad_norm": 0.11032044887542725, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 282450 + }, + { + "epoch": 1.0919113667640827, + "grad_norm": 0.08857570588588715, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 282460 + }, + { + "epoch": 1.091950023967466, + "grad_norm": 0.10108328610658646, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 282470 + }, + { + "epoch": 1.0919886811708495, + "grad_norm": 0.10817533731460571, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 282480 + }, + { + "epoch": 1.0920273383742327, + "grad_norm": 0.10717236250638962, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 282490 + }, + { + "epoch": 1.092065995577616, + "grad_norm": 0.10586033016443253, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 282500 + }, + { + "epoch": 1.0921046527809992, + "grad_norm": 0.10583069175481796, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 282510 + }, + { + "epoch": 1.0921433099843825, + "grad_norm": 0.09219476580619812, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 282520 + }, + { + "epoch": 1.0921819671877657, + "grad_norm": 0.10465552657842636, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 282530 + }, + { + "epoch": 1.092220624391149, + "grad_norm": 0.10524141043424606, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 282540 + }, + { + "epoch": 1.0922592815945322, + "grad_norm": 0.09466832876205444, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 282550 + }, + { + "epoch": 1.0922979387979157, + "grad_norm": 0.11899077147245407, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 282560 + }, + { + "epoch": 1.092336596001299, + "grad_norm": 0.11153442412614822, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 282570 + }, + { + "epoch": 1.0923752532046822, + "grad_norm": 0.1049443930387497, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 282580 + }, + { + "epoch": 1.0924139104080655, + "grad_norm": 0.11074576526880264, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 282590 + }, + { + "epoch": 1.0924525676114487, + "grad_norm": 0.10997039824724197, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 282600 + }, + { + "epoch": 1.092491224814832, + "grad_norm": 0.11313261091709137, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 282610 + }, + { + "epoch": 1.0925298820182152, + "grad_norm": 0.09633355587720871, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 282620 + }, + { + "epoch": 1.0925685392215985, + "grad_norm": 0.09769116342067719, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 282630 + }, + { + "epoch": 1.0926071964249817, + "grad_norm": 0.10594171285629272, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 282640 + }, + { + "epoch": 1.0926458536283652, + "grad_norm": 0.09079165756702423, + "learning_rate": 0.002, + "loss": 2.327, + "step": 282650 + }, + { + "epoch": 1.0926845108317484, + "grad_norm": 0.10845083743333817, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 282660 + }, + { + "epoch": 1.0927231680351317, + "grad_norm": 0.09342625737190247, + "learning_rate": 0.002, + "loss": 2.3152, + "step": 282670 + }, + { + "epoch": 1.092761825238515, + "grad_norm": 0.09645923227071762, + "learning_rate": 0.002, + "loss": 2.351, + "step": 282680 + }, + { + "epoch": 1.0928004824418982, + "grad_norm": 0.10519500821828842, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 282690 + }, + { + "epoch": 1.0928391396452815, + "grad_norm": 0.10223992168903351, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 282700 + }, + { + "epoch": 1.0928777968486647, + "grad_norm": 0.10256168246269226, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 282710 + }, + { + "epoch": 1.092916454052048, + "grad_norm": 0.11982840299606323, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 282720 + }, + { + "epoch": 1.0929551112554314, + "grad_norm": 0.1197512075304985, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 282730 + }, + { + "epoch": 1.0929937684588147, + "grad_norm": 0.1088634803891182, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 282740 + }, + { + "epoch": 1.093032425662198, + "grad_norm": 0.1055222898721695, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 282750 + }, + { + "epoch": 1.0930710828655812, + "grad_norm": 0.10009009391069412, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 282760 + }, + { + "epoch": 1.0931097400689644, + "grad_norm": 0.09391296654939651, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 282770 + }, + { + "epoch": 1.0931483972723477, + "grad_norm": 0.11217600107192993, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 282780 + }, + { + "epoch": 1.093187054475731, + "grad_norm": 0.13397294282913208, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 282790 + }, + { + "epoch": 1.0932257116791142, + "grad_norm": 0.09961531311273575, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 282800 + }, + { + "epoch": 1.0932643688824975, + "grad_norm": 0.09814653545618057, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 282810 + }, + { + "epoch": 1.093303026085881, + "grad_norm": 0.12853848934173584, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 282820 + }, + { + "epoch": 1.0933416832892642, + "grad_norm": 0.10931167006492615, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 282830 + }, + { + "epoch": 1.0933803404926474, + "grad_norm": 0.11069802194833755, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 282840 + }, + { + "epoch": 1.0934189976960307, + "grad_norm": 0.09355204552412033, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 282850 + }, + { + "epoch": 1.093457654899414, + "grad_norm": 0.09880665689706802, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 282860 + }, + { + "epoch": 1.0934963121027972, + "grad_norm": 0.09627640247344971, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 282870 + }, + { + "epoch": 1.0935349693061804, + "grad_norm": 0.11856389045715332, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 282880 + }, + { + "epoch": 1.0935736265095637, + "grad_norm": 0.09591594338417053, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 282890 + }, + { + "epoch": 1.0936122837129472, + "grad_norm": 0.10306254774332047, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 282900 + }, + { + "epoch": 1.0936509409163304, + "grad_norm": 0.11224587261676788, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 282910 + }, + { + "epoch": 1.0936895981197137, + "grad_norm": 0.10437082499265671, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 282920 + }, + { + "epoch": 1.093728255323097, + "grad_norm": 0.09855062514543533, + "learning_rate": 0.002, + "loss": 2.336, + "step": 282930 + }, + { + "epoch": 1.0937669125264802, + "grad_norm": 0.09843055158853531, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 282940 + }, + { + "epoch": 1.0938055697298634, + "grad_norm": 0.09790752083063126, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 282950 + }, + { + "epoch": 1.0938442269332467, + "grad_norm": 0.10575347393751144, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 282960 + }, + { + "epoch": 1.09388288413663, + "grad_norm": 0.12459485232830048, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 282970 + }, + { + "epoch": 1.0939215413400132, + "grad_norm": 0.11834020912647247, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 282980 + }, + { + "epoch": 1.0939601985433967, + "grad_norm": 0.11081720888614655, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 282990 + }, + { + "epoch": 1.09399885574678, + "grad_norm": 0.11029912531375885, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 283000 + }, + { + "epoch": 1.0940375129501632, + "grad_norm": 0.12420996278524399, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 283010 + }, + { + "epoch": 1.0940761701535464, + "grad_norm": 0.09996948391199112, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 283020 + }, + { + "epoch": 1.0941148273569297, + "grad_norm": 0.11960510909557343, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 283030 + }, + { + "epoch": 1.094153484560313, + "grad_norm": 0.10635130107402802, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 283040 + }, + { + "epoch": 1.0941921417636962, + "grad_norm": 0.14550523459911346, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 283050 + }, + { + "epoch": 1.0942307989670796, + "grad_norm": 0.09189361333847046, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 283060 + }, + { + "epoch": 1.094269456170463, + "grad_norm": 0.12127846479415894, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 283070 + }, + { + "epoch": 1.0943081133738461, + "grad_norm": 0.10634903609752655, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 283080 + }, + { + "epoch": 1.0943467705772294, + "grad_norm": 0.0875033587217331, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 283090 + }, + { + "epoch": 1.0943854277806127, + "grad_norm": 0.10889355093240738, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 283100 + }, + { + "epoch": 1.094424084983996, + "grad_norm": 0.11864157766103745, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 283110 + }, + { + "epoch": 1.0944627421873792, + "grad_norm": 0.09440959244966507, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 283120 + }, + { + "epoch": 1.0945013993907624, + "grad_norm": 0.10142505168914795, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 283130 + }, + { + "epoch": 1.0945400565941457, + "grad_norm": 0.095737025141716, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 283140 + }, + { + "epoch": 1.0945787137975291, + "grad_norm": 0.10328061878681183, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 283150 + }, + { + "epoch": 1.0946173710009124, + "grad_norm": 0.11649847775697708, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 283160 + }, + { + "epoch": 1.0946560282042956, + "grad_norm": 0.10745200514793396, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 283170 + }, + { + "epoch": 1.094694685407679, + "grad_norm": 0.10119928419589996, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 283180 + }, + { + "epoch": 1.0947333426110621, + "grad_norm": 0.102198526263237, + "learning_rate": 0.002, + "loss": 2.338, + "step": 283190 + }, + { + "epoch": 1.0947719998144454, + "grad_norm": 0.10957236588001251, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 283200 + }, + { + "epoch": 1.0948106570178286, + "grad_norm": 0.09801110625267029, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 283210 + }, + { + "epoch": 1.094849314221212, + "grad_norm": 0.1274397224187851, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 283220 + }, + { + "epoch": 1.0948879714245954, + "grad_norm": 0.08464301377534866, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 283230 + }, + { + "epoch": 1.0949266286279786, + "grad_norm": 0.095365509390831, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 283240 + }, + { + "epoch": 1.0949652858313619, + "grad_norm": 0.09696754068136215, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 283250 + }, + { + "epoch": 1.0950039430347451, + "grad_norm": 0.10455650091171265, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 283260 + }, + { + "epoch": 1.0950426002381284, + "grad_norm": 0.08448006212711334, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 283270 + }, + { + "epoch": 1.0950812574415116, + "grad_norm": 0.10643161088228226, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 283280 + }, + { + "epoch": 1.0951199146448949, + "grad_norm": 0.1184401661157608, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 283290 + }, + { + "epoch": 1.0951585718482781, + "grad_norm": 0.10242758691310883, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 283300 + }, + { + "epoch": 1.0951972290516614, + "grad_norm": 0.09504953771829605, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 283310 + }, + { + "epoch": 1.0952358862550449, + "grad_norm": 0.0992271676659584, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 283320 + }, + { + "epoch": 1.0952745434584281, + "grad_norm": 0.0895295962691307, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 283330 + }, + { + "epoch": 1.0953132006618114, + "grad_norm": 0.09662551432847977, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 283340 + }, + { + "epoch": 1.0953518578651946, + "grad_norm": 0.10198549926280975, + "learning_rate": 0.002, + "loss": 2.329, + "step": 283350 + }, + { + "epoch": 1.0953905150685779, + "grad_norm": 0.11558642238378525, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 283360 + }, + { + "epoch": 1.0954291722719611, + "grad_norm": 0.10037699341773987, + "learning_rate": 0.002, + "loss": 2.347, + "step": 283370 + }, + { + "epoch": 1.0954678294753444, + "grad_norm": 0.10268152505159378, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 283380 + }, + { + "epoch": 1.0955064866787276, + "grad_norm": 0.08938860893249512, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 283390 + }, + { + "epoch": 1.095545143882111, + "grad_norm": 0.10257000476121902, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 283400 + }, + { + "epoch": 1.0955838010854944, + "grad_norm": 0.10406453162431717, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 283410 + }, + { + "epoch": 1.0956224582888776, + "grad_norm": 0.09739171713590622, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 283420 + }, + { + "epoch": 1.0956611154922609, + "grad_norm": 0.10173606127500534, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 283430 + }, + { + "epoch": 1.0956997726956441, + "grad_norm": 0.10406725853681564, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 283440 + }, + { + "epoch": 1.0957384298990274, + "grad_norm": 0.11148425191640854, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 283450 + }, + { + "epoch": 1.0957770871024106, + "grad_norm": 0.13934779167175293, + "learning_rate": 0.002, + "loss": 2.336, + "step": 283460 + }, + { + "epoch": 1.0958157443057939, + "grad_norm": 0.11460437625646591, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 283470 + }, + { + "epoch": 1.0958544015091771, + "grad_norm": 0.10291483998298645, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 283480 + }, + { + "epoch": 1.0958930587125606, + "grad_norm": 0.1973194181919098, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 283490 + }, + { + "epoch": 1.0959317159159438, + "grad_norm": 0.09999255836009979, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 283500 + }, + { + "epoch": 1.095970373119327, + "grad_norm": 0.09152089804410934, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 283510 + }, + { + "epoch": 1.0960090303227104, + "grad_norm": 0.10186208039522171, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 283520 + }, + { + "epoch": 1.0960476875260936, + "grad_norm": 0.09846815466880798, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 283530 + }, + { + "epoch": 1.0960863447294769, + "grad_norm": 0.1407434195280075, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 283540 + }, + { + "epoch": 1.09612500193286, + "grad_norm": 0.10553040355443954, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 283550 + }, + { + "epoch": 1.0961636591362434, + "grad_norm": 0.10117257386445999, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 283560 + }, + { + "epoch": 1.0962023163396268, + "grad_norm": 0.10986392199993134, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 283570 + }, + { + "epoch": 1.09624097354301, + "grad_norm": 0.09731899946928024, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 283580 + }, + { + "epoch": 1.0962796307463933, + "grad_norm": 0.10193801671266556, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 283590 + }, + { + "epoch": 1.0963182879497766, + "grad_norm": 0.10751131922006607, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 283600 + }, + { + "epoch": 1.0963569451531598, + "grad_norm": 0.10056516528129578, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 283610 + }, + { + "epoch": 1.096395602356543, + "grad_norm": 0.08993778377771378, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 283620 + }, + { + "epoch": 1.0964342595599263, + "grad_norm": 0.1073274165391922, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 283630 + }, + { + "epoch": 1.0964729167633096, + "grad_norm": 0.13375040888786316, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 283640 + }, + { + "epoch": 1.0965115739666929, + "grad_norm": 0.11124551296234131, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 283650 + }, + { + "epoch": 1.0965502311700763, + "grad_norm": 0.09685186296701431, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 283660 + }, + { + "epoch": 1.0965888883734596, + "grad_norm": 0.10944927483797073, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 283670 + }, + { + "epoch": 1.0966275455768428, + "grad_norm": 0.12088717520236969, + "learning_rate": 0.002, + "loss": 2.33, + "step": 283680 + }, + { + "epoch": 1.096666202780226, + "grad_norm": 0.09783034771680832, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 283690 + }, + { + "epoch": 1.0967048599836093, + "grad_norm": 0.09728636592626572, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 283700 + }, + { + "epoch": 1.0967435171869926, + "grad_norm": 0.11335409432649612, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 283710 + }, + { + "epoch": 1.0967821743903758, + "grad_norm": 0.13359227776527405, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 283720 + }, + { + "epoch": 1.096820831593759, + "grad_norm": 0.12501873075962067, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 283730 + }, + { + "epoch": 1.0968594887971426, + "grad_norm": 0.10961022973060608, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 283740 + }, + { + "epoch": 1.0968981460005258, + "grad_norm": 0.10510764271020889, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 283750 + }, + { + "epoch": 1.096936803203909, + "grad_norm": 0.26109975576400757, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 283760 + }, + { + "epoch": 1.0969754604072923, + "grad_norm": 0.1035788282752037, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 283770 + }, + { + "epoch": 1.0970141176106756, + "grad_norm": 0.14167214930057526, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 283780 + }, + { + "epoch": 1.0970527748140588, + "grad_norm": 0.09841074794530869, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 283790 + }, + { + "epoch": 1.097091432017442, + "grad_norm": 0.10851003229618073, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 283800 + }, + { + "epoch": 1.0971300892208253, + "grad_norm": 0.09611783176660538, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 283810 + }, + { + "epoch": 1.0971687464242086, + "grad_norm": 0.09295455366373062, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 283820 + }, + { + "epoch": 1.097207403627592, + "grad_norm": 0.12964671850204468, + "learning_rate": 0.002, + "loss": 2.336, + "step": 283830 + }, + { + "epoch": 1.0972460608309753, + "grad_norm": 0.09662358462810516, + "learning_rate": 0.002, + "loss": 2.338, + "step": 283840 + }, + { + "epoch": 1.0972847180343586, + "grad_norm": 0.10117034614086151, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 283850 + }, + { + "epoch": 1.0973233752377418, + "grad_norm": 0.10756699740886688, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 283860 + }, + { + "epoch": 1.097362032441125, + "grad_norm": 0.1286441534757614, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 283870 + }, + { + "epoch": 1.0974006896445083, + "grad_norm": 0.1278308480978012, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 283880 + }, + { + "epoch": 1.0974393468478916, + "grad_norm": 0.09650461375713348, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 283890 + }, + { + "epoch": 1.0974780040512748, + "grad_norm": 0.10499653965234756, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 283900 + }, + { + "epoch": 1.0975166612546583, + "grad_norm": 0.12353569269180298, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 283910 + }, + { + "epoch": 1.0975553184580416, + "grad_norm": 0.11407862603664398, + "learning_rate": 0.002, + "loss": 2.332, + "step": 283920 + }, + { + "epoch": 1.0975939756614248, + "grad_norm": 0.11378373950719833, + "learning_rate": 0.002, + "loss": 2.337, + "step": 283930 + }, + { + "epoch": 1.097632632864808, + "grad_norm": 0.09341233223676682, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 283940 + }, + { + "epoch": 1.0976712900681913, + "grad_norm": 0.1038387194275856, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 283950 + }, + { + "epoch": 1.0977099472715746, + "grad_norm": 0.10045725107192993, + "learning_rate": 0.002, + "loss": 2.34, + "step": 283960 + }, + { + "epoch": 1.0977486044749578, + "grad_norm": 0.11936244368553162, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 283970 + }, + { + "epoch": 1.097787261678341, + "grad_norm": 0.12233182042837143, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 283980 + }, + { + "epoch": 1.0978259188817243, + "grad_norm": 0.10054139047861099, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 283990 + }, + { + "epoch": 1.0978645760851078, + "grad_norm": 0.11717696487903595, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 284000 + }, + { + "epoch": 1.097903233288491, + "grad_norm": 0.11514868587255478, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 284010 + }, + { + "epoch": 1.0979418904918743, + "grad_norm": 0.10135221481323242, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 284020 + }, + { + "epoch": 1.0979805476952575, + "grad_norm": 0.11172813177108765, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 284030 + }, + { + "epoch": 1.0980192048986408, + "grad_norm": 0.13704566657543182, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 284040 + }, + { + "epoch": 1.098057862102024, + "grad_norm": 0.1290348768234253, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 284050 + }, + { + "epoch": 1.0980965193054073, + "grad_norm": 0.10477565228939056, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 284060 + }, + { + "epoch": 1.0981351765087906, + "grad_norm": 0.0943203717470169, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 284070 + }, + { + "epoch": 1.098173833712174, + "grad_norm": 0.11145610362291336, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 284080 + }, + { + "epoch": 1.0982124909155573, + "grad_norm": 0.09936654567718506, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 284090 + }, + { + "epoch": 1.0982511481189405, + "grad_norm": 0.09357469528913498, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 284100 + }, + { + "epoch": 1.0982898053223238, + "grad_norm": 0.11341989040374756, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 284110 + }, + { + "epoch": 1.098328462525707, + "grad_norm": 0.11302851140499115, + "learning_rate": 0.002, + "loss": 2.338, + "step": 284120 + }, + { + "epoch": 1.0983671197290903, + "grad_norm": 0.11179227381944656, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 284130 + }, + { + "epoch": 1.0984057769324735, + "grad_norm": 0.10397867113351822, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 284140 + }, + { + "epoch": 1.0984444341358568, + "grad_norm": 0.10592572391033173, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 284150 + }, + { + "epoch": 1.09848309133924, + "grad_norm": 0.1135161817073822, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 284160 + }, + { + "epoch": 1.0985217485426235, + "grad_norm": 0.11015531420707703, + "learning_rate": 0.002, + "loss": 2.332, + "step": 284170 + }, + { + "epoch": 1.0985604057460068, + "grad_norm": 0.1006748378276825, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 284180 + }, + { + "epoch": 1.09859906294939, + "grad_norm": 0.10288757085800171, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 284190 + }, + { + "epoch": 1.0986377201527733, + "grad_norm": 0.11791105568408966, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 284200 + }, + { + "epoch": 1.0986763773561565, + "grad_norm": 0.1182391345500946, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 284210 + }, + { + "epoch": 1.0987150345595398, + "grad_norm": 0.10134794563055038, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 284220 + }, + { + "epoch": 1.098753691762923, + "grad_norm": 0.10538896173238754, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 284230 + }, + { + "epoch": 1.0987923489663063, + "grad_norm": 0.11533650755882263, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 284240 + }, + { + "epoch": 1.0988310061696898, + "grad_norm": 0.10452184826135635, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 284250 + }, + { + "epoch": 1.098869663373073, + "grad_norm": 0.10536464303731918, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 284260 + }, + { + "epoch": 1.0989083205764563, + "grad_norm": 0.09663750976324081, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 284270 + }, + { + "epoch": 1.0989469777798395, + "grad_norm": 0.10678671300411224, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 284280 + }, + { + "epoch": 1.0989856349832228, + "grad_norm": 0.11602350324392319, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 284290 + }, + { + "epoch": 1.099024292186606, + "grad_norm": 0.09180068224668503, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 284300 + }, + { + "epoch": 1.0990629493899893, + "grad_norm": 0.1271265298128128, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 284310 + }, + { + "epoch": 1.0991016065933725, + "grad_norm": 0.12973609566688538, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 284320 + }, + { + "epoch": 1.0991402637967558, + "grad_norm": 0.10404635965824127, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 284330 + }, + { + "epoch": 1.0991789210001393, + "grad_norm": 0.10287823528051376, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 284340 + }, + { + "epoch": 1.0992175782035225, + "grad_norm": 0.11792373657226562, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 284350 + }, + { + "epoch": 1.0992562354069058, + "grad_norm": 0.10006173700094223, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 284360 + }, + { + "epoch": 1.099294892610289, + "grad_norm": 0.10750042647123337, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 284370 + }, + { + "epoch": 1.0993335498136723, + "grad_norm": 0.10897422581911087, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 284380 + }, + { + "epoch": 1.0993722070170555, + "grad_norm": 0.11446550488471985, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 284390 + }, + { + "epoch": 1.0994108642204388, + "grad_norm": 0.10375721007585526, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 284400 + }, + { + "epoch": 1.099449521423822, + "grad_norm": 0.10286027938127518, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 284410 + }, + { + "epoch": 1.0994881786272055, + "grad_norm": 0.09958858788013458, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 284420 + }, + { + "epoch": 1.0995268358305887, + "grad_norm": 0.11231593787670135, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 284430 + }, + { + "epoch": 1.099565493033972, + "grad_norm": 0.0965258777141571, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 284440 + }, + { + "epoch": 1.0996041502373552, + "grad_norm": 0.12888574600219727, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 284450 + }, + { + "epoch": 1.0996428074407385, + "grad_norm": 0.10070165991783142, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 284460 + }, + { + "epoch": 1.0996814646441218, + "grad_norm": 0.10629080981016159, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 284470 + }, + { + "epoch": 1.099720121847505, + "grad_norm": 0.10859460383653641, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 284480 + }, + { + "epoch": 1.0997587790508883, + "grad_norm": 0.1041555181145668, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 284490 + }, + { + "epoch": 1.0997974362542715, + "grad_norm": 0.1404908150434494, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 284500 + }, + { + "epoch": 1.099836093457655, + "grad_norm": 0.10661852359771729, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 284510 + }, + { + "epoch": 1.0998747506610382, + "grad_norm": 0.09594159573316574, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 284520 + }, + { + "epoch": 1.0999134078644215, + "grad_norm": 0.10165949165821075, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 284530 + }, + { + "epoch": 1.0999520650678047, + "grad_norm": 0.10713981091976166, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 284540 + }, + { + "epoch": 1.099990722271188, + "grad_norm": 0.1109931617975235, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 284550 + }, + { + "epoch": 1.1000293794745712, + "grad_norm": 0.09798242896795273, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 284560 + }, + { + "epoch": 1.1000680366779545, + "grad_norm": 0.11462962627410889, + "learning_rate": 0.002, + "loss": 2.328, + "step": 284570 + }, + { + "epoch": 1.1001066938813377, + "grad_norm": 0.10036194324493408, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 284580 + }, + { + "epoch": 1.1001453510847212, + "grad_norm": 0.0940588042140007, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 284590 + }, + { + "epoch": 1.1001840082881045, + "grad_norm": 0.10205405950546265, + "learning_rate": 0.002, + "loss": 2.327, + "step": 284600 + }, + { + "epoch": 1.1002226654914877, + "grad_norm": 0.09708960354328156, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 284610 + }, + { + "epoch": 1.100261322694871, + "grad_norm": 0.09543051570653915, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 284620 + }, + { + "epoch": 1.1002999798982542, + "grad_norm": 0.11247290670871735, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 284630 + }, + { + "epoch": 1.1003386371016375, + "grad_norm": 0.10105713456869125, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 284640 + }, + { + "epoch": 1.1003772943050207, + "grad_norm": 0.09401721507310867, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 284650 + }, + { + "epoch": 1.100415951508404, + "grad_norm": 0.11368951946496964, + "learning_rate": 0.002, + "loss": 2.336, + "step": 284660 + }, + { + "epoch": 1.1004546087117872, + "grad_norm": 0.09477970004081726, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 284670 + }, + { + "epoch": 1.1004932659151707, + "grad_norm": 0.10316511243581772, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 284680 + }, + { + "epoch": 1.100531923118554, + "grad_norm": 0.09212981164455414, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 284690 + }, + { + "epoch": 1.1005705803219372, + "grad_norm": 0.106972336769104, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 284700 + }, + { + "epoch": 1.1006092375253205, + "grad_norm": 0.093800850212574, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 284710 + }, + { + "epoch": 1.1006478947287037, + "grad_norm": 0.1049603745341301, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 284720 + }, + { + "epoch": 1.100686551932087, + "grad_norm": 0.09647928178310394, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 284730 + }, + { + "epoch": 1.1007252091354702, + "grad_norm": 0.10955628752708435, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 284740 + }, + { + "epoch": 1.1007638663388535, + "grad_norm": 0.10220403969287872, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 284750 + }, + { + "epoch": 1.100802523542237, + "grad_norm": 0.09642922133207321, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 284760 + }, + { + "epoch": 1.1008411807456202, + "grad_norm": 0.10072393715381622, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 284770 + }, + { + "epoch": 1.1008798379490035, + "grad_norm": 0.11426430195569992, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 284780 + }, + { + "epoch": 1.1009184951523867, + "grad_norm": 0.10772670805454254, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 284790 + }, + { + "epoch": 1.10095715235577, + "grad_norm": 0.14087091386318207, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 284800 + }, + { + "epoch": 1.1009958095591532, + "grad_norm": 0.10744642466306686, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 284810 + }, + { + "epoch": 1.1010344667625365, + "grad_norm": 0.09577398002147675, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 284820 + }, + { + "epoch": 1.1010731239659197, + "grad_norm": 0.09843907505273819, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 284830 + }, + { + "epoch": 1.101111781169303, + "grad_norm": 0.10781969875097275, + "learning_rate": 0.002, + "loss": 2.339, + "step": 284840 + }, + { + "epoch": 1.1011504383726864, + "grad_norm": 0.10822036117315292, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 284850 + }, + { + "epoch": 1.1011890955760697, + "grad_norm": 0.11118780076503754, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 284860 + }, + { + "epoch": 1.101227752779453, + "grad_norm": 0.13004349172115326, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 284870 + }, + { + "epoch": 1.1012664099828362, + "grad_norm": 0.1018972173333168, + "learning_rate": 0.002, + "loss": 2.327, + "step": 284880 + }, + { + "epoch": 1.1013050671862195, + "grad_norm": 0.1026332899928093, + "learning_rate": 0.002, + "loss": 2.34, + "step": 284890 + }, + { + "epoch": 1.1013437243896027, + "grad_norm": 0.09754814952611923, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 284900 + }, + { + "epoch": 1.101382381592986, + "grad_norm": 0.11591901630163193, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 284910 + }, + { + "epoch": 1.1014210387963692, + "grad_norm": 0.1105770617723465, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 284920 + }, + { + "epoch": 1.1014596959997527, + "grad_norm": 0.11102866381406784, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 284930 + }, + { + "epoch": 1.101498353203136, + "grad_norm": 0.10752061009407043, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 284940 + }, + { + "epoch": 1.1015370104065192, + "grad_norm": 0.11592989414930344, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 284950 + }, + { + "epoch": 1.1015756676099024, + "grad_norm": 0.09817852824926376, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 284960 + }, + { + "epoch": 1.1016143248132857, + "grad_norm": 0.1352873593568802, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 284970 + }, + { + "epoch": 1.101652982016669, + "grad_norm": 0.10397396981716156, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 284980 + }, + { + "epoch": 1.1016916392200522, + "grad_norm": 0.1036774218082428, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 284990 + }, + { + "epoch": 1.1017302964234355, + "grad_norm": 0.10826187580823898, + "learning_rate": 0.002, + "loss": 2.338, + "step": 285000 + }, + { + "epoch": 1.101768953626819, + "grad_norm": 0.11108260601758957, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 285010 + }, + { + "epoch": 1.1018076108302022, + "grad_norm": 0.10326624661684036, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 285020 + }, + { + "epoch": 1.1018462680335854, + "grad_norm": 0.11157377809286118, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 285030 + }, + { + "epoch": 1.1018849252369687, + "grad_norm": 0.10896344482898712, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 285040 + }, + { + "epoch": 1.101923582440352, + "grad_norm": 0.11324446648359299, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 285050 + }, + { + "epoch": 1.1019622396437352, + "grad_norm": 0.11675186455249786, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 285060 + }, + { + "epoch": 1.1020008968471184, + "grad_norm": 0.40514060854911804, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 285070 + }, + { + "epoch": 1.1020395540505017, + "grad_norm": 0.10736236721277237, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 285080 + }, + { + "epoch": 1.1020782112538852, + "grad_norm": 0.1119396984577179, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 285090 + }, + { + "epoch": 1.1021168684572684, + "grad_norm": 0.10223685950040817, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 285100 + }, + { + "epoch": 1.1021555256606517, + "grad_norm": 0.11727441847324371, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 285110 + }, + { + "epoch": 1.102194182864035, + "grad_norm": 0.10487577319145203, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 285120 + }, + { + "epoch": 1.1022328400674182, + "grad_norm": 0.10361069440841675, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 285130 + }, + { + "epoch": 1.1022714972708014, + "grad_norm": 0.1039617732167244, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 285140 + }, + { + "epoch": 1.1023101544741847, + "grad_norm": 0.1025695726275444, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 285150 + }, + { + "epoch": 1.102348811677568, + "grad_norm": 0.13202959299087524, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 285160 + }, + { + "epoch": 1.1023874688809512, + "grad_norm": 0.10830264538526535, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 285170 + }, + { + "epoch": 1.1024261260843347, + "grad_norm": 0.08623657375574112, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 285180 + }, + { + "epoch": 1.102464783287718, + "grad_norm": 0.1080300509929657, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 285190 + }, + { + "epoch": 1.1025034404911012, + "grad_norm": 0.10596869140863419, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 285200 + }, + { + "epoch": 1.1025420976944844, + "grad_norm": 0.1066625639796257, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 285210 + }, + { + "epoch": 1.1025807548978677, + "grad_norm": 0.11383505910634995, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 285220 + }, + { + "epoch": 1.102619412101251, + "grad_norm": 0.148569256067276, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 285230 + }, + { + "epoch": 1.1026580693046342, + "grad_norm": 0.1070183590054512, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 285240 + }, + { + "epoch": 1.1026967265080174, + "grad_norm": 0.1264624297618866, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 285250 + }, + { + "epoch": 1.102735383711401, + "grad_norm": 0.10144893825054169, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 285260 + }, + { + "epoch": 1.1027740409147841, + "grad_norm": 0.11170484870672226, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 285270 + }, + { + "epoch": 1.1028126981181674, + "grad_norm": 0.10286292433738708, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 285280 + }, + { + "epoch": 1.1028513553215507, + "grad_norm": 0.10856574028730392, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 285290 + }, + { + "epoch": 1.102890012524934, + "grad_norm": 0.09397318214178085, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 285300 + }, + { + "epoch": 1.1029286697283172, + "grad_norm": 0.133082315325737, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 285310 + }, + { + "epoch": 1.1029673269317004, + "grad_norm": 0.1034841313958168, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 285320 + }, + { + "epoch": 1.1030059841350837, + "grad_norm": 0.10122717171907425, + "learning_rate": 0.002, + "loss": 2.333, + "step": 285330 + }, + { + "epoch": 1.103044641338467, + "grad_norm": 0.09922084212303162, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 285340 + }, + { + "epoch": 1.1030832985418504, + "grad_norm": 0.10910443961620331, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 285350 + }, + { + "epoch": 1.1031219557452336, + "grad_norm": 0.13533568382263184, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 285360 + }, + { + "epoch": 1.103160612948617, + "grad_norm": 0.09651536494493484, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 285370 + }, + { + "epoch": 1.1031992701520001, + "grad_norm": 0.0913858413696289, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 285380 + }, + { + "epoch": 1.1032379273553834, + "grad_norm": 0.13411878049373627, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 285390 + }, + { + "epoch": 1.1032765845587666, + "grad_norm": 0.10474178194999695, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 285400 + }, + { + "epoch": 1.10331524176215, + "grad_norm": 0.11313706636428833, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 285410 + }, + { + "epoch": 1.1033538989655332, + "grad_norm": 0.11625546962022781, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 285420 + }, + { + "epoch": 1.1033925561689166, + "grad_norm": 0.10761766880750656, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 285430 + }, + { + "epoch": 1.1034312133722999, + "grad_norm": 0.1149444431066513, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 285440 + }, + { + "epoch": 1.1034698705756831, + "grad_norm": 0.10345727950334549, + "learning_rate": 0.002, + "loss": 2.33, + "step": 285450 + }, + { + "epoch": 1.1035085277790664, + "grad_norm": 0.10542980581521988, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 285460 + }, + { + "epoch": 1.1035471849824496, + "grad_norm": 0.12415088713169098, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 285470 + }, + { + "epoch": 1.1035858421858329, + "grad_norm": 0.09921295195817947, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 285480 + }, + { + "epoch": 1.1036244993892161, + "grad_norm": 0.10105586051940918, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 285490 + }, + { + "epoch": 1.1036631565925994, + "grad_norm": 0.13473346829414368, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 285500 + }, + { + "epoch": 1.1037018137959826, + "grad_norm": 0.11584562808275223, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 285510 + }, + { + "epoch": 1.1037404709993661, + "grad_norm": 0.09862792491912842, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 285520 + }, + { + "epoch": 1.1037791282027494, + "grad_norm": 0.08858393132686615, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 285530 + }, + { + "epoch": 1.1038177854061326, + "grad_norm": 0.1095161959528923, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 285540 + }, + { + "epoch": 1.1038564426095159, + "grad_norm": 0.10374027490615845, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 285550 + }, + { + "epoch": 1.1038950998128991, + "grad_norm": 0.10438241809606552, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 285560 + }, + { + "epoch": 1.1039337570162824, + "grad_norm": 0.09749176353216171, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 285570 + }, + { + "epoch": 1.1039724142196656, + "grad_norm": 0.11155575513839722, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 285580 + }, + { + "epoch": 1.1040110714230489, + "grad_norm": 0.13335822522640228, + "learning_rate": 0.002, + "loss": 2.344, + "step": 285590 + }, + { + "epoch": 1.1040497286264324, + "grad_norm": 0.08708234131336212, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 285600 + }, + { + "epoch": 1.1040883858298156, + "grad_norm": 0.09702277183532715, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 285610 + }, + { + "epoch": 1.1041270430331989, + "grad_norm": 0.10341376811265945, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 285620 + }, + { + "epoch": 1.1041657002365821, + "grad_norm": 0.09655317664146423, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 285630 + }, + { + "epoch": 1.1042043574399654, + "grad_norm": 0.09124155342578888, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 285640 + }, + { + "epoch": 1.1042430146433486, + "grad_norm": 0.10956746339797974, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 285650 + }, + { + "epoch": 1.1042816718467319, + "grad_norm": 0.10166729986667633, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 285660 + }, + { + "epoch": 1.1043203290501151, + "grad_norm": 0.14822007715702057, + "learning_rate": 0.002, + "loss": 2.323, + "step": 285670 + }, + { + "epoch": 1.1043589862534984, + "grad_norm": 0.09753800928592682, + "learning_rate": 0.002, + "loss": 2.342, + "step": 285680 + }, + { + "epoch": 1.1043976434568819, + "grad_norm": 0.10967541486024857, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 285690 + }, + { + "epoch": 1.104436300660265, + "grad_norm": 0.10217584669589996, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 285700 + }, + { + "epoch": 1.1044749578636484, + "grad_norm": 0.10899412631988525, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 285710 + }, + { + "epoch": 1.1045136150670316, + "grad_norm": 0.0949438065290451, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 285720 + }, + { + "epoch": 1.1045522722704149, + "grad_norm": 0.10611841827630997, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 285730 + }, + { + "epoch": 1.104590929473798, + "grad_norm": 0.1734570562839508, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 285740 + }, + { + "epoch": 1.1046295866771814, + "grad_norm": 0.12010052055120468, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 285750 + }, + { + "epoch": 1.1046682438805646, + "grad_norm": 0.10046463459730148, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 285760 + }, + { + "epoch": 1.104706901083948, + "grad_norm": 0.14872701466083527, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 285770 + }, + { + "epoch": 1.1047455582873313, + "grad_norm": 0.09705018252134323, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 285780 + }, + { + "epoch": 1.1047842154907146, + "grad_norm": 0.11013457924127579, + "learning_rate": 0.002, + "loss": 2.329, + "step": 285790 + }, + { + "epoch": 1.1048228726940978, + "grad_norm": 0.09360580146312714, + "learning_rate": 0.002, + "loss": 2.329, + "step": 285800 + }, + { + "epoch": 1.104861529897481, + "grad_norm": 0.09901615977287292, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 285810 + }, + { + "epoch": 1.1049001871008644, + "grad_norm": 0.09901581704616547, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 285820 + }, + { + "epoch": 1.1049388443042476, + "grad_norm": 0.11049401015043259, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 285830 + }, + { + "epoch": 1.1049775015076309, + "grad_norm": 0.09339258819818497, + "learning_rate": 0.002, + "loss": 2.334, + "step": 285840 + }, + { + "epoch": 1.105016158711014, + "grad_norm": 0.08979971706867218, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 285850 + }, + { + "epoch": 1.1050548159143976, + "grad_norm": 0.0979980006814003, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 285860 + }, + { + "epoch": 1.1050934731177808, + "grad_norm": 0.11645156890153885, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 285870 + }, + { + "epoch": 1.105132130321164, + "grad_norm": 0.10296425223350525, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 285880 + }, + { + "epoch": 1.1051707875245473, + "grad_norm": 0.10710039734840393, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 285890 + }, + { + "epoch": 1.1052094447279306, + "grad_norm": 0.10021751374006271, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 285900 + }, + { + "epoch": 1.1052481019313138, + "grad_norm": 0.12587647140026093, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 285910 + }, + { + "epoch": 1.105286759134697, + "grad_norm": 0.10164447128772736, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 285920 + }, + { + "epoch": 1.1053254163380803, + "grad_norm": 0.094952292740345, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 285930 + }, + { + "epoch": 1.1053640735414638, + "grad_norm": 0.12019840627908707, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 285940 + }, + { + "epoch": 1.105402730744847, + "grad_norm": 0.12803016602993011, + "learning_rate": 0.002, + "loss": 2.33, + "step": 285950 + }, + { + "epoch": 1.1054413879482303, + "grad_norm": 0.11855556070804596, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 285960 + }, + { + "epoch": 1.1054800451516136, + "grad_norm": 0.0914088562130928, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 285970 + }, + { + "epoch": 1.1055187023549968, + "grad_norm": 0.10145406424999237, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 285980 + }, + { + "epoch": 1.10555735955838, + "grad_norm": 0.1333356648683548, + "learning_rate": 0.002, + "loss": 2.32, + "step": 285990 + }, + { + "epoch": 1.1055960167617633, + "grad_norm": 0.12203472852706909, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 286000 + }, + { + "epoch": 1.1056346739651466, + "grad_norm": 0.11965616047382355, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 286010 + }, + { + "epoch": 1.1056733311685298, + "grad_norm": 0.09209474176168442, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 286020 + }, + { + "epoch": 1.1057119883719133, + "grad_norm": 0.0947568342089653, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 286030 + }, + { + "epoch": 1.1057506455752966, + "grad_norm": 0.11331100016832352, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 286040 + }, + { + "epoch": 1.1057893027786798, + "grad_norm": 0.09801577776670456, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 286050 + }, + { + "epoch": 1.105827959982063, + "grad_norm": 0.09460748732089996, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 286060 + }, + { + "epoch": 1.1058666171854463, + "grad_norm": 0.10627475380897522, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 286070 + }, + { + "epoch": 1.1059052743888296, + "grad_norm": 0.10834797471761703, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 286080 + }, + { + "epoch": 1.1059439315922128, + "grad_norm": 0.1028374433517456, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 286090 + }, + { + "epoch": 1.105982588795596, + "grad_norm": 0.130677729845047, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 286100 + }, + { + "epoch": 1.1060212459989796, + "grad_norm": 0.09175800532102585, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 286110 + }, + { + "epoch": 1.1060599032023628, + "grad_norm": 0.1281922161579132, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 286120 + }, + { + "epoch": 1.106098560405746, + "grad_norm": 0.10538359731435776, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 286130 + }, + { + "epoch": 1.1061372176091293, + "grad_norm": 0.1097087562084198, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 286140 + }, + { + "epoch": 1.1061758748125126, + "grad_norm": 0.09895108640193939, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 286150 + }, + { + "epoch": 1.1062145320158958, + "grad_norm": 0.11682061851024628, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 286160 + }, + { + "epoch": 1.106253189219279, + "grad_norm": 0.09236068278551102, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 286170 + }, + { + "epoch": 1.1062918464226623, + "grad_norm": 0.1044696792960167, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 286180 + }, + { + "epoch": 1.1063305036260456, + "grad_norm": 0.10135937482118607, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 286190 + }, + { + "epoch": 1.106369160829429, + "grad_norm": 0.09955133497714996, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 286200 + }, + { + "epoch": 1.1064078180328123, + "grad_norm": 0.11692699790000916, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 286210 + }, + { + "epoch": 1.1064464752361955, + "grad_norm": 0.0946819931268692, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 286220 + }, + { + "epoch": 1.1064851324395788, + "grad_norm": 0.10226627439260483, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 286230 + }, + { + "epoch": 1.106523789642962, + "grad_norm": 0.10247496515512466, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 286240 + }, + { + "epoch": 1.1065624468463453, + "grad_norm": 0.10509783029556274, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 286250 + }, + { + "epoch": 1.1066011040497286, + "grad_norm": 0.11241637170314789, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 286260 + }, + { + "epoch": 1.1066397612531118, + "grad_norm": 0.16422449052333832, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 286270 + }, + { + "epoch": 1.1066784184564953, + "grad_norm": 0.11735934764146805, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 286280 + }, + { + "epoch": 1.1067170756598785, + "grad_norm": 0.09991582483053207, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 286290 + }, + { + "epoch": 1.1067557328632618, + "grad_norm": 0.09954442828893661, + "learning_rate": 0.002, + "loss": 2.326, + "step": 286300 + }, + { + "epoch": 1.106794390066645, + "grad_norm": 0.10756408423185349, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 286310 + }, + { + "epoch": 1.1068330472700283, + "grad_norm": 0.10762269049882889, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 286320 + }, + { + "epoch": 1.1068717044734115, + "grad_norm": 0.1275537610054016, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 286330 + }, + { + "epoch": 1.1069103616767948, + "grad_norm": 0.10587187111377716, + "learning_rate": 0.002, + "loss": 2.3102, + "step": 286340 + }, + { + "epoch": 1.106949018880178, + "grad_norm": 0.10666114091873169, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 286350 + }, + { + "epoch": 1.1069876760835613, + "grad_norm": 0.11461252719163895, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 286360 + }, + { + "epoch": 1.1070263332869448, + "grad_norm": 0.13306303322315216, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 286370 + }, + { + "epoch": 1.107064990490328, + "grad_norm": 0.10906513780355453, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 286380 + }, + { + "epoch": 1.1071036476937113, + "grad_norm": 0.10084366053342819, + "learning_rate": 0.002, + "loss": 2.339, + "step": 286390 + }, + { + "epoch": 1.1071423048970945, + "grad_norm": 0.1198674812912941, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 286400 + }, + { + "epoch": 1.1071809621004778, + "grad_norm": 0.09597591310739517, + "learning_rate": 0.002, + "loss": 2.334, + "step": 286410 + }, + { + "epoch": 1.107219619303861, + "grad_norm": 0.12501464784145355, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 286420 + }, + { + "epoch": 1.1072582765072443, + "grad_norm": 0.10793590545654297, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 286430 + }, + { + "epoch": 1.1072969337106275, + "grad_norm": 0.09429333359003067, + "learning_rate": 0.002, + "loss": 2.328, + "step": 286440 + }, + { + "epoch": 1.107335590914011, + "grad_norm": 0.09729223698377609, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 286450 + }, + { + "epoch": 1.1073742481173943, + "grad_norm": 0.09174039214849472, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 286460 + }, + { + "epoch": 1.1074129053207775, + "grad_norm": 0.10584468394517899, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 286470 + }, + { + "epoch": 1.1074515625241608, + "grad_norm": 0.10441362112760544, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 286480 + }, + { + "epoch": 1.107490219727544, + "grad_norm": 0.09315589815378189, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 286490 + }, + { + "epoch": 1.1075288769309273, + "grad_norm": 0.08834546059370041, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 286500 + }, + { + "epoch": 1.1075675341343105, + "grad_norm": 0.10272207856178284, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 286510 + }, + { + "epoch": 1.1076061913376938, + "grad_norm": 0.15574143826961517, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 286520 + }, + { + "epoch": 1.107644848541077, + "grad_norm": 0.09827958792448044, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 286530 + }, + { + "epoch": 1.1076835057444605, + "grad_norm": 0.10926888883113861, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 286540 + }, + { + "epoch": 1.1077221629478438, + "grad_norm": 0.10553573817014694, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 286550 + }, + { + "epoch": 1.107760820151227, + "grad_norm": 0.1055358499288559, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 286560 + }, + { + "epoch": 1.1077994773546103, + "grad_norm": 0.11329042911529541, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 286570 + }, + { + "epoch": 1.1078381345579935, + "grad_norm": 0.09704306721687317, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 286580 + }, + { + "epoch": 1.1078767917613768, + "grad_norm": 0.10216046124696732, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 286590 + }, + { + "epoch": 1.10791544896476, + "grad_norm": 0.10295818001031876, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 286600 + }, + { + "epoch": 1.1079541061681433, + "grad_norm": 0.14099784195423126, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 286610 + }, + { + "epoch": 1.1079927633715267, + "grad_norm": 0.10787280648946762, + "learning_rate": 0.002, + "loss": 2.324, + "step": 286620 + }, + { + "epoch": 1.10803142057491, + "grad_norm": 0.10273903608322144, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 286630 + }, + { + "epoch": 1.1080700777782932, + "grad_norm": 0.09775515645742416, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 286640 + }, + { + "epoch": 1.1081087349816765, + "grad_norm": 0.11061781644821167, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 286650 + }, + { + "epoch": 1.1081473921850598, + "grad_norm": 0.12733496725559235, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 286660 + }, + { + "epoch": 1.108186049388443, + "grad_norm": 0.1342155486345291, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 286670 + }, + { + "epoch": 1.1082247065918263, + "grad_norm": 0.09409618377685547, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 286680 + }, + { + "epoch": 1.1082633637952095, + "grad_norm": 0.10852969437837601, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 286690 + }, + { + "epoch": 1.1083020209985928, + "grad_norm": 0.09696200489997864, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 286700 + }, + { + "epoch": 1.1083406782019762, + "grad_norm": 0.09392782300710678, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 286710 + }, + { + "epoch": 1.1083793354053595, + "grad_norm": 0.11639557033777237, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 286720 + }, + { + "epoch": 1.1084179926087427, + "grad_norm": 0.11509211361408234, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 286730 + }, + { + "epoch": 1.108456649812126, + "grad_norm": 0.09146511554718018, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 286740 + }, + { + "epoch": 1.1084953070155092, + "grad_norm": 0.11421307176351547, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 286750 + }, + { + "epoch": 1.1085339642188925, + "grad_norm": 0.10285865515470505, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 286760 + }, + { + "epoch": 1.1085726214222758, + "grad_norm": 0.10659022629261017, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 286770 + }, + { + "epoch": 1.108611278625659, + "grad_norm": 0.0977044478058815, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 286780 + }, + { + "epoch": 1.1086499358290425, + "grad_norm": 0.11840414255857468, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 286790 + }, + { + "epoch": 1.1086885930324257, + "grad_norm": 0.09854122251272202, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 286800 + }, + { + "epoch": 1.108727250235809, + "grad_norm": 0.10114125907421112, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 286810 + }, + { + "epoch": 1.1087659074391922, + "grad_norm": 0.10757792741060257, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 286820 + }, + { + "epoch": 1.1088045646425755, + "grad_norm": 0.1165936067700386, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 286830 + }, + { + "epoch": 1.1088432218459587, + "grad_norm": 0.10642943531274796, + "learning_rate": 0.002, + "loss": 2.3559, + "step": 286840 + }, + { + "epoch": 1.108881879049342, + "grad_norm": 0.19730453193187714, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 286850 + }, + { + "epoch": 1.1089205362527252, + "grad_norm": 0.10663972049951553, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 286860 + }, + { + "epoch": 1.1089591934561085, + "grad_norm": 0.09699961543083191, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 286870 + }, + { + "epoch": 1.108997850659492, + "grad_norm": 0.09914124757051468, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 286880 + }, + { + "epoch": 1.1090365078628752, + "grad_norm": 0.09312078356742859, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 286890 + }, + { + "epoch": 1.1090751650662585, + "grad_norm": 0.11414426565170288, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 286900 + }, + { + "epoch": 1.1091138222696417, + "grad_norm": 0.1020946130156517, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 286910 + }, + { + "epoch": 1.109152479473025, + "grad_norm": 0.10086391866207123, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 286920 + }, + { + "epoch": 1.1091911366764082, + "grad_norm": 0.10533702373504639, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 286930 + }, + { + "epoch": 1.1092297938797915, + "grad_norm": 0.09942932426929474, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 286940 + }, + { + "epoch": 1.109268451083175, + "grad_norm": 0.0986490547657013, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 286950 + }, + { + "epoch": 1.1093071082865582, + "grad_norm": 0.10131828486919403, + "learning_rate": 0.002, + "loss": 2.338, + "step": 286960 + }, + { + "epoch": 1.1093457654899415, + "grad_norm": 0.10056748241186142, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 286970 + }, + { + "epoch": 1.1093844226933247, + "grad_norm": 0.12299492210149765, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 286980 + }, + { + "epoch": 1.109423079896708, + "grad_norm": 0.09333730489015579, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 286990 + }, + { + "epoch": 1.1094617371000912, + "grad_norm": 0.10080965608358383, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 287000 + }, + { + "epoch": 1.1095003943034745, + "grad_norm": 0.09615743905305862, + "learning_rate": 0.002, + "loss": 2.331, + "step": 287010 + }, + { + "epoch": 1.1095390515068577, + "grad_norm": 0.10049605369567871, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 287020 + }, + { + "epoch": 1.109577708710241, + "grad_norm": 0.10170258581638336, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 287030 + }, + { + "epoch": 1.1096163659136244, + "grad_norm": 0.09737537056207657, + "learning_rate": 0.002, + "loss": 2.337, + "step": 287040 + }, + { + "epoch": 1.1096550231170077, + "grad_norm": 0.12915514409542084, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 287050 + }, + { + "epoch": 1.109693680320391, + "grad_norm": 0.12582509219646454, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 287060 + }, + { + "epoch": 1.1097323375237742, + "grad_norm": 0.6325317025184631, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 287070 + }, + { + "epoch": 1.1097709947271575, + "grad_norm": 0.09826202690601349, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 287080 + }, + { + "epoch": 1.1098096519305407, + "grad_norm": 0.10656698793172836, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 287090 + }, + { + "epoch": 1.109848309133924, + "grad_norm": 0.10194915533065796, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 287100 + }, + { + "epoch": 1.1098869663373072, + "grad_norm": 0.10883122682571411, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 287110 + }, + { + "epoch": 1.1099256235406907, + "grad_norm": 0.09845606237649918, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 287120 + }, + { + "epoch": 1.109964280744074, + "grad_norm": 0.11428073793649673, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 287130 + }, + { + "epoch": 1.1100029379474572, + "grad_norm": 0.1102704256772995, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 287140 + }, + { + "epoch": 1.1100415951508404, + "grad_norm": 0.11757481098175049, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 287150 + }, + { + "epoch": 1.1100802523542237, + "grad_norm": 0.1087174043059349, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 287160 + }, + { + "epoch": 1.110118909557607, + "grad_norm": 0.09554096311330795, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 287170 + }, + { + "epoch": 1.1101575667609902, + "grad_norm": 0.13887332379817963, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 287180 + }, + { + "epoch": 1.1101962239643735, + "grad_norm": 0.11097030341625214, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 287190 + }, + { + "epoch": 1.1102348811677567, + "grad_norm": 0.09805213660001755, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 287200 + }, + { + "epoch": 1.1102735383711402, + "grad_norm": 0.10877746343612671, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 287210 + }, + { + "epoch": 1.1103121955745234, + "grad_norm": 0.09850569069385529, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 287220 + }, + { + "epoch": 1.1103508527779067, + "grad_norm": 0.10622706264257431, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 287230 + }, + { + "epoch": 1.11038950998129, + "grad_norm": 0.10253924876451492, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 287240 + }, + { + "epoch": 1.1104281671846732, + "grad_norm": 0.10554666072130203, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 287250 + }, + { + "epoch": 1.1104668243880564, + "grad_norm": 0.1024741381406784, + "learning_rate": 0.002, + "loss": 2.325, + "step": 287260 + }, + { + "epoch": 1.1105054815914397, + "grad_norm": 0.12398812174797058, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 287270 + }, + { + "epoch": 1.110544138794823, + "grad_norm": 0.09940765053033829, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 287280 + }, + { + "epoch": 1.1105827959982064, + "grad_norm": 0.09944300353527069, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 287290 + }, + { + "epoch": 1.1106214532015897, + "grad_norm": 0.10488318651914597, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 287300 + }, + { + "epoch": 1.110660110404973, + "grad_norm": 0.11721440404653549, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 287310 + }, + { + "epoch": 1.1106987676083562, + "grad_norm": 0.10939953476190567, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 287320 + }, + { + "epoch": 1.1107374248117394, + "grad_norm": 0.0925823301076889, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 287330 + }, + { + "epoch": 1.1107760820151227, + "grad_norm": 0.08701872080564499, + "learning_rate": 0.002, + "loss": 2.34, + "step": 287340 + }, + { + "epoch": 1.110814739218506, + "grad_norm": 0.10976503044366837, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 287350 + }, + { + "epoch": 1.1108533964218892, + "grad_norm": 0.1309625506401062, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 287360 + }, + { + "epoch": 1.1108920536252724, + "grad_norm": 0.09811020642518997, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 287370 + }, + { + "epoch": 1.110930710828656, + "grad_norm": 0.10170017927885056, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 287380 + }, + { + "epoch": 1.1109693680320392, + "grad_norm": 0.09560930728912354, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 287390 + }, + { + "epoch": 1.1110080252354224, + "grad_norm": 0.11622206121683121, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 287400 + }, + { + "epoch": 1.1110466824388057, + "grad_norm": 0.10059899091720581, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 287410 + }, + { + "epoch": 1.111085339642189, + "grad_norm": 0.0940239429473877, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 287420 + }, + { + "epoch": 1.1111239968455722, + "grad_norm": 0.13035322725772858, + "learning_rate": 0.002, + "loss": 2.341, + "step": 287430 + }, + { + "epoch": 1.1111626540489554, + "grad_norm": 0.11438962817192078, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 287440 + }, + { + "epoch": 1.1112013112523387, + "grad_norm": 0.08248197287321091, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 287450 + }, + { + "epoch": 1.1112399684557221, + "grad_norm": 0.10871239006519318, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 287460 + }, + { + "epoch": 1.1112786256591054, + "grad_norm": 0.1135963648557663, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 287470 + }, + { + "epoch": 1.1113172828624887, + "grad_norm": 0.12674102187156677, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 287480 + }, + { + "epoch": 1.111355940065872, + "grad_norm": 0.10381345450878143, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 287490 + }, + { + "epoch": 1.1113945972692552, + "grad_norm": 0.10797566920518875, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 287500 + }, + { + "epoch": 1.1114332544726384, + "grad_norm": 0.09637517482042313, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 287510 + }, + { + "epoch": 1.1114719116760217, + "grad_norm": 0.09481417387723923, + "learning_rate": 0.002, + "loss": 2.35, + "step": 287520 + }, + { + "epoch": 1.111510568879405, + "grad_norm": 0.11646459251642227, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 287530 + }, + { + "epoch": 1.1115492260827882, + "grad_norm": 0.10901686549186707, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 287540 + }, + { + "epoch": 1.1115878832861716, + "grad_norm": 0.0974387377500534, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 287550 + }, + { + "epoch": 1.111626540489555, + "grad_norm": 0.12838047742843628, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 287560 + }, + { + "epoch": 1.1116651976929381, + "grad_norm": 0.2370108962059021, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 287570 + }, + { + "epoch": 1.1117038548963214, + "grad_norm": 0.27904149889945984, + "learning_rate": 0.002, + "loss": 2.333, + "step": 287580 + }, + { + "epoch": 1.1117425120997046, + "grad_norm": 0.13837091624736786, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 287590 + }, + { + "epoch": 1.111781169303088, + "grad_norm": 0.10299310088157654, + "learning_rate": 0.002, + "loss": 2.338, + "step": 287600 + }, + { + "epoch": 1.1118198265064712, + "grad_norm": 0.10843832790851593, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 287610 + }, + { + "epoch": 1.1118584837098544, + "grad_norm": 0.11448937654495239, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 287620 + }, + { + "epoch": 1.1118971409132379, + "grad_norm": 0.10186512768268585, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 287630 + }, + { + "epoch": 1.1119357981166211, + "grad_norm": 0.12241533398628235, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 287640 + }, + { + "epoch": 1.1119744553200044, + "grad_norm": 0.11468174308538437, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 287650 + }, + { + "epoch": 1.1120131125233876, + "grad_norm": 0.11049031466245651, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 287660 + }, + { + "epoch": 1.112051769726771, + "grad_norm": 0.10353521257638931, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 287670 + }, + { + "epoch": 1.1120904269301541, + "grad_norm": 0.11140059679746628, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 287680 + }, + { + "epoch": 1.1121290841335374, + "grad_norm": 0.09088698029518127, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 287690 + }, + { + "epoch": 1.1121677413369206, + "grad_norm": 0.12379549443721771, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 287700 + }, + { + "epoch": 1.112206398540304, + "grad_norm": 0.11208084970712662, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 287710 + }, + { + "epoch": 1.1122450557436874, + "grad_norm": 0.10558558255434036, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 287720 + }, + { + "epoch": 1.1122837129470706, + "grad_norm": 0.10928310453891754, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 287730 + }, + { + "epoch": 1.1123223701504539, + "grad_norm": 0.12338408082723618, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 287740 + }, + { + "epoch": 1.1123610273538371, + "grad_norm": 0.16096697747707367, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 287750 + }, + { + "epoch": 1.1123996845572204, + "grad_norm": 0.10622543096542358, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 287760 + }, + { + "epoch": 1.1124383417606036, + "grad_norm": 0.12482261657714844, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 287770 + }, + { + "epoch": 1.1124769989639869, + "grad_norm": 0.09466690570116043, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 287780 + }, + { + "epoch": 1.1125156561673701, + "grad_norm": 0.1149539053440094, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 287790 + }, + { + "epoch": 1.1125543133707536, + "grad_norm": 0.10789894312620163, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 287800 + }, + { + "epoch": 1.1125929705741369, + "grad_norm": 0.11976473778486252, + "learning_rate": 0.002, + "loss": 2.305, + "step": 287810 + }, + { + "epoch": 1.1126316277775201, + "grad_norm": 0.09605703502893448, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 287820 + }, + { + "epoch": 1.1126702849809034, + "grad_norm": 0.0961158275604248, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 287830 + }, + { + "epoch": 1.1127089421842866, + "grad_norm": 0.11573757231235504, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 287840 + }, + { + "epoch": 1.1127475993876699, + "grad_norm": 0.10903476923704147, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 287850 + }, + { + "epoch": 1.1127862565910531, + "grad_norm": 0.14516030251979828, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 287860 + }, + { + "epoch": 1.1128249137944364, + "grad_norm": 0.10482911020517349, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 287870 + }, + { + "epoch": 1.1128635709978196, + "grad_norm": 0.09693562984466553, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 287880 + }, + { + "epoch": 1.112902228201203, + "grad_norm": 0.10743766278028488, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 287890 + }, + { + "epoch": 1.1129408854045864, + "grad_norm": 0.11316178739070892, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 287900 + }, + { + "epoch": 1.1129795426079696, + "grad_norm": 0.11615628749132156, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 287910 + }, + { + "epoch": 1.1130181998113529, + "grad_norm": 0.09100455045700073, + "learning_rate": 0.002, + "loss": 2.351, + "step": 287920 + }, + { + "epoch": 1.1130568570147361, + "grad_norm": 0.10525690019130707, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 287930 + }, + { + "epoch": 1.1130955142181194, + "grad_norm": 0.10136166214942932, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 287940 + }, + { + "epoch": 1.1131341714215026, + "grad_norm": 0.1073971837759018, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 287950 + }, + { + "epoch": 1.1131728286248859, + "grad_norm": 0.11636967957019806, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 287960 + }, + { + "epoch": 1.1132114858282693, + "grad_norm": 0.10656551271677017, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 287970 + }, + { + "epoch": 1.1132501430316526, + "grad_norm": 0.11081990599632263, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 287980 + }, + { + "epoch": 1.1132888002350358, + "grad_norm": 0.11542163044214249, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 287990 + }, + { + "epoch": 1.113327457438419, + "grad_norm": 0.09533045440912247, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 288000 + }, + { + "epoch": 1.1133661146418024, + "grad_norm": 0.09483199566602707, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 288010 + }, + { + "epoch": 1.1134047718451856, + "grad_norm": 0.114728644490242, + "learning_rate": 0.002, + "loss": 2.324, + "step": 288020 + }, + { + "epoch": 1.1134434290485689, + "grad_norm": 0.11476054787635803, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 288030 + }, + { + "epoch": 1.113482086251952, + "grad_norm": 0.10087236016988754, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 288040 + }, + { + "epoch": 1.1135207434553354, + "grad_norm": 0.10353070497512817, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 288050 + }, + { + "epoch": 1.1135594006587188, + "grad_norm": 0.1241433322429657, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 288060 + }, + { + "epoch": 1.113598057862102, + "grad_norm": 0.09710206091403961, + "learning_rate": 0.002, + "loss": 2.342, + "step": 288070 + }, + { + "epoch": 1.1136367150654853, + "grad_norm": 0.11962775886058807, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 288080 + }, + { + "epoch": 1.1136753722688686, + "grad_norm": 0.1040009930729866, + "learning_rate": 0.002, + "loss": 2.344, + "step": 288090 + }, + { + "epoch": 1.1137140294722518, + "grad_norm": 0.13171091675758362, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 288100 + }, + { + "epoch": 1.113752686675635, + "grad_norm": 0.09785674512386322, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 288110 + }, + { + "epoch": 1.1137913438790183, + "grad_norm": 0.10102818161249161, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 288120 + }, + { + "epoch": 1.1138300010824016, + "grad_norm": 0.10420387238264084, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 288130 + }, + { + "epoch": 1.113868658285785, + "grad_norm": 0.10302894562482834, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 288140 + }, + { + "epoch": 1.1139073154891683, + "grad_norm": 0.1063724160194397, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 288150 + }, + { + "epoch": 1.1139459726925516, + "grad_norm": 0.09987162053585052, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 288160 + }, + { + "epoch": 1.1139846298959348, + "grad_norm": 0.10557962954044342, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 288170 + }, + { + "epoch": 1.114023287099318, + "grad_norm": 0.10160104185342789, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 288180 + }, + { + "epoch": 1.1140619443027013, + "grad_norm": 0.0960058942437172, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 288190 + }, + { + "epoch": 1.1141006015060846, + "grad_norm": 0.09926880151033401, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 288200 + }, + { + "epoch": 1.1141392587094678, + "grad_norm": 0.10220940411090851, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 288210 + }, + { + "epoch": 1.114177915912851, + "grad_norm": 0.11922407895326614, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 288220 + }, + { + "epoch": 1.1142165731162346, + "grad_norm": 0.10637892037630081, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 288230 + }, + { + "epoch": 1.1142552303196178, + "grad_norm": 0.0881463959813118, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 288240 + }, + { + "epoch": 1.114293887523001, + "grad_norm": 0.10221447050571442, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 288250 + }, + { + "epoch": 1.1143325447263843, + "grad_norm": 0.12899447977542877, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 288260 + }, + { + "epoch": 1.1143712019297676, + "grad_norm": 0.12827207148075104, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 288270 + }, + { + "epoch": 1.1144098591331508, + "grad_norm": 0.10625769942998886, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 288280 + }, + { + "epoch": 1.114448516336534, + "grad_norm": 0.09953627735376358, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 288290 + }, + { + "epoch": 1.1144871735399173, + "grad_norm": 0.11532321572303772, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 288300 + }, + { + "epoch": 1.1145258307433008, + "grad_norm": 0.10999681055545807, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 288310 + }, + { + "epoch": 1.114564487946684, + "grad_norm": 0.13596197962760925, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 288320 + }, + { + "epoch": 1.1146031451500673, + "grad_norm": 0.10923830419778824, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 288330 + }, + { + "epoch": 1.1146418023534506, + "grad_norm": 0.10871526598930359, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 288340 + }, + { + "epoch": 1.1146804595568338, + "grad_norm": 0.12883463501930237, + "learning_rate": 0.002, + "loss": 2.339, + "step": 288350 + }, + { + "epoch": 1.114719116760217, + "grad_norm": 0.11219342797994614, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 288360 + }, + { + "epoch": 1.1147577739636003, + "grad_norm": 0.10813488811254501, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 288370 + }, + { + "epoch": 1.1147964311669836, + "grad_norm": 0.12277258187532425, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 288380 + }, + { + "epoch": 1.1148350883703668, + "grad_norm": 0.09526970237493515, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 288390 + }, + { + "epoch": 1.1148737455737503, + "grad_norm": 0.09309110045433044, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 288400 + }, + { + "epoch": 1.1149124027771335, + "grad_norm": 0.1104973778128624, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 288410 + }, + { + "epoch": 1.1149510599805168, + "grad_norm": 0.09410668164491653, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 288420 + }, + { + "epoch": 1.1149897171839, + "grad_norm": 0.0946255549788475, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 288430 + }, + { + "epoch": 1.1150283743872833, + "grad_norm": 0.10899773985147476, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 288440 + }, + { + "epoch": 1.1150670315906666, + "grad_norm": 0.1305028200149536, + "learning_rate": 0.002, + "loss": 2.342, + "step": 288450 + }, + { + "epoch": 1.1151056887940498, + "grad_norm": 0.11407119035720825, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 288460 + }, + { + "epoch": 1.115144345997433, + "grad_norm": 0.10499758273363113, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 288470 + }, + { + "epoch": 1.1151830032008165, + "grad_norm": 0.11272277683019638, + "learning_rate": 0.002, + "loss": 2.339, + "step": 288480 + }, + { + "epoch": 1.1152216604041998, + "grad_norm": 0.10035009682178497, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 288490 + }, + { + "epoch": 1.115260317607583, + "grad_norm": 0.10695807635784149, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 288500 + }, + { + "epoch": 1.1152989748109663, + "grad_norm": 0.10339631140232086, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 288510 + }, + { + "epoch": 1.1153376320143495, + "grad_norm": 0.09223701804876328, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 288520 + }, + { + "epoch": 1.1153762892177328, + "grad_norm": 0.1100502610206604, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 288530 + }, + { + "epoch": 1.115414946421116, + "grad_norm": 0.1022772565484047, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 288540 + }, + { + "epoch": 1.1154536036244993, + "grad_norm": 0.09872733801603317, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 288550 + }, + { + "epoch": 1.1154922608278826, + "grad_norm": 0.10790759325027466, + "learning_rate": 0.002, + "loss": 2.32, + "step": 288560 + }, + { + "epoch": 1.115530918031266, + "grad_norm": 0.11124914139509201, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 288570 + }, + { + "epoch": 1.1155695752346493, + "grad_norm": 0.11812636256217957, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 288580 + }, + { + "epoch": 1.1156082324380325, + "grad_norm": 0.12386882305145264, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 288590 + }, + { + "epoch": 1.1156468896414158, + "grad_norm": 0.10007230192422867, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 288600 + }, + { + "epoch": 1.115685546844799, + "grad_norm": 0.11115007102489471, + "learning_rate": 0.002, + "loss": 2.338, + "step": 288610 + }, + { + "epoch": 1.1157242040481823, + "grad_norm": 0.11161840707063675, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 288620 + }, + { + "epoch": 1.1157628612515655, + "grad_norm": 0.10194922238588333, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 288630 + }, + { + "epoch": 1.1158015184549488, + "grad_norm": 0.11764828115701675, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 288640 + }, + { + "epoch": 1.1158401756583323, + "grad_norm": 0.1009848415851593, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 288650 + }, + { + "epoch": 1.1158788328617155, + "grad_norm": 0.14312507212162018, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 288660 + }, + { + "epoch": 1.1159174900650988, + "grad_norm": 0.1186060905456543, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 288670 + }, + { + "epoch": 1.115956147268482, + "grad_norm": 0.10075625777244568, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 288680 + }, + { + "epoch": 1.1159948044718653, + "grad_norm": 0.5035440325737, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 288690 + }, + { + "epoch": 1.1160334616752485, + "grad_norm": 0.11094953864812851, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 288700 + }, + { + "epoch": 1.1160721188786318, + "grad_norm": 0.08963083475828171, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 288710 + }, + { + "epoch": 1.116110776082015, + "grad_norm": 0.11137071996927261, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 288720 + }, + { + "epoch": 1.1161494332853983, + "grad_norm": 0.09859320521354675, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 288730 + }, + { + "epoch": 1.1161880904887818, + "grad_norm": 0.10735145956277847, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 288740 + }, + { + "epoch": 1.116226747692165, + "grad_norm": 0.10665154457092285, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 288750 + }, + { + "epoch": 1.1162654048955483, + "grad_norm": 0.09725417196750641, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 288760 + }, + { + "epoch": 1.1163040620989315, + "grad_norm": 0.11257147789001465, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 288770 + }, + { + "epoch": 1.1163427193023148, + "grad_norm": 0.12395235151052475, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 288780 + }, + { + "epoch": 1.116381376505698, + "grad_norm": 0.09967972338199615, + "learning_rate": 0.002, + "loss": 2.338, + "step": 288790 + }, + { + "epoch": 1.1164200337090813, + "grad_norm": 0.10118284821510315, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 288800 + }, + { + "epoch": 1.1164586909124647, + "grad_norm": 0.10583081096410751, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 288810 + }, + { + "epoch": 1.116497348115848, + "grad_norm": 0.1105639636516571, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 288820 + }, + { + "epoch": 1.1165360053192313, + "grad_norm": 0.10586509108543396, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 288830 + }, + { + "epoch": 1.1165746625226145, + "grad_norm": 0.10803575068712234, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 288840 + }, + { + "epoch": 1.1166133197259978, + "grad_norm": 0.0964178740978241, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 288850 + }, + { + "epoch": 1.116651976929381, + "grad_norm": 0.11801418662071228, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 288860 + }, + { + "epoch": 1.1166906341327643, + "grad_norm": 0.10151393711566925, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 288870 + }, + { + "epoch": 1.1167292913361475, + "grad_norm": 0.0936862900853157, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 288880 + }, + { + "epoch": 1.1167679485395308, + "grad_norm": 0.11406335979700089, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 288890 + }, + { + "epoch": 1.1168066057429142, + "grad_norm": 0.1116715669631958, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 288900 + }, + { + "epoch": 1.1168452629462975, + "grad_norm": 0.10459590703248978, + "learning_rate": 0.002, + "loss": 2.352, + "step": 288910 + }, + { + "epoch": 1.1168839201496807, + "grad_norm": 0.11433672904968262, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 288920 + }, + { + "epoch": 1.116922577353064, + "grad_norm": 0.09518077969551086, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 288930 + }, + { + "epoch": 1.1169612345564472, + "grad_norm": 0.11603035777807236, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 288940 + }, + { + "epoch": 1.1169998917598305, + "grad_norm": 0.11251766979694366, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 288950 + }, + { + "epoch": 1.1170385489632138, + "grad_norm": 0.09062660485506058, + "learning_rate": 0.002, + "loss": 2.329, + "step": 288960 + }, + { + "epoch": 1.117077206166597, + "grad_norm": 0.1127210482954979, + "learning_rate": 0.002, + "loss": 2.325, + "step": 288970 + }, + { + "epoch": 1.1171158633699805, + "grad_norm": 0.1010056659579277, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 288980 + }, + { + "epoch": 1.1171545205733637, + "grad_norm": 0.09560646116733551, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 288990 + }, + { + "epoch": 1.117193177776747, + "grad_norm": 0.12088503688573837, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 289000 + }, + { + "epoch": 1.1172318349801302, + "grad_norm": 0.1045701801776886, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 289010 + }, + { + "epoch": 1.1172704921835135, + "grad_norm": 0.10648337751626968, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 289020 + }, + { + "epoch": 1.1173091493868967, + "grad_norm": 0.10930231958627701, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 289030 + }, + { + "epoch": 1.11734780659028, + "grad_norm": 0.11172724515199661, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 289040 + }, + { + "epoch": 1.1173864637936632, + "grad_norm": 0.10152576118707657, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 289050 + }, + { + "epoch": 1.1174251209970465, + "grad_norm": 0.14760203659534454, + "learning_rate": 0.002, + "loss": 2.358, + "step": 289060 + }, + { + "epoch": 1.11746377820043, + "grad_norm": 0.08888938277959824, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 289070 + }, + { + "epoch": 1.1175024354038132, + "grad_norm": 0.09549479931592941, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 289080 + }, + { + "epoch": 1.1175410926071965, + "grad_norm": 0.11979442834854126, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 289090 + }, + { + "epoch": 1.1175797498105797, + "grad_norm": 0.1341460645198822, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 289100 + }, + { + "epoch": 1.117618407013963, + "grad_norm": 0.11074167490005493, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 289110 + }, + { + "epoch": 1.1176570642173462, + "grad_norm": 0.10645493119955063, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 289120 + }, + { + "epoch": 1.1176957214207295, + "grad_norm": 0.10313550382852554, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 289130 + }, + { + "epoch": 1.1177343786241127, + "grad_norm": 0.1280437409877777, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 289140 + }, + { + "epoch": 1.1177730358274962, + "grad_norm": 0.10288901627063751, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 289150 + }, + { + "epoch": 1.1178116930308795, + "grad_norm": 0.12505581974983215, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 289160 + }, + { + "epoch": 1.1178503502342627, + "grad_norm": 0.09792499244213104, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 289170 + }, + { + "epoch": 1.117889007437646, + "grad_norm": 0.10279227793216705, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 289180 + }, + { + "epoch": 1.1179276646410292, + "grad_norm": 0.10783105343580246, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 289190 + }, + { + "epoch": 1.1179663218444125, + "grad_norm": 0.09635636955499649, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 289200 + }, + { + "epoch": 1.1180049790477957, + "grad_norm": 0.10871084779500961, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 289210 + }, + { + "epoch": 1.118043636251179, + "grad_norm": 0.11938074976205826, + "learning_rate": 0.002, + "loss": 2.344, + "step": 289220 + }, + { + "epoch": 1.1180822934545622, + "grad_norm": 0.12168192863464355, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 289230 + }, + { + "epoch": 1.1181209506579457, + "grad_norm": 0.09364253282546997, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 289240 + }, + { + "epoch": 1.118159607861329, + "grad_norm": 0.0911431536078453, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 289250 + }, + { + "epoch": 1.1181982650647122, + "grad_norm": 0.10865811258554459, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 289260 + }, + { + "epoch": 1.1182369222680955, + "grad_norm": 0.13914142549037933, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 289270 + }, + { + "epoch": 1.1182755794714787, + "grad_norm": 0.11465007066726685, + "learning_rate": 0.002, + "loss": 2.3631, + "step": 289280 + }, + { + "epoch": 1.118314236674862, + "grad_norm": 0.12156081199645996, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 289290 + }, + { + "epoch": 1.1183528938782452, + "grad_norm": 0.10695220530033112, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 289300 + }, + { + "epoch": 1.1183915510816285, + "grad_norm": 0.10531539469957352, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 289310 + }, + { + "epoch": 1.118430208285012, + "grad_norm": 0.09656589478254318, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 289320 + }, + { + "epoch": 1.1184688654883952, + "grad_norm": 0.10795163363218307, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 289330 + }, + { + "epoch": 1.1185075226917784, + "grad_norm": 0.10545086860656738, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 289340 + }, + { + "epoch": 1.1185461798951617, + "grad_norm": 0.10557561367750168, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 289350 + }, + { + "epoch": 1.118584837098545, + "grad_norm": 0.11986632645130157, + "learning_rate": 0.002, + "loss": 2.325, + "step": 289360 + }, + { + "epoch": 1.1186234943019282, + "grad_norm": 0.12289483100175858, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 289370 + }, + { + "epoch": 1.1186621515053115, + "grad_norm": 0.1210024505853653, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 289380 + }, + { + "epoch": 1.1187008087086947, + "grad_norm": 0.10017368942499161, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 289390 + }, + { + "epoch": 1.118739465912078, + "grad_norm": 0.09998578578233719, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 289400 + }, + { + "epoch": 1.1187781231154614, + "grad_norm": 0.09792304039001465, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 289410 + }, + { + "epoch": 1.1188167803188447, + "grad_norm": 0.09898721426725388, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 289420 + }, + { + "epoch": 1.118855437522228, + "grad_norm": 0.15388897061347961, + "learning_rate": 0.002, + "loss": 2.336, + "step": 289430 + }, + { + "epoch": 1.1188940947256112, + "grad_norm": 0.11085615307092667, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 289440 + }, + { + "epoch": 1.1189327519289944, + "grad_norm": 0.11091116815805435, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 289450 + }, + { + "epoch": 1.1189714091323777, + "grad_norm": 0.11337563395500183, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 289460 + }, + { + "epoch": 1.119010066335761, + "grad_norm": 0.12272960692644119, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 289470 + }, + { + "epoch": 1.1190487235391442, + "grad_norm": 0.09862526506185532, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 289480 + }, + { + "epoch": 1.1190873807425277, + "grad_norm": 0.11311447620391846, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 289490 + }, + { + "epoch": 1.119126037945911, + "grad_norm": 0.10457554459571838, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 289500 + }, + { + "epoch": 1.1191646951492942, + "grad_norm": 0.11485693603754044, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 289510 + }, + { + "epoch": 1.1192033523526774, + "grad_norm": 0.11195486038923264, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 289520 + }, + { + "epoch": 1.1192420095560607, + "grad_norm": 0.10413413494825363, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 289530 + }, + { + "epoch": 1.119280666759444, + "grad_norm": 0.11456961184740067, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 289540 + }, + { + "epoch": 1.1193193239628272, + "grad_norm": 0.1050366684794426, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 289550 + }, + { + "epoch": 1.1193579811662104, + "grad_norm": 0.08729544281959534, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 289560 + }, + { + "epoch": 1.1193966383695937, + "grad_norm": 0.15180206298828125, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 289570 + }, + { + "epoch": 1.1194352955729772, + "grad_norm": 0.11148621886968613, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 289580 + }, + { + "epoch": 1.1194739527763604, + "grad_norm": 0.11813264340162277, + "learning_rate": 0.002, + "loss": 2.352, + "step": 289590 + }, + { + "epoch": 1.1195126099797437, + "grad_norm": 0.11504266411066055, + "learning_rate": 0.002, + "loss": 2.3664, + "step": 289600 + }, + { + "epoch": 1.119551267183127, + "grad_norm": 0.10125447064638138, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 289610 + }, + { + "epoch": 1.1195899243865102, + "grad_norm": 0.11368642747402191, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 289620 + }, + { + "epoch": 1.1196285815898934, + "grad_norm": 0.10616200417280197, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 289630 + }, + { + "epoch": 1.1196672387932767, + "grad_norm": 0.08843506127595901, + "learning_rate": 0.002, + "loss": 2.3143, + "step": 289640 + }, + { + "epoch": 1.11970589599666, + "grad_norm": 0.12057457864284515, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 289650 + }, + { + "epoch": 1.1197445532000434, + "grad_norm": 0.1454624980688095, + "learning_rate": 0.002, + "loss": 2.331, + "step": 289660 + }, + { + "epoch": 1.1197832104034267, + "grad_norm": 0.11299949139356613, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 289670 + }, + { + "epoch": 1.11982186760681, + "grad_norm": 0.11968392133712769, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 289680 + }, + { + "epoch": 1.1198605248101932, + "grad_norm": 0.12136146426200867, + "learning_rate": 0.002, + "loss": 2.345, + "step": 289690 + }, + { + "epoch": 1.1198991820135764, + "grad_norm": 0.09192265570163727, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 289700 + }, + { + "epoch": 1.1199378392169597, + "grad_norm": 0.12679505348205566, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 289710 + }, + { + "epoch": 1.119976496420343, + "grad_norm": 0.1034976989030838, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 289720 + }, + { + "epoch": 1.1200151536237262, + "grad_norm": 0.08856305480003357, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 289730 + }, + { + "epoch": 1.1200538108271094, + "grad_norm": 0.10530061274766922, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 289740 + }, + { + "epoch": 1.120092468030493, + "grad_norm": 0.09523380547761917, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 289750 + }, + { + "epoch": 1.1201311252338761, + "grad_norm": 0.13746485114097595, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 289760 + }, + { + "epoch": 1.1201697824372594, + "grad_norm": 0.09969547390937805, + "learning_rate": 0.002, + "loss": 2.349, + "step": 289770 + }, + { + "epoch": 1.1202084396406427, + "grad_norm": 0.09542639553546906, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 289780 + }, + { + "epoch": 1.120247096844026, + "grad_norm": 0.13123148679733276, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 289790 + }, + { + "epoch": 1.1202857540474092, + "grad_norm": 0.09224528819322586, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 289800 + }, + { + "epoch": 1.1203244112507924, + "grad_norm": 0.10775623470544815, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 289810 + }, + { + "epoch": 1.1203630684541757, + "grad_norm": 0.11774013191461563, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 289820 + }, + { + "epoch": 1.1204017256575591, + "grad_norm": 0.16113704442977905, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 289830 + }, + { + "epoch": 1.1204403828609424, + "grad_norm": 0.08824397623538971, + "learning_rate": 0.002, + "loss": 2.3129, + "step": 289840 + }, + { + "epoch": 1.1204790400643256, + "grad_norm": 0.09775109589099884, + "learning_rate": 0.002, + "loss": 2.343, + "step": 289850 + }, + { + "epoch": 1.120517697267709, + "grad_norm": 0.12996326386928558, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 289860 + }, + { + "epoch": 1.1205563544710921, + "grad_norm": 0.0891638770699501, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 289870 + }, + { + "epoch": 1.1205950116744754, + "grad_norm": 0.10942413657903671, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 289880 + }, + { + "epoch": 1.1206336688778586, + "grad_norm": 0.14186042547225952, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 289890 + }, + { + "epoch": 1.120672326081242, + "grad_norm": 0.09692166745662689, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 289900 + }, + { + "epoch": 1.1207109832846252, + "grad_norm": 0.1097760796546936, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 289910 + }, + { + "epoch": 1.1207496404880086, + "grad_norm": 0.1349506974220276, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 289920 + }, + { + "epoch": 1.1207882976913919, + "grad_norm": 0.10145654529333115, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 289930 + }, + { + "epoch": 1.1208269548947751, + "grad_norm": 0.10704918950796127, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 289940 + }, + { + "epoch": 1.1208656120981584, + "grad_norm": 0.10984811186790466, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 289950 + }, + { + "epoch": 1.1209042693015416, + "grad_norm": 0.09716042876243591, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 289960 + }, + { + "epoch": 1.1209429265049249, + "grad_norm": 0.10437486320734024, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 289970 + }, + { + "epoch": 1.1209815837083081, + "grad_norm": 0.11297482252120972, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 289980 + }, + { + "epoch": 1.1210202409116914, + "grad_norm": 0.09970725327730179, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 289990 + }, + { + "epoch": 1.1210588981150749, + "grad_norm": 0.09292304515838623, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 290000 + }, + { + "epoch": 1.1210975553184581, + "grad_norm": 0.09936968982219696, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 290010 + }, + { + "epoch": 1.1211362125218414, + "grad_norm": 0.145578533411026, + "learning_rate": 0.002, + "loss": 2.327, + "step": 290020 + }, + { + "epoch": 1.1211748697252246, + "grad_norm": 0.11210444569587708, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 290030 + }, + { + "epoch": 1.1212135269286079, + "grad_norm": 0.09582260251045227, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 290040 + }, + { + "epoch": 1.1212521841319911, + "grad_norm": 0.10336029529571533, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 290050 + }, + { + "epoch": 1.1212908413353744, + "grad_norm": 0.17279626429080963, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 290060 + }, + { + "epoch": 1.1213294985387576, + "grad_norm": 0.10698701441287994, + "learning_rate": 0.002, + "loss": 2.346, + "step": 290070 + }, + { + "epoch": 1.1213681557421409, + "grad_norm": 0.10927004367113113, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 290080 + }, + { + "epoch": 1.1214068129455244, + "grad_norm": 0.09581339359283447, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 290090 + }, + { + "epoch": 1.1214454701489076, + "grad_norm": 0.10079756379127502, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 290100 + }, + { + "epoch": 1.1214841273522909, + "grad_norm": 0.11049097031354904, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 290110 + }, + { + "epoch": 1.1215227845556741, + "grad_norm": 0.09853102266788483, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 290120 + }, + { + "epoch": 1.1215614417590574, + "grad_norm": 0.0913471132516861, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 290130 + }, + { + "epoch": 1.1216000989624406, + "grad_norm": 0.09522217512130737, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 290140 + }, + { + "epoch": 1.1216387561658239, + "grad_norm": 0.1073422059416771, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 290150 + }, + { + "epoch": 1.1216774133692071, + "grad_norm": 0.13044096529483795, + "learning_rate": 0.002, + "loss": 2.343, + "step": 290160 + }, + { + "epoch": 1.1217160705725906, + "grad_norm": 0.10940760374069214, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 290170 + }, + { + "epoch": 1.1217547277759738, + "grad_norm": 0.09089474380016327, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 290180 + }, + { + "epoch": 1.121793384979357, + "grad_norm": 0.09686536341905594, + "learning_rate": 0.002, + "loss": 2.332, + "step": 290190 + }, + { + "epoch": 1.1218320421827404, + "grad_norm": 0.10884518176317215, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 290200 + }, + { + "epoch": 1.1218706993861236, + "grad_norm": 0.1591804176568985, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 290210 + }, + { + "epoch": 1.1219093565895069, + "grad_norm": 0.10680185258388519, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 290220 + }, + { + "epoch": 1.12194801379289, + "grad_norm": 0.10056709498167038, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 290230 + }, + { + "epoch": 1.1219866709962734, + "grad_norm": 0.0994231328368187, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 290240 + }, + { + "epoch": 1.1220253281996566, + "grad_norm": 0.09971922636032104, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 290250 + }, + { + "epoch": 1.12206398540304, + "grad_norm": 0.10332693159580231, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 290260 + }, + { + "epoch": 1.1221026426064233, + "grad_norm": 0.11420253664255142, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 290270 + }, + { + "epoch": 1.1221412998098066, + "grad_norm": 0.09633500128984451, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 290280 + }, + { + "epoch": 1.1221799570131898, + "grad_norm": 0.0984421819448471, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 290290 + }, + { + "epoch": 1.122218614216573, + "grad_norm": 0.11268351227045059, + "learning_rate": 0.002, + "loss": 2.3597, + "step": 290300 + }, + { + "epoch": 1.1222572714199563, + "grad_norm": 0.1098770722746849, + "learning_rate": 0.002, + "loss": 2.327, + "step": 290310 + }, + { + "epoch": 1.1222959286233396, + "grad_norm": 0.10104798525571823, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 290320 + }, + { + "epoch": 1.1223345858267229, + "grad_norm": 0.1057971939444542, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 290330 + }, + { + "epoch": 1.1223732430301063, + "grad_norm": 0.09403365850448608, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 290340 + }, + { + "epoch": 1.1224119002334896, + "grad_norm": 0.10322824120521545, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 290350 + }, + { + "epoch": 1.1224505574368728, + "grad_norm": 0.10056138783693314, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 290360 + }, + { + "epoch": 1.122489214640256, + "grad_norm": 0.10949905216693878, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 290370 + }, + { + "epoch": 1.1225278718436393, + "grad_norm": 0.10780070722103119, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 290380 + }, + { + "epoch": 1.1225665290470226, + "grad_norm": 0.12997755408287048, + "learning_rate": 0.002, + "loss": 2.341, + "step": 290390 + }, + { + "epoch": 1.1226051862504058, + "grad_norm": 0.09493321180343628, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 290400 + }, + { + "epoch": 1.122643843453789, + "grad_norm": 0.09830449521541595, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 290410 + }, + { + "epoch": 1.1226825006571723, + "grad_norm": 0.1011434867978096, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 290420 + }, + { + "epoch": 1.1227211578605558, + "grad_norm": 0.09244860708713531, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 290430 + }, + { + "epoch": 1.122759815063939, + "grad_norm": 0.09907836467027664, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 290440 + }, + { + "epoch": 1.1227984722673223, + "grad_norm": 0.1255566030740738, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 290450 + }, + { + "epoch": 1.1228371294707056, + "grad_norm": 0.08803509920835495, + "learning_rate": 0.002, + "loss": 2.323, + "step": 290460 + }, + { + "epoch": 1.1228757866740888, + "grad_norm": 0.1252477765083313, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 290470 + }, + { + "epoch": 1.122914443877472, + "grad_norm": 0.09476697444915771, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 290480 + }, + { + "epoch": 1.1229531010808553, + "grad_norm": 0.11161638051271439, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 290490 + }, + { + "epoch": 1.1229917582842386, + "grad_norm": 0.1074063628911972, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 290500 + }, + { + "epoch": 1.123030415487622, + "grad_norm": 0.10495628416538239, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 290510 + }, + { + "epoch": 1.1230690726910053, + "grad_norm": 0.12283427268266678, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 290520 + }, + { + "epoch": 1.1231077298943886, + "grad_norm": 0.11145826429128647, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 290530 + }, + { + "epoch": 1.1231463870977718, + "grad_norm": 0.13109144568443298, + "learning_rate": 0.002, + "loss": 2.3569, + "step": 290540 + }, + { + "epoch": 1.123185044301155, + "grad_norm": 0.1077161654829979, + "learning_rate": 0.002, + "loss": 2.331, + "step": 290550 + }, + { + "epoch": 1.1232237015045383, + "grad_norm": 0.09395164251327515, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 290560 + }, + { + "epoch": 1.1232623587079216, + "grad_norm": 0.10779568552970886, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 290570 + }, + { + "epoch": 1.1233010159113048, + "grad_norm": 0.10034944862127304, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 290580 + }, + { + "epoch": 1.123339673114688, + "grad_norm": 0.09337441623210907, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 290590 + }, + { + "epoch": 1.1233783303180715, + "grad_norm": 0.0911400094628334, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 290600 + }, + { + "epoch": 1.1234169875214548, + "grad_norm": 0.12469466775655746, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 290610 + }, + { + "epoch": 1.123455644724838, + "grad_norm": 0.11057650297880173, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 290620 + }, + { + "epoch": 1.1234943019282213, + "grad_norm": 0.09163448214530945, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 290630 + }, + { + "epoch": 1.1235329591316046, + "grad_norm": 0.10629388689994812, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 290640 + }, + { + "epoch": 1.1235716163349878, + "grad_norm": 0.13407862186431885, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 290650 + }, + { + "epoch": 1.123610273538371, + "grad_norm": 0.11412046104669571, + "learning_rate": 0.002, + "loss": 2.337, + "step": 290660 + }, + { + "epoch": 1.1236489307417543, + "grad_norm": 0.1208498477935791, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 290670 + }, + { + "epoch": 1.1236875879451378, + "grad_norm": 0.1076798141002655, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 290680 + }, + { + "epoch": 1.123726245148521, + "grad_norm": 0.12939608097076416, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 290690 + }, + { + "epoch": 1.1237649023519043, + "grad_norm": 0.10172143578529358, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 290700 + }, + { + "epoch": 1.1238035595552875, + "grad_norm": 0.1000906303524971, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 290710 + }, + { + "epoch": 1.1238422167586708, + "grad_norm": 0.09072905033826828, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 290720 + }, + { + "epoch": 1.123880873962054, + "grad_norm": 0.10595636069774628, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 290730 + }, + { + "epoch": 1.1239195311654373, + "grad_norm": 0.11296257376670837, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 290740 + }, + { + "epoch": 1.1239581883688206, + "grad_norm": 0.11922469735145569, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 290750 + }, + { + "epoch": 1.1239968455722038, + "grad_norm": 0.10842528939247131, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 290760 + }, + { + "epoch": 1.1240355027755873, + "grad_norm": 0.11093436181545258, + "learning_rate": 0.002, + "loss": 2.346, + "step": 290770 + }, + { + "epoch": 1.1240741599789705, + "grad_norm": 0.10337400436401367, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 290780 + }, + { + "epoch": 1.1241128171823538, + "grad_norm": 0.11542078852653503, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 290790 + }, + { + "epoch": 1.124151474385737, + "grad_norm": 0.10995732247829437, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 290800 + }, + { + "epoch": 1.1241901315891203, + "grad_norm": 0.09430845826864243, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 290810 + }, + { + "epoch": 1.1242287887925035, + "grad_norm": 0.1120004877448082, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 290820 + }, + { + "epoch": 1.1242674459958868, + "grad_norm": 0.107681505382061, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 290830 + }, + { + "epoch": 1.1243061031992703, + "grad_norm": 0.11644504219293594, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 290840 + }, + { + "epoch": 1.1243447604026535, + "grad_norm": 0.101902537047863, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 290850 + }, + { + "epoch": 1.1243834176060368, + "grad_norm": 0.10107220709323883, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 290860 + }, + { + "epoch": 1.12442207480942, + "grad_norm": 0.11538407951593399, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 290870 + }, + { + "epoch": 1.1244607320128033, + "grad_norm": 0.1046702191233635, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 290880 + }, + { + "epoch": 1.1244993892161865, + "grad_norm": 0.13055744767189026, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 290890 + }, + { + "epoch": 1.1245380464195698, + "grad_norm": 0.11719472706317902, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 290900 + }, + { + "epoch": 1.124576703622953, + "grad_norm": 0.10073716193437576, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 290910 + }, + { + "epoch": 1.1246153608263363, + "grad_norm": 0.12202101200819016, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 290920 + }, + { + "epoch": 1.1246540180297198, + "grad_norm": 0.10007891803979874, + "learning_rate": 0.002, + "loss": 2.34, + "step": 290930 + }, + { + "epoch": 1.124692675233103, + "grad_norm": 0.09889797866344452, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 290940 + }, + { + "epoch": 1.1247313324364863, + "grad_norm": 0.10457437485456467, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 290950 + }, + { + "epoch": 1.1247699896398695, + "grad_norm": 0.10528910905122757, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 290960 + }, + { + "epoch": 1.1248086468432528, + "grad_norm": 0.9624165296554565, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 290970 + }, + { + "epoch": 1.124847304046636, + "grad_norm": 0.14060012996196747, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 290980 + }, + { + "epoch": 1.1248859612500193, + "grad_norm": 0.10160759091377258, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 290990 + }, + { + "epoch": 1.1249246184534025, + "grad_norm": 0.09019457548856735, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 291000 + }, + { + "epoch": 1.124963275656786, + "grad_norm": 0.09293685853481293, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 291010 + }, + { + "epoch": 1.1250019328601693, + "grad_norm": 0.09362807124853134, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 291020 + }, + { + "epoch": 1.1250405900635525, + "grad_norm": 0.09911637008190155, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 291030 + }, + { + "epoch": 1.1250792472669358, + "grad_norm": 0.09766800701618195, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 291040 + }, + { + "epoch": 1.125117904470319, + "grad_norm": 0.09793352335691452, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 291050 + }, + { + "epoch": 1.1251565616737023, + "grad_norm": 0.11317580938339233, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 291060 + }, + { + "epoch": 1.1251952188770855, + "grad_norm": 0.0960904210805893, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 291070 + }, + { + "epoch": 1.1252338760804688, + "grad_norm": 0.1038578525185585, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 291080 + }, + { + "epoch": 1.125272533283852, + "grad_norm": 0.10069358348846436, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 291090 + }, + { + "epoch": 1.1253111904872353, + "grad_norm": 0.10367664694786072, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 291100 + }, + { + "epoch": 1.1253498476906187, + "grad_norm": 0.1242256611585617, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 291110 + }, + { + "epoch": 1.125388504894002, + "grad_norm": 0.09930729866027832, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 291120 + }, + { + "epoch": 1.1254271620973852, + "grad_norm": 0.12044630199670792, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 291130 + }, + { + "epoch": 1.1254658193007685, + "grad_norm": 0.12741614878177643, + "learning_rate": 0.002, + "loss": 2.3102, + "step": 291140 + }, + { + "epoch": 1.1255044765041518, + "grad_norm": 0.0974070355296135, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 291150 + }, + { + "epoch": 1.125543133707535, + "grad_norm": 0.1045672744512558, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 291160 + }, + { + "epoch": 1.1255817909109183, + "grad_norm": 0.11492281407117844, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 291170 + }, + { + "epoch": 1.1256204481143017, + "grad_norm": 0.09922520816326141, + "learning_rate": 0.002, + "loss": 2.36, + "step": 291180 + }, + { + "epoch": 1.125659105317685, + "grad_norm": 0.09569668769836426, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 291190 + }, + { + "epoch": 1.1256977625210682, + "grad_norm": 0.10939368605613708, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 291200 + }, + { + "epoch": 1.1257364197244515, + "grad_norm": 0.1325932741165161, + "learning_rate": 0.002, + "loss": 2.339, + "step": 291210 + }, + { + "epoch": 1.1257750769278347, + "grad_norm": 0.0986105427145958, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 291220 + }, + { + "epoch": 1.125813734131218, + "grad_norm": 0.08473266661167145, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 291230 + }, + { + "epoch": 1.1258523913346012, + "grad_norm": 0.12451715022325516, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 291240 + }, + { + "epoch": 1.1258910485379845, + "grad_norm": 0.10106877237558365, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 291250 + }, + { + "epoch": 1.1259297057413677, + "grad_norm": 0.112881600856781, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 291260 + }, + { + "epoch": 1.125968362944751, + "grad_norm": 0.10713592171669006, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 291270 + }, + { + "epoch": 1.1260070201481345, + "grad_norm": 0.11898888647556305, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 291280 + }, + { + "epoch": 1.1260456773515177, + "grad_norm": 0.11185794323682785, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 291290 + }, + { + "epoch": 1.126084334554901, + "grad_norm": 0.10397381335496902, + "learning_rate": 0.002, + "loss": 2.339, + "step": 291300 + }, + { + "epoch": 1.1261229917582842, + "grad_norm": 0.1448792964220047, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 291310 + }, + { + "epoch": 1.1261616489616675, + "grad_norm": 0.10253781825304031, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 291320 + }, + { + "epoch": 1.1262003061650507, + "grad_norm": 0.1232328712940216, + "learning_rate": 0.002, + "loss": 2.32, + "step": 291330 + }, + { + "epoch": 1.126238963368434, + "grad_norm": 0.10629157721996307, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 291340 + }, + { + "epoch": 1.1262776205718175, + "grad_norm": 0.09541931748390198, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 291350 + }, + { + "epoch": 1.1263162777752007, + "grad_norm": 0.1253042221069336, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 291360 + }, + { + "epoch": 1.126354934978584, + "grad_norm": 0.10665005445480347, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 291370 + }, + { + "epoch": 1.1263935921819672, + "grad_norm": 0.11243661493062973, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 291380 + }, + { + "epoch": 1.1264322493853505, + "grad_norm": 0.11163606494665146, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 291390 + }, + { + "epoch": 1.1264709065887337, + "grad_norm": 0.13230501115322113, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 291400 + }, + { + "epoch": 1.126509563792117, + "grad_norm": 0.09509193152189255, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 291410 + }, + { + "epoch": 1.1265482209955002, + "grad_norm": 0.08772409707307816, + "learning_rate": 0.002, + "loss": 2.342, + "step": 291420 + }, + { + "epoch": 1.1265868781988835, + "grad_norm": 0.11761298775672913, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 291430 + }, + { + "epoch": 1.126625535402267, + "grad_norm": 0.11829674988985062, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 291440 + }, + { + "epoch": 1.1266641926056502, + "grad_norm": 0.11085585504770279, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 291450 + }, + { + "epoch": 1.1267028498090335, + "grad_norm": 0.11346784234046936, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 291460 + }, + { + "epoch": 1.1267415070124167, + "grad_norm": 0.09546907246112823, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 291470 + }, + { + "epoch": 1.1267801642158, + "grad_norm": 0.09734796732664108, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 291480 + }, + { + "epoch": 1.1268188214191832, + "grad_norm": 0.11880217492580414, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 291490 + }, + { + "epoch": 1.1268574786225665, + "grad_norm": 0.09316064417362213, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 291500 + }, + { + "epoch": 1.1268961358259497, + "grad_norm": 0.1146712377667427, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 291510 + }, + { + "epoch": 1.1269347930293332, + "grad_norm": 0.10328290611505508, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 291520 + }, + { + "epoch": 1.1269734502327164, + "grad_norm": 0.10888945311307907, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 291530 + }, + { + "epoch": 1.1270121074360997, + "grad_norm": 0.10232578217983246, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 291540 + }, + { + "epoch": 1.127050764639483, + "grad_norm": 0.1258421689271927, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 291550 + }, + { + "epoch": 1.1270894218428662, + "grad_norm": 0.11606115847826004, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 291560 + }, + { + "epoch": 1.1271280790462495, + "grad_norm": 0.10178054124116898, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 291570 + }, + { + "epoch": 1.1271667362496327, + "grad_norm": 0.11443013697862625, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 291580 + }, + { + "epoch": 1.127205393453016, + "grad_norm": 0.10536891967058182, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 291590 + }, + { + "epoch": 1.1272440506563992, + "grad_norm": 0.10703325271606445, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 291600 + }, + { + "epoch": 1.1272827078597827, + "grad_norm": 0.10174395889043808, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 291610 + }, + { + "epoch": 1.127321365063166, + "grad_norm": 0.09756621718406677, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 291620 + }, + { + "epoch": 1.1273600222665492, + "grad_norm": 0.1081177219748497, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 291630 + }, + { + "epoch": 1.1273986794699324, + "grad_norm": 0.09706825017929077, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 291640 + }, + { + "epoch": 1.1274373366733157, + "grad_norm": 0.09932103008031845, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 291650 + }, + { + "epoch": 1.127475993876699, + "grad_norm": 0.11982505768537521, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 291660 + }, + { + "epoch": 1.1275146510800822, + "grad_norm": 0.10125798732042313, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 291670 + }, + { + "epoch": 1.1275533082834654, + "grad_norm": 0.10993444174528122, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 291680 + }, + { + "epoch": 1.127591965486849, + "grad_norm": 0.10214158892631531, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 291690 + }, + { + "epoch": 1.1276306226902322, + "grad_norm": 0.09644022583961487, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 291700 + }, + { + "epoch": 1.1276692798936154, + "grad_norm": 0.09081563353538513, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 291710 + }, + { + "epoch": 1.1277079370969987, + "grad_norm": 0.09693627059459686, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 291720 + }, + { + "epoch": 1.127746594300382, + "grad_norm": 0.11326735466718674, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 291730 + }, + { + "epoch": 1.1277852515037652, + "grad_norm": 0.09546099603176117, + "learning_rate": 0.002, + "loss": 2.342, + "step": 291740 + }, + { + "epoch": 1.1278239087071484, + "grad_norm": 0.11614954471588135, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 291750 + }, + { + "epoch": 1.1278625659105317, + "grad_norm": 0.11270710825920105, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 291760 + }, + { + "epoch": 1.127901223113915, + "grad_norm": 0.10175687819719315, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 291770 + }, + { + "epoch": 1.1279398803172984, + "grad_norm": 0.10057220607995987, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 291780 + }, + { + "epoch": 1.1279785375206817, + "grad_norm": 0.14043523371219635, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 291790 + }, + { + "epoch": 1.128017194724065, + "grad_norm": 0.11015049368143082, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 291800 + }, + { + "epoch": 1.1280558519274482, + "grad_norm": 0.10724800825119019, + "learning_rate": 0.002, + "loss": 2.341, + "step": 291810 + }, + { + "epoch": 1.1280945091308314, + "grad_norm": 0.489633709192276, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 291820 + }, + { + "epoch": 1.1281331663342147, + "grad_norm": 0.10834350436925888, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 291830 + }, + { + "epoch": 1.128171823537598, + "grad_norm": 0.09968345612287521, + "learning_rate": 0.002, + "loss": 2.324, + "step": 291840 + }, + { + "epoch": 1.1282104807409812, + "grad_norm": 0.10822063684463501, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 291850 + }, + { + "epoch": 1.1282491379443647, + "grad_norm": 0.10708516091108322, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 291860 + }, + { + "epoch": 1.128287795147748, + "grad_norm": 0.09746289253234863, + "learning_rate": 0.002, + "loss": 2.328, + "step": 291870 + }, + { + "epoch": 1.1283264523511312, + "grad_norm": 0.10932601988315582, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 291880 + }, + { + "epoch": 1.1283651095545144, + "grad_norm": 1.7511025667190552, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 291890 + }, + { + "epoch": 1.1284037667578977, + "grad_norm": 0.10689739137887955, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 291900 + }, + { + "epoch": 1.128442423961281, + "grad_norm": 0.09519015997648239, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 291910 + }, + { + "epoch": 1.1284810811646642, + "grad_norm": 0.10214191675186157, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 291920 + }, + { + "epoch": 1.1285197383680474, + "grad_norm": 0.09518104791641235, + "learning_rate": 0.002, + "loss": 2.308, + "step": 291930 + }, + { + "epoch": 1.1285583955714307, + "grad_norm": 0.10087645053863525, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 291940 + }, + { + "epoch": 1.1285970527748141, + "grad_norm": 0.12696702778339386, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 291950 + }, + { + "epoch": 1.1286357099781974, + "grad_norm": 0.09882255643606186, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 291960 + }, + { + "epoch": 1.1286743671815807, + "grad_norm": 0.11024424433708191, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 291970 + }, + { + "epoch": 1.128713024384964, + "grad_norm": 0.09846694767475128, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 291980 + }, + { + "epoch": 1.1287516815883472, + "grad_norm": 0.10065692663192749, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 291990 + }, + { + "epoch": 1.1287903387917304, + "grad_norm": 0.09468679875135422, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 292000 + }, + { + "epoch": 1.1288289959951137, + "grad_norm": 0.1300666779279709, + "learning_rate": 0.002, + "loss": 2.33, + "step": 292010 + }, + { + "epoch": 1.128867653198497, + "grad_norm": 0.10579036921262741, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 292020 + }, + { + "epoch": 1.1289063104018804, + "grad_norm": 0.10410472005605698, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 292030 + }, + { + "epoch": 1.1289449676052636, + "grad_norm": 0.11222629994153976, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 292040 + }, + { + "epoch": 1.128983624808647, + "grad_norm": 0.09962747246026993, + "learning_rate": 0.002, + "loss": 2.346, + "step": 292050 + }, + { + "epoch": 1.1290222820120301, + "grad_norm": 0.11796198785305023, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 292060 + }, + { + "epoch": 1.1290609392154134, + "grad_norm": 0.10702431201934814, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 292070 + }, + { + "epoch": 1.1290995964187966, + "grad_norm": 0.08998136967420578, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 292080 + }, + { + "epoch": 1.12913825362218, + "grad_norm": 0.09148484468460083, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 292090 + }, + { + "epoch": 1.1291769108255632, + "grad_norm": 0.09372007101774216, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 292100 + }, + { + "epoch": 1.1292155680289464, + "grad_norm": 0.11189445853233337, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 292110 + }, + { + "epoch": 1.1292542252323299, + "grad_norm": 0.09926964342594147, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 292120 + }, + { + "epoch": 1.1292928824357131, + "grad_norm": 0.08940834552049637, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 292130 + }, + { + "epoch": 1.1293315396390964, + "grad_norm": 0.12416701018810272, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 292140 + }, + { + "epoch": 1.1293701968424796, + "grad_norm": 0.10040237009525299, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 292150 + }, + { + "epoch": 1.1294088540458629, + "grad_norm": 0.11909114569425583, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 292160 + }, + { + "epoch": 1.1294475112492461, + "grad_norm": 0.30783000588417053, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 292170 + }, + { + "epoch": 1.1294861684526294, + "grad_norm": 0.1081785336136818, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 292180 + }, + { + "epoch": 1.1295248256560129, + "grad_norm": 0.10198457539081573, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 292190 + }, + { + "epoch": 1.1295634828593961, + "grad_norm": 0.09565374255180359, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 292200 + }, + { + "epoch": 1.1296021400627794, + "grad_norm": 0.09713221341371536, + "learning_rate": 0.002, + "loss": 2.336, + "step": 292210 + }, + { + "epoch": 1.1296407972661626, + "grad_norm": 0.09174959361553192, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 292220 + }, + { + "epoch": 1.1296794544695459, + "grad_norm": 0.11424946784973145, + "learning_rate": 0.002, + "loss": 2.339, + "step": 292230 + }, + { + "epoch": 1.1297181116729291, + "grad_norm": 0.10965334624052048, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 292240 + }, + { + "epoch": 1.1297567688763124, + "grad_norm": 0.10149752348661423, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 292250 + }, + { + "epoch": 1.1297954260796956, + "grad_norm": 0.13889697194099426, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 292260 + }, + { + "epoch": 1.1298340832830789, + "grad_norm": 0.10565648227930069, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 292270 + }, + { + "epoch": 1.1298727404864621, + "grad_norm": 0.10034672915935516, + "learning_rate": 0.002, + "loss": 2.336, + "step": 292280 + }, + { + "epoch": 1.1299113976898456, + "grad_norm": 0.10778245329856873, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 292290 + }, + { + "epoch": 1.1299500548932289, + "grad_norm": 0.11974408477544785, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 292300 + }, + { + "epoch": 1.1299887120966121, + "grad_norm": 0.09993696212768555, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 292310 + }, + { + "epoch": 1.1300273692999954, + "grad_norm": 0.09732171893119812, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 292320 + }, + { + "epoch": 1.1300660265033786, + "grad_norm": 0.11085110902786255, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 292330 + }, + { + "epoch": 1.1301046837067619, + "grad_norm": 0.10859154909849167, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 292340 + }, + { + "epoch": 1.1301433409101451, + "grad_norm": 0.08565965294837952, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 292350 + }, + { + "epoch": 1.1301819981135286, + "grad_norm": 0.10185325145721436, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 292360 + }, + { + "epoch": 1.1302206553169118, + "grad_norm": 0.09620902687311172, + "learning_rate": 0.002, + "loss": 2.329, + "step": 292370 + }, + { + "epoch": 1.130259312520295, + "grad_norm": 0.10088915377855301, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 292380 + }, + { + "epoch": 1.1302979697236784, + "grad_norm": 0.12231367826461792, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 292390 + }, + { + "epoch": 1.1303366269270616, + "grad_norm": 0.11737716943025589, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 292400 + }, + { + "epoch": 1.1303752841304449, + "grad_norm": 0.12546178698539734, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 292410 + }, + { + "epoch": 1.130413941333828, + "grad_norm": 0.09996787458658218, + "learning_rate": 0.002, + "loss": 2.347, + "step": 292420 + }, + { + "epoch": 1.1304525985372114, + "grad_norm": 0.10157563537359238, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 292430 + }, + { + "epoch": 1.1304912557405946, + "grad_norm": 0.09826963394880295, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 292440 + }, + { + "epoch": 1.1305299129439779, + "grad_norm": 0.10715880990028381, + "learning_rate": 0.002, + "loss": 2.332, + "step": 292450 + }, + { + "epoch": 1.1305685701473613, + "grad_norm": 0.1121690645813942, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 292460 + }, + { + "epoch": 1.1306072273507446, + "grad_norm": 0.10457079857587814, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 292470 + }, + { + "epoch": 1.1306458845541278, + "grad_norm": 0.10679171979427338, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 292480 + }, + { + "epoch": 1.130684541757511, + "grad_norm": 0.0944804698228836, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 292490 + }, + { + "epoch": 1.1307231989608943, + "grad_norm": 0.0977306142449379, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 292500 + }, + { + "epoch": 1.1307618561642776, + "grad_norm": 0.10510288178920746, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 292510 + }, + { + "epoch": 1.1308005133676609, + "grad_norm": 0.12686295807361603, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 292520 + }, + { + "epoch": 1.1308391705710443, + "grad_norm": 0.11069813370704651, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 292530 + }, + { + "epoch": 1.1308778277744276, + "grad_norm": 0.11886830627918243, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 292540 + }, + { + "epoch": 1.1309164849778108, + "grad_norm": 0.09395457059144974, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 292550 + }, + { + "epoch": 1.130955142181194, + "grad_norm": 0.09358373284339905, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 292560 + }, + { + "epoch": 1.1309937993845773, + "grad_norm": 0.10663779079914093, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 292570 + }, + { + "epoch": 1.1310324565879606, + "grad_norm": 0.10025444626808167, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 292580 + }, + { + "epoch": 1.1310711137913438, + "grad_norm": 0.10433932393789291, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 292590 + }, + { + "epoch": 1.131109770994727, + "grad_norm": 0.1081257089972496, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 292600 + }, + { + "epoch": 1.1311484281981103, + "grad_norm": 0.09624449908733368, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 292610 + }, + { + "epoch": 1.1311870854014936, + "grad_norm": 0.09565295279026031, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 292620 + }, + { + "epoch": 1.131225742604877, + "grad_norm": 0.11357071995735168, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 292630 + }, + { + "epoch": 1.1312643998082603, + "grad_norm": 0.09906624257564545, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 292640 + }, + { + "epoch": 1.1313030570116436, + "grad_norm": 0.11691075563430786, + "learning_rate": 0.002, + "loss": 2.334, + "step": 292650 + }, + { + "epoch": 1.1313417142150268, + "grad_norm": 0.10988210141658783, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 292660 + }, + { + "epoch": 1.13138037141841, + "grad_norm": 0.08801021426916122, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 292670 + }, + { + "epoch": 1.1314190286217933, + "grad_norm": 0.11762183904647827, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 292680 + }, + { + "epoch": 1.1314576858251766, + "grad_norm": 0.10153577476739883, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 292690 + }, + { + "epoch": 1.13149634302856, + "grad_norm": 0.11427845060825348, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 292700 + }, + { + "epoch": 1.1315350002319433, + "grad_norm": 0.12625622749328613, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 292710 + }, + { + "epoch": 1.1315736574353266, + "grad_norm": 0.10108830034732819, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 292720 + }, + { + "epoch": 1.1316123146387098, + "grad_norm": 0.09629391878843307, + "learning_rate": 0.002, + "loss": 2.346, + "step": 292730 + }, + { + "epoch": 1.131650971842093, + "grad_norm": 0.09989041835069656, + "learning_rate": 0.002, + "loss": 2.348, + "step": 292740 + }, + { + "epoch": 1.1316896290454763, + "grad_norm": 0.10820818692445755, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 292750 + }, + { + "epoch": 1.1317282862488596, + "grad_norm": 0.10253668576478958, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 292760 + }, + { + "epoch": 1.1317669434522428, + "grad_norm": 0.09603530168533325, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 292770 + }, + { + "epoch": 1.131805600655626, + "grad_norm": 0.1039159968495369, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 292780 + }, + { + "epoch": 1.1318442578590093, + "grad_norm": 0.10775238275527954, + "learning_rate": 0.002, + "loss": 2.331, + "step": 292790 + }, + { + "epoch": 1.1318829150623928, + "grad_norm": 0.09700698405504227, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 292800 + }, + { + "epoch": 1.131921572265776, + "grad_norm": 0.09702009707689285, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 292810 + }, + { + "epoch": 1.1319602294691593, + "grad_norm": 0.09258618950843811, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 292820 + }, + { + "epoch": 1.1319988866725426, + "grad_norm": 0.1123087927699089, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 292830 + }, + { + "epoch": 1.1320375438759258, + "grad_norm": 0.0948568657040596, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 292840 + }, + { + "epoch": 1.132076201079309, + "grad_norm": 0.1217011883854866, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 292850 + }, + { + "epoch": 1.1321148582826923, + "grad_norm": 0.10090646147727966, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 292860 + }, + { + "epoch": 1.1321535154860758, + "grad_norm": 0.10404011607170105, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 292870 + }, + { + "epoch": 1.132192172689459, + "grad_norm": 0.1336078643798828, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 292880 + }, + { + "epoch": 1.1322308298928423, + "grad_norm": 0.12429642677307129, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 292890 + }, + { + "epoch": 1.1322694870962255, + "grad_norm": 0.1276293247938156, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 292900 + }, + { + "epoch": 1.1323081442996088, + "grad_norm": 0.09542718529701233, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 292910 + }, + { + "epoch": 1.132346801502992, + "grad_norm": 0.1164063885807991, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 292920 + }, + { + "epoch": 1.1323854587063753, + "grad_norm": 0.10636220127344131, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 292930 + }, + { + "epoch": 1.1324241159097586, + "grad_norm": 0.09025134891271591, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 292940 + }, + { + "epoch": 1.1324627731131418, + "grad_norm": 0.11995851248502731, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 292950 + }, + { + "epoch": 1.132501430316525, + "grad_norm": 0.1027119979262352, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 292960 + }, + { + "epoch": 1.1325400875199085, + "grad_norm": 0.10810244828462601, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 292970 + }, + { + "epoch": 1.1325787447232918, + "grad_norm": 0.10233033448457718, + "learning_rate": 0.002, + "loss": 2.342, + "step": 292980 + }, + { + "epoch": 1.132617401926675, + "grad_norm": 0.10700412839651108, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 292990 + }, + { + "epoch": 1.1326560591300583, + "grad_norm": 0.11041572690010071, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 293000 + }, + { + "epoch": 1.1326947163334415, + "grad_norm": 0.10478068143129349, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 293010 + }, + { + "epoch": 1.1327333735368248, + "grad_norm": 0.10832428187131882, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 293020 + }, + { + "epoch": 1.132772030740208, + "grad_norm": 0.12409119307994843, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 293030 + }, + { + "epoch": 1.1328106879435915, + "grad_norm": 0.0951986163854599, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 293040 + }, + { + "epoch": 1.1328493451469748, + "grad_norm": 0.09725385904312134, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 293050 + }, + { + "epoch": 1.132888002350358, + "grad_norm": 0.12577664852142334, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 293060 + }, + { + "epoch": 1.1329266595537413, + "grad_norm": 0.12517467141151428, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 293070 + }, + { + "epoch": 1.1329653167571245, + "grad_norm": 0.10554498434066772, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 293080 + }, + { + "epoch": 1.1330039739605078, + "grad_norm": 0.09695900976657867, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 293090 + }, + { + "epoch": 1.133042631163891, + "grad_norm": 0.1009320542216301, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 293100 + }, + { + "epoch": 1.1330812883672743, + "grad_norm": 0.09868354350328445, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 293110 + }, + { + "epoch": 1.1331199455706575, + "grad_norm": 0.10926561802625656, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 293120 + }, + { + "epoch": 1.1331586027740408, + "grad_norm": 0.11301583796739578, + "learning_rate": 0.002, + "loss": 2.337, + "step": 293130 + }, + { + "epoch": 1.1331972599774243, + "grad_norm": 0.08914438635110855, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 293140 + }, + { + "epoch": 1.1332359171808075, + "grad_norm": 0.10104355961084366, + "learning_rate": 0.002, + "loss": 2.344, + "step": 293150 + }, + { + "epoch": 1.1332745743841908, + "grad_norm": 0.10801784694194794, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 293160 + }, + { + "epoch": 1.133313231587574, + "grad_norm": 0.11123783141374588, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 293170 + }, + { + "epoch": 1.1333518887909573, + "grad_norm": 0.09961264580488205, + "learning_rate": 0.002, + "loss": 2.331, + "step": 293180 + }, + { + "epoch": 1.1333905459943405, + "grad_norm": 0.09766477346420288, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 293190 + }, + { + "epoch": 1.1334292031977238, + "grad_norm": 0.11179253458976746, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 293200 + }, + { + "epoch": 1.1334678604011073, + "grad_norm": 0.14136448502540588, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 293210 + }, + { + "epoch": 1.1335065176044905, + "grad_norm": 0.100206658244133, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 293220 + }, + { + "epoch": 1.1335451748078738, + "grad_norm": 0.09034978598356247, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 293230 + }, + { + "epoch": 1.133583832011257, + "grad_norm": 0.10155314952135086, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 293240 + }, + { + "epoch": 1.1336224892146403, + "grad_norm": 0.09789247810840607, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 293250 + }, + { + "epoch": 1.1336611464180235, + "grad_norm": 0.12007251381874084, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 293260 + }, + { + "epoch": 1.1336998036214068, + "grad_norm": 0.0898607075214386, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 293270 + }, + { + "epoch": 1.13373846082479, + "grad_norm": 0.09242238104343414, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 293280 + }, + { + "epoch": 1.1337771180281733, + "grad_norm": 0.13049384951591492, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 293290 + }, + { + "epoch": 1.1338157752315567, + "grad_norm": 0.10886628925800323, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 293300 + }, + { + "epoch": 1.13385443243494, + "grad_norm": 0.10137798637151718, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 293310 + }, + { + "epoch": 1.1338930896383232, + "grad_norm": 0.09787803143262863, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 293320 + }, + { + "epoch": 1.1339317468417065, + "grad_norm": 0.11174280196428299, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 293330 + }, + { + "epoch": 1.1339704040450898, + "grad_norm": 0.11473164707422256, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 293340 + }, + { + "epoch": 1.134009061248473, + "grad_norm": 0.10111039876937866, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 293350 + }, + { + "epoch": 1.1340477184518563, + "grad_norm": 0.1075422465801239, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 293360 + }, + { + "epoch": 1.1340863756552395, + "grad_norm": 0.11484236270189285, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 293370 + }, + { + "epoch": 1.134125032858623, + "grad_norm": 0.09201221913099289, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 293380 + }, + { + "epoch": 1.1341636900620062, + "grad_norm": 0.12520328164100647, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 293390 + }, + { + "epoch": 1.1342023472653895, + "grad_norm": 0.09377523511648178, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 293400 + }, + { + "epoch": 1.1342410044687727, + "grad_norm": 0.11587250977754593, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 293410 + }, + { + "epoch": 1.134279661672156, + "grad_norm": 0.10029492527246475, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 293420 + }, + { + "epoch": 1.1343183188755392, + "grad_norm": 0.09807023406028748, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 293430 + }, + { + "epoch": 1.1343569760789225, + "grad_norm": 0.10845301300287247, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 293440 + }, + { + "epoch": 1.1343956332823057, + "grad_norm": 0.11254344135522842, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 293450 + }, + { + "epoch": 1.134434290485689, + "grad_norm": 0.09739566594362259, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 293460 + }, + { + "epoch": 1.1344729476890725, + "grad_norm": 0.11345074325799942, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 293470 + }, + { + "epoch": 1.1345116048924557, + "grad_norm": 0.11020570993423462, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 293480 + }, + { + "epoch": 1.134550262095839, + "grad_norm": 0.095384381711483, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 293490 + }, + { + "epoch": 1.1345889192992222, + "grad_norm": 0.11963685601949692, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 293500 + }, + { + "epoch": 1.1346275765026055, + "grad_norm": 0.11871231347322464, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 293510 + }, + { + "epoch": 1.1346662337059887, + "grad_norm": 0.10440956801176071, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 293520 + }, + { + "epoch": 1.134704890909372, + "grad_norm": 0.12999387085437775, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 293530 + }, + { + "epoch": 1.1347435481127552, + "grad_norm": 0.1252586394548416, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 293540 + }, + { + "epoch": 1.1347822053161387, + "grad_norm": 0.09814410656690598, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 293550 + }, + { + "epoch": 1.134820862519522, + "grad_norm": 0.10317713022232056, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 293560 + }, + { + "epoch": 1.1348595197229052, + "grad_norm": 0.09887121617794037, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 293570 + }, + { + "epoch": 1.1348981769262885, + "grad_norm": 0.09725435823202133, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 293580 + }, + { + "epoch": 1.1349368341296717, + "grad_norm": 0.11813397705554962, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 293590 + }, + { + "epoch": 1.134975491333055, + "grad_norm": 0.10130724310874939, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 293600 + }, + { + "epoch": 1.1350141485364382, + "grad_norm": 0.10235709697008133, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 293610 + }, + { + "epoch": 1.1350528057398215, + "grad_norm": 0.11193295568227768, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 293620 + }, + { + "epoch": 1.1350914629432047, + "grad_norm": 0.11110543459653854, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 293630 + }, + { + "epoch": 1.1351301201465882, + "grad_norm": 0.0979127287864685, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 293640 + }, + { + "epoch": 1.1351687773499715, + "grad_norm": 0.09800735116004944, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 293650 + }, + { + "epoch": 1.1352074345533547, + "grad_norm": 0.13615325093269348, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 293660 + }, + { + "epoch": 1.135246091756738, + "grad_norm": 0.09840956330299377, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 293670 + }, + { + "epoch": 1.1352847489601212, + "grad_norm": 0.09474692493677139, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 293680 + }, + { + "epoch": 1.1353234061635045, + "grad_norm": 0.12110727280378342, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 293690 + }, + { + "epoch": 1.1353620633668877, + "grad_norm": 0.10657018423080444, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 293700 + }, + { + "epoch": 1.135400720570271, + "grad_norm": 0.10020346194505692, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 293710 + }, + { + "epoch": 1.1354393777736544, + "grad_norm": 0.09850382059812546, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 293720 + }, + { + "epoch": 1.1354780349770377, + "grad_norm": 0.10717561841011047, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 293730 + }, + { + "epoch": 1.135516692180421, + "grad_norm": 0.1006290391087532, + "learning_rate": 0.002, + "loss": 2.324, + "step": 293740 + }, + { + "epoch": 1.1355553493838042, + "grad_norm": 0.10350465029478073, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 293750 + }, + { + "epoch": 1.1355940065871875, + "grad_norm": 0.1130458265542984, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 293760 + }, + { + "epoch": 1.1356326637905707, + "grad_norm": 0.11281120777130127, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 293770 + }, + { + "epoch": 1.135671320993954, + "grad_norm": 0.10524564981460571, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 293780 + }, + { + "epoch": 1.1357099781973372, + "grad_norm": 0.11162056028842926, + "learning_rate": 0.002, + "loss": 2.349, + "step": 293790 + }, + { + "epoch": 1.1357486354007205, + "grad_norm": 0.10000617057085037, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 293800 + }, + { + "epoch": 1.135787292604104, + "grad_norm": 0.12025784701108932, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 293810 + }, + { + "epoch": 1.1358259498074872, + "grad_norm": 0.09196845442056656, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 293820 + }, + { + "epoch": 1.1358646070108704, + "grad_norm": 0.1028822734951973, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 293830 + }, + { + "epoch": 1.1359032642142537, + "grad_norm": 0.11393461376428604, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 293840 + }, + { + "epoch": 1.135941921417637, + "grad_norm": 0.11156242340803146, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 293850 + }, + { + "epoch": 1.1359805786210202, + "grad_norm": 0.127973273396492, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 293860 + }, + { + "epoch": 1.1360192358244035, + "grad_norm": 0.11327163130044937, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 293870 + }, + { + "epoch": 1.1360578930277867, + "grad_norm": 0.0905095785856247, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 293880 + }, + { + "epoch": 1.1360965502311702, + "grad_norm": 0.09544403851032257, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 293890 + }, + { + "epoch": 1.1361352074345534, + "grad_norm": 0.11277344822883606, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 293900 + }, + { + "epoch": 1.1361738646379367, + "grad_norm": 0.12489354610443115, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 293910 + }, + { + "epoch": 1.13621252184132, + "grad_norm": 0.11960267275571823, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 293920 + }, + { + "epoch": 1.1362511790447032, + "grad_norm": 0.11858020722866058, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 293930 + }, + { + "epoch": 1.1362898362480864, + "grad_norm": 0.10492021590471268, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 293940 + }, + { + "epoch": 1.1363284934514697, + "grad_norm": 0.11255805194377899, + "learning_rate": 0.002, + "loss": 2.334, + "step": 293950 + }, + { + "epoch": 1.136367150654853, + "grad_norm": 0.09281715750694275, + "learning_rate": 0.002, + "loss": 2.349, + "step": 293960 + }, + { + "epoch": 1.1364058078582362, + "grad_norm": 0.11357801407575607, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 293970 + }, + { + "epoch": 1.1364444650616197, + "grad_norm": 0.1001327857375145, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 293980 + }, + { + "epoch": 1.136483122265003, + "grad_norm": 0.09641039371490479, + "learning_rate": 0.002, + "loss": 2.334, + "step": 293990 + }, + { + "epoch": 1.1365217794683862, + "grad_norm": 0.13236850500106812, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 294000 + }, + { + "epoch": 1.1365604366717694, + "grad_norm": 0.10867318511009216, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 294010 + }, + { + "epoch": 1.1365990938751527, + "grad_norm": 0.10816536098718643, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 294020 + }, + { + "epoch": 1.136637751078536, + "grad_norm": 0.11130563914775848, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 294030 + }, + { + "epoch": 1.1366764082819192, + "grad_norm": 0.11933492124080658, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 294040 + }, + { + "epoch": 1.1367150654853027, + "grad_norm": 0.10061101615428925, + "learning_rate": 0.002, + "loss": 2.355, + "step": 294050 + }, + { + "epoch": 1.136753722688686, + "grad_norm": 0.11716160923242569, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 294060 + }, + { + "epoch": 1.1367923798920692, + "grad_norm": 0.10237450897693634, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 294070 + }, + { + "epoch": 1.1368310370954524, + "grad_norm": 0.10022053867578506, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 294080 + }, + { + "epoch": 1.1368696942988357, + "grad_norm": 0.10771946609020233, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 294090 + }, + { + "epoch": 1.136908351502219, + "grad_norm": 0.11842872202396393, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 294100 + }, + { + "epoch": 1.1369470087056022, + "grad_norm": 0.11161211878061295, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 294110 + }, + { + "epoch": 1.1369856659089854, + "grad_norm": 0.09512294828891754, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 294120 + }, + { + "epoch": 1.1370243231123687, + "grad_norm": 0.0964144691824913, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 294130 + }, + { + "epoch": 1.137062980315752, + "grad_norm": 0.09910161048173904, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 294140 + }, + { + "epoch": 1.1371016375191354, + "grad_norm": 0.09445329755544662, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 294150 + }, + { + "epoch": 1.1371402947225187, + "grad_norm": 0.11625587195158005, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 294160 + }, + { + "epoch": 1.137178951925902, + "grad_norm": 0.1302066296339035, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 294170 + }, + { + "epoch": 1.1372176091292852, + "grad_norm": 0.11113950610160828, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 294180 + }, + { + "epoch": 1.1372562663326684, + "grad_norm": 0.09936251491308212, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 294190 + }, + { + "epoch": 1.1372949235360517, + "grad_norm": 0.09935726970434189, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 294200 + }, + { + "epoch": 1.137333580739435, + "grad_norm": 0.10621356219053268, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 294210 + }, + { + "epoch": 1.1373722379428184, + "grad_norm": 0.11173443496227264, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 294220 + }, + { + "epoch": 1.1374108951462016, + "grad_norm": 0.10126882791519165, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 294230 + }, + { + "epoch": 1.137449552349585, + "grad_norm": 0.10050792247056961, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 294240 + }, + { + "epoch": 1.1374882095529681, + "grad_norm": 0.10208075493574142, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 294250 + }, + { + "epoch": 1.1375268667563514, + "grad_norm": 0.11423249542713165, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 294260 + }, + { + "epoch": 1.1375655239597346, + "grad_norm": 0.0945185050368309, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 294270 + }, + { + "epoch": 1.137604181163118, + "grad_norm": 0.09072756767272949, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 294280 + }, + { + "epoch": 1.1376428383665012, + "grad_norm": 0.10278897732496262, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 294290 + }, + { + "epoch": 1.1376814955698844, + "grad_norm": 0.12045719474554062, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 294300 + }, + { + "epoch": 1.1377201527732677, + "grad_norm": 0.1090647280216217, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 294310 + }, + { + "epoch": 1.1377588099766511, + "grad_norm": 0.0932658389210701, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 294320 + }, + { + "epoch": 1.1377974671800344, + "grad_norm": 0.11652059853076935, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 294330 + }, + { + "epoch": 1.1378361243834176, + "grad_norm": 0.13362693786621094, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 294340 + }, + { + "epoch": 1.1378747815868009, + "grad_norm": 0.09870358556509018, + "learning_rate": 0.002, + "loss": 2.336, + "step": 294350 + }, + { + "epoch": 1.1379134387901841, + "grad_norm": 0.14037330448627472, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 294360 + }, + { + "epoch": 1.1379520959935674, + "grad_norm": 0.10037653893232346, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 294370 + }, + { + "epoch": 1.1379907531969506, + "grad_norm": 0.0925574079155922, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 294380 + }, + { + "epoch": 1.1380294104003341, + "grad_norm": 0.09062647819519043, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 294390 + }, + { + "epoch": 1.1380680676037174, + "grad_norm": 0.09871121495962143, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 294400 + }, + { + "epoch": 1.1381067248071006, + "grad_norm": 0.10257646441459656, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 294410 + }, + { + "epoch": 1.1381453820104839, + "grad_norm": 0.12807287275791168, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 294420 + }, + { + "epoch": 1.1381840392138671, + "grad_norm": 0.14499905705451965, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 294430 + }, + { + "epoch": 1.1382226964172504, + "grad_norm": 0.10224590450525284, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 294440 + }, + { + "epoch": 1.1382613536206336, + "grad_norm": 0.08267837762832642, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 294450 + }, + { + "epoch": 1.1383000108240169, + "grad_norm": 0.10924314707517624, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 294460 + }, + { + "epoch": 1.1383386680274001, + "grad_norm": 0.10160164535045624, + "learning_rate": 0.002, + "loss": 2.32, + "step": 294470 + }, + { + "epoch": 1.1383773252307834, + "grad_norm": 0.10632597655057907, + "learning_rate": 0.002, + "loss": 2.341, + "step": 294480 + }, + { + "epoch": 1.1384159824341669, + "grad_norm": 0.13016681373119354, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 294490 + }, + { + "epoch": 1.1384546396375501, + "grad_norm": 0.1169414222240448, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 294500 + }, + { + "epoch": 1.1384932968409334, + "grad_norm": 0.09101717919111252, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 294510 + }, + { + "epoch": 1.1385319540443166, + "grad_norm": 0.09878969192504883, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 294520 + }, + { + "epoch": 1.1385706112476999, + "grad_norm": 0.09379914402961731, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 294530 + }, + { + "epoch": 1.1386092684510831, + "grad_norm": 0.10676123946905136, + "learning_rate": 0.002, + "loss": 2.345, + "step": 294540 + }, + { + "epoch": 1.1386479256544664, + "grad_norm": 0.10265262424945831, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 294550 + }, + { + "epoch": 1.1386865828578498, + "grad_norm": 0.11115993559360504, + "learning_rate": 0.002, + "loss": 2.349, + "step": 294560 + }, + { + "epoch": 1.138725240061233, + "grad_norm": 0.1104055792093277, + "learning_rate": 0.002, + "loss": 2.34, + "step": 294570 + }, + { + "epoch": 1.1387638972646164, + "grad_norm": 0.10094203054904938, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 294580 + }, + { + "epoch": 1.1388025544679996, + "grad_norm": 0.09028852730989456, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 294590 + }, + { + "epoch": 1.1388412116713829, + "grad_norm": 0.09736517071723938, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 294600 + }, + { + "epoch": 1.138879868874766, + "grad_norm": 0.12073744833469391, + "learning_rate": 0.002, + "loss": 2.315, + "step": 294610 + }, + { + "epoch": 1.1389185260781494, + "grad_norm": 0.10712353140115738, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 294620 + }, + { + "epoch": 1.1389571832815326, + "grad_norm": 0.11008912324905396, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 294630 + }, + { + "epoch": 1.1389958404849159, + "grad_norm": 0.0936817154288292, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 294640 + }, + { + "epoch": 1.1390344976882991, + "grad_norm": 0.11594562977552414, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 294650 + }, + { + "epoch": 1.1390731548916826, + "grad_norm": 0.10609795898199081, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 294660 + }, + { + "epoch": 1.1391118120950658, + "grad_norm": 0.12127210199832916, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 294670 + }, + { + "epoch": 1.139150469298449, + "grad_norm": 0.08909579366445541, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 294680 + }, + { + "epoch": 1.1391891265018323, + "grad_norm": 0.12188173830509186, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 294690 + }, + { + "epoch": 1.1392277837052156, + "grad_norm": 0.10340917110443115, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 294700 + }, + { + "epoch": 1.1392664409085989, + "grad_norm": 0.10992109775543213, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 294710 + }, + { + "epoch": 1.139305098111982, + "grad_norm": 0.0943150594830513, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 294720 + }, + { + "epoch": 1.1393437553153656, + "grad_norm": 0.10311193764209747, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 294730 + }, + { + "epoch": 1.1393824125187488, + "grad_norm": 0.09406863898038864, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 294740 + }, + { + "epoch": 1.139421069722132, + "grad_norm": 0.10221472382545471, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 294750 + }, + { + "epoch": 1.1394597269255153, + "grad_norm": 0.13521109521389008, + "learning_rate": 0.002, + "loss": 2.333, + "step": 294760 + }, + { + "epoch": 1.1394983841288986, + "grad_norm": 0.09720505774021149, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 294770 + }, + { + "epoch": 1.1395370413322818, + "grad_norm": 0.1073853075504303, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 294780 + }, + { + "epoch": 1.139575698535665, + "grad_norm": 0.11814963817596436, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 294790 + }, + { + "epoch": 1.1396143557390483, + "grad_norm": 0.10372687876224518, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 294800 + }, + { + "epoch": 1.1396530129424316, + "grad_norm": 0.11656109243631363, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 294810 + }, + { + "epoch": 1.1396916701458149, + "grad_norm": 0.08834312111139297, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 294820 + }, + { + "epoch": 1.1397303273491983, + "grad_norm": 0.1092064306139946, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 294830 + }, + { + "epoch": 1.1397689845525816, + "grad_norm": 0.10597414523363113, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 294840 + }, + { + "epoch": 1.1398076417559648, + "grad_norm": 0.09589572250843048, + "learning_rate": 0.002, + "loss": 2.322, + "step": 294850 + }, + { + "epoch": 1.139846298959348, + "grad_norm": 0.12765908241271973, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 294860 + }, + { + "epoch": 1.1398849561627313, + "grad_norm": 0.11571625620126724, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 294870 + }, + { + "epoch": 1.1399236133661146, + "grad_norm": 0.09366437047719955, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 294880 + }, + { + "epoch": 1.1399622705694978, + "grad_norm": 0.08736945688724518, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 294890 + }, + { + "epoch": 1.1400009277728813, + "grad_norm": 0.0926312580704689, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 294900 + }, + { + "epoch": 1.1400395849762646, + "grad_norm": 0.12359138578176498, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 294910 + }, + { + "epoch": 1.1400782421796478, + "grad_norm": 0.09568643569946289, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 294920 + }, + { + "epoch": 1.140116899383031, + "grad_norm": 0.09694010764360428, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 294930 + }, + { + "epoch": 1.1401555565864143, + "grad_norm": 0.10709650069475174, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 294940 + }, + { + "epoch": 1.1401942137897976, + "grad_norm": 0.09399951994419098, + "learning_rate": 0.002, + "loss": 2.338, + "step": 294950 + }, + { + "epoch": 1.1402328709931808, + "grad_norm": 0.1101020947098732, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 294960 + }, + { + "epoch": 1.140271528196564, + "grad_norm": 0.10791051387786865, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 294970 + }, + { + "epoch": 1.1403101853999473, + "grad_norm": 0.09474851936101913, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 294980 + }, + { + "epoch": 1.1403488426033306, + "grad_norm": 0.1254984587430954, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 294990 + }, + { + "epoch": 1.140387499806714, + "grad_norm": 0.11828399449586868, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 295000 + }, + { + "epoch": 1.1404261570100973, + "grad_norm": 0.13409629464149475, + "learning_rate": 0.002, + "loss": 2.323, + "step": 295010 + }, + { + "epoch": 1.1404648142134806, + "grad_norm": 0.09929110109806061, + "learning_rate": 0.002, + "loss": 2.334, + "step": 295020 + }, + { + "epoch": 1.1405034714168638, + "grad_norm": 0.08490047603845596, + "learning_rate": 0.002, + "loss": 2.323, + "step": 295030 + }, + { + "epoch": 1.140542128620247, + "grad_norm": 0.1046157255768776, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 295040 + }, + { + "epoch": 1.1405807858236303, + "grad_norm": 0.10370510071516037, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 295050 + }, + { + "epoch": 1.1406194430270136, + "grad_norm": 0.09449782967567444, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 295060 + }, + { + "epoch": 1.140658100230397, + "grad_norm": 0.10334462672472, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 295070 + }, + { + "epoch": 1.1406967574337803, + "grad_norm": 0.11590588837862015, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 295080 + }, + { + "epoch": 1.1407354146371635, + "grad_norm": 0.10096020996570587, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 295090 + }, + { + "epoch": 1.1407740718405468, + "grad_norm": 0.09558035433292389, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 295100 + }, + { + "epoch": 1.14081272904393, + "grad_norm": 0.09858120232820511, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 295110 + }, + { + "epoch": 1.1408513862473133, + "grad_norm": 0.09994387626647949, + "learning_rate": 0.002, + "loss": 2.323, + "step": 295120 + }, + { + "epoch": 1.1408900434506966, + "grad_norm": 0.08849941194057465, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 295130 + }, + { + "epoch": 1.1409287006540798, + "grad_norm": 0.123213991522789, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 295140 + }, + { + "epoch": 1.140967357857463, + "grad_norm": 0.11973105370998383, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 295150 + }, + { + "epoch": 1.1410060150608465, + "grad_norm": 0.10440967977046967, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 295160 + }, + { + "epoch": 1.1410446722642298, + "grad_norm": 0.0943630114197731, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 295170 + }, + { + "epoch": 1.141083329467613, + "grad_norm": 0.10222385823726654, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 295180 + }, + { + "epoch": 1.1411219866709963, + "grad_norm": 0.12588584423065186, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 295190 + }, + { + "epoch": 1.1411606438743795, + "grad_norm": 0.0961066260933876, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 295200 + }, + { + "epoch": 1.1411993010777628, + "grad_norm": 0.13759472966194153, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 295210 + }, + { + "epoch": 1.141237958281146, + "grad_norm": 0.1053820252418518, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 295220 + }, + { + "epoch": 1.1412766154845293, + "grad_norm": 0.11537367105484009, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 295230 + }, + { + "epoch": 1.1413152726879128, + "grad_norm": 0.10057997703552246, + "learning_rate": 0.002, + "loss": 2.34, + "step": 295240 + }, + { + "epoch": 1.141353929891296, + "grad_norm": 0.13626791536808014, + "learning_rate": 0.002, + "loss": 2.348, + "step": 295250 + }, + { + "epoch": 1.1413925870946793, + "grad_norm": 0.10184498876333237, + "learning_rate": 0.002, + "loss": 2.34, + "step": 295260 + }, + { + "epoch": 1.1414312442980625, + "grad_norm": 0.11100531369447708, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 295270 + }, + { + "epoch": 1.1414699015014458, + "grad_norm": 0.09096559882164001, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 295280 + }, + { + "epoch": 1.141508558704829, + "grad_norm": 0.09893319010734558, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 295290 + }, + { + "epoch": 1.1415472159082123, + "grad_norm": 0.09062279015779495, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 295300 + }, + { + "epoch": 1.1415858731115955, + "grad_norm": 0.10352206230163574, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 295310 + }, + { + "epoch": 1.1416245303149788, + "grad_norm": 0.7122738361358643, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 295320 + }, + { + "epoch": 1.1416631875183623, + "grad_norm": 0.11961215734481812, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 295330 + }, + { + "epoch": 1.1417018447217455, + "grad_norm": 0.15325365960597992, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 295340 + }, + { + "epoch": 1.1417405019251288, + "grad_norm": 0.10539396852254868, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 295350 + }, + { + "epoch": 1.141779159128512, + "grad_norm": 0.09125971049070358, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 295360 + }, + { + "epoch": 1.1418178163318953, + "grad_norm": 0.11987537890672684, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 295370 + }, + { + "epoch": 1.1418564735352785, + "grad_norm": 0.11500542610883713, + "learning_rate": 0.002, + "loss": 2.331, + "step": 295380 + }, + { + "epoch": 1.1418951307386618, + "grad_norm": 0.15385165810585022, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 295390 + }, + { + "epoch": 1.141933787942045, + "grad_norm": 0.09489123523235321, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 295400 + }, + { + "epoch": 1.1419724451454285, + "grad_norm": 0.10632327198982239, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 295410 + }, + { + "epoch": 1.1420111023488118, + "grad_norm": 0.09892427176237106, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 295420 + }, + { + "epoch": 1.142049759552195, + "grad_norm": 0.1019333004951477, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 295430 + }, + { + "epoch": 1.1420884167555783, + "grad_norm": 0.10518952459096909, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 295440 + }, + { + "epoch": 1.1421270739589615, + "grad_norm": 0.09612604230642319, + "learning_rate": 0.002, + "loss": 2.338, + "step": 295450 + }, + { + "epoch": 1.1421657311623448, + "grad_norm": 0.11424669623374939, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 295460 + }, + { + "epoch": 1.142204388365728, + "grad_norm": 0.10660576075315475, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 295470 + }, + { + "epoch": 1.1422430455691113, + "grad_norm": 0.09305384755134583, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 295480 + }, + { + "epoch": 1.1422817027724945, + "grad_norm": 0.12523122131824493, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 295490 + }, + { + "epoch": 1.142320359975878, + "grad_norm": 0.10106904804706573, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 295500 + }, + { + "epoch": 1.1423590171792612, + "grad_norm": 0.10420478880405426, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 295510 + }, + { + "epoch": 1.1423976743826445, + "grad_norm": 0.11749018728733063, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 295520 + }, + { + "epoch": 1.1424363315860278, + "grad_norm": 0.10532240569591522, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 295530 + }, + { + "epoch": 1.142474988789411, + "grad_norm": 0.10158240050077438, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 295540 + }, + { + "epoch": 1.1425136459927943, + "grad_norm": 0.10830393433570862, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 295550 + }, + { + "epoch": 1.1425523031961775, + "grad_norm": 0.11087879538536072, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 295560 + }, + { + "epoch": 1.1425909603995608, + "grad_norm": 0.1073136031627655, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 295570 + }, + { + "epoch": 1.1426296176029442, + "grad_norm": 0.09680242836475372, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 295580 + }, + { + "epoch": 1.1426682748063275, + "grad_norm": 0.12555821239948273, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 295590 + }, + { + "epoch": 1.1427069320097107, + "grad_norm": 0.11523910611867905, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 295600 + }, + { + "epoch": 1.142745589213094, + "grad_norm": 0.1234579086303711, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 295610 + }, + { + "epoch": 1.1427842464164772, + "grad_norm": 0.08822614699602127, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 295620 + }, + { + "epoch": 1.1428229036198605, + "grad_norm": 0.0989428460597992, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 295630 + }, + { + "epoch": 1.1428615608232437, + "grad_norm": 0.09494653344154358, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 295640 + }, + { + "epoch": 1.142900218026627, + "grad_norm": 0.09739215672016144, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 295650 + }, + { + "epoch": 1.1429388752300103, + "grad_norm": 0.1287994533777237, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 295660 + }, + { + "epoch": 1.1429775324333937, + "grad_norm": 0.08983709663152695, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 295670 + }, + { + "epoch": 1.143016189636777, + "grad_norm": 0.10030065476894379, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 295680 + }, + { + "epoch": 1.1430548468401602, + "grad_norm": 0.10107168555259705, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 295690 + }, + { + "epoch": 1.1430935040435435, + "grad_norm": 0.10440707206726074, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 295700 + }, + { + "epoch": 1.1431321612469267, + "grad_norm": 0.11294665187597275, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 295710 + }, + { + "epoch": 1.14317081845031, + "grad_norm": 0.10706926882266998, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 295720 + }, + { + "epoch": 1.1432094756536932, + "grad_norm": 0.17112603783607483, + "learning_rate": 0.002, + "loss": 2.334, + "step": 295730 + }, + { + "epoch": 1.1432481328570765, + "grad_norm": 0.10437934100627899, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 295740 + }, + { + "epoch": 1.14328679006046, + "grad_norm": 0.10511988401412964, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 295750 + }, + { + "epoch": 1.1433254472638432, + "grad_norm": 0.10877574980258942, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 295760 + }, + { + "epoch": 1.1433641044672265, + "grad_norm": 0.11442890018224716, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 295770 + }, + { + "epoch": 1.1434027616706097, + "grad_norm": 0.08420255035161972, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 295780 + }, + { + "epoch": 1.143441418873993, + "grad_norm": 0.11311393231153488, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 295790 + }, + { + "epoch": 1.1434800760773762, + "grad_norm": 0.09536857157945633, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 295800 + }, + { + "epoch": 1.1435187332807595, + "grad_norm": 0.1102205440402031, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 295810 + }, + { + "epoch": 1.1435573904841427, + "grad_norm": 0.10846175998449326, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 295820 + }, + { + "epoch": 1.143596047687526, + "grad_norm": 0.12481531500816345, + "learning_rate": 0.002, + "loss": 2.342, + "step": 295830 + }, + { + "epoch": 1.1436347048909095, + "grad_norm": 0.10244545340538025, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 295840 + }, + { + "epoch": 1.1436733620942927, + "grad_norm": 0.09871228784322739, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 295850 + }, + { + "epoch": 1.143712019297676, + "grad_norm": 0.0947028174996376, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 295860 + }, + { + "epoch": 1.1437506765010592, + "grad_norm": 0.10122165083885193, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 295870 + }, + { + "epoch": 1.1437893337044425, + "grad_norm": 0.11432994157075882, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 295880 + }, + { + "epoch": 1.1438279909078257, + "grad_norm": 0.10737597197294235, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 295890 + }, + { + "epoch": 1.143866648111209, + "grad_norm": 0.18943317234516144, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 295900 + }, + { + "epoch": 1.1439053053145922, + "grad_norm": 0.09859231859445572, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 295910 + }, + { + "epoch": 1.1439439625179757, + "grad_norm": 0.09982761740684509, + "learning_rate": 0.002, + "loss": 2.324, + "step": 295920 + }, + { + "epoch": 1.143982619721359, + "grad_norm": 0.11415493488311768, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 295930 + }, + { + "epoch": 1.1440212769247422, + "grad_norm": 0.10270734131336212, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 295940 + }, + { + "epoch": 1.1440599341281255, + "grad_norm": 0.10434996336698532, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 295950 + }, + { + "epoch": 1.1440985913315087, + "grad_norm": 0.09721355885267258, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 295960 + }, + { + "epoch": 1.144137248534892, + "grad_norm": 0.11060193181037903, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 295970 + }, + { + "epoch": 1.1441759057382752, + "grad_norm": 0.10203656554222107, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 295980 + }, + { + "epoch": 1.1442145629416585, + "grad_norm": 0.10176026076078415, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 295990 + }, + { + "epoch": 1.1442532201450417, + "grad_norm": 0.09184641391038895, + "learning_rate": 0.002, + "loss": 2.336, + "step": 296000 + }, + { + "epoch": 1.1442918773484252, + "grad_norm": 0.10682286322116852, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 296010 + }, + { + "epoch": 1.1443305345518084, + "grad_norm": 0.10524826496839523, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 296020 + }, + { + "epoch": 1.1443691917551917, + "grad_norm": 0.08950001001358032, + "learning_rate": 0.002, + "loss": 2.331, + "step": 296030 + }, + { + "epoch": 1.144407848958575, + "grad_norm": 0.14437347650527954, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 296040 + }, + { + "epoch": 1.1444465061619582, + "grad_norm": 0.09879963099956512, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 296050 + }, + { + "epoch": 1.1444851633653415, + "grad_norm": 0.10273423790931702, + "learning_rate": 0.002, + "loss": 2.34, + "step": 296060 + }, + { + "epoch": 1.1445238205687247, + "grad_norm": 0.09035829454660416, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 296070 + }, + { + "epoch": 1.1445624777721082, + "grad_norm": 0.1273248940706253, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 296080 + }, + { + "epoch": 1.1446011349754914, + "grad_norm": 0.10491590201854706, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 296090 + }, + { + "epoch": 1.1446397921788747, + "grad_norm": 0.13738693296909332, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 296100 + }, + { + "epoch": 1.144678449382258, + "grad_norm": 0.5196298360824585, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 296110 + }, + { + "epoch": 1.1447171065856412, + "grad_norm": 0.12203863263130188, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 296120 + }, + { + "epoch": 1.1447557637890244, + "grad_norm": 0.10520727187395096, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 296130 + }, + { + "epoch": 1.1447944209924077, + "grad_norm": 0.12339538335800171, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 296140 + }, + { + "epoch": 1.144833078195791, + "grad_norm": 0.10765001177787781, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 296150 + }, + { + "epoch": 1.1448717353991742, + "grad_norm": 0.11031709611415863, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 296160 + }, + { + "epoch": 1.1449103926025574, + "grad_norm": 0.09553369134664536, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 296170 + }, + { + "epoch": 1.144949049805941, + "grad_norm": 0.11168837547302246, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 296180 + }, + { + "epoch": 1.1449877070093242, + "grad_norm": 0.11190593242645264, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 296190 + }, + { + "epoch": 1.1450263642127074, + "grad_norm": 0.09106079488992691, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 296200 + }, + { + "epoch": 1.1450650214160907, + "grad_norm": 0.10923727601766586, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 296210 + }, + { + "epoch": 1.145103678619474, + "grad_norm": 0.10502249002456665, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 296220 + }, + { + "epoch": 1.1451423358228572, + "grad_norm": 0.109004445374012, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 296230 + }, + { + "epoch": 1.1451809930262404, + "grad_norm": 0.10266054421663284, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 296240 + }, + { + "epoch": 1.145219650229624, + "grad_norm": 0.1098453477025032, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 296250 + }, + { + "epoch": 1.1452583074330072, + "grad_norm": 0.0950651541352272, + "learning_rate": 0.002, + "loss": 2.344, + "step": 296260 + }, + { + "epoch": 1.1452969646363904, + "grad_norm": 0.0950298085808754, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 296270 + }, + { + "epoch": 1.1453356218397737, + "grad_norm": 0.109294094145298, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 296280 + }, + { + "epoch": 1.145374279043157, + "grad_norm": 0.13532811403274536, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 296290 + }, + { + "epoch": 1.1454129362465402, + "grad_norm": 0.10342597961425781, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 296300 + }, + { + "epoch": 1.1454515934499234, + "grad_norm": 0.11257849633693695, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 296310 + }, + { + "epoch": 1.1454902506533067, + "grad_norm": 0.09743151813745499, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 296320 + }, + { + "epoch": 1.14552890785669, + "grad_norm": 0.10734330117702484, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 296330 + }, + { + "epoch": 1.1455675650600732, + "grad_norm": 0.10549934953451157, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 296340 + }, + { + "epoch": 1.1456062222634567, + "grad_norm": 0.10295829176902771, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 296350 + }, + { + "epoch": 1.14564487946684, + "grad_norm": 0.10301025211811066, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 296360 + }, + { + "epoch": 1.1456835366702232, + "grad_norm": 0.12947851419448853, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 296370 + }, + { + "epoch": 1.1457221938736064, + "grad_norm": 0.10246923565864563, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 296380 + }, + { + "epoch": 1.1457608510769897, + "grad_norm": 0.10931044816970825, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 296390 + }, + { + "epoch": 1.145799508280373, + "grad_norm": 0.11043696850538254, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 296400 + }, + { + "epoch": 1.1458381654837562, + "grad_norm": 0.10995186120271683, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 296410 + }, + { + "epoch": 1.1458768226871396, + "grad_norm": 0.11364120244979858, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 296420 + }, + { + "epoch": 1.145915479890523, + "grad_norm": 0.1192992627620697, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 296430 + }, + { + "epoch": 1.1459541370939061, + "grad_norm": 0.09699320048093796, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 296440 + }, + { + "epoch": 1.1459927942972894, + "grad_norm": 0.1066770851612091, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 296450 + }, + { + "epoch": 1.1460314515006726, + "grad_norm": 0.09629204869270325, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 296460 + }, + { + "epoch": 1.146070108704056, + "grad_norm": 0.10704781115055084, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 296470 + }, + { + "epoch": 1.1461087659074392, + "grad_norm": 0.10781607776880264, + "learning_rate": 0.002, + "loss": 2.33, + "step": 296480 + }, + { + "epoch": 1.1461474231108224, + "grad_norm": 0.11625618487596512, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 296490 + }, + { + "epoch": 1.1461860803142057, + "grad_norm": 0.10001617670059204, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 296500 + }, + { + "epoch": 1.146224737517589, + "grad_norm": 0.1142706573009491, + "learning_rate": 0.002, + "loss": 2.354, + "step": 296510 + }, + { + "epoch": 1.1462633947209724, + "grad_norm": 0.11631233245134354, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 296520 + }, + { + "epoch": 1.1463020519243556, + "grad_norm": 0.0933648869395256, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 296530 + }, + { + "epoch": 1.1463407091277389, + "grad_norm": 0.10287400335073471, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 296540 + }, + { + "epoch": 1.1463793663311221, + "grad_norm": 0.09764724969863892, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 296550 + }, + { + "epoch": 1.1464180235345054, + "grad_norm": 0.0917297750711441, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 296560 + }, + { + "epoch": 1.1464566807378886, + "grad_norm": 0.11046093702316284, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 296570 + }, + { + "epoch": 1.146495337941272, + "grad_norm": 0.1363193839788437, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 296580 + }, + { + "epoch": 1.1465339951446554, + "grad_norm": 0.09778366982936859, + "learning_rate": 0.002, + "loss": 2.3144, + "step": 296590 + }, + { + "epoch": 1.1465726523480386, + "grad_norm": 0.11382003128528595, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 296600 + }, + { + "epoch": 1.1466113095514219, + "grad_norm": 0.11143088340759277, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 296610 + }, + { + "epoch": 1.1466499667548051, + "grad_norm": 0.10715234279632568, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 296620 + }, + { + "epoch": 1.1466886239581884, + "grad_norm": 0.19777058064937592, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 296630 + }, + { + "epoch": 1.1467272811615716, + "grad_norm": 0.10907673835754395, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 296640 + }, + { + "epoch": 1.1467659383649549, + "grad_norm": 0.11077481508255005, + "learning_rate": 0.002, + "loss": 2.3068, + "step": 296650 + }, + { + "epoch": 1.1468045955683381, + "grad_norm": 0.09810581058263779, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 296660 + }, + { + "epoch": 1.1468432527717214, + "grad_norm": 0.09280452132225037, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 296670 + }, + { + "epoch": 1.1468819099751046, + "grad_norm": 0.18051892518997192, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 296680 + }, + { + "epoch": 1.1469205671784881, + "grad_norm": 0.10337542742490768, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 296690 + }, + { + "epoch": 1.1469592243818714, + "grad_norm": 0.11727637052536011, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 296700 + }, + { + "epoch": 1.1469978815852546, + "grad_norm": 0.10399829596281052, + "learning_rate": 0.002, + "loss": 2.341, + "step": 296710 + }, + { + "epoch": 1.1470365387886379, + "grad_norm": 0.10587452352046967, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 296720 + }, + { + "epoch": 1.1470751959920211, + "grad_norm": 0.10825172066688538, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 296730 + }, + { + "epoch": 1.1471138531954044, + "grad_norm": 0.10754440724849701, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 296740 + }, + { + "epoch": 1.1471525103987876, + "grad_norm": 0.10634081065654755, + "learning_rate": 0.002, + "loss": 2.333, + "step": 296750 + }, + { + "epoch": 1.147191167602171, + "grad_norm": 0.13813266158103943, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 296760 + }, + { + "epoch": 1.1472298248055544, + "grad_norm": 0.14105969667434692, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 296770 + }, + { + "epoch": 1.1472684820089376, + "grad_norm": 0.1194007471203804, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 296780 + }, + { + "epoch": 1.1473071392123209, + "grad_norm": 0.09992986172437668, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 296790 + }, + { + "epoch": 1.1473457964157041, + "grad_norm": 0.09494622051715851, + "learning_rate": 0.002, + "loss": 2.339, + "step": 296800 + }, + { + "epoch": 1.1473844536190874, + "grad_norm": 0.13352660834789276, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 296810 + }, + { + "epoch": 1.1474231108224706, + "grad_norm": 0.10252249985933304, + "learning_rate": 0.002, + "loss": 2.345, + "step": 296820 + }, + { + "epoch": 1.1474617680258539, + "grad_norm": 0.08959279209375381, + "learning_rate": 0.002, + "loss": 2.333, + "step": 296830 + }, + { + "epoch": 1.1475004252292371, + "grad_norm": 0.1278964728116989, + "learning_rate": 0.002, + "loss": 2.334, + "step": 296840 + }, + { + "epoch": 1.1475390824326204, + "grad_norm": 0.11255158483982086, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 296850 + }, + { + "epoch": 1.1475777396360038, + "grad_norm": 0.09615004062652588, + "learning_rate": 0.002, + "loss": 2.334, + "step": 296860 + }, + { + "epoch": 1.147616396839387, + "grad_norm": 0.12653405964374542, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 296870 + }, + { + "epoch": 1.1476550540427704, + "grad_norm": 0.096727654337883, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 296880 + }, + { + "epoch": 1.1476937112461536, + "grad_norm": 0.11681623011827469, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 296890 + }, + { + "epoch": 1.1477323684495369, + "grad_norm": 0.09364824742078781, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 296900 + }, + { + "epoch": 1.14777102565292, + "grad_norm": 0.10341514647006989, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 296910 + }, + { + "epoch": 1.1478096828563034, + "grad_norm": 0.09559976309537888, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 296920 + }, + { + "epoch": 1.1478483400596868, + "grad_norm": 0.1283910572528839, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 296930 + }, + { + "epoch": 1.14788699726307, + "grad_norm": 0.1130613386631012, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 296940 + }, + { + "epoch": 1.1479256544664533, + "grad_norm": 0.1011149063706398, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 296950 + }, + { + "epoch": 1.1479643116698366, + "grad_norm": 0.14449918270111084, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 296960 + }, + { + "epoch": 1.1480029688732198, + "grad_norm": 0.11203070729970932, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 296970 + }, + { + "epoch": 1.148041626076603, + "grad_norm": 0.11074665188789368, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 296980 + }, + { + "epoch": 1.1480802832799863, + "grad_norm": 0.10194501280784607, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 296990 + }, + { + "epoch": 1.1481189404833696, + "grad_norm": 0.10869333148002625, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 297000 + }, + { + "epoch": 1.1481575976867529, + "grad_norm": 0.1241835206747055, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 297010 + }, + { + "epoch": 1.148196254890136, + "grad_norm": 0.10600640624761581, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 297020 + }, + { + "epoch": 1.1482349120935196, + "grad_norm": 0.18213452398777008, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 297030 + }, + { + "epoch": 1.1482735692969028, + "grad_norm": 0.11283764988183975, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 297040 + }, + { + "epoch": 1.148312226500286, + "grad_norm": 0.10974473506212234, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 297050 + }, + { + "epoch": 1.1483508837036693, + "grad_norm": 0.09493612498044968, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 297060 + }, + { + "epoch": 1.1483895409070526, + "grad_norm": 0.09714702516794205, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 297070 + }, + { + "epoch": 1.1484281981104358, + "grad_norm": 0.1268359273672104, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 297080 + }, + { + "epoch": 1.148466855313819, + "grad_norm": 0.10395190864801407, + "learning_rate": 0.002, + "loss": 2.333, + "step": 297090 + }, + { + "epoch": 1.1485055125172026, + "grad_norm": 0.13091175258159637, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 297100 + }, + { + "epoch": 1.1485441697205858, + "grad_norm": 0.09874138981103897, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 297110 + }, + { + "epoch": 1.148582826923969, + "grad_norm": 0.10048501193523407, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 297120 + }, + { + "epoch": 1.1486214841273523, + "grad_norm": 0.11106102168560028, + "learning_rate": 0.002, + "loss": 2.348, + "step": 297130 + }, + { + "epoch": 1.1486601413307356, + "grad_norm": 0.10966502875089645, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 297140 + }, + { + "epoch": 1.1486987985341188, + "grad_norm": 0.09882976114749908, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 297150 + }, + { + "epoch": 1.148737455737502, + "grad_norm": 0.1049468070268631, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 297160 + }, + { + "epoch": 1.1487761129408853, + "grad_norm": 0.10627221316099167, + "learning_rate": 0.002, + "loss": 2.327, + "step": 297170 + }, + { + "epoch": 1.1488147701442686, + "grad_norm": 0.2593117952346802, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 297180 + }, + { + "epoch": 1.148853427347652, + "grad_norm": 0.1032712534070015, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 297190 + }, + { + "epoch": 1.1488920845510353, + "grad_norm": 0.09483695030212402, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 297200 + }, + { + "epoch": 1.1489307417544186, + "grad_norm": 0.10538081079721451, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 297210 + }, + { + "epoch": 1.1489693989578018, + "grad_norm": 0.10002459585666656, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 297220 + }, + { + "epoch": 1.149008056161185, + "grad_norm": 0.11455190181732178, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 297230 + }, + { + "epoch": 1.1490467133645683, + "grad_norm": 0.10687296092510223, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 297240 + }, + { + "epoch": 1.1490853705679516, + "grad_norm": 0.1209367960691452, + "learning_rate": 0.002, + "loss": 2.334, + "step": 297250 + }, + { + "epoch": 1.1491240277713348, + "grad_norm": 0.11175933480262756, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 297260 + }, + { + "epoch": 1.1491626849747183, + "grad_norm": 0.09743501991033554, + "learning_rate": 0.002, + "loss": 2.34, + "step": 297270 + }, + { + "epoch": 1.1492013421781015, + "grad_norm": 0.11677234619855881, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 297280 + }, + { + "epoch": 1.1492399993814848, + "grad_norm": 0.10145784914493561, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 297290 + }, + { + "epoch": 1.149278656584868, + "grad_norm": 0.0993756502866745, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 297300 + }, + { + "epoch": 1.1493173137882513, + "grad_norm": 0.11287908256053925, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 297310 + }, + { + "epoch": 1.1493559709916346, + "grad_norm": 0.10820694267749786, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 297320 + }, + { + "epoch": 1.1493946281950178, + "grad_norm": 0.12097244709730148, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 297330 + }, + { + "epoch": 1.149433285398401, + "grad_norm": 0.10064926743507385, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 297340 + }, + { + "epoch": 1.1494719426017843, + "grad_norm": 0.10099725425243378, + "learning_rate": 0.002, + "loss": 2.333, + "step": 297350 + }, + { + "epoch": 1.1495105998051678, + "grad_norm": 0.11119300872087479, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 297360 + }, + { + "epoch": 1.149549257008551, + "grad_norm": 0.10817861557006836, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 297370 + }, + { + "epoch": 1.1495879142119343, + "grad_norm": 0.16150224208831787, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 297380 + }, + { + "epoch": 1.1496265714153175, + "grad_norm": 0.09874442219734192, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 297390 + }, + { + "epoch": 1.1496652286187008, + "grad_norm": 0.1252291351556778, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 297400 + }, + { + "epoch": 1.149703885822084, + "grad_norm": 0.10039383918046951, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 297410 + }, + { + "epoch": 1.1497425430254673, + "grad_norm": 0.09669938683509827, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 297420 + }, + { + "epoch": 1.1497812002288506, + "grad_norm": 0.1003023162484169, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 297430 + }, + { + "epoch": 1.149819857432234, + "grad_norm": 0.11819092184305191, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 297440 + }, + { + "epoch": 1.1498585146356173, + "grad_norm": 0.11495328694581985, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 297450 + }, + { + "epoch": 1.1498971718390005, + "grad_norm": 0.10414993762969971, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 297460 + }, + { + "epoch": 1.1499358290423838, + "grad_norm": 0.10090905427932739, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 297470 + }, + { + "epoch": 1.149974486245767, + "grad_norm": 0.11985575407743454, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 297480 + }, + { + "epoch": 1.1500131434491503, + "grad_norm": 0.09595164656639099, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 297490 + }, + { + "epoch": 1.1500518006525335, + "grad_norm": 0.13302060961723328, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 297500 + }, + { + "epoch": 1.1500904578559168, + "grad_norm": 0.10358621180057526, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 297510 + }, + { + "epoch": 1.1501291150593, + "grad_norm": 0.11713553965091705, + "learning_rate": 0.002, + "loss": 2.3123, + "step": 297520 + }, + { + "epoch": 1.1501677722626835, + "grad_norm": 0.09859627485275269, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 297530 + }, + { + "epoch": 1.1502064294660668, + "grad_norm": 0.10029926151037216, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 297540 + }, + { + "epoch": 1.15024508666945, + "grad_norm": 0.10118751972913742, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 297550 + }, + { + "epoch": 1.1502837438728333, + "grad_norm": 0.11143433302640915, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 297560 + }, + { + "epoch": 1.1503224010762165, + "grad_norm": 0.08883854746818542, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 297570 + }, + { + "epoch": 1.1503610582795998, + "grad_norm": 0.12167437374591827, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 297580 + }, + { + "epoch": 1.150399715482983, + "grad_norm": 0.10091345012187958, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 297590 + }, + { + "epoch": 1.1504383726863663, + "grad_norm": 0.09581989049911499, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 297600 + }, + { + "epoch": 1.1504770298897498, + "grad_norm": 0.10923956334590912, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 297610 + }, + { + "epoch": 1.150515687093133, + "grad_norm": 0.11006530374288559, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 297620 + }, + { + "epoch": 1.1505543442965163, + "grad_norm": 0.1279909759759903, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 297630 + }, + { + "epoch": 1.1505930014998995, + "grad_norm": 0.09534849226474762, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 297640 + }, + { + "epoch": 1.1506316587032828, + "grad_norm": 0.10234690457582474, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 297650 + }, + { + "epoch": 1.150670315906666, + "grad_norm": 0.10225453972816467, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 297660 + }, + { + "epoch": 1.1507089731100493, + "grad_norm": 0.10582304745912552, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 297670 + }, + { + "epoch": 1.1507476303134325, + "grad_norm": 0.10547558963298798, + "learning_rate": 0.002, + "loss": 2.328, + "step": 297680 + }, + { + "epoch": 1.1507862875168158, + "grad_norm": 0.10145711153745651, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 297690 + }, + { + "epoch": 1.1508249447201992, + "grad_norm": 0.1168600469827652, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 297700 + }, + { + "epoch": 1.1508636019235825, + "grad_norm": 0.1104208454489708, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 297710 + }, + { + "epoch": 1.1509022591269658, + "grad_norm": 0.10342969745397568, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 297720 + }, + { + "epoch": 1.150940916330349, + "grad_norm": 0.09394558519124985, + "learning_rate": 0.002, + "loss": 2.336, + "step": 297730 + }, + { + "epoch": 1.1509795735337323, + "grad_norm": 0.10640209168195724, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 297740 + }, + { + "epoch": 1.1510182307371155, + "grad_norm": 0.15803790092468262, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 297750 + }, + { + "epoch": 1.1510568879404988, + "grad_norm": 0.10676860064268112, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 297760 + }, + { + "epoch": 1.151095545143882, + "grad_norm": 0.09660085290670395, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 297770 + }, + { + "epoch": 1.1511342023472655, + "grad_norm": 0.10708586126565933, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 297780 + }, + { + "epoch": 1.1511728595506487, + "grad_norm": 0.10311704128980637, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 297790 + }, + { + "epoch": 1.151211516754032, + "grad_norm": 0.11200468242168427, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 297800 + }, + { + "epoch": 1.1512501739574152, + "grad_norm": 0.11802764981985092, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 297810 + }, + { + "epoch": 1.1512888311607985, + "grad_norm": 0.09990455955266953, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 297820 + }, + { + "epoch": 1.1513274883641818, + "grad_norm": 0.13548719882965088, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 297830 + }, + { + "epoch": 1.151366145567565, + "grad_norm": 0.13101348280906677, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 297840 + }, + { + "epoch": 1.1514048027709483, + "grad_norm": 0.11312423646450043, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 297850 + }, + { + "epoch": 1.1514434599743315, + "grad_norm": 0.09926366806030273, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 297860 + }, + { + "epoch": 1.151482117177715, + "grad_norm": 0.09945975244045258, + "learning_rate": 0.002, + "loss": 2.345, + "step": 297870 + }, + { + "epoch": 1.1515207743810982, + "grad_norm": 0.10929170995950699, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 297880 + }, + { + "epoch": 1.1515594315844815, + "grad_norm": 0.12738560140132904, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 297890 + }, + { + "epoch": 1.1515980887878647, + "grad_norm": 0.10594571381807327, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 297900 + }, + { + "epoch": 1.151636745991248, + "grad_norm": 0.0898451879620552, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 297910 + }, + { + "epoch": 1.1516754031946312, + "grad_norm": 0.10429723560810089, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 297920 + }, + { + "epoch": 1.1517140603980145, + "grad_norm": 0.0913781002163887, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 297930 + }, + { + "epoch": 1.151752717601398, + "grad_norm": 0.10696902871131897, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 297940 + }, + { + "epoch": 1.1517913748047812, + "grad_norm": 0.1067197248339653, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 297950 + }, + { + "epoch": 1.1518300320081645, + "grad_norm": 0.11086300760507584, + "learning_rate": 0.002, + "loss": 2.334, + "step": 297960 + }, + { + "epoch": 1.1518686892115477, + "grad_norm": 0.10444667935371399, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 297970 + }, + { + "epoch": 1.151907346414931, + "grad_norm": 0.12773331999778748, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 297980 + }, + { + "epoch": 1.1519460036183142, + "grad_norm": 0.11314024776220322, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 297990 + }, + { + "epoch": 1.1519846608216975, + "grad_norm": 0.09332095086574554, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 298000 + }, + { + "epoch": 1.1520233180250807, + "grad_norm": 0.12006017565727234, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 298010 + }, + { + "epoch": 1.152061975228464, + "grad_norm": 0.09459801763296127, + "learning_rate": 0.002, + "loss": 2.314, + "step": 298020 + }, + { + "epoch": 1.1521006324318472, + "grad_norm": 0.1063527911901474, + "learning_rate": 0.002, + "loss": 2.34, + "step": 298030 + }, + { + "epoch": 1.1521392896352307, + "grad_norm": 0.11665216088294983, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 298040 + }, + { + "epoch": 1.152177946838614, + "grad_norm": 0.11165454238653183, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 298050 + }, + { + "epoch": 1.1522166040419972, + "grad_norm": 0.11676241457462311, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 298060 + }, + { + "epoch": 1.1522552612453805, + "grad_norm": 0.1029791459441185, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 298070 + }, + { + "epoch": 1.1522939184487637, + "grad_norm": 0.10949081182479858, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 298080 + }, + { + "epoch": 1.152332575652147, + "grad_norm": 0.12753087282180786, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 298090 + }, + { + "epoch": 1.1523712328555302, + "grad_norm": 0.11897064745426178, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 298100 + }, + { + "epoch": 1.1524098900589137, + "grad_norm": 0.10330282896757126, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 298110 + }, + { + "epoch": 1.152448547262297, + "grad_norm": 0.12037758529186249, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 298120 + }, + { + "epoch": 1.1524872044656802, + "grad_norm": 0.1015135869383812, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 298130 + }, + { + "epoch": 1.1525258616690635, + "grad_norm": 0.11100692301988602, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 298140 + }, + { + "epoch": 1.1525645188724467, + "grad_norm": 0.09652989357709885, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 298150 + }, + { + "epoch": 1.15260317607583, + "grad_norm": 0.11029095202684402, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 298160 + }, + { + "epoch": 1.1526418332792132, + "grad_norm": 0.09196136891841888, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 298170 + }, + { + "epoch": 1.1526804904825965, + "grad_norm": 0.1197347342967987, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 298180 + }, + { + "epoch": 1.1527191476859797, + "grad_norm": 0.11640334874391556, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 298190 + }, + { + "epoch": 1.152757804889363, + "grad_norm": 0.10275018960237503, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 298200 + }, + { + "epoch": 1.1527964620927464, + "grad_norm": 0.10140158981084824, + "learning_rate": 0.002, + "loss": 2.331, + "step": 298210 + }, + { + "epoch": 1.1528351192961297, + "grad_norm": 0.11424034833908081, + "learning_rate": 0.002, + "loss": 2.329, + "step": 298220 + }, + { + "epoch": 1.152873776499513, + "grad_norm": 0.11030445247888565, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 298230 + }, + { + "epoch": 1.1529124337028962, + "grad_norm": 0.10368213057518005, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 298240 + }, + { + "epoch": 1.1529510909062795, + "grad_norm": 0.10507994890213013, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 298250 + }, + { + "epoch": 1.1529897481096627, + "grad_norm": 0.11051535606384277, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 298260 + }, + { + "epoch": 1.153028405313046, + "grad_norm": 0.11708662658929825, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 298270 + }, + { + "epoch": 1.1530670625164294, + "grad_norm": 0.10672136396169662, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 298280 + }, + { + "epoch": 1.1531057197198127, + "grad_norm": 0.10426277667284012, + "learning_rate": 0.002, + "loss": 2.336, + "step": 298290 + }, + { + "epoch": 1.153144376923196, + "grad_norm": 0.11060561239719391, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 298300 + }, + { + "epoch": 1.1531830341265792, + "grad_norm": 0.10655226558446884, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 298310 + }, + { + "epoch": 1.1532216913299624, + "grad_norm": 0.0946177989244461, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 298320 + }, + { + "epoch": 1.1532603485333457, + "grad_norm": 0.0947253406047821, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 298330 + }, + { + "epoch": 1.153299005736729, + "grad_norm": 0.09470956772565842, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 298340 + }, + { + "epoch": 1.1533376629401122, + "grad_norm": 0.09391769021749496, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 298350 + }, + { + "epoch": 1.1533763201434954, + "grad_norm": 0.10826486349105835, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 298360 + }, + { + "epoch": 1.1534149773468787, + "grad_norm": 0.102037712931633, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 298370 + }, + { + "epoch": 1.1534536345502622, + "grad_norm": 0.12063083797693253, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 298380 + }, + { + "epoch": 1.1534922917536454, + "grad_norm": 0.09736413508653641, + "learning_rate": 0.002, + "loss": 2.3583, + "step": 298390 + }, + { + "epoch": 1.1535309489570287, + "grad_norm": 0.12331432849168777, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 298400 + }, + { + "epoch": 1.153569606160412, + "grad_norm": 0.0907859355211258, + "learning_rate": 0.002, + "loss": 2.346, + "step": 298410 + }, + { + "epoch": 1.1536082633637952, + "grad_norm": 0.09962484240531921, + "learning_rate": 0.002, + "loss": 2.338, + "step": 298420 + }, + { + "epoch": 1.1536469205671784, + "grad_norm": 0.10940424352884293, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 298430 + }, + { + "epoch": 1.1536855777705617, + "grad_norm": 0.10202945023775101, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 298440 + }, + { + "epoch": 1.1537242349739452, + "grad_norm": 0.10457737743854523, + "learning_rate": 0.002, + "loss": 2.3107, + "step": 298450 + }, + { + "epoch": 1.1537628921773284, + "grad_norm": 0.1488359272480011, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 298460 + }, + { + "epoch": 1.1538015493807117, + "grad_norm": 0.08914506435394287, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 298470 + }, + { + "epoch": 1.153840206584095, + "grad_norm": 0.10192002356052399, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 298480 + }, + { + "epoch": 1.1538788637874782, + "grad_norm": 0.10051634907722473, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 298490 + }, + { + "epoch": 1.1539175209908614, + "grad_norm": 0.10865295678377151, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 298500 + }, + { + "epoch": 1.1539561781942447, + "grad_norm": 0.11688423156738281, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 298510 + }, + { + "epoch": 1.153994835397628, + "grad_norm": 0.09238366782665253, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 298520 + }, + { + "epoch": 1.1540334926010112, + "grad_norm": 0.12976820766925812, + "learning_rate": 0.002, + "loss": 2.318, + "step": 298530 + }, + { + "epoch": 1.1540721498043944, + "grad_norm": 0.09833407402038574, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 298540 + }, + { + "epoch": 1.154110807007778, + "grad_norm": 0.1135307028889656, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 298550 + }, + { + "epoch": 1.1541494642111612, + "grad_norm": 0.10334569215774536, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 298560 + }, + { + "epoch": 1.1541881214145444, + "grad_norm": 0.10028822720050812, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 298570 + }, + { + "epoch": 1.1542267786179277, + "grad_norm": 0.12211079150438309, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 298580 + }, + { + "epoch": 1.154265435821311, + "grad_norm": 0.10825755447149277, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 298590 + }, + { + "epoch": 1.1543040930246942, + "grad_norm": 0.10587592422962189, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 298600 + }, + { + "epoch": 1.1543427502280774, + "grad_norm": 0.11863046139478683, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 298610 + }, + { + "epoch": 1.154381407431461, + "grad_norm": 0.10384626686573029, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 298620 + }, + { + "epoch": 1.1544200646348441, + "grad_norm": 0.10779169946908951, + "learning_rate": 0.002, + "loss": 2.345, + "step": 298630 + }, + { + "epoch": 1.1544587218382274, + "grad_norm": 0.10602803528308868, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 298640 + }, + { + "epoch": 1.1544973790416106, + "grad_norm": 0.10449524223804474, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 298650 + }, + { + "epoch": 1.154536036244994, + "grad_norm": 0.0889180600643158, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 298660 + }, + { + "epoch": 1.1545746934483772, + "grad_norm": 0.10732623934745789, + "learning_rate": 0.002, + "loss": 2.343, + "step": 298670 + }, + { + "epoch": 1.1546133506517604, + "grad_norm": 0.09704501181840897, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 298680 + }, + { + "epoch": 1.1546520078551437, + "grad_norm": 0.09987418353557587, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 298690 + }, + { + "epoch": 1.154690665058527, + "grad_norm": 0.12397871911525726, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 298700 + }, + { + "epoch": 1.1547293222619102, + "grad_norm": 0.10326369851827621, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 298710 + }, + { + "epoch": 1.1547679794652936, + "grad_norm": 0.10426446050405502, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 298720 + }, + { + "epoch": 1.154806636668677, + "grad_norm": 0.11490487307310104, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 298730 + }, + { + "epoch": 1.1548452938720601, + "grad_norm": 0.11625088006258011, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 298740 + }, + { + "epoch": 1.1548839510754434, + "grad_norm": 0.0878482460975647, + "learning_rate": 0.002, + "loss": 2.337, + "step": 298750 + }, + { + "epoch": 1.1549226082788266, + "grad_norm": 0.1331155151128769, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 298760 + }, + { + "epoch": 1.15496126548221, + "grad_norm": 0.09513157606124878, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 298770 + }, + { + "epoch": 1.1549999226855932, + "grad_norm": 0.136835515499115, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 298780 + }, + { + "epoch": 1.1550385798889766, + "grad_norm": 0.09034319967031479, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 298790 + }, + { + "epoch": 1.1550772370923599, + "grad_norm": 0.10461827367544174, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 298800 + }, + { + "epoch": 1.1551158942957431, + "grad_norm": 0.10814294219017029, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 298810 + }, + { + "epoch": 1.1551545514991264, + "grad_norm": 0.10786023736000061, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 298820 + }, + { + "epoch": 1.1551932087025096, + "grad_norm": 0.1013743132352829, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 298830 + }, + { + "epoch": 1.1552318659058929, + "grad_norm": 0.11625484377145767, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 298840 + }, + { + "epoch": 1.1552705231092761, + "grad_norm": 0.10375026613473892, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 298850 + }, + { + "epoch": 1.1553091803126594, + "grad_norm": 0.10422974824905396, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 298860 + }, + { + "epoch": 1.1553478375160426, + "grad_norm": 0.11368890106678009, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 298870 + }, + { + "epoch": 1.155386494719426, + "grad_norm": 0.1095462292432785, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 298880 + }, + { + "epoch": 1.1554251519228094, + "grad_norm": 0.11351530253887177, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 298890 + }, + { + "epoch": 1.1554638091261926, + "grad_norm": 0.11732436716556549, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 298900 + }, + { + "epoch": 1.1555024663295759, + "grad_norm": 0.115448959171772, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 298910 + }, + { + "epoch": 1.1555411235329591, + "grad_norm": 0.1171172708272934, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 298920 + }, + { + "epoch": 1.1555797807363424, + "grad_norm": 0.15639221668243408, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 298930 + }, + { + "epoch": 1.1556184379397256, + "grad_norm": 0.10376555472612381, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 298940 + }, + { + "epoch": 1.1556570951431089, + "grad_norm": 0.10352832078933716, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 298950 + }, + { + "epoch": 1.1556957523464924, + "grad_norm": 0.10156875848770142, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 298960 + }, + { + "epoch": 1.1557344095498756, + "grad_norm": 0.08658002316951752, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 298970 + }, + { + "epoch": 1.1557730667532589, + "grad_norm": 0.1257506012916565, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 298980 + }, + { + "epoch": 1.1558117239566421, + "grad_norm": 0.11586952954530716, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 298990 + }, + { + "epoch": 1.1558503811600254, + "grad_norm": 0.10772280395030975, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 299000 + }, + { + "epoch": 1.1558890383634086, + "grad_norm": 0.10585318505764008, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 299010 + }, + { + "epoch": 1.1559276955667919, + "grad_norm": 0.0990624874830246, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 299020 + }, + { + "epoch": 1.1559663527701751, + "grad_norm": 0.1242714673280716, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 299030 + }, + { + "epoch": 1.1560050099735584, + "grad_norm": 0.12910033762454987, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 299040 + }, + { + "epoch": 1.1560436671769418, + "grad_norm": 0.09899185597896576, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 299050 + }, + { + "epoch": 1.156082324380325, + "grad_norm": 0.11427902430295944, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 299060 + }, + { + "epoch": 1.1561209815837084, + "grad_norm": 0.27933862805366516, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 299070 + }, + { + "epoch": 1.1561596387870916, + "grad_norm": 0.12222559005022049, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 299080 + }, + { + "epoch": 1.1561982959904749, + "grad_norm": 0.09958814084529877, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 299090 + }, + { + "epoch": 1.156236953193858, + "grad_norm": 0.10184434056282043, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 299100 + }, + { + "epoch": 1.1562756103972414, + "grad_norm": 0.10524575412273407, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 299110 + }, + { + "epoch": 1.1563142676006246, + "grad_norm": 0.10867523401975632, + "learning_rate": 0.002, + "loss": 2.327, + "step": 299120 + }, + { + "epoch": 1.156352924804008, + "grad_norm": 0.11048688739538193, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 299130 + }, + { + "epoch": 1.1563915820073913, + "grad_norm": 0.09732969850301743, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 299140 + }, + { + "epoch": 1.1564302392107746, + "grad_norm": 0.0917268842458725, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 299150 + }, + { + "epoch": 1.1564688964141578, + "grad_norm": 0.10696238279342651, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 299160 + }, + { + "epoch": 1.156507553617541, + "grad_norm": 0.10692931711673737, + "learning_rate": 0.002, + "loss": 2.33, + "step": 299170 + }, + { + "epoch": 1.1565462108209243, + "grad_norm": 0.10884692519903183, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 299180 + }, + { + "epoch": 1.1565848680243076, + "grad_norm": 0.11779770255088806, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 299190 + }, + { + "epoch": 1.1566235252276909, + "grad_norm": 0.09254894405603409, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 299200 + }, + { + "epoch": 1.156662182431074, + "grad_norm": 0.10647129267454147, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 299210 + }, + { + "epoch": 1.1567008396344576, + "grad_norm": 0.11991509050130844, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 299220 + }, + { + "epoch": 1.1567394968378408, + "grad_norm": 0.11554388701915741, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 299230 + }, + { + "epoch": 1.156778154041224, + "grad_norm": 0.10474992543458939, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 299240 + }, + { + "epoch": 1.1568168112446073, + "grad_norm": 0.11388864368200302, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 299250 + }, + { + "epoch": 1.1568554684479906, + "grad_norm": 0.10299711674451828, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 299260 + }, + { + "epoch": 1.1568941256513738, + "grad_norm": 0.14622846245765686, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 299270 + }, + { + "epoch": 1.156932782854757, + "grad_norm": 0.09922489523887634, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 299280 + }, + { + "epoch": 1.1569714400581403, + "grad_norm": 0.11386466026306152, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 299290 + }, + { + "epoch": 1.1570100972615238, + "grad_norm": 0.10140512138605118, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 299300 + }, + { + "epoch": 1.157048754464907, + "grad_norm": 0.10422612726688385, + "learning_rate": 0.002, + "loss": 2.33, + "step": 299310 + }, + { + "epoch": 1.1570874116682903, + "grad_norm": 0.10468500852584839, + "learning_rate": 0.002, + "loss": 2.332, + "step": 299320 + }, + { + "epoch": 1.1571260688716736, + "grad_norm": 0.10063187032938004, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 299330 + }, + { + "epoch": 1.1571647260750568, + "grad_norm": 0.09477391839027405, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 299340 + }, + { + "epoch": 1.15720338327844, + "grad_norm": 0.09196572005748749, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 299350 + }, + { + "epoch": 1.1572420404818233, + "grad_norm": 0.10172398388385773, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 299360 + }, + { + "epoch": 1.1572806976852066, + "grad_norm": 0.1202421635389328, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 299370 + }, + { + "epoch": 1.1573193548885898, + "grad_norm": 0.10497753322124481, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 299380 + }, + { + "epoch": 1.1573580120919733, + "grad_norm": 0.12195313721895218, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 299390 + }, + { + "epoch": 1.1573966692953566, + "grad_norm": 0.09654326736927032, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 299400 + }, + { + "epoch": 1.1574353264987398, + "grad_norm": 0.09133104979991913, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 299410 + }, + { + "epoch": 1.157473983702123, + "grad_norm": 0.11477682739496231, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 299420 + }, + { + "epoch": 1.1575126409055063, + "grad_norm": 0.10247191786766052, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 299430 + }, + { + "epoch": 1.1575512981088896, + "grad_norm": 0.10641482472419739, + "learning_rate": 0.002, + "loss": 2.333, + "step": 299440 + }, + { + "epoch": 1.1575899553122728, + "grad_norm": 0.2622292637825012, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 299450 + }, + { + "epoch": 1.157628612515656, + "grad_norm": 0.7004684209823608, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 299460 + }, + { + "epoch": 1.1576672697190395, + "grad_norm": 0.11828330159187317, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 299470 + }, + { + "epoch": 1.1577059269224228, + "grad_norm": 0.09065783023834229, + "learning_rate": 0.002, + "loss": 2.33, + "step": 299480 + }, + { + "epoch": 1.157744584125806, + "grad_norm": 0.10334282368421555, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 299490 + }, + { + "epoch": 1.1577832413291893, + "grad_norm": 0.11078079044818878, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 299500 + }, + { + "epoch": 1.1578218985325726, + "grad_norm": 0.10222682356834412, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 299510 + }, + { + "epoch": 1.1578605557359558, + "grad_norm": 0.09242222458124161, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 299520 + }, + { + "epoch": 1.157899212939339, + "grad_norm": 0.13665029406547546, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 299530 + }, + { + "epoch": 1.1579378701427223, + "grad_norm": 0.11644630134105682, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 299540 + }, + { + "epoch": 1.1579765273461056, + "grad_norm": 0.10311318188905716, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 299550 + }, + { + "epoch": 1.158015184549489, + "grad_norm": 0.0962674617767334, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 299560 + }, + { + "epoch": 1.1580538417528723, + "grad_norm": 0.10548513382673264, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 299570 + }, + { + "epoch": 1.1580924989562555, + "grad_norm": 0.10236866027116776, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 299580 + }, + { + "epoch": 1.1581311561596388, + "grad_norm": 0.09971366077661514, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 299590 + }, + { + "epoch": 1.158169813363022, + "grad_norm": 0.10626211017370224, + "learning_rate": 0.002, + "loss": 2.335, + "step": 299600 + }, + { + "epoch": 1.1582084705664053, + "grad_norm": 0.11893949657678604, + "learning_rate": 0.002, + "loss": 2.332, + "step": 299610 + }, + { + "epoch": 1.1582471277697886, + "grad_norm": 0.10046777874231339, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 299620 + }, + { + "epoch": 1.1582857849731718, + "grad_norm": 0.09620541334152222, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 299630 + }, + { + "epoch": 1.1583244421765553, + "grad_norm": 0.13971824944019318, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 299640 + }, + { + "epoch": 1.1583630993799385, + "grad_norm": 0.10550142079591751, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 299650 + }, + { + "epoch": 1.1584017565833218, + "grad_norm": 0.11489161103963852, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 299660 + }, + { + "epoch": 1.158440413786705, + "grad_norm": 0.09856657683849335, + "learning_rate": 0.002, + "loss": 2.333, + "step": 299670 + }, + { + "epoch": 1.1584790709900883, + "grad_norm": 0.10443252325057983, + "learning_rate": 0.002, + "loss": 2.3627, + "step": 299680 + }, + { + "epoch": 1.1585177281934715, + "grad_norm": 0.09363368898630142, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 299690 + }, + { + "epoch": 1.1585563853968548, + "grad_norm": 0.13245029747486115, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 299700 + }, + { + "epoch": 1.158595042600238, + "grad_norm": 0.12561266124248505, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 299710 + }, + { + "epoch": 1.1586336998036213, + "grad_norm": 0.0979650467634201, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 299720 + }, + { + "epoch": 1.1586723570070048, + "grad_norm": 0.10856011509895325, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 299730 + }, + { + "epoch": 1.158711014210388, + "grad_norm": 0.1100655347108841, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 299740 + }, + { + "epoch": 1.1587496714137713, + "grad_norm": 0.09626168012619019, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 299750 + }, + { + "epoch": 1.1587883286171545, + "grad_norm": 0.1044035255908966, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 299760 + }, + { + "epoch": 1.1588269858205378, + "grad_norm": 0.1007981076836586, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 299770 + }, + { + "epoch": 1.158865643023921, + "grad_norm": 0.10776758193969727, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 299780 + }, + { + "epoch": 1.1589043002273043, + "grad_norm": 0.11865066736936569, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 299790 + }, + { + "epoch": 1.1589429574306878, + "grad_norm": 0.09247344732284546, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 299800 + }, + { + "epoch": 1.158981614634071, + "grad_norm": 0.09969916939735413, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 299810 + }, + { + "epoch": 1.1590202718374543, + "grad_norm": 0.11441514641046524, + "learning_rate": 0.002, + "loss": 2.33, + "step": 299820 + }, + { + "epoch": 1.1590589290408375, + "grad_norm": 0.12042112648487091, + "learning_rate": 0.002, + "loss": 2.35, + "step": 299830 + }, + { + "epoch": 1.1590975862442208, + "grad_norm": 0.13869859278202057, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 299840 + }, + { + "epoch": 1.159136243447604, + "grad_norm": 0.09707216918468475, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 299850 + }, + { + "epoch": 1.1591749006509873, + "grad_norm": 0.11898298561573029, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 299860 + }, + { + "epoch": 1.1592135578543705, + "grad_norm": 0.09777060896158218, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 299870 + }, + { + "epoch": 1.1592522150577538, + "grad_norm": 0.10587567090988159, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 299880 + }, + { + "epoch": 1.159290872261137, + "grad_norm": 0.11703342199325562, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 299890 + }, + { + "epoch": 1.1593295294645205, + "grad_norm": 0.106162890791893, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 299900 + }, + { + "epoch": 1.1593681866679038, + "grad_norm": 0.11302954703569412, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 299910 + }, + { + "epoch": 1.159406843871287, + "grad_norm": 0.12867295742034912, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 299920 + }, + { + "epoch": 1.1594455010746703, + "grad_norm": 0.09764662384986877, + "learning_rate": 0.002, + "loss": 2.339, + "step": 299930 + }, + { + "epoch": 1.1594841582780535, + "grad_norm": 0.09465903043746948, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 299940 + }, + { + "epoch": 1.1595228154814368, + "grad_norm": 0.10889971256256104, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 299950 + }, + { + "epoch": 1.15956147268482, + "grad_norm": 0.09882204234600067, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 299960 + }, + { + "epoch": 1.1596001298882035, + "grad_norm": 0.10860224068164825, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 299970 + }, + { + "epoch": 1.1596387870915867, + "grad_norm": 0.11030543595552444, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 299980 + }, + { + "epoch": 1.15967744429497, + "grad_norm": 0.1054888442158699, + "learning_rate": 0.002, + "loss": 2.336, + "step": 299990 + }, + { + "epoch": 1.1597161014983532, + "grad_norm": 0.10351574420928955, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 300000 + }, + { + "epoch": 1.1597547587017365, + "grad_norm": 0.11095496267080307, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 300010 + }, + { + "epoch": 1.1597934159051198, + "grad_norm": 0.09909340739250183, + "learning_rate": 0.002, + "loss": 2.346, + "step": 300020 + }, + { + "epoch": 1.159832073108503, + "grad_norm": 0.12616288661956787, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 300030 + }, + { + "epoch": 1.1598707303118863, + "grad_norm": 0.09794657677412033, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 300040 + }, + { + "epoch": 1.1599093875152695, + "grad_norm": 0.10776015371084213, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 300050 + }, + { + "epoch": 1.1599480447186528, + "grad_norm": 0.10282982885837555, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 300060 + }, + { + "epoch": 1.1599867019220362, + "grad_norm": 0.121549591422081, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 300070 + }, + { + "epoch": 1.1600253591254195, + "grad_norm": 0.1109287366271019, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 300080 + }, + { + "epoch": 1.1600640163288027, + "grad_norm": 0.08838099241256714, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 300090 + }, + { + "epoch": 1.160102673532186, + "grad_norm": 0.12162969261407852, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 300100 + }, + { + "epoch": 1.1601413307355692, + "grad_norm": 0.098938949406147, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 300110 + }, + { + "epoch": 1.1601799879389525, + "grad_norm": 0.09758912026882172, + "learning_rate": 0.002, + "loss": 2.342, + "step": 300120 + }, + { + "epoch": 1.1602186451423357, + "grad_norm": 0.10965708643198013, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 300130 + }, + { + "epoch": 1.1602573023457192, + "grad_norm": 0.08661700785160065, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 300140 + }, + { + "epoch": 1.1602959595491025, + "grad_norm": 0.09226927161216736, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 300150 + }, + { + "epoch": 1.1603346167524857, + "grad_norm": 0.09487424045801163, + "learning_rate": 0.002, + "loss": 2.33, + "step": 300160 + }, + { + "epoch": 1.160373273955869, + "grad_norm": 0.12119285762310028, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 300170 + }, + { + "epoch": 1.1604119311592522, + "grad_norm": 0.11557340621948242, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 300180 + }, + { + "epoch": 1.1604505883626355, + "grad_norm": 0.11036872118711472, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 300190 + }, + { + "epoch": 1.1604892455660187, + "grad_norm": 0.10435856133699417, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 300200 + }, + { + "epoch": 1.160527902769402, + "grad_norm": 0.10671596229076385, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 300210 + }, + { + "epoch": 1.1605665599727852, + "grad_norm": 0.14279340207576752, + "learning_rate": 0.002, + "loss": 2.34, + "step": 300220 + }, + { + "epoch": 1.1606052171761685, + "grad_norm": 0.10160546004772186, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 300230 + }, + { + "epoch": 1.160643874379552, + "grad_norm": 0.10072829574346542, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 300240 + }, + { + "epoch": 1.1606825315829352, + "grad_norm": 0.10195444524288177, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 300250 + }, + { + "epoch": 1.1607211887863185, + "grad_norm": 0.12637607753276825, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 300260 + }, + { + "epoch": 1.1607598459897017, + "grad_norm": 0.10484391450881958, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 300270 + }, + { + "epoch": 1.160798503193085, + "grad_norm": 0.18085014820098877, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 300280 + }, + { + "epoch": 1.1608371603964682, + "grad_norm": 0.10291527211666107, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 300290 + }, + { + "epoch": 1.1608758175998515, + "grad_norm": 0.09656839817762375, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 300300 + }, + { + "epoch": 1.160914474803235, + "grad_norm": 0.08661667257547379, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 300310 + }, + { + "epoch": 1.1609531320066182, + "grad_norm": 0.11928417533636093, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 300320 + }, + { + "epoch": 1.1609917892100015, + "grad_norm": 0.09696167707443237, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 300330 + }, + { + "epoch": 1.1610304464133847, + "grad_norm": 0.11764390766620636, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 300340 + }, + { + "epoch": 1.161069103616768, + "grad_norm": 0.11150548607110977, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 300350 + }, + { + "epoch": 1.1611077608201512, + "grad_norm": 0.12289080023765564, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 300360 + }, + { + "epoch": 1.1611464180235345, + "grad_norm": 0.1032310277223587, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 300370 + }, + { + "epoch": 1.1611850752269177, + "grad_norm": 0.13653236627578735, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 300380 + }, + { + "epoch": 1.161223732430301, + "grad_norm": 0.10622701793909073, + "learning_rate": 0.002, + "loss": 2.349, + "step": 300390 + }, + { + "epoch": 1.1612623896336842, + "grad_norm": 0.09135434776544571, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 300400 + }, + { + "epoch": 1.1613010468370677, + "grad_norm": 0.1155262440443039, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 300410 + }, + { + "epoch": 1.161339704040451, + "grad_norm": 0.09453856945037842, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 300420 + }, + { + "epoch": 1.1613783612438342, + "grad_norm": 0.08795636147260666, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 300430 + }, + { + "epoch": 1.1614170184472175, + "grad_norm": 0.14006325602531433, + "learning_rate": 0.002, + "loss": 2.335, + "step": 300440 + }, + { + "epoch": 1.1614556756506007, + "grad_norm": 0.09395135939121246, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 300450 + }, + { + "epoch": 1.161494332853984, + "grad_norm": 0.1190577894449234, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 300460 + }, + { + "epoch": 1.1615329900573672, + "grad_norm": 0.10218417644500732, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 300470 + }, + { + "epoch": 1.1615716472607507, + "grad_norm": 0.12376347929239273, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 300480 + }, + { + "epoch": 1.161610304464134, + "grad_norm": 0.10159707069396973, + "learning_rate": 0.002, + "loss": 2.334, + "step": 300490 + }, + { + "epoch": 1.1616489616675172, + "grad_norm": 0.09371952712535858, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 300500 + }, + { + "epoch": 1.1616876188709004, + "grad_norm": 0.11684858053922653, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 300510 + }, + { + "epoch": 1.1617262760742837, + "grad_norm": 0.0982179343700409, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 300520 + }, + { + "epoch": 1.161764933277667, + "grad_norm": 0.0905216634273529, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 300530 + }, + { + "epoch": 1.1618035904810502, + "grad_norm": 0.12256705015897751, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 300540 + }, + { + "epoch": 1.1618422476844334, + "grad_norm": 0.10113391280174255, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 300550 + }, + { + "epoch": 1.1618809048878167, + "grad_norm": 0.09468615800142288, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 300560 + }, + { + "epoch": 1.1619195620912, + "grad_norm": 0.09727149456739426, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 300570 + }, + { + "epoch": 1.1619582192945834, + "grad_norm": 0.09404943883419037, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 300580 + }, + { + "epoch": 1.1619968764979667, + "grad_norm": 0.1004868894815445, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 300590 + }, + { + "epoch": 1.16203553370135, + "grad_norm": 0.12360697239637375, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 300600 + }, + { + "epoch": 1.1620741909047332, + "grad_norm": 0.13018663227558136, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 300610 + }, + { + "epoch": 1.1621128481081164, + "grad_norm": 0.09137805551290512, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 300620 + }, + { + "epoch": 1.1621515053114997, + "grad_norm": 0.10897412896156311, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 300630 + }, + { + "epoch": 1.162190162514883, + "grad_norm": 0.09854017198085785, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 300640 + }, + { + "epoch": 1.1622288197182664, + "grad_norm": 0.10869092494249344, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 300650 + }, + { + "epoch": 1.1622674769216497, + "grad_norm": 0.1176285594701767, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 300660 + }, + { + "epoch": 1.162306134125033, + "grad_norm": 0.13885082304477692, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 300670 + }, + { + "epoch": 1.1623447913284162, + "grad_norm": 0.10269021987915039, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 300680 + }, + { + "epoch": 1.1623834485317994, + "grad_norm": 0.11646575480699539, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 300690 + }, + { + "epoch": 1.1624221057351827, + "grad_norm": 0.11593282222747803, + "learning_rate": 0.002, + "loss": 2.354, + "step": 300700 + }, + { + "epoch": 1.162460762938566, + "grad_norm": 0.10426647216081619, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 300710 + }, + { + "epoch": 1.1624994201419492, + "grad_norm": 0.09929317235946655, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 300720 + }, + { + "epoch": 1.1625380773453324, + "grad_norm": 0.09835363924503326, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 300730 + }, + { + "epoch": 1.1625767345487157, + "grad_norm": 0.10210787504911423, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 300740 + }, + { + "epoch": 1.1626153917520992, + "grad_norm": 0.121326744556427, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 300750 + }, + { + "epoch": 1.1626540489554824, + "grad_norm": 0.11139842122793198, + "learning_rate": 0.002, + "loss": 2.336, + "step": 300760 + }, + { + "epoch": 1.1626927061588657, + "grad_norm": 0.11265687644481659, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 300770 + }, + { + "epoch": 1.162731363362249, + "grad_norm": 0.10704000294208527, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 300780 + }, + { + "epoch": 1.1627700205656322, + "grad_norm": 0.12270093709230423, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 300790 + }, + { + "epoch": 1.1628086777690154, + "grad_norm": 0.10371486097574234, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 300800 + }, + { + "epoch": 1.1628473349723987, + "grad_norm": 0.10047317296266556, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 300810 + }, + { + "epoch": 1.1628859921757821, + "grad_norm": 0.10786613076925278, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 300820 + }, + { + "epoch": 1.1629246493791654, + "grad_norm": 0.1134653314948082, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 300830 + }, + { + "epoch": 1.1629633065825487, + "grad_norm": 0.16030678153038025, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 300840 + }, + { + "epoch": 1.163001963785932, + "grad_norm": 0.12873561680316925, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 300850 + }, + { + "epoch": 1.1630406209893152, + "grad_norm": 0.09747027605772018, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 300860 + }, + { + "epoch": 1.1630792781926984, + "grad_norm": 0.1002790704369545, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 300870 + }, + { + "epoch": 1.1631179353960817, + "grad_norm": 0.1007324680685997, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 300880 + }, + { + "epoch": 1.163156592599465, + "grad_norm": 0.09922447055578232, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 300890 + }, + { + "epoch": 1.1631952498028482, + "grad_norm": 0.09478519856929779, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 300900 + }, + { + "epoch": 1.1632339070062316, + "grad_norm": 0.12346886843442917, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 300910 + }, + { + "epoch": 1.163272564209615, + "grad_norm": 0.1043015792965889, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 300920 + }, + { + "epoch": 1.1633112214129981, + "grad_norm": 0.09666992723941803, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 300930 + }, + { + "epoch": 1.1633498786163814, + "grad_norm": 0.0899510309100151, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 300940 + }, + { + "epoch": 1.1633885358197646, + "grad_norm": 0.12516461312770844, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 300950 + }, + { + "epoch": 1.163427193023148, + "grad_norm": 0.10826769471168518, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 300960 + }, + { + "epoch": 1.1634658502265312, + "grad_norm": 0.11351073533296585, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 300970 + }, + { + "epoch": 1.1635045074299144, + "grad_norm": 0.1089121550321579, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 300980 + }, + { + "epoch": 1.1635431646332979, + "grad_norm": 0.10068459808826447, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 300990 + }, + { + "epoch": 1.1635818218366811, + "grad_norm": 0.1085510179400444, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 301000 + }, + { + "epoch": 1.1636204790400644, + "grad_norm": 0.09401760250329971, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 301010 + }, + { + "epoch": 1.1636591362434476, + "grad_norm": 0.1090821698307991, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 301020 + }, + { + "epoch": 1.1636977934468309, + "grad_norm": 0.10806293040513992, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 301030 + }, + { + "epoch": 1.1637364506502141, + "grad_norm": 0.13532325625419617, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 301040 + }, + { + "epoch": 1.1637751078535974, + "grad_norm": 0.09232982993125916, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 301050 + }, + { + "epoch": 1.1638137650569806, + "grad_norm": 0.0929589793086052, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 301060 + }, + { + "epoch": 1.163852422260364, + "grad_norm": 0.11074736714363098, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 301070 + }, + { + "epoch": 1.1638910794637474, + "grad_norm": 0.11797834932804108, + "learning_rate": 0.002, + "loss": 2.32, + "step": 301080 + }, + { + "epoch": 1.1639297366671306, + "grad_norm": 0.0900818407535553, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 301090 + }, + { + "epoch": 1.1639683938705139, + "grad_norm": 0.11992789804935455, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 301100 + }, + { + "epoch": 1.1640070510738971, + "grad_norm": 0.1221378892660141, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 301110 + }, + { + "epoch": 1.1640457082772804, + "grad_norm": 0.09956325590610504, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 301120 + }, + { + "epoch": 1.1640843654806636, + "grad_norm": 0.10660485178232193, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 301130 + }, + { + "epoch": 1.1641230226840469, + "grad_norm": 0.09304320812225342, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 301140 + }, + { + "epoch": 1.1641616798874301, + "grad_norm": 0.10756511986255646, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 301150 + }, + { + "epoch": 1.1642003370908136, + "grad_norm": 0.10929480940103531, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 301160 + }, + { + "epoch": 1.1642389942941969, + "grad_norm": 0.09262862801551819, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 301170 + }, + { + "epoch": 1.1642776514975801, + "grad_norm": 0.12419795989990234, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 301180 + }, + { + "epoch": 1.1643163087009634, + "grad_norm": 0.11665025353431702, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 301190 + }, + { + "epoch": 1.1643549659043466, + "grad_norm": 0.10153454542160034, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 301200 + }, + { + "epoch": 1.1643936231077299, + "grad_norm": 0.10074479132890701, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 301210 + }, + { + "epoch": 1.1644322803111131, + "grad_norm": 0.10060901939868927, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 301220 + }, + { + "epoch": 1.1644709375144964, + "grad_norm": 0.10998406261205673, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 301230 + }, + { + "epoch": 1.1645095947178796, + "grad_norm": 0.09824924916028976, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 301240 + }, + { + "epoch": 1.164548251921263, + "grad_norm": 0.09302695840597153, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 301250 + }, + { + "epoch": 1.1645869091246464, + "grad_norm": 0.1218177080154419, + "learning_rate": 0.002, + "loss": 2.335, + "step": 301260 + }, + { + "epoch": 1.1646255663280296, + "grad_norm": 0.10029064863920212, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 301270 + }, + { + "epoch": 1.1646642235314129, + "grad_norm": 0.17595061659812927, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 301280 + }, + { + "epoch": 1.164702880734796, + "grad_norm": 0.0965089499950409, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 301290 + }, + { + "epoch": 1.1647415379381794, + "grad_norm": 0.1053343340754509, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 301300 + }, + { + "epoch": 1.1647801951415626, + "grad_norm": 0.13139332830905914, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 301310 + }, + { + "epoch": 1.1648188523449459, + "grad_norm": 0.10064920783042908, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 301320 + }, + { + "epoch": 1.1648575095483293, + "grad_norm": 0.10264275968074799, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 301330 + }, + { + "epoch": 1.1648961667517126, + "grad_norm": 0.1274469792842865, + "learning_rate": 0.002, + "loss": 2.348, + "step": 301340 + }, + { + "epoch": 1.1649348239550958, + "grad_norm": 0.09462037682533264, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 301350 + }, + { + "epoch": 1.164973481158479, + "grad_norm": 0.11224725842475891, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 301360 + }, + { + "epoch": 1.1650121383618623, + "grad_norm": 0.12100022286176682, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 301370 + }, + { + "epoch": 1.1650507955652456, + "grad_norm": 0.08697990328073502, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 301380 + }, + { + "epoch": 1.1650894527686289, + "grad_norm": 0.1286000907421112, + "learning_rate": 0.002, + "loss": 2.331, + "step": 301390 + }, + { + "epoch": 1.165128109972012, + "grad_norm": 0.10235783457756042, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 301400 + }, + { + "epoch": 1.1651667671753954, + "grad_norm": 0.10327660292387009, + "learning_rate": 0.002, + "loss": 2.328, + "step": 301410 + }, + { + "epoch": 1.1652054243787788, + "grad_norm": 0.09840571880340576, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 301420 + }, + { + "epoch": 1.165244081582162, + "grad_norm": 0.10375815629959106, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 301430 + }, + { + "epoch": 1.1652827387855453, + "grad_norm": 0.11783238500356674, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 301440 + }, + { + "epoch": 1.1653213959889286, + "grad_norm": 0.10080355405807495, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 301450 + }, + { + "epoch": 1.1653600531923118, + "grad_norm": 0.11816217750310898, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 301460 + }, + { + "epoch": 1.165398710395695, + "grad_norm": 0.10482607036828995, + "learning_rate": 0.002, + "loss": 2.3127, + "step": 301470 + }, + { + "epoch": 1.1654373675990783, + "grad_norm": 0.11168037354946136, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 301480 + }, + { + "epoch": 1.1654760248024616, + "grad_norm": 0.09168479591608047, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 301490 + }, + { + "epoch": 1.165514682005845, + "grad_norm": 0.10162169486284256, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 301500 + }, + { + "epoch": 1.1655533392092283, + "grad_norm": 0.12645724415779114, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 301510 + }, + { + "epoch": 1.1655919964126116, + "grad_norm": 0.1105879694223404, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 301520 + }, + { + "epoch": 1.1656306536159948, + "grad_norm": 0.10556762665510178, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 301530 + }, + { + "epoch": 1.165669310819378, + "grad_norm": 0.1095210388302803, + "learning_rate": 0.002, + "loss": 2.343, + "step": 301540 + }, + { + "epoch": 1.1657079680227613, + "grad_norm": 0.11712250858545303, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 301550 + }, + { + "epoch": 1.1657466252261446, + "grad_norm": 0.11337719112634659, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 301560 + }, + { + "epoch": 1.1657852824295278, + "grad_norm": 0.1021040752530098, + "learning_rate": 0.002, + "loss": 2.342, + "step": 301570 + }, + { + "epoch": 1.165823939632911, + "grad_norm": 0.09224560111761093, + "learning_rate": 0.002, + "loss": 2.326, + "step": 301580 + }, + { + "epoch": 1.1658625968362946, + "grad_norm": 0.1109401062130928, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 301590 + }, + { + "epoch": 1.1659012540396778, + "grad_norm": 0.08976784348487854, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 301600 + }, + { + "epoch": 1.165939911243061, + "grad_norm": 0.10225795954465866, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 301610 + }, + { + "epoch": 1.1659785684464443, + "grad_norm": 0.12347820401191711, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 301620 + }, + { + "epoch": 1.1660172256498276, + "grad_norm": 0.17781856656074524, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 301630 + }, + { + "epoch": 1.1660558828532108, + "grad_norm": 0.10366151481866837, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 301640 + }, + { + "epoch": 1.166094540056594, + "grad_norm": 0.10425670444965363, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 301650 + }, + { + "epoch": 1.1661331972599773, + "grad_norm": 0.10662340372800827, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 301660 + }, + { + "epoch": 1.1661718544633608, + "grad_norm": 0.10434211045503616, + "learning_rate": 0.002, + "loss": 2.3608, + "step": 301670 + }, + { + "epoch": 1.166210511666744, + "grad_norm": 0.09581685811281204, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 301680 + }, + { + "epoch": 1.1662491688701273, + "grad_norm": 0.0873362123966217, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 301690 + }, + { + "epoch": 1.1662878260735106, + "grad_norm": 0.1252460479736328, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 301700 + }, + { + "epoch": 1.1663264832768938, + "grad_norm": 0.119962178170681, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 301710 + }, + { + "epoch": 1.166365140480277, + "grad_norm": 0.09406163543462753, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 301720 + }, + { + "epoch": 1.1664037976836603, + "grad_norm": 0.10572811961174011, + "learning_rate": 0.002, + "loss": 2.33, + "step": 301730 + }, + { + "epoch": 1.1664424548870436, + "grad_norm": 0.10888563841581345, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 301740 + }, + { + "epoch": 1.1664811120904268, + "grad_norm": 0.0975589007139206, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 301750 + }, + { + "epoch": 1.1665197692938103, + "grad_norm": 0.11016840487718582, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 301760 + }, + { + "epoch": 1.1665584264971935, + "grad_norm": 0.12181409448385239, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 301770 + }, + { + "epoch": 1.1665970837005768, + "grad_norm": 0.10931305587291718, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 301780 + }, + { + "epoch": 1.16663574090396, + "grad_norm": 0.11234242469072342, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 301790 + }, + { + "epoch": 1.1666743981073433, + "grad_norm": 0.09115167707204819, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 301800 + }, + { + "epoch": 1.1667130553107266, + "grad_norm": 0.15512128174304962, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 301810 + }, + { + "epoch": 1.1667517125141098, + "grad_norm": 0.11527229100465775, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 301820 + }, + { + "epoch": 1.1667903697174933, + "grad_norm": 0.10287356376647949, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 301830 + }, + { + "epoch": 1.1668290269208765, + "grad_norm": 0.09924790263175964, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 301840 + }, + { + "epoch": 1.1668676841242598, + "grad_norm": 0.11103297024965286, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 301850 + }, + { + "epoch": 1.166906341327643, + "grad_norm": 0.10338755697011948, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 301860 + }, + { + "epoch": 1.1669449985310263, + "grad_norm": 0.11973705142736435, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 301870 + }, + { + "epoch": 1.1669836557344095, + "grad_norm": 0.09499425441026688, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 301880 + }, + { + "epoch": 1.1670223129377928, + "grad_norm": 0.09334953129291534, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 301890 + }, + { + "epoch": 1.167060970141176, + "grad_norm": 0.12852221727371216, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 301900 + }, + { + "epoch": 1.1670996273445593, + "grad_norm": 0.12621502578258514, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 301910 + }, + { + "epoch": 1.1671382845479426, + "grad_norm": 0.09067445993423462, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 301920 + }, + { + "epoch": 1.167176941751326, + "grad_norm": 0.09515558183193207, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 301930 + }, + { + "epoch": 1.1672155989547093, + "grad_norm": 0.10094108432531357, + "learning_rate": 0.002, + "loss": 2.348, + "step": 301940 + }, + { + "epoch": 1.1672542561580925, + "grad_norm": 0.10715378075838089, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 301950 + }, + { + "epoch": 1.1672929133614758, + "grad_norm": 0.11361619085073471, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 301960 + }, + { + "epoch": 1.167331570564859, + "grad_norm": 0.13124187290668488, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 301970 + }, + { + "epoch": 1.1673702277682423, + "grad_norm": 0.11959458887577057, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 301980 + }, + { + "epoch": 1.1674088849716255, + "grad_norm": 0.09688874334096909, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 301990 + }, + { + "epoch": 1.167447542175009, + "grad_norm": 0.10897421091794968, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 302000 + }, + { + "epoch": 1.1674861993783923, + "grad_norm": 0.11167865991592407, + "learning_rate": 0.002, + "loss": 2.337, + "step": 302010 + }, + { + "epoch": 1.1675248565817755, + "grad_norm": 0.0992354303598404, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 302020 + }, + { + "epoch": 1.1675635137851588, + "grad_norm": 0.10574154555797577, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 302030 + }, + { + "epoch": 1.167602170988542, + "grad_norm": 0.097483791410923, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 302040 + }, + { + "epoch": 1.1676408281919253, + "grad_norm": 0.10599792748689651, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 302050 + }, + { + "epoch": 1.1676794853953085, + "grad_norm": 0.11261613667011261, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 302060 + }, + { + "epoch": 1.1677181425986918, + "grad_norm": 0.12978191673755646, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 302070 + }, + { + "epoch": 1.167756799802075, + "grad_norm": 0.10361035168170929, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 302080 + }, + { + "epoch": 1.1677954570054583, + "grad_norm": 0.1083601862192154, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 302090 + }, + { + "epoch": 1.1678341142088418, + "grad_norm": 0.10080970823764801, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 302100 + }, + { + "epoch": 1.167872771412225, + "grad_norm": 0.09778130054473877, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 302110 + }, + { + "epoch": 1.1679114286156083, + "grad_norm": 0.11359573900699615, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 302120 + }, + { + "epoch": 1.1679500858189915, + "grad_norm": 0.09689043462276459, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 302130 + }, + { + "epoch": 1.1679887430223748, + "grad_norm": 0.09919068217277527, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 302140 + }, + { + "epoch": 1.168027400225758, + "grad_norm": 0.09681098163127899, + "learning_rate": 0.002, + "loss": 2.319, + "step": 302150 + }, + { + "epoch": 1.1680660574291413, + "grad_norm": 0.09828266501426697, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 302160 + }, + { + "epoch": 1.1681047146325247, + "grad_norm": 0.12220165133476257, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 302170 + }, + { + "epoch": 1.168143371835908, + "grad_norm": 0.09539945423603058, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 302180 + }, + { + "epoch": 1.1681820290392912, + "grad_norm": 0.09313678741455078, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 302190 + }, + { + "epoch": 1.1682206862426745, + "grad_norm": 0.11383495479822159, + "learning_rate": 0.002, + "loss": 2.342, + "step": 302200 + }, + { + "epoch": 1.1682593434460578, + "grad_norm": 0.10649847239255905, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 302210 + }, + { + "epoch": 1.168298000649441, + "grad_norm": 0.12820562720298767, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 302220 + }, + { + "epoch": 1.1683366578528243, + "grad_norm": 0.11359477043151855, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 302230 + }, + { + "epoch": 1.1683753150562075, + "grad_norm": 0.10154304653406143, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 302240 + }, + { + "epoch": 1.1684139722595908, + "grad_norm": 0.10479036718606949, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 302250 + }, + { + "epoch": 1.168452629462974, + "grad_norm": 0.15693366527557373, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 302260 + }, + { + "epoch": 1.1684912866663575, + "grad_norm": 0.09982585906982422, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 302270 + }, + { + "epoch": 1.1685299438697407, + "grad_norm": 0.10710987448692322, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 302280 + }, + { + "epoch": 1.168568601073124, + "grad_norm": 0.10061867535114288, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 302290 + }, + { + "epoch": 1.1686072582765072, + "grad_norm": 0.1065753772854805, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 302300 + }, + { + "epoch": 1.1686459154798905, + "grad_norm": 0.10224214941263199, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 302310 + }, + { + "epoch": 1.1686845726832737, + "grad_norm": 0.09375665336847305, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 302320 + }, + { + "epoch": 1.168723229886657, + "grad_norm": 0.08769812434911728, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 302330 + }, + { + "epoch": 1.1687618870900405, + "grad_norm": 0.10518988221883774, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 302340 + }, + { + "epoch": 1.1688005442934237, + "grad_norm": 0.137264683842659, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 302350 + }, + { + "epoch": 1.168839201496807, + "grad_norm": 0.12920604646205902, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 302360 + }, + { + "epoch": 1.1688778587001902, + "grad_norm": 0.1093267872929573, + "learning_rate": 0.002, + "loss": 2.354, + "step": 302370 + }, + { + "epoch": 1.1689165159035735, + "grad_norm": 0.09726943075656891, + "learning_rate": 0.002, + "loss": 2.33, + "step": 302380 + }, + { + "epoch": 1.1689551731069567, + "grad_norm": 0.11821261048316956, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 302390 + }, + { + "epoch": 1.16899383031034, + "grad_norm": 0.09697729349136353, + "learning_rate": 0.002, + "loss": 2.328, + "step": 302400 + }, + { + "epoch": 1.1690324875137232, + "grad_norm": 0.12189007550477982, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 302410 + }, + { + "epoch": 1.1690711447171065, + "grad_norm": 0.11940648406744003, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 302420 + }, + { + "epoch": 1.1691098019204897, + "grad_norm": 0.1193215548992157, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 302430 + }, + { + "epoch": 1.1691484591238732, + "grad_norm": 0.10657306015491486, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 302440 + }, + { + "epoch": 1.1691871163272565, + "grad_norm": 0.16606692969799042, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 302450 + }, + { + "epoch": 1.1692257735306397, + "grad_norm": 0.12425743043422699, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 302460 + }, + { + "epoch": 1.169264430734023, + "grad_norm": 0.09916871786117554, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 302470 + }, + { + "epoch": 1.1693030879374062, + "grad_norm": 0.1128845140337944, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 302480 + }, + { + "epoch": 1.1693417451407895, + "grad_norm": 0.10971315205097198, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 302490 + }, + { + "epoch": 1.1693804023441727, + "grad_norm": 0.10512728989124298, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 302500 + }, + { + "epoch": 1.1694190595475562, + "grad_norm": 0.1052752435207367, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 302510 + }, + { + "epoch": 1.1694577167509395, + "grad_norm": 0.11025025695562363, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 302520 + }, + { + "epoch": 1.1694963739543227, + "grad_norm": 0.09883037954568863, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 302530 + }, + { + "epoch": 1.169535031157706, + "grad_norm": 0.12725435197353363, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 302540 + }, + { + "epoch": 1.1695736883610892, + "grad_norm": 0.19296719133853912, + "learning_rate": 0.002, + "loss": 2.337, + "step": 302550 + }, + { + "epoch": 1.1696123455644725, + "grad_norm": 0.10766754299402237, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 302560 + }, + { + "epoch": 1.1696510027678557, + "grad_norm": 0.09638424962759018, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 302570 + }, + { + "epoch": 1.169689659971239, + "grad_norm": 0.1059780865907669, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 302580 + }, + { + "epoch": 1.1697283171746222, + "grad_norm": 0.11075137555599213, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 302590 + }, + { + "epoch": 1.1697669743780055, + "grad_norm": 0.12818937003612518, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 302600 + }, + { + "epoch": 1.169805631581389, + "grad_norm": 0.11721966415643692, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 302610 + }, + { + "epoch": 1.1698442887847722, + "grad_norm": 0.08859899640083313, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 302620 + }, + { + "epoch": 1.1698829459881555, + "grad_norm": 0.1014174371957779, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 302630 + }, + { + "epoch": 1.1699216031915387, + "grad_norm": 0.11284614354372025, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 302640 + }, + { + "epoch": 1.169960260394922, + "grad_norm": 0.10882255434989929, + "learning_rate": 0.002, + "loss": 2.325, + "step": 302650 + }, + { + "epoch": 1.1699989175983052, + "grad_norm": 0.08628625422716141, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 302660 + }, + { + "epoch": 1.1700375748016885, + "grad_norm": 0.12380296736955643, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 302670 + }, + { + "epoch": 1.170076232005072, + "grad_norm": 0.10757222771644592, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 302680 + }, + { + "epoch": 1.1701148892084552, + "grad_norm": 0.10542470961809158, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 302690 + }, + { + "epoch": 1.1701535464118384, + "grad_norm": 0.10293202847242355, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 302700 + }, + { + "epoch": 1.1701922036152217, + "grad_norm": 0.14387407898902893, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 302710 + }, + { + "epoch": 1.170230860818605, + "grad_norm": 0.09444009512662888, + "learning_rate": 0.002, + "loss": 2.354, + "step": 302720 + }, + { + "epoch": 1.1702695180219882, + "grad_norm": 0.09907840192317963, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 302730 + }, + { + "epoch": 1.1703081752253714, + "grad_norm": 0.09588519483804703, + "learning_rate": 0.002, + "loss": 2.324, + "step": 302740 + }, + { + "epoch": 1.1703468324287547, + "grad_norm": 0.10444150120019913, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 302750 + }, + { + "epoch": 1.170385489632138, + "grad_norm": 0.13282561302185059, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 302760 + }, + { + "epoch": 1.1704241468355212, + "grad_norm": 0.1018977016210556, + "learning_rate": 0.002, + "loss": 2.33, + "step": 302770 + }, + { + "epoch": 1.1704628040389047, + "grad_norm": 0.09958646446466446, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 302780 + }, + { + "epoch": 1.170501461242288, + "grad_norm": 0.10159620642662048, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 302790 + }, + { + "epoch": 1.1705401184456712, + "grad_norm": 0.10525692254304886, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 302800 + }, + { + "epoch": 1.1705787756490544, + "grad_norm": 0.12873084843158722, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 302810 + }, + { + "epoch": 1.1706174328524377, + "grad_norm": 0.09802334755659103, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 302820 + }, + { + "epoch": 1.170656090055821, + "grad_norm": 0.10764884948730469, + "learning_rate": 0.002, + "loss": 2.341, + "step": 302830 + }, + { + "epoch": 1.1706947472592042, + "grad_norm": 0.1302516609430313, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 302840 + }, + { + "epoch": 1.1707334044625877, + "grad_norm": 0.09868675470352173, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 302850 + }, + { + "epoch": 1.170772061665971, + "grad_norm": 0.09090667217969894, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 302860 + }, + { + "epoch": 1.1708107188693542, + "grad_norm": 0.09880606085062027, + "learning_rate": 0.002, + "loss": 2.333, + "step": 302870 + }, + { + "epoch": 1.1708493760727374, + "grad_norm": 0.09945500642061234, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 302880 + }, + { + "epoch": 1.1708880332761207, + "grad_norm": 0.09477779269218445, + "learning_rate": 0.002, + "loss": 2.337, + "step": 302890 + }, + { + "epoch": 1.170926690479504, + "grad_norm": 0.12763994932174683, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 302900 + }, + { + "epoch": 1.1709653476828872, + "grad_norm": 0.11147284507751465, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 302910 + }, + { + "epoch": 1.1710040048862704, + "grad_norm": 0.11019851267337799, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 302920 + }, + { + "epoch": 1.1710426620896537, + "grad_norm": 0.09784045815467834, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 302930 + }, + { + "epoch": 1.1710813192930372, + "grad_norm": 0.10193490236997604, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 302940 + }, + { + "epoch": 1.1711199764964204, + "grad_norm": 0.1023782268166542, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 302950 + }, + { + "epoch": 1.1711586336998037, + "grad_norm": 0.1048358753323555, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 302960 + }, + { + "epoch": 1.171197290903187, + "grad_norm": 0.10859642177820206, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 302970 + }, + { + "epoch": 1.1712359481065702, + "grad_norm": 0.10615593940019608, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 302980 + }, + { + "epoch": 1.1712746053099534, + "grad_norm": 0.10078933089971542, + "learning_rate": 0.002, + "loss": 2.348, + "step": 302990 + }, + { + "epoch": 1.1713132625133367, + "grad_norm": 0.09982361644506454, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 303000 + }, + { + "epoch": 1.17135191971672, + "grad_norm": 0.09770236164331436, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 303010 + }, + { + "epoch": 1.1713905769201034, + "grad_norm": 0.11166229099035263, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 303020 + }, + { + "epoch": 1.1714292341234867, + "grad_norm": 0.09842663258314133, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 303030 + }, + { + "epoch": 1.17146789132687, + "grad_norm": 0.09624983370304108, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 303040 + }, + { + "epoch": 1.1715065485302532, + "grad_norm": 0.11118954420089722, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 303050 + }, + { + "epoch": 1.1715452057336364, + "grad_norm": 0.10093878209590912, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 303060 + }, + { + "epoch": 1.1715838629370197, + "grad_norm": 0.1114550307393074, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 303070 + }, + { + "epoch": 1.171622520140403, + "grad_norm": 0.10811921954154968, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 303080 + }, + { + "epoch": 1.1716611773437862, + "grad_norm": 0.0978502631187439, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 303090 + }, + { + "epoch": 1.1716998345471694, + "grad_norm": 0.1217813491821289, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 303100 + }, + { + "epoch": 1.171738491750553, + "grad_norm": 0.10993924736976624, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 303110 + }, + { + "epoch": 1.1717771489539361, + "grad_norm": 0.0941442996263504, + "learning_rate": 0.002, + "loss": 2.342, + "step": 303120 + }, + { + "epoch": 1.1718158061573194, + "grad_norm": 0.09519714117050171, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 303130 + }, + { + "epoch": 1.1718544633607026, + "grad_norm": 0.09512632340192795, + "learning_rate": 0.002, + "loss": 2.329, + "step": 303140 + }, + { + "epoch": 1.171893120564086, + "grad_norm": 0.08752937614917755, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 303150 + }, + { + "epoch": 1.1719317777674692, + "grad_norm": 0.1074802577495575, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 303160 + }, + { + "epoch": 1.1719704349708524, + "grad_norm": 0.09255402535200119, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 303170 + }, + { + "epoch": 1.1720090921742357, + "grad_norm": 0.09914116561412811, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 303180 + }, + { + "epoch": 1.1720477493776191, + "grad_norm": 0.0997026115655899, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 303190 + }, + { + "epoch": 1.1720864065810024, + "grad_norm": 0.12617866694927216, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 303200 + }, + { + "epoch": 1.1721250637843856, + "grad_norm": 0.11828405410051346, + "learning_rate": 0.002, + "loss": 2.337, + "step": 303210 + }, + { + "epoch": 1.1721637209877689, + "grad_norm": 0.1121009960770607, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 303220 + }, + { + "epoch": 1.1722023781911521, + "grad_norm": 0.09488152712583542, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 303230 + }, + { + "epoch": 1.1722410353945354, + "grad_norm": 0.13893936574459076, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 303240 + }, + { + "epoch": 1.1722796925979186, + "grad_norm": 0.09628939628601074, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 303250 + }, + { + "epoch": 1.172318349801302, + "grad_norm": 0.09990405291318893, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 303260 + }, + { + "epoch": 1.1723570070046851, + "grad_norm": 0.10545917600393295, + "learning_rate": 0.002, + "loss": 2.352, + "step": 303270 + }, + { + "epoch": 1.1723956642080686, + "grad_norm": 0.09982466697692871, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 303280 + }, + { + "epoch": 1.1724343214114519, + "grad_norm": 0.09827154129743576, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 303290 + }, + { + "epoch": 1.1724729786148351, + "grad_norm": 0.09286846220493317, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 303300 + }, + { + "epoch": 1.1725116358182184, + "grad_norm": 0.10099194198846817, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 303310 + }, + { + "epoch": 1.1725502930216016, + "grad_norm": 0.11508749425411224, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 303320 + }, + { + "epoch": 1.1725889502249849, + "grad_norm": 0.10250307619571686, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 303330 + }, + { + "epoch": 1.1726276074283681, + "grad_norm": 0.10944922268390656, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 303340 + }, + { + "epoch": 1.1726662646317514, + "grad_norm": 0.11692958325147629, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 303350 + }, + { + "epoch": 1.1727049218351349, + "grad_norm": 0.10119730234146118, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 303360 + }, + { + "epoch": 1.1727435790385181, + "grad_norm": 0.12241552770137787, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 303370 + }, + { + "epoch": 1.1727822362419014, + "grad_norm": 0.0938178300857544, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 303380 + }, + { + "epoch": 1.1728208934452846, + "grad_norm": 0.11530961096286774, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 303390 + }, + { + "epoch": 1.1728595506486679, + "grad_norm": 0.09542708843946457, + "learning_rate": 0.002, + "loss": 2.347, + "step": 303400 + }, + { + "epoch": 1.1728982078520511, + "grad_norm": 0.10534845292568207, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 303410 + }, + { + "epoch": 1.1729368650554344, + "grad_norm": 0.10652362555265427, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 303420 + }, + { + "epoch": 1.1729755222588176, + "grad_norm": 0.09927625954151154, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 303430 + }, + { + "epoch": 1.1730141794622009, + "grad_norm": 0.12564003467559814, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 303440 + }, + { + "epoch": 1.1730528366655844, + "grad_norm": 0.15445850789546967, + "learning_rate": 0.002, + "loss": 2.329, + "step": 303450 + }, + { + "epoch": 1.1730914938689676, + "grad_norm": 0.11700819432735443, + "learning_rate": 0.002, + "loss": 2.324, + "step": 303460 + }, + { + "epoch": 1.1731301510723509, + "grad_norm": 0.11349461227655411, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 303470 + }, + { + "epoch": 1.173168808275734, + "grad_norm": 0.10206008702516556, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 303480 + }, + { + "epoch": 1.1732074654791174, + "grad_norm": 0.10352712124586105, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 303490 + }, + { + "epoch": 1.1732461226825006, + "grad_norm": 0.09197427332401276, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 303500 + }, + { + "epoch": 1.1732847798858839, + "grad_norm": 0.11469713598489761, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 303510 + }, + { + "epoch": 1.1733234370892671, + "grad_norm": 0.10551794618368149, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 303520 + }, + { + "epoch": 1.1733620942926506, + "grad_norm": 0.10328248888254166, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 303530 + }, + { + "epoch": 1.1734007514960338, + "grad_norm": 0.11079932749271393, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 303540 + }, + { + "epoch": 1.173439408699417, + "grad_norm": 0.10771428048610687, + "learning_rate": 0.002, + "loss": 2.347, + "step": 303550 + }, + { + "epoch": 1.1734780659028003, + "grad_norm": 0.10114746540784836, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 303560 + }, + { + "epoch": 1.1735167231061836, + "grad_norm": 0.10057065635919571, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 303570 + }, + { + "epoch": 1.1735553803095669, + "grad_norm": 0.10123337060213089, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 303580 + }, + { + "epoch": 1.17359403751295, + "grad_norm": 0.10665623843669891, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 303590 + }, + { + "epoch": 1.1736326947163334, + "grad_norm": 0.094930000603199, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 303600 + }, + { + "epoch": 1.1736713519197166, + "grad_norm": 0.12668092548847198, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 303610 + }, + { + "epoch": 1.1737100091231, + "grad_norm": 0.09292471408843994, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 303620 + }, + { + "epoch": 1.1737486663264833, + "grad_norm": 0.10419165343046188, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 303630 + }, + { + "epoch": 1.1737873235298666, + "grad_norm": 0.12584923207759857, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 303640 + }, + { + "epoch": 1.1738259807332498, + "grad_norm": 0.10469865798950195, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 303650 + }, + { + "epoch": 1.173864637936633, + "grad_norm": 0.10127655416727066, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 303660 + }, + { + "epoch": 1.1739032951400163, + "grad_norm": 0.10466544330120087, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 303670 + }, + { + "epoch": 1.1739419523433996, + "grad_norm": 0.11453257501125336, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 303680 + }, + { + "epoch": 1.173980609546783, + "grad_norm": 0.10881021618843079, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 303690 + }, + { + "epoch": 1.1740192667501663, + "grad_norm": 0.10869313031435013, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 303700 + }, + { + "epoch": 1.1740579239535496, + "grad_norm": 0.10462198406457901, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 303710 + }, + { + "epoch": 1.1740965811569328, + "grad_norm": 0.10045907646417618, + "learning_rate": 0.002, + "loss": 2.326, + "step": 303720 + }, + { + "epoch": 1.174135238360316, + "grad_norm": 0.09981822222471237, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 303730 + }, + { + "epoch": 1.1741738955636993, + "grad_norm": 0.11632543802261353, + "learning_rate": 0.002, + "loss": 2.341, + "step": 303740 + }, + { + "epoch": 1.1742125527670826, + "grad_norm": 0.13264437019824982, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 303750 + }, + { + "epoch": 1.1742512099704658, + "grad_norm": 0.1124536469578743, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 303760 + }, + { + "epoch": 1.174289867173849, + "grad_norm": 0.10196714848279953, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 303770 + }, + { + "epoch": 1.1743285243772323, + "grad_norm": 0.1239824965596199, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 303780 + }, + { + "epoch": 1.1743671815806158, + "grad_norm": 0.10366879403591156, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 303790 + }, + { + "epoch": 1.174405838783999, + "grad_norm": 0.09123245626688004, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 303800 + }, + { + "epoch": 1.1744444959873823, + "grad_norm": 0.10823673009872437, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 303810 + }, + { + "epoch": 1.1744831531907656, + "grad_norm": 0.107994444668293, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 303820 + }, + { + "epoch": 1.1745218103941488, + "grad_norm": 0.10749204456806183, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 303830 + }, + { + "epoch": 1.174560467597532, + "grad_norm": 0.10459636151790619, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 303840 + }, + { + "epoch": 1.1745991248009153, + "grad_norm": 0.10358193516731262, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 303850 + }, + { + "epoch": 1.1746377820042988, + "grad_norm": 0.09980751574039459, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 303860 + }, + { + "epoch": 1.174676439207682, + "grad_norm": 0.11811071634292603, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 303870 + }, + { + "epoch": 1.1747150964110653, + "grad_norm": 0.10448575764894485, + "learning_rate": 0.002, + "loss": 2.3629, + "step": 303880 + }, + { + "epoch": 1.1747537536144486, + "grad_norm": 0.092103011906147, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 303890 + }, + { + "epoch": 1.1747924108178318, + "grad_norm": 0.11232535541057587, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 303900 + }, + { + "epoch": 1.174831068021215, + "grad_norm": 0.10447873175144196, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 303910 + }, + { + "epoch": 1.1748697252245983, + "grad_norm": 0.11118560284376144, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 303920 + }, + { + "epoch": 1.1749083824279816, + "grad_norm": 0.08857395499944687, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 303930 + }, + { + "epoch": 1.1749470396313648, + "grad_norm": 0.13168229162693024, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 303940 + }, + { + "epoch": 1.174985696834748, + "grad_norm": 0.11878544837236404, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 303950 + }, + { + "epoch": 1.1750243540381315, + "grad_norm": 0.10220491141080856, + "learning_rate": 0.002, + "loss": 2.32, + "step": 303960 + }, + { + "epoch": 1.1750630112415148, + "grad_norm": 0.11073249578475952, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 303970 + }, + { + "epoch": 1.175101668444898, + "grad_norm": 0.089708112180233, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 303980 + }, + { + "epoch": 1.1751403256482813, + "grad_norm": 0.12098492681980133, + "learning_rate": 0.002, + "loss": 2.334, + "step": 303990 + }, + { + "epoch": 1.1751789828516646, + "grad_norm": 0.11484367400407791, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 304000 + }, + { + "epoch": 1.1752176400550478, + "grad_norm": 0.09806149452924728, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 304010 + }, + { + "epoch": 1.175256297258431, + "grad_norm": 0.1285572201013565, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 304020 + }, + { + "epoch": 1.1752949544618145, + "grad_norm": 0.1325317770242691, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 304030 + }, + { + "epoch": 1.1753336116651978, + "grad_norm": 0.11979369074106216, + "learning_rate": 0.002, + "loss": 2.344, + "step": 304040 + }, + { + "epoch": 1.175372268868581, + "grad_norm": 0.11940714716911316, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 304050 + }, + { + "epoch": 1.1754109260719643, + "grad_norm": 0.09162340313196182, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 304060 + }, + { + "epoch": 1.1754495832753475, + "grad_norm": 0.1013336181640625, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 304070 + }, + { + "epoch": 1.1754882404787308, + "grad_norm": 0.11899504065513611, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 304080 + }, + { + "epoch": 1.175526897682114, + "grad_norm": 0.10176719725131989, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 304090 + }, + { + "epoch": 1.1755655548854973, + "grad_norm": 0.11070800572633743, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 304100 + }, + { + "epoch": 1.1756042120888806, + "grad_norm": 0.10064385086297989, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 304110 + }, + { + "epoch": 1.1756428692922638, + "grad_norm": 0.10409154742956161, + "learning_rate": 0.002, + "loss": 2.323, + "step": 304120 + }, + { + "epoch": 1.1756815264956473, + "grad_norm": 0.09488542377948761, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 304130 + }, + { + "epoch": 1.1757201836990305, + "grad_norm": 0.1136431023478508, + "learning_rate": 0.002, + "loss": 2.346, + "step": 304140 + }, + { + "epoch": 1.1757588409024138, + "grad_norm": 0.09265240281820297, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 304150 + }, + { + "epoch": 1.175797498105797, + "grad_norm": 0.09490280598402023, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 304160 + }, + { + "epoch": 1.1758361553091803, + "grad_norm": 0.1054140031337738, + "learning_rate": 0.002, + "loss": 2.332, + "step": 304170 + }, + { + "epoch": 1.1758748125125635, + "grad_norm": 0.10391158610582352, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 304180 + }, + { + "epoch": 1.1759134697159468, + "grad_norm": 0.09150063991546631, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 304190 + }, + { + "epoch": 1.1759521269193303, + "grad_norm": 0.11067679524421692, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 304200 + }, + { + "epoch": 1.1759907841227135, + "grad_norm": 0.10968092083930969, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 304210 + }, + { + "epoch": 1.1760294413260968, + "grad_norm": 0.10392303019762039, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 304220 + }, + { + "epoch": 1.17606809852948, + "grad_norm": 0.11467640101909637, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 304230 + }, + { + "epoch": 1.1761067557328633, + "grad_norm": 0.10126051306724548, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 304240 + }, + { + "epoch": 1.1761454129362465, + "grad_norm": 0.10900580137968063, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 304250 + }, + { + "epoch": 1.1761840701396298, + "grad_norm": 0.09736665338277817, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 304260 + }, + { + "epoch": 1.176222727343013, + "grad_norm": 0.10468364506959915, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 304270 + }, + { + "epoch": 1.1762613845463963, + "grad_norm": 0.11530701816082001, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 304280 + }, + { + "epoch": 1.1763000417497795, + "grad_norm": 0.13199259340763092, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 304290 + }, + { + "epoch": 1.176338698953163, + "grad_norm": 0.10489901900291443, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 304300 + }, + { + "epoch": 1.1763773561565463, + "grad_norm": 0.09021489322185516, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 304310 + }, + { + "epoch": 1.1764160133599295, + "grad_norm": 0.11300509423017502, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 304320 + }, + { + "epoch": 1.1764546705633128, + "grad_norm": 0.1115007996559143, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 304330 + }, + { + "epoch": 1.176493327766696, + "grad_norm": 0.10999033600091934, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 304340 + }, + { + "epoch": 1.1765319849700793, + "grad_norm": 0.09730103611946106, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 304350 + }, + { + "epoch": 1.1765706421734625, + "grad_norm": 0.0897442027926445, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 304360 + }, + { + "epoch": 1.176609299376846, + "grad_norm": 0.10270145535469055, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 304370 + }, + { + "epoch": 1.1766479565802292, + "grad_norm": 0.11997896432876587, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 304380 + }, + { + "epoch": 1.1766866137836125, + "grad_norm": 0.11275748163461685, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 304390 + }, + { + "epoch": 1.1767252709869958, + "grad_norm": 0.09913094341754913, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 304400 + }, + { + "epoch": 1.176763928190379, + "grad_norm": 0.13795678317546844, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 304410 + }, + { + "epoch": 1.1768025853937623, + "grad_norm": 0.1019025593996048, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 304420 + }, + { + "epoch": 1.1768412425971455, + "grad_norm": 0.12087249010801315, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 304430 + }, + { + "epoch": 1.1768798998005288, + "grad_norm": 0.10497815907001495, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 304440 + }, + { + "epoch": 1.176918557003912, + "grad_norm": 0.14686134457588196, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 304450 + }, + { + "epoch": 1.1769572142072953, + "grad_norm": 0.1088930293917656, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 304460 + }, + { + "epoch": 1.1769958714106787, + "grad_norm": 0.11077285557985306, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 304470 + }, + { + "epoch": 1.177034528614062, + "grad_norm": 0.10116249322891235, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 304480 + }, + { + "epoch": 1.1770731858174452, + "grad_norm": 0.09450482577085495, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 304490 + }, + { + "epoch": 1.1771118430208285, + "grad_norm": 0.11417514830827713, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 304500 + }, + { + "epoch": 1.1771505002242117, + "grad_norm": 0.10279672592878342, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 304510 + }, + { + "epoch": 1.177189157427595, + "grad_norm": 0.10613597929477692, + "learning_rate": 0.002, + "loss": 2.335, + "step": 304520 + }, + { + "epoch": 1.1772278146309783, + "grad_norm": 0.11793537437915802, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 304530 + }, + { + "epoch": 1.1772664718343617, + "grad_norm": 0.10077495127916336, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 304540 + }, + { + "epoch": 1.177305129037745, + "grad_norm": 0.10949613153934479, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 304550 + }, + { + "epoch": 1.1773437862411282, + "grad_norm": 0.11719959229230881, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 304560 + }, + { + "epoch": 1.1773824434445115, + "grad_norm": 0.1240655705332756, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 304570 + }, + { + "epoch": 1.1774211006478947, + "grad_norm": 0.0896514281630516, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 304580 + }, + { + "epoch": 1.177459757851278, + "grad_norm": 0.11183775216341019, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 304590 + }, + { + "epoch": 1.1774984150546612, + "grad_norm": 0.12366892397403717, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 304600 + }, + { + "epoch": 1.1775370722580445, + "grad_norm": 0.11976148188114166, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 304610 + }, + { + "epoch": 1.1775757294614277, + "grad_norm": 0.10143893957138062, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 304620 + }, + { + "epoch": 1.177614386664811, + "grad_norm": 0.09807173907756805, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 304630 + }, + { + "epoch": 1.1776530438681945, + "grad_norm": 0.08996410667896271, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 304640 + }, + { + "epoch": 1.1776917010715777, + "grad_norm": 0.15061452984809875, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 304650 + }, + { + "epoch": 1.177730358274961, + "grad_norm": 0.10375141352415085, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 304660 + }, + { + "epoch": 1.1777690154783442, + "grad_norm": 0.0976865217089653, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 304670 + }, + { + "epoch": 1.1778076726817275, + "grad_norm": 0.10766734182834625, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 304680 + }, + { + "epoch": 1.1778463298851107, + "grad_norm": 0.10150058567523956, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 304690 + }, + { + "epoch": 1.177884987088494, + "grad_norm": 0.11639443784952164, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 304700 + }, + { + "epoch": 1.1779236442918775, + "grad_norm": 0.10969920456409454, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 304710 + }, + { + "epoch": 1.1779623014952607, + "grad_norm": 0.11082258075475693, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 304720 + }, + { + "epoch": 1.178000958698644, + "grad_norm": 0.10220003128051758, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 304730 + }, + { + "epoch": 1.1780396159020272, + "grad_norm": 0.1293516606092453, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 304740 + }, + { + "epoch": 1.1780782731054105, + "grad_norm": 0.10310104489326477, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 304750 + }, + { + "epoch": 1.1781169303087937, + "grad_norm": 0.10579738020896912, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 304760 + }, + { + "epoch": 1.178155587512177, + "grad_norm": 0.09480555355548859, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 304770 + }, + { + "epoch": 1.1781942447155602, + "grad_norm": 0.1701122671365738, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 304780 + }, + { + "epoch": 1.1782329019189435, + "grad_norm": 0.12626154720783234, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 304790 + }, + { + "epoch": 1.178271559122327, + "grad_norm": 0.10779381543397903, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 304800 + }, + { + "epoch": 1.1783102163257102, + "grad_norm": 0.10039182752370834, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 304810 + }, + { + "epoch": 1.1783488735290935, + "grad_norm": 0.09189040213823318, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 304820 + }, + { + "epoch": 1.1783875307324767, + "grad_norm": 0.1019379049539566, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 304830 + }, + { + "epoch": 1.17842618793586, + "grad_norm": 0.14611029624938965, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 304840 + }, + { + "epoch": 1.1784648451392432, + "grad_norm": 0.10264472663402557, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 304850 + }, + { + "epoch": 1.1785035023426265, + "grad_norm": 0.10579685121774673, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 304860 + }, + { + "epoch": 1.1785421595460097, + "grad_norm": 0.09584686905145645, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 304870 + }, + { + "epoch": 1.1785808167493932, + "grad_norm": 0.13405711948871613, + "learning_rate": 0.002, + "loss": 2.329, + "step": 304880 + }, + { + "epoch": 1.1786194739527764, + "grad_norm": 0.12086565047502518, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 304890 + }, + { + "epoch": 1.1786581311561597, + "grad_norm": 0.10096962749958038, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 304900 + }, + { + "epoch": 1.178696788359543, + "grad_norm": 0.10008525103330612, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 304910 + }, + { + "epoch": 1.1787354455629262, + "grad_norm": 0.10858751833438873, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 304920 + }, + { + "epoch": 1.1787741027663095, + "grad_norm": 0.10242888331413269, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 304930 + }, + { + "epoch": 1.1788127599696927, + "grad_norm": 0.1024758443236351, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 304940 + }, + { + "epoch": 1.178851417173076, + "grad_norm": 0.10521946847438812, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 304950 + }, + { + "epoch": 1.1788900743764592, + "grad_norm": 0.0923774316906929, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 304960 + }, + { + "epoch": 1.1789287315798427, + "grad_norm": 0.10218238085508347, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 304970 + }, + { + "epoch": 1.178967388783226, + "grad_norm": 0.10977078229188919, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 304980 + }, + { + "epoch": 1.1790060459866092, + "grad_norm": 0.09633048623800278, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 304990 + }, + { + "epoch": 1.1790447031899924, + "grad_norm": 0.10134069621562958, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 305000 + }, + { + "epoch": 1.1790833603933757, + "grad_norm": 0.10490524023771286, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 305010 + }, + { + "epoch": 1.179122017596759, + "grad_norm": 0.1142604649066925, + "learning_rate": 0.002, + "loss": 2.332, + "step": 305020 + }, + { + "epoch": 1.1791606748001422, + "grad_norm": 0.09350310266017914, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 305030 + }, + { + "epoch": 1.1791993320035254, + "grad_norm": 0.1101241186261177, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 305040 + }, + { + "epoch": 1.179237989206909, + "grad_norm": 0.11008507758378983, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 305050 + }, + { + "epoch": 1.1792766464102922, + "grad_norm": 0.10210888832807541, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 305060 + }, + { + "epoch": 1.1793153036136754, + "grad_norm": 0.12260551750659943, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 305070 + }, + { + "epoch": 1.1793539608170587, + "grad_norm": 0.09672050178050995, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 305080 + }, + { + "epoch": 1.179392618020442, + "grad_norm": 0.11338058114051819, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 305090 + }, + { + "epoch": 1.1794312752238252, + "grad_norm": 0.12634330987930298, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 305100 + }, + { + "epoch": 1.1794699324272084, + "grad_norm": 0.10544847697019577, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 305110 + }, + { + "epoch": 1.1795085896305917, + "grad_norm": 0.1192038282752037, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 305120 + }, + { + "epoch": 1.179547246833975, + "grad_norm": 0.08795661479234695, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 305130 + }, + { + "epoch": 1.1795859040373584, + "grad_norm": 0.10282469540834427, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 305140 + }, + { + "epoch": 1.1796245612407417, + "grad_norm": 0.1073727011680603, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 305150 + }, + { + "epoch": 1.179663218444125, + "grad_norm": 0.09340602159500122, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 305160 + }, + { + "epoch": 1.1797018756475082, + "grad_norm": 0.09694849699735641, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 305170 + }, + { + "epoch": 1.1797405328508914, + "grad_norm": 0.1085776686668396, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 305180 + }, + { + "epoch": 1.1797791900542747, + "grad_norm": 0.10637427866458893, + "learning_rate": 0.002, + "loss": 2.331, + "step": 305190 + }, + { + "epoch": 1.179817847257658, + "grad_norm": 0.13425855338573456, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 305200 + }, + { + "epoch": 1.1798565044610412, + "grad_norm": 0.11008214950561523, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 305210 + }, + { + "epoch": 1.1798951616644247, + "grad_norm": 0.10857033729553223, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 305220 + }, + { + "epoch": 1.179933818867808, + "grad_norm": 0.10989338159561157, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 305230 + }, + { + "epoch": 1.1799724760711912, + "grad_norm": 0.10804920643568039, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 305240 + }, + { + "epoch": 1.1800111332745744, + "grad_norm": 0.09076432883739471, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 305250 + }, + { + "epoch": 1.1800497904779577, + "grad_norm": 0.10450518876314163, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 305260 + }, + { + "epoch": 1.180088447681341, + "grad_norm": 0.11721612513065338, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 305270 + }, + { + "epoch": 1.1801271048847242, + "grad_norm": 0.11377738416194916, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 305280 + }, + { + "epoch": 1.1801657620881074, + "grad_norm": 0.10185536742210388, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 305290 + }, + { + "epoch": 1.1802044192914907, + "grad_norm": 0.1519765406847, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 305300 + }, + { + "epoch": 1.1802430764948741, + "grad_norm": 0.0957445278763771, + "learning_rate": 0.002, + "loss": 2.327, + "step": 305310 + }, + { + "epoch": 1.1802817336982574, + "grad_norm": 0.1012631356716156, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 305320 + }, + { + "epoch": 1.1803203909016406, + "grad_norm": 0.10935018211603165, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 305330 + }, + { + "epoch": 1.180359048105024, + "grad_norm": 0.09782283008098602, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 305340 + }, + { + "epoch": 1.1803977053084072, + "grad_norm": 0.11596140265464783, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 305350 + }, + { + "epoch": 1.1804363625117904, + "grad_norm": 0.1043427437543869, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 305360 + }, + { + "epoch": 1.1804750197151737, + "grad_norm": 0.09407017379999161, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 305370 + }, + { + "epoch": 1.180513676918557, + "grad_norm": 0.08882657438516617, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 305380 + }, + { + "epoch": 1.1805523341219404, + "grad_norm": 0.07942847907543182, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 305390 + }, + { + "epoch": 1.1805909913253236, + "grad_norm": 0.12886931002140045, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 305400 + }, + { + "epoch": 1.1806296485287069, + "grad_norm": 0.10612437129020691, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 305410 + }, + { + "epoch": 1.1806683057320901, + "grad_norm": 0.11033554375171661, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 305420 + }, + { + "epoch": 1.1807069629354734, + "grad_norm": 0.11206621676683426, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 305430 + }, + { + "epoch": 1.1807456201388566, + "grad_norm": 0.10127930343151093, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 305440 + }, + { + "epoch": 1.18078427734224, + "grad_norm": 0.10065993666648865, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 305450 + }, + { + "epoch": 1.1808229345456231, + "grad_norm": 0.11005466431379318, + "learning_rate": 0.002, + "loss": 2.3102, + "step": 305460 + }, + { + "epoch": 1.1808615917490064, + "grad_norm": 0.11241529881954193, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 305470 + }, + { + "epoch": 1.1809002489523899, + "grad_norm": 0.09118768572807312, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 305480 + }, + { + "epoch": 1.1809389061557731, + "grad_norm": 0.1035250723361969, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 305490 + }, + { + "epoch": 1.1809775633591564, + "grad_norm": 0.10607394576072693, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 305500 + }, + { + "epoch": 1.1810162205625396, + "grad_norm": 0.10815197974443436, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 305510 + }, + { + "epoch": 1.1810548777659229, + "grad_norm": 0.10144860297441483, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 305520 + }, + { + "epoch": 1.1810935349693061, + "grad_norm": 0.10189273208379745, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 305530 + }, + { + "epoch": 1.1811321921726894, + "grad_norm": 0.09582088142633438, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 305540 + }, + { + "epoch": 1.1811708493760729, + "grad_norm": 0.10352307558059692, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 305550 + }, + { + "epoch": 1.1812095065794561, + "grad_norm": 0.13020190596580505, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 305560 + }, + { + "epoch": 1.1812481637828394, + "grad_norm": 0.09354320168495178, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 305570 + }, + { + "epoch": 1.1812868209862226, + "grad_norm": 0.09511909633874893, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 305580 + }, + { + "epoch": 1.1813254781896059, + "grad_norm": 0.1106218695640564, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 305590 + }, + { + "epoch": 1.1813641353929891, + "grad_norm": 0.11186060309410095, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 305600 + }, + { + "epoch": 1.1814027925963724, + "grad_norm": 0.0967664122581482, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 305610 + }, + { + "epoch": 1.1814414497997556, + "grad_norm": 0.09282700717449188, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 305620 + }, + { + "epoch": 1.1814801070031389, + "grad_norm": 0.11425047367811203, + "learning_rate": 0.002, + "loss": 2.332, + "step": 305630 + }, + { + "epoch": 1.1815187642065221, + "grad_norm": 0.11561277508735657, + "learning_rate": 0.002, + "loss": 2.319, + "step": 305640 + }, + { + "epoch": 1.1815574214099056, + "grad_norm": 0.11100748181343079, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 305650 + }, + { + "epoch": 1.1815960786132889, + "grad_norm": 0.10870927572250366, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 305660 + }, + { + "epoch": 1.181634735816672, + "grad_norm": 0.10777224600315094, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 305670 + }, + { + "epoch": 1.1816733930200554, + "grad_norm": 0.09503760188817978, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 305680 + }, + { + "epoch": 1.1817120502234386, + "grad_norm": 0.12765321135520935, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 305690 + }, + { + "epoch": 1.1817507074268219, + "grad_norm": 0.10698419064283371, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 305700 + }, + { + "epoch": 1.1817893646302051, + "grad_norm": 0.09345374256372452, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 305710 + }, + { + "epoch": 1.1818280218335886, + "grad_norm": 0.10894269496202469, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 305720 + }, + { + "epoch": 1.1818666790369718, + "grad_norm": 0.10499607026576996, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 305730 + }, + { + "epoch": 1.181905336240355, + "grad_norm": 0.10457663983106613, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 305740 + }, + { + "epoch": 1.1819439934437383, + "grad_norm": 0.11890369653701782, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 305750 + }, + { + "epoch": 1.1819826506471216, + "grad_norm": 0.11439453065395355, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 305760 + }, + { + "epoch": 1.1820213078505049, + "grad_norm": 0.11376997083425522, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 305770 + }, + { + "epoch": 1.182059965053888, + "grad_norm": 0.09751281887292862, + "learning_rate": 0.002, + "loss": 2.332, + "step": 305780 + }, + { + "epoch": 1.1820986222572714, + "grad_norm": 0.10192884504795074, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 305790 + }, + { + "epoch": 1.1821372794606546, + "grad_norm": 0.12316975742578506, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 305800 + }, + { + "epoch": 1.1821759366640379, + "grad_norm": 0.09169697761535645, + "learning_rate": 0.002, + "loss": 2.333, + "step": 305810 + }, + { + "epoch": 1.1822145938674213, + "grad_norm": 0.09697745740413666, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 305820 + }, + { + "epoch": 1.1822532510708046, + "grad_norm": 0.10638635605573654, + "learning_rate": 0.002, + "loss": 2.335, + "step": 305830 + }, + { + "epoch": 1.1822919082741878, + "grad_norm": 0.1196802407503128, + "learning_rate": 0.002, + "loss": 2.3123, + "step": 305840 + }, + { + "epoch": 1.182330565477571, + "grad_norm": 0.09272005409002304, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 305850 + }, + { + "epoch": 1.1823692226809543, + "grad_norm": 0.10029997676610947, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 305860 + }, + { + "epoch": 1.1824078798843376, + "grad_norm": 0.09517457336187363, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 305870 + }, + { + "epoch": 1.1824465370877209, + "grad_norm": 0.11812178045511246, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 305880 + }, + { + "epoch": 1.1824851942911043, + "grad_norm": 0.11352573335170746, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 305890 + }, + { + "epoch": 1.1825238514944876, + "grad_norm": 0.11691530048847198, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 305900 + }, + { + "epoch": 1.1825625086978708, + "grad_norm": 0.102568119764328, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 305910 + }, + { + "epoch": 1.182601165901254, + "grad_norm": 0.09333674609661102, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 305920 + }, + { + "epoch": 1.1826398231046373, + "grad_norm": 0.1106361448764801, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 305930 + }, + { + "epoch": 1.1826784803080206, + "grad_norm": 0.10933764278888702, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 305940 + }, + { + "epoch": 1.1827171375114038, + "grad_norm": 0.11312533169984818, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 305950 + }, + { + "epoch": 1.182755794714787, + "grad_norm": 0.10010302811861038, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 305960 + }, + { + "epoch": 1.1827944519181703, + "grad_norm": 0.09435338526964188, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 305970 + }, + { + "epoch": 1.1828331091215536, + "grad_norm": 0.1275791972875595, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 305980 + }, + { + "epoch": 1.182871766324937, + "grad_norm": 0.11196912080049515, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 305990 + }, + { + "epoch": 1.1829104235283203, + "grad_norm": 0.09606979042291641, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 306000 + }, + { + "epoch": 1.1829490807317036, + "grad_norm": 0.12343128025531769, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 306010 + }, + { + "epoch": 1.1829877379350868, + "grad_norm": 0.13962876796722412, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 306020 + }, + { + "epoch": 1.18302639513847, + "grad_norm": 0.10752856731414795, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 306030 + }, + { + "epoch": 1.1830650523418533, + "grad_norm": 0.10863500088453293, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 306040 + }, + { + "epoch": 1.1831037095452366, + "grad_norm": 0.11633353680372238, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 306050 + }, + { + "epoch": 1.18314236674862, + "grad_norm": 0.09035677462816238, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 306060 + }, + { + "epoch": 1.1831810239520033, + "grad_norm": 0.12252160161733627, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 306070 + }, + { + "epoch": 1.1832196811553866, + "grad_norm": 0.1086413711309433, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 306080 + }, + { + "epoch": 1.1832583383587698, + "grad_norm": 0.09670509397983551, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 306090 + }, + { + "epoch": 1.183296995562153, + "grad_norm": 0.12193197757005692, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 306100 + }, + { + "epoch": 1.1833356527655363, + "grad_norm": 0.11630985885858536, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 306110 + }, + { + "epoch": 1.1833743099689196, + "grad_norm": 0.10431554168462753, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 306120 + }, + { + "epoch": 1.1834129671723028, + "grad_norm": 0.11279870569705963, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 306130 + }, + { + "epoch": 1.183451624375686, + "grad_norm": 0.10690067708492279, + "learning_rate": 0.002, + "loss": 2.331, + "step": 306140 + }, + { + "epoch": 1.1834902815790693, + "grad_norm": 0.10031410306692123, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 306150 + }, + { + "epoch": 1.1835289387824528, + "grad_norm": 0.10549402236938477, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 306160 + }, + { + "epoch": 1.183567595985836, + "grad_norm": 0.09717525541782379, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 306170 + }, + { + "epoch": 1.1836062531892193, + "grad_norm": 0.10498525202274323, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 306180 + }, + { + "epoch": 1.1836449103926026, + "grad_norm": 0.10035532712936401, + "learning_rate": 0.002, + "loss": 2.3133, + "step": 306190 + }, + { + "epoch": 1.1836835675959858, + "grad_norm": 0.1097157821059227, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 306200 + }, + { + "epoch": 1.183722224799369, + "grad_norm": 0.10666971653699875, + "learning_rate": 0.002, + "loss": 2.325, + "step": 306210 + }, + { + "epoch": 1.1837608820027523, + "grad_norm": 0.10622602701187134, + "learning_rate": 0.002, + "loss": 2.326, + "step": 306220 + }, + { + "epoch": 1.1837995392061358, + "grad_norm": 0.11170188337564468, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 306230 + }, + { + "epoch": 1.183838196409519, + "grad_norm": 0.09753941744565964, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 306240 + }, + { + "epoch": 1.1838768536129023, + "grad_norm": 0.12126488238573074, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 306250 + }, + { + "epoch": 1.1839155108162855, + "grad_norm": 0.10929860919713974, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 306260 + }, + { + "epoch": 1.1839541680196688, + "grad_norm": 0.09766397625207901, + "learning_rate": 0.002, + "loss": 2.346, + "step": 306270 + }, + { + "epoch": 1.183992825223052, + "grad_norm": 0.11956153064966202, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 306280 + }, + { + "epoch": 1.1840314824264353, + "grad_norm": 0.13645967841148376, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 306290 + }, + { + "epoch": 1.1840701396298186, + "grad_norm": 0.132292702794075, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 306300 + }, + { + "epoch": 1.1841087968332018, + "grad_norm": 0.09791436791419983, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 306310 + }, + { + "epoch": 1.184147454036585, + "grad_norm": 0.10473255813121796, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 306320 + }, + { + "epoch": 1.1841861112399685, + "grad_norm": 0.11555669456720352, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 306330 + }, + { + "epoch": 1.1842247684433518, + "grad_norm": 0.11690516024827957, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 306340 + }, + { + "epoch": 1.184263425646735, + "grad_norm": 0.095167376101017, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 306350 + }, + { + "epoch": 1.1843020828501183, + "grad_norm": 0.10989649593830109, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 306360 + }, + { + "epoch": 1.1843407400535015, + "grad_norm": 0.13234752416610718, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 306370 + }, + { + "epoch": 1.1843793972568848, + "grad_norm": 0.11061105132102966, + "learning_rate": 0.002, + "loss": 2.3651, + "step": 306380 + }, + { + "epoch": 1.184418054460268, + "grad_norm": 0.10562548041343689, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 306390 + }, + { + "epoch": 1.1844567116636515, + "grad_norm": 0.09931013733148575, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 306400 + }, + { + "epoch": 1.1844953688670348, + "grad_norm": 0.1872059553861618, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 306410 + }, + { + "epoch": 1.184534026070418, + "grad_norm": 0.09803324192762375, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 306420 + }, + { + "epoch": 1.1845726832738013, + "grad_norm": 0.10257833451032639, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 306430 + }, + { + "epoch": 1.1846113404771845, + "grad_norm": 0.11944346129894257, + "learning_rate": 0.002, + "loss": 2.321, + "step": 306440 + }, + { + "epoch": 1.1846499976805678, + "grad_norm": 0.11018768697977066, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 306450 + }, + { + "epoch": 1.184688654883951, + "grad_norm": 0.09937886148691177, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 306460 + }, + { + "epoch": 1.1847273120873343, + "grad_norm": 0.11394789069890976, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 306470 + }, + { + "epoch": 1.1847659692907175, + "grad_norm": 0.10206689685583115, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 306480 + }, + { + "epoch": 1.1848046264941008, + "grad_norm": 0.09132876992225647, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 306490 + }, + { + "epoch": 1.1848432836974843, + "grad_norm": 0.10233450680971146, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 306500 + }, + { + "epoch": 1.1848819409008675, + "grad_norm": 0.11387529969215393, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 306510 + }, + { + "epoch": 1.1849205981042508, + "grad_norm": 0.09129679203033447, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 306520 + }, + { + "epoch": 1.184959255307634, + "grad_norm": 0.12205521762371063, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 306530 + }, + { + "epoch": 1.1849979125110173, + "grad_norm": 0.11332284659147263, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 306540 + }, + { + "epoch": 1.1850365697144005, + "grad_norm": 0.10139425843954086, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 306550 + }, + { + "epoch": 1.1850752269177838, + "grad_norm": 0.10110003501176834, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 306560 + }, + { + "epoch": 1.1851138841211672, + "grad_norm": 0.23061221837997437, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 306570 + }, + { + "epoch": 1.1851525413245505, + "grad_norm": 0.11319497227668762, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 306580 + }, + { + "epoch": 1.1851911985279338, + "grad_norm": 0.09993323683738708, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 306590 + }, + { + "epoch": 1.185229855731317, + "grad_norm": 0.11044862866401672, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 306600 + }, + { + "epoch": 1.1852685129347003, + "grad_norm": 0.09655289351940155, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 306610 + }, + { + "epoch": 1.1853071701380835, + "grad_norm": 0.13763998448848724, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 306620 + }, + { + "epoch": 1.1853458273414668, + "grad_norm": 0.10221410542726517, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 306630 + }, + { + "epoch": 1.18538448454485, + "grad_norm": 0.09295745193958282, + "learning_rate": 0.002, + "loss": 2.338, + "step": 306640 + }, + { + "epoch": 1.1854231417482333, + "grad_norm": 0.0991741344332695, + "learning_rate": 0.002, + "loss": 2.325, + "step": 306650 + }, + { + "epoch": 1.1854617989516167, + "grad_norm": 0.12523503601551056, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 306660 + }, + { + "epoch": 1.185500456155, + "grad_norm": 0.09871948510408401, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 306670 + }, + { + "epoch": 1.1855391133583832, + "grad_norm": 0.11182510107755661, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 306680 + }, + { + "epoch": 1.1855777705617665, + "grad_norm": 0.09244689345359802, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 306690 + }, + { + "epoch": 1.1856164277651497, + "grad_norm": 0.09097203612327576, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 306700 + }, + { + "epoch": 1.185655084968533, + "grad_norm": 0.10416053980588913, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 306710 + }, + { + "epoch": 1.1856937421719163, + "grad_norm": 0.09652557224035263, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 306720 + }, + { + "epoch": 1.1857323993752995, + "grad_norm": 0.10522696375846863, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 306730 + }, + { + "epoch": 1.185771056578683, + "grad_norm": 0.12889423966407776, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 306740 + }, + { + "epoch": 1.1858097137820662, + "grad_norm": 0.11473888903856277, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 306750 + }, + { + "epoch": 1.1858483709854495, + "grad_norm": 0.1227368712425232, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 306760 + }, + { + "epoch": 1.1858870281888327, + "grad_norm": 0.12399069219827652, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 306770 + }, + { + "epoch": 1.185925685392216, + "grad_norm": 0.09926015883684158, + "learning_rate": 0.002, + "loss": 2.336, + "step": 306780 + }, + { + "epoch": 1.1859643425955992, + "grad_norm": 0.09733019024133682, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 306790 + }, + { + "epoch": 1.1860029997989825, + "grad_norm": 0.09353894740343094, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 306800 + }, + { + "epoch": 1.1860416570023657, + "grad_norm": 0.11982983350753784, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 306810 + }, + { + "epoch": 1.186080314205749, + "grad_norm": 0.10334306210279465, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 306820 + }, + { + "epoch": 1.1861189714091325, + "grad_norm": 0.09221477806568146, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 306830 + }, + { + "epoch": 1.1861576286125157, + "grad_norm": 0.10017237812280655, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 306840 + }, + { + "epoch": 1.186196285815899, + "grad_norm": 0.10200434923171997, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 306850 + }, + { + "epoch": 1.1862349430192822, + "grad_norm": 0.12413059175014496, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 306860 + }, + { + "epoch": 1.1862736002226655, + "grad_norm": 0.10810477286577225, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 306870 + }, + { + "epoch": 1.1863122574260487, + "grad_norm": 0.1036938801407814, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 306880 + }, + { + "epoch": 1.186350914629432, + "grad_norm": 0.0994827151298523, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 306890 + }, + { + "epoch": 1.1863895718328152, + "grad_norm": 0.10518717765808105, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 306900 + }, + { + "epoch": 1.1864282290361987, + "grad_norm": 0.09604518115520477, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 306910 + }, + { + "epoch": 1.186466886239582, + "grad_norm": 0.10765660554170609, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 306920 + }, + { + "epoch": 1.1865055434429652, + "grad_norm": 0.1137009784579277, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 306930 + }, + { + "epoch": 1.1865442006463485, + "grad_norm": 0.10713685303926468, + "learning_rate": 0.002, + "loss": 2.334, + "step": 306940 + }, + { + "epoch": 1.1865828578497317, + "grad_norm": 0.09157906472682953, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 306950 + }, + { + "epoch": 1.186621515053115, + "grad_norm": 0.11402291059494019, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 306960 + }, + { + "epoch": 1.1866601722564982, + "grad_norm": 0.09192168712615967, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 306970 + }, + { + "epoch": 1.1866988294598815, + "grad_norm": 0.10053259879350662, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 306980 + }, + { + "epoch": 1.1867374866632647, + "grad_norm": 0.11963258683681488, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 306990 + }, + { + "epoch": 1.1867761438666482, + "grad_norm": 0.11115763336420059, + "learning_rate": 0.002, + "loss": 2.321, + "step": 307000 + }, + { + "epoch": 1.1868148010700315, + "grad_norm": 0.12055559456348419, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 307010 + }, + { + "epoch": 1.1868534582734147, + "grad_norm": 0.11069901287555695, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 307020 + }, + { + "epoch": 1.186892115476798, + "grad_norm": 0.10177513211965561, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 307030 + }, + { + "epoch": 1.1869307726801812, + "grad_norm": 0.10583703964948654, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 307040 + }, + { + "epoch": 1.1869694298835645, + "grad_norm": 0.10791445523500443, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 307050 + }, + { + "epoch": 1.1870080870869477, + "grad_norm": 0.10395360738039017, + "learning_rate": 0.002, + "loss": 2.3593, + "step": 307060 + }, + { + "epoch": 1.187046744290331, + "grad_norm": 0.09354843199253082, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 307070 + }, + { + "epoch": 1.1870854014937144, + "grad_norm": 0.11032462120056152, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 307080 + }, + { + "epoch": 1.1871240586970977, + "grad_norm": 0.12376872450113297, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 307090 + }, + { + "epoch": 1.187162715900481, + "grad_norm": 0.1092623770236969, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 307100 + }, + { + "epoch": 1.1872013731038642, + "grad_norm": 0.11651972681283951, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 307110 + }, + { + "epoch": 1.1872400303072475, + "grad_norm": 0.10235489159822464, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 307120 + }, + { + "epoch": 1.1872786875106307, + "grad_norm": 0.10163053870201111, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 307130 + }, + { + "epoch": 1.187317344714014, + "grad_norm": 0.10021232813596725, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 307140 + }, + { + "epoch": 1.1873560019173972, + "grad_norm": 0.10669775307178497, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 307150 + }, + { + "epoch": 1.1873946591207805, + "grad_norm": 0.08023542910814285, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 307160 + }, + { + "epoch": 1.187433316324164, + "grad_norm": 0.10091197490692139, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 307170 + }, + { + "epoch": 1.1874719735275472, + "grad_norm": 0.11032316088676453, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 307180 + }, + { + "epoch": 1.1875106307309304, + "grad_norm": 0.10547507554292679, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 307190 + }, + { + "epoch": 1.1875492879343137, + "grad_norm": 0.11119785159826279, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 307200 + }, + { + "epoch": 1.187587945137697, + "grad_norm": 0.10898708552122116, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 307210 + }, + { + "epoch": 1.1876266023410802, + "grad_norm": 0.11538541316986084, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 307220 + }, + { + "epoch": 1.1876652595444634, + "grad_norm": 0.10693303495645523, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 307230 + }, + { + "epoch": 1.1877039167478467, + "grad_norm": 0.09446164220571518, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 307240 + }, + { + "epoch": 1.1877425739512302, + "grad_norm": 0.11978152394294739, + "learning_rate": 0.002, + "loss": 2.337, + "step": 307250 + }, + { + "epoch": 1.1877812311546134, + "grad_norm": 0.10594190657138824, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 307260 + }, + { + "epoch": 1.1878198883579967, + "grad_norm": 0.11744443327188492, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 307270 + }, + { + "epoch": 1.18785854556138, + "grad_norm": 0.10290765762329102, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 307280 + }, + { + "epoch": 1.1878972027647632, + "grad_norm": 0.10236617177724838, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 307290 + }, + { + "epoch": 1.1879358599681464, + "grad_norm": 0.10091518610715866, + "learning_rate": 0.002, + "loss": 2.33, + "step": 307300 + }, + { + "epoch": 1.1879745171715297, + "grad_norm": 0.09385214000940323, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 307310 + }, + { + "epoch": 1.188013174374913, + "grad_norm": 0.11631542444229126, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 307320 + }, + { + "epoch": 1.1880518315782962, + "grad_norm": 0.10773419588804245, + "learning_rate": 0.002, + "loss": 2.306, + "step": 307330 + }, + { + "epoch": 1.1880904887816797, + "grad_norm": 0.11131122708320618, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 307340 + }, + { + "epoch": 1.188129145985063, + "grad_norm": 0.1060451790690422, + "learning_rate": 0.002, + "loss": 2.336, + "step": 307350 + }, + { + "epoch": 1.1881678031884462, + "grad_norm": 0.10052044689655304, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 307360 + }, + { + "epoch": 1.1882064603918294, + "grad_norm": 0.10508698970079422, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 307370 + }, + { + "epoch": 1.1882451175952127, + "grad_norm": 0.18196623027324677, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 307380 + }, + { + "epoch": 1.188283774798596, + "grad_norm": 0.10565753281116486, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 307390 + }, + { + "epoch": 1.1883224320019792, + "grad_norm": 0.09839676320552826, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 307400 + }, + { + "epoch": 1.1883610892053624, + "grad_norm": 0.09454700350761414, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 307410 + }, + { + "epoch": 1.188399746408746, + "grad_norm": 0.11287378519773483, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 307420 + }, + { + "epoch": 1.1884384036121292, + "grad_norm": 0.10098222643136978, + "learning_rate": 0.002, + "loss": 2.333, + "step": 307430 + }, + { + "epoch": 1.1884770608155124, + "grad_norm": 0.10282760858535767, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 307440 + }, + { + "epoch": 1.1885157180188957, + "grad_norm": 0.09915308654308319, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 307450 + }, + { + "epoch": 1.188554375222279, + "grad_norm": 0.10907262563705444, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 307460 + }, + { + "epoch": 1.1885930324256622, + "grad_norm": 0.1083667203783989, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 307470 + }, + { + "epoch": 1.1886316896290454, + "grad_norm": 0.11217281967401505, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 307480 + }, + { + "epoch": 1.1886703468324287, + "grad_norm": 0.09714347869157791, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 307490 + }, + { + "epoch": 1.188709004035812, + "grad_norm": 0.0932031124830246, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 307500 + }, + { + "epoch": 1.1887476612391954, + "grad_norm": 0.09969399124383926, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 307510 + }, + { + "epoch": 1.1887863184425786, + "grad_norm": 0.10963352024555206, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 307520 + }, + { + "epoch": 1.188824975645962, + "grad_norm": 0.10182205587625504, + "learning_rate": 0.002, + "loss": 2.341, + "step": 307530 + }, + { + "epoch": 1.1888636328493452, + "grad_norm": 0.13690140843391418, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 307540 + }, + { + "epoch": 1.1889022900527284, + "grad_norm": 0.11846937984228134, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 307550 + }, + { + "epoch": 1.1889409472561117, + "grad_norm": 0.1026109978556633, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 307560 + }, + { + "epoch": 1.188979604459495, + "grad_norm": 0.1018030047416687, + "learning_rate": 0.002, + "loss": 2.33, + "step": 307570 + }, + { + "epoch": 1.1890182616628784, + "grad_norm": 0.09329404681921005, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 307580 + }, + { + "epoch": 1.1890569188662616, + "grad_norm": 0.09284912049770355, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 307590 + }, + { + "epoch": 1.1890955760696449, + "grad_norm": 0.10771986097097397, + "learning_rate": 0.002, + "loss": 2.332, + "step": 307600 + }, + { + "epoch": 1.1891342332730281, + "grad_norm": 0.10230138152837753, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 307610 + }, + { + "epoch": 1.1891728904764114, + "grad_norm": 0.09888580441474915, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 307620 + }, + { + "epoch": 1.1892115476797946, + "grad_norm": 0.10737486183643341, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 307630 + }, + { + "epoch": 1.189250204883178, + "grad_norm": 0.12650904059410095, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 307640 + }, + { + "epoch": 1.1892888620865611, + "grad_norm": 0.08823800086975098, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 307650 + }, + { + "epoch": 1.1893275192899444, + "grad_norm": 0.12838537991046906, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 307660 + }, + { + "epoch": 1.1893661764933277, + "grad_norm": 0.12015782296657562, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 307670 + }, + { + "epoch": 1.1894048336967111, + "grad_norm": 0.09876890480518341, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 307680 + }, + { + "epoch": 1.1894434909000944, + "grad_norm": 0.11343344300985336, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 307690 + }, + { + "epoch": 1.1894821481034776, + "grad_norm": 0.10124673694372177, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 307700 + }, + { + "epoch": 1.1895208053068609, + "grad_norm": 0.11742060631513596, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 307710 + }, + { + "epoch": 1.1895594625102441, + "grad_norm": 0.10189960896968842, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 307720 + }, + { + "epoch": 1.1895981197136274, + "grad_norm": 0.12576916813850403, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 307730 + }, + { + "epoch": 1.1896367769170106, + "grad_norm": 0.1032869890332222, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 307740 + }, + { + "epoch": 1.1896754341203941, + "grad_norm": 0.09936468303203583, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 307750 + }, + { + "epoch": 1.1897140913237774, + "grad_norm": 0.1241552010178566, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 307760 + }, + { + "epoch": 1.1897527485271606, + "grad_norm": 0.0964624434709549, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 307770 + }, + { + "epoch": 1.1897914057305439, + "grad_norm": 0.12272000312805176, + "learning_rate": 0.002, + "loss": 2.333, + "step": 307780 + }, + { + "epoch": 1.1898300629339271, + "grad_norm": 0.08760685473680496, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 307790 + }, + { + "epoch": 1.1898687201373104, + "grad_norm": 0.0961800292134285, + "learning_rate": 0.002, + "loss": 2.316, + "step": 307800 + }, + { + "epoch": 1.1899073773406936, + "grad_norm": 0.1039305329322815, + "learning_rate": 0.002, + "loss": 2.3136, + "step": 307810 + }, + { + "epoch": 1.1899460345440769, + "grad_norm": 0.09458538889884949, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 307820 + }, + { + "epoch": 1.1899846917474601, + "grad_norm": 0.12459837645292282, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 307830 + }, + { + "epoch": 1.1900233489508434, + "grad_norm": 0.10995468497276306, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 307840 + }, + { + "epoch": 1.1900620061542269, + "grad_norm": 0.1075374186038971, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 307850 + }, + { + "epoch": 1.1901006633576101, + "grad_norm": 0.130019411444664, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 307860 + }, + { + "epoch": 1.1901393205609934, + "grad_norm": 0.10518362373113632, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 307870 + }, + { + "epoch": 1.1901779777643766, + "grad_norm": 0.10554949939250946, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 307880 + }, + { + "epoch": 1.1902166349677599, + "grad_norm": 0.10739036649465561, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 307890 + }, + { + "epoch": 1.1902552921711431, + "grad_norm": 0.11622469127178192, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 307900 + }, + { + "epoch": 1.1902939493745264, + "grad_norm": 0.1241396814584732, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 307910 + }, + { + "epoch": 1.1903326065779098, + "grad_norm": 0.1170787587761879, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 307920 + }, + { + "epoch": 1.190371263781293, + "grad_norm": 0.10361120849847794, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 307930 + }, + { + "epoch": 1.1904099209846764, + "grad_norm": 0.11528493463993073, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 307940 + }, + { + "epoch": 1.1904485781880596, + "grad_norm": 0.10670798271894455, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 307950 + }, + { + "epoch": 1.1904872353914429, + "grad_norm": 0.1060328334569931, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 307960 + }, + { + "epoch": 1.190525892594826, + "grad_norm": 0.096269890666008, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 307970 + }, + { + "epoch": 1.1905645497982094, + "grad_norm": 0.10902507603168488, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 307980 + }, + { + "epoch": 1.1906032070015926, + "grad_norm": 0.11386062949895859, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 307990 + }, + { + "epoch": 1.1906418642049759, + "grad_norm": 0.11864448338747025, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 308000 + }, + { + "epoch": 1.1906805214083591, + "grad_norm": 0.10171358287334442, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 308010 + }, + { + "epoch": 1.1907191786117426, + "grad_norm": 0.09052585065364838, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 308020 + }, + { + "epoch": 1.1907578358151258, + "grad_norm": 0.09586137533187866, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 308030 + }, + { + "epoch": 1.190796493018509, + "grad_norm": 0.10091507434844971, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 308040 + }, + { + "epoch": 1.1908351502218923, + "grad_norm": 0.10137958079576492, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 308050 + }, + { + "epoch": 1.1908738074252756, + "grad_norm": 0.10727639496326447, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 308060 + }, + { + "epoch": 1.1909124646286589, + "grad_norm": 0.23738090693950653, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 308070 + }, + { + "epoch": 1.190951121832042, + "grad_norm": 0.20171679556369781, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 308080 + }, + { + "epoch": 1.1909897790354256, + "grad_norm": 0.10972524434328079, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 308090 + }, + { + "epoch": 1.1910284362388088, + "grad_norm": 0.09481258690357208, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 308100 + }, + { + "epoch": 1.191067093442192, + "grad_norm": 0.09582295268774033, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 308110 + }, + { + "epoch": 1.1911057506455753, + "grad_norm": 0.1289292275905609, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 308120 + }, + { + "epoch": 1.1911444078489586, + "grad_norm": 0.10114498436450958, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 308130 + }, + { + "epoch": 1.1911830650523418, + "grad_norm": 0.09929239004850388, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 308140 + }, + { + "epoch": 1.191221722255725, + "grad_norm": 0.10156530886888504, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 308150 + }, + { + "epoch": 1.1912603794591083, + "grad_norm": 0.10426311939954758, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 308160 + }, + { + "epoch": 1.1912990366624916, + "grad_norm": 0.11024799942970276, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 308170 + }, + { + "epoch": 1.1913376938658748, + "grad_norm": 0.10496527701616287, + "learning_rate": 0.002, + "loss": 2.343, + "step": 308180 + }, + { + "epoch": 1.1913763510692583, + "grad_norm": 0.09811306744813919, + "learning_rate": 0.002, + "loss": 2.333, + "step": 308190 + }, + { + "epoch": 1.1914150082726416, + "grad_norm": 0.1302216500043869, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 308200 + }, + { + "epoch": 1.1914536654760248, + "grad_norm": 0.11187312752008438, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 308210 + }, + { + "epoch": 1.191492322679408, + "grad_norm": 0.10793672502040863, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 308220 + }, + { + "epoch": 1.1915309798827913, + "grad_norm": 0.08793167024850845, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 308230 + }, + { + "epoch": 1.1915696370861746, + "grad_norm": 0.10832060128450394, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 308240 + }, + { + "epoch": 1.1916082942895578, + "grad_norm": 0.10909979790449142, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 308250 + }, + { + "epoch": 1.1916469514929413, + "grad_norm": 0.11267384886741638, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 308260 + }, + { + "epoch": 1.1916856086963246, + "grad_norm": 0.09032999724149704, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 308270 + }, + { + "epoch": 1.1917242658997078, + "grad_norm": 0.1101817861199379, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 308280 + }, + { + "epoch": 1.191762923103091, + "grad_norm": 0.11183900386095047, + "learning_rate": 0.002, + "loss": 2.346, + "step": 308290 + }, + { + "epoch": 1.1918015803064743, + "grad_norm": 0.09554877132177353, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 308300 + }, + { + "epoch": 1.1918402375098576, + "grad_norm": 0.10025066882371902, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 308310 + }, + { + "epoch": 1.1918788947132408, + "grad_norm": 0.11711962521076202, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 308320 + }, + { + "epoch": 1.191917551916624, + "grad_norm": 0.10263378918170929, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 308330 + }, + { + "epoch": 1.1919562091200073, + "grad_norm": 0.09514163434505463, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 308340 + }, + { + "epoch": 1.1919948663233906, + "grad_norm": 0.13325776159763336, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 308350 + }, + { + "epoch": 1.192033523526774, + "grad_norm": 0.09737804532051086, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 308360 + }, + { + "epoch": 1.1920721807301573, + "grad_norm": 0.09637849032878876, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 308370 + }, + { + "epoch": 1.1921108379335406, + "grad_norm": 0.10954175144433975, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 308380 + }, + { + "epoch": 1.1921494951369238, + "grad_norm": 0.09258211404085159, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 308390 + }, + { + "epoch": 1.192188152340307, + "grad_norm": 0.1098412349820137, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 308400 + }, + { + "epoch": 1.1922268095436903, + "grad_norm": 0.14261560142040253, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 308410 + }, + { + "epoch": 1.1922654667470736, + "grad_norm": 0.09297990798950195, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 308420 + }, + { + "epoch": 1.192304123950457, + "grad_norm": 0.09771433472633362, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 308430 + }, + { + "epoch": 1.1923427811538403, + "grad_norm": 0.12001529335975647, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 308440 + }, + { + "epoch": 1.1923814383572235, + "grad_norm": 0.1097932979464531, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 308450 + }, + { + "epoch": 1.1924200955606068, + "grad_norm": 0.09968653321266174, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 308460 + }, + { + "epoch": 1.19245875276399, + "grad_norm": 0.11144307255744934, + "learning_rate": 0.002, + "loss": 2.333, + "step": 308470 + }, + { + "epoch": 1.1924974099673733, + "grad_norm": 0.17210029065608978, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 308480 + }, + { + "epoch": 1.1925360671707566, + "grad_norm": 0.11730711162090302, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 308490 + }, + { + "epoch": 1.1925747243741398, + "grad_norm": 0.12075983732938766, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 308500 + }, + { + "epoch": 1.192613381577523, + "grad_norm": 0.10987797379493713, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 308510 + }, + { + "epoch": 1.1926520387809063, + "grad_norm": 0.10271605104207993, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 308520 + }, + { + "epoch": 1.1926906959842898, + "grad_norm": 0.10429355502128601, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 308530 + }, + { + "epoch": 1.192729353187673, + "grad_norm": 0.10565821826457977, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 308540 + }, + { + "epoch": 1.1927680103910563, + "grad_norm": 0.09842915832996368, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 308550 + }, + { + "epoch": 1.1928066675944395, + "grad_norm": 0.10916607081890106, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 308560 + }, + { + "epoch": 1.1928453247978228, + "grad_norm": 0.10457335412502289, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 308570 + }, + { + "epoch": 1.192883982001206, + "grad_norm": 0.13191327452659607, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 308580 + }, + { + "epoch": 1.1929226392045893, + "grad_norm": 0.0972403958439827, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 308590 + }, + { + "epoch": 1.1929612964079728, + "grad_norm": 0.11111482977867126, + "learning_rate": 0.002, + "loss": 2.335, + "step": 308600 + }, + { + "epoch": 1.192999953611356, + "grad_norm": 0.08959123492240906, + "learning_rate": 0.002, + "loss": 2.3079, + "step": 308610 + }, + { + "epoch": 1.1930386108147393, + "grad_norm": 0.09942329674959183, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 308620 + }, + { + "epoch": 1.1930772680181225, + "grad_norm": 0.10511796176433563, + "learning_rate": 0.002, + "loss": 2.3048, + "step": 308630 + }, + { + "epoch": 1.1931159252215058, + "grad_norm": 0.11657918989658356, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 308640 + }, + { + "epoch": 1.193154582424889, + "grad_norm": 0.10845893621444702, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 308650 + }, + { + "epoch": 1.1931932396282723, + "grad_norm": 0.09751000255346298, + "learning_rate": 0.002, + "loss": 2.324, + "step": 308660 + }, + { + "epoch": 1.1932318968316555, + "grad_norm": 0.11037997156381607, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 308670 + }, + { + "epoch": 1.1932705540350388, + "grad_norm": 0.10616657137870789, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 308680 + }, + { + "epoch": 1.1933092112384223, + "grad_norm": 0.11541463434696198, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 308690 + }, + { + "epoch": 1.1933478684418055, + "grad_norm": 0.10183534026145935, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 308700 + }, + { + "epoch": 1.1933865256451888, + "grad_norm": 0.11032002419233322, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 308710 + }, + { + "epoch": 1.193425182848572, + "grad_norm": 0.11707691103219986, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 308720 + }, + { + "epoch": 1.1934638400519553, + "grad_norm": 0.1008838638663292, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 308730 + }, + { + "epoch": 1.1935024972553385, + "grad_norm": 0.09674558788537979, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 308740 + }, + { + "epoch": 1.1935411544587218, + "grad_norm": 0.10396383702754974, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 308750 + }, + { + "epoch": 1.193579811662105, + "grad_norm": 0.10542046278715134, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 308760 + }, + { + "epoch": 1.1936184688654885, + "grad_norm": 0.12094923853874207, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 308770 + }, + { + "epoch": 1.1936571260688718, + "grad_norm": 0.10676782578229904, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 308780 + }, + { + "epoch": 1.193695783272255, + "grad_norm": 0.10295173525810242, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 308790 + }, + { + "epoch": 1.1937344404756383, + "grad_norm": 0.10381203889846802, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 308800 + }, + { + "epoch": 1.1937730976790215, + "grad_norm": 0.11356569081544876, + "learning_rate": 0.002, + "loss": 2.334, + "step": 308810 + }, + { + "epoch": 1.1938117548824048, + "grad_norm": 0.10669662803411484, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 308820 + }, + { + "epoch": 1.193850412085788, + "grad_norm": 0.10124512761831284, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 308830 + }, + { + "epoch": 1.1938890692891713, + "grad_norm": 0.1617117077112198, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 308840 + }, + { + "epoch": 1.1939277264925545, + "grad_norm": 0.10875289887189865, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 308850 + }, + { + "epoch": 1.193966383695938, + "grad_norm": 0.10443301498889923, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 308860 + }, + { + "epoch": 1.1940050408993212, + "grad_norm": 0.1163385659456253, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 308870 + }, + { + "epoch": 1.1940436981027045, + "grad_norm": 0.09189847856760025, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 308880 + }, + { + "epoch": 1.1940823553060878, + "grad_norm": 0.1170836016535759, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 308890 + }, + { + "epoch": 1.194121012509471, + "grad_norm": 0.1277143359184265, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 308900 + }, + { + "epoch": 1.1941596697128543, + "grad_norm": 0.09729064255952835, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 308910 + }, + { + "epoch": 1.1941983269162375, + "grad_norm": 0.10895591974258423, + "learning_rate": 0.002, + "loss": 2.338, + "step": 308920 + }, + { + "epoch": 1.1942369841196208, + "grad_norm": 0.10426922887563705, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 308930 + }, + { + "epoch": 1.1942756413230042, + "grad_norm": 0.09164383262395859, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 308940 + }, + { + "epoch": 1.1943142985263875, + "grad_norm": 0.08422087132930756, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 308950 + }, + { + "epoch": 1.1943529557297707, + "grad_norm": 0.09703510999679565, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 308960 + }, + { + "epoch": 1.194391612933154, + "grad_norm": 0.09442820399999619, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 308970 + }, + { + "epoch": 1.1944302701365372, + "grad_norm": 0.19480974972248077, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 308980 + }, + { + "epoch": 1.1944689273399205, + "grad_norm": 0.10483382642269135, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 308990 + }, + { + "epoch": 1.1945075845433037, + "grad_norm": 0.08840036392211914, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 309000 + }, + { + "epoch": 1.194546241746687, + "grad_norm": 0.09493028372526169, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 309010 + }, + { + "epoch": 1.1945848989500703, + "grad_norm": 0.10036586970090866, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 309020 + }, + { + "epoch": 1.1946235561534537, + "grad_norm": 0.11401264369487762, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 309030 + }, + { + "epoch": 1.194662213356837, + "grad_norm": 0.09835246205329895, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 309040 + }, + { + "epoch": 1.1947008705602202, + "grad_norm": 0.09693726152181625, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 309050 + }, + { + "epoch": 1.1947395277636035, + "grad_norm": 0.10586172342300415, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 309060 + }, + { + "epoch": 1.1947781849669867, + "grad_norm": 0.10866979509592056, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 309070 + }, + { + "epoch": 1.19481684217037, + "grad_norm": 0.15142571926116943, + "learning_rate": 0.002, + "loss": 2.343, + "step": 309080 + }, + { + "epoch": 1.1948554993737532, + "grad_norm": 0.10809877514839172, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 309090 + }, + { + "epoch": 1.1948941565771365, + "grad_norm": 0.12076469510793686, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 309100 + }, + { + "epoch": 1.19493281378052, + "grad_norm": 0.09712011367082596, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 309110 + }, + { + "epoch": 1.1949714709839032, + "grad_norm": 0.11490646004676819, + "learning_rate": 0.002, + "loss": 2.332, + "step": 309120 + }, + { + "epoch": 1.1950101281872865, + "grad_norm": 0.12239948660135269, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 309130 + }, + { + "epoch": 1.1950487853906697, + "grad_norm": 0.099088653922081, + "learning_rate": 0.002, + "loss": 2.329, + "step": 309140 + }, + { + "epoch": 1.195087442594053, + "grad_norm": 0.10289102792739868, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 309150 + }, + { + "epoch": 1.1951260997974362, + "grad_norm": 0.1293850541114807, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 309160 + }, + { + "epoch": 1.1951647570008195, + "grad_norm": 0.09299208968877792, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 309170 + }, + { + "epoch": 1.1952034142042027, + "grad_norm": 0.10709170997142792, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 309180 + }, + { + "epoch": 1.195242071407586, + "grad_norm": 0.09683641791343689, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 309190 + }, + { + "epoch": 1.1952807286109695, + "grad_norm": 0.0931752473115921, + "learning_rate": 0.002, + "loss": 2.33, + "step": 309200 + }, + { + "epoch": 1.1953193858143527, + "grad_norm": 0.11616582423448563, + "learning_rate": 0.002, + "loss": 2.338, + "step": 309210 + }, + { + "epoch": 1.195358043017736, + "grad_norm": 0.10611452162265778, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 309220 + }, + { + "epoch": 1.1953967002211192, + "grad_norm": 0.11353754252195358, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 309230 + }, + { + "epoch": 1.1954353574245025, + "grad_norm": 0.10875255614519119, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 309240 + }, + { + "epoch": 1.1954740146278857, + "grad_norm": 0.11347277462482452, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 309250 + }, + { + "epoch": 1.195512671831269, + "grad_norm": 0.09928679466247559, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 309260 + }, + { + "epoch": 1.1955513290346522, + "grad_norm": 0.09654610604047775, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 309270 + }, + { + "epoch": 1.1955899862380357, + "grad_norm": 0.10184766352176666, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 309280 + }, + { + "epoch": 1.195628643441419, + "grad_norm": 0.11834858357906342, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 309290 + }, + { + "epoch": 1.1956673006448022, + "grad_norm": 0.1013941541314125, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 309300 + }, + { + "epoch": 1.1957059578481855, + "grad_norm": 0.12373871356248856, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 309310 + }, + { + "epoch": 1.1957446150515687, + "grad_norm": 0.11233308166265488, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 309320 + }, + { + "epoch": 1.195783272254952, + "grad_norm": 0.12040448933839798, + "learning_rate": 0.002, + "loss": 2.34, + "step": 309330 + }, + { + "epoch": 1.1958219294583352, + "grad_norm": 0.1115075871348381, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 309340 + }, + { + "epoch": 1.1958605866617185, + "grad_norm": 0.09439297765493393, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 309350 + }, + { + "epoch": 1.1958992438651017, + "grad_norm": 0.11103499680757523, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 309360 + }, + { + "epoch": 1.1959379010684852, + "grad_norm": 0.09440824389457703, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 309370 + }, + { + "epoch": 1.1959765582718684, + "grad_norm": 0.10800778120756149, + "learning_rate": 0.002, + "loss": 2.322, + "step": 309380 + }, + { + "epoch": 1.1960152154752517, + "grad_norm": 0.11891094595193863, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 309390 + }, + { + "epoch": 1.196053872678635, + "grad_norm": 0.09633566439151764, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 309400 + }, + { + "epoch": 1.1960925298820182, + "grad_norm": 0.1220220997929573, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 309410 + }, + { + "epoch": 1.1961311870854014, + "grad_norm": 0.09684797376394272, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 309420 + }, + { + "epoch": 1.1961698442887847, + "grad_norm": 0.12197108566761017, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 309430 + }, + { + "epoch": 1.1962085014921682, + "grad_norm": 0.0932028740644455, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 309440 + }, + { + "epoch": 1.1962471586955514, + "grad_norm": 0.09771830588579178, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 309450 + }, + { + "epoch": 1.1962858158989347, + "grad_norm": 0.10045291483402252, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 309460 + }, + { + "epoch": 1.196324473102318, + "grad_norm": 0.16125184297561646, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 309470 + }, + { + "epoch": 1.1963631303057012, + "grad_norm": 0.11066312342882156, + "learning_rate": 0.002, + "loss": 2.339, + "step": 309480 + }, + { + "epoch": 1.1964017875090844, + "grad_norm": 0.11836784332990646, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 309490 + }, + { + "epoch": 1.1964404447124677, + "grad_norm": 0.11377326399087906, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 309500 + }, + { + "epoch": 1.196479101915851, + "grad_norm": 0.11057248711585999, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 309510 + }, + { + "epoch": 1.1965177591192342, + "grad_norm": 0.09580224752426147, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 309520 + }, + { + "epoch": 1.1965564163226174, + "grad_norm": 0.09781431406736374, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 309530 + }, + { + "epoch": 1.196595073526001, + "grad_norm": 0.09017840027809143, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 309540 + }, + { + "epoch": 1.1966337307293842, + "grad_norm": 0.10557594895362854, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 309550 + }, + { + "epoch": 1.1966723879327674, + "grad_norm": 0.1149761825799942, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 309560 + }, + { + "epoch": 1.1967110451361507, + "grad_norm": 0.11471065133810043, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 309570 + }, + { + "epoch": 1.196749702339534, + "grad_norm": 0.10117927193641663, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 309580 + }, + { + "epoch": 1.1967883595429172, + "grad_norm": 0.11146955192089081, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 309590 + }, + { + "epoch": 1.1968270167463004, + "grad_norm": 0.1146436259150505, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 309600 + }, + { + "epoch": 1.196865673949684, + "grad_norm": 0.09842398762702942, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 309610 + }, + { + "epoch": 1.1969043311530672, + "grad_norm": 0.1109817773103714, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 309620 + }, + { + "epoch": 1.1969429883564504, + "grad_norm": 0.09087683260440826, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 309630 + }, + { + "epoch": 1.1969816455598337, + "grad_norm": 0.11437252908945084, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 309640 + }, + { + "epoch": 1.197020302763217, + "grad_norm": 0.10380634665489197, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 309650 + }, + { + "epoch": 1.1970589599666002, + "grad_norm": 0.10666163265705109, + "learning_rate": 0.002, + "loss": 2.329, + "step": 309660 + }, + { + "epoch": 1.1970976171699834, + "grad_norm": 0.09751375764608383, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 309670 + }, + { + "epoch": 1.1971362743733667, + "grad_norm": 0.10477735102176666, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 309680 + }, + { + "epoch": 1.19717493157675, + "grad_norm": 0.09767846763134003, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 309690 + }, + { + "epoch": 1.1972135887801332, + "grad_norm": 0.109504334628582, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 309700 + }, + { + "epoch": 1.1972522459835166, + "grad_norm": 0.10675853490829468, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 309710 + }, + { + "epoch": 1.1972909031869, + "grad_norm": 0.09702813625335693, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 309720 + }, + { + "epoch": 1.1973295603902832, + "grad_norm": 0.09669061750173569, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 309730 + }, + { + "epoch": 1.1973682175936664, + "grad_norm": 0.11900770664215088, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 309740 + }, + { + "epoch": 1.1974068747970497, + "grad_norm": 0.09379424154758453, + "learning_rate": 0.002, + "loss": 2.335, + "step": 309750 + }, + { + "epoch": 1.197445532000433, + "grad_norm": 0.11037370562553406, + "learning_rate": 0.002, + "loss": 2.342, + "step": 309760 + }, + { + "epoch": 1.1974841892038162, + "grad_norm": 0.10939163714647293, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 309770 + }, + { + "epoch": 1.1975228464071996, + "grad_norm": 0.1152690127491951, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 309780 + }, + { + "epoch": 1.197561503610583, + "grad_norm": 0.10573794692754745, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 309790 + }, + { + "epoch": 1.1976001608139661, + "grad_norm": 0.11093052476644516, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 309800 + }, + { + "epoch": 1.1976388180173494, + "grad_norm": 0.11789587885141373, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 309810 + }, + { + "epoch": 1.1976774752207326, + "grad_norm": 0.11578092724084854, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 309820 + }, + { + "epoch": 1.197716132424116, + "grad_norm": 0.10635251551866531, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 309830 + }, + { + "epoch": 1.1977547896274992, + "grad_norm": 0.0906798392534256, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 309840 + }, + { + "epoch": 1.1977934468308824, + "grad_norm": 0.10510555654764175, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 309850 + }, + { + "epoch": 1.1978321040342657, + "grad_norm": 0.10239271819591522, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 309860 + }, + { + "epoch": 1.197870761237649, + "grad_norm": 0.11979003995656967, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 309870 + }, + { + "epoch": 1.1979094184410324, + "grad_norm": 0.10474664717912674, + "learning_rate": 0.002, + "loss": 2.354, + "step": 309880 + }, + { + "epoch": 1.1979480756444156, + "grad_norm": 0.09589861333370209, + "learning_rate": 0.002, + "loss": 2.351, + "step": 309890 + }, + { + "epoch": 1.1979867328477989, + "grad_norm": 0.10829038172960281, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 309900 + }, + { + "epoch": 1.1980253900511821, + "grad_norm": 0.11464477330446243, + "learning_rate": 0.002, + "loss": 2.342, + "step": 309910 + }, + { + "epoch": 1.1980640472545654, + "grad_norm": 0.10862333327531815, + "learning_rate": 0.002, + "loss": 2.349, + "step": 309920 + }, + { + "epoch": 1.1981027044579486, + "grad_norm": 0.10340090841054916, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 309930 + }, + { + "epoch": 1.198141361661332, + "grad_norm": 0.10448790341615677, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 309940 + }, + { + "epoch": 1.1981800188647154, + "grad_norm": 0.10916253179311752, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 309950 + }, + { + "epoch": 1.1982186760680986, + "grad_norm": 0.105451300740242, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 309960 + }, + { + "epoch": 1.1982573332714819, + "grad_norm": 0.10273198038339615, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 309970 + }, + { + "epoch": 1.1982959904748651, + "grad_norm": 0.09579921513795853, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 309980 + }, + { + "epoch": 1.1983346476782484, + "grad_norm": 0.10059081017971039, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 309990 + }, + { + "epoch": 1.1983733048816316, + "grad_norm": 0.09338857233524323, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 310000 + }, + { + "epoch": 1.1984119620850149, + "grad_norm": 0.10279089212417603, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 310010 + }, + { + "epoch": 1.1984506192883981, + "grad_norm": 0.08862245827913284, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 310020 + }, + { + "epoch": 1.1984892764917814, + "grad_norm": 0.12492749840021133, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 310030 + }, + { + "epoch": 1.1985279336951646, + "grad_norm": 0.11897465586662292, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 310040 + }, + { + "epoch": 1.1985665908985481, + "grad_norm": 0.10090432316064835, + "learning_rate": 0.002, + "loss": 2.336, + "step": 310050 + }, + { + "epoch": 1.1986052481019314, + "grad_norm": 0.09931127727031708, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 310060 + }, + { + "epoch": 1.1986439053053146, + "grad_norm": 0.10696078836917877, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 310070 + }, + { + "epoch": 1.1986825625086979, + "grad_norm": 0.11368277668952942, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 310080 + }, + { + "epoch": 1.1987212197120811, + "grad_norm": 0.10651197284460068, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 310090 + }, + { + "epoch": 1.1987598769154644, + "grad_norm": 0.12357331067323685, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 310100 + }, + { + "epoch": 1.1987985341188476, + "grad_norm": 0.1091422438621521, + "learning_rate": 0.002, + "loss": 2.333, + "step": 310110 + }, + { + "epoch": 1.198837191322231, + "grad_norm": 0.10718175768852234, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 310120 + }, + { + "epoch": 1.1988758485256144, + "grad_norm": 0.10889726132154465, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 310130 + }, + { + "epoch": 1.1989145057289976, + "grad_norm": 0.10348767787218094, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 310140 + }, + { + "epoch": 1.1989531629323809, + "grad_norm": 0.11551138013601303, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 310150 + }, + { + "epoch": 1.198991820135764, + "grad_norm": 0.103383868932724, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 310160 + }, + { + "epoch": 1.1990304773391474, + "grad_norm": 0.10193579643964767, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 310170 + }, + { + "epoch": 1.1990691345425306, + "grad_norm": 0.10685243457555771, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 310180 + }, + { + "epoch": 1.1991077917459139, + "grad_norm": 0.159051775932312, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 310190 + }, + { + "epoch": 1.1991464489492971, + "grad_norm": 0.10236582159996033, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 310200 + }, + { + "epoch": 1.1991851061526804, + "grad_norm": 0.1219271719455719, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 310210 + }, + { + "epoch": 1.1992237633560638, + "grad_norm": 0.11734987050294876, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 310220 + }, + { + "epoch": 1.199262420559447, + "grad_norm": 0.09976553916931152, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 310230 + }, + { + "epoch": 1.1993010777628303, + "grad_norm": 0.15345707535743713, + "learning_rate": 0.002, + "loss": 2.34, + "step": 310240 + }, + { + "epoch": 1.1993397349662136, + "grad_norm": 0.10631512105464935, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 310250 + }, + { + "epoch": 1.1993783921695969, + "grad_norm": 0.09638812392950058, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 310260 + }, + { + "epoch": 1.19941704937298, + "grad_norm": 0.10293331742286682, + "learning_rate": 0.002, + "loss": 2.3568, + "step": 310270 + }, + { + "epoch": 1.1994557065763634, + "grad_norm": 0.10437485575675964, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 310280 + }, + { + "epoch": 1.1994943637797468, + "grad_norm": 0.12006824463605881, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 310290 + }, + { + "epoch": 1.19953302098313, + "grad_norm": 0.09263867884874344, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 310300 + }, + { + "epoch": 1.1995716781865133, + "grad_norm": 0.1929297298192978, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 310310 + }, + { + "epoch": 1.1996103353898966, + "grad_norm": 0.12172416597604752, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 310320 + }, + { + "epoch": 1.1996489925932798, + "grad_norm": 0.1284799724817276, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 310330 + }, + { + "epoch": 1.199687649796663, + "grad_norm": 0.09342958778142929, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 310340 + }, + { + "epoch": 1.1997263070000463, + "grad_norm": 0.10736458003520966, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 310350 + }, + { + "epoch": 1.1997649642034296, + "grad_norm": 0.11657021939754486, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 310360 + }, + { + "epoch": 1.1998036214068128, + "grad_norm": 0.1261918991804123, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 310370 + }, + { + "epoch": 1.199842278610196, + "grad_norm": 0.10105116665363312, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 310380 + }, + { + "epoch": 1.1998809358135796, + "grad_norm": 0.11543400585651398, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 310390 + }, + { + "epoch": 1.1999195930169628, + "grad_norm": 0.09686581790447235, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 310400 + }, + { + "epoch": 1.199958250220346, + "grad_norm": 0.0990409329533577, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 310410 + }, + { + "epoch": 1.1999969074237293, + "grad_norm": 0.12822741270065308, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 310420 + }, + { + "epoch": 1.2000355646271126, + "grad_norm": 0.108786940574646, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 310430 + }, + { + "epoch": 1.2000742218304958, + "grad_norm": 0.11611306667327881, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 310440 + }, + { + "epoch": 1.200112879033879, + "grad_norm": 0.12393523007631302, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 310450 + }, + { + "epoch": 1.2001515362372626, + "grad_norm": 0.10479535162448883, + "learning_rate": 0.002, + "loss": 2.321, + "step": 310460 + }, + { + "epoch": 1.2001901934406458, + "grad_norm": 0.20911908149719238, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 310470 + }, + { + "epoch": 1.200228850644029, + "grad_norm": 0.11443410813808441, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 310480 + }, + { + "epoch": 1.2002675078474123, + "grad_norm": 0.10956830531358719, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 310490 + }, + { + "epoch": 1.2003061650507956, + "grad_norm": 0.135431170463562, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 310500 + }, + { + "epoch": 1.2003448222541788, + "grad_norm": 0.10912695527076721, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 310510 + }, + { + "epoch": 1.200383479457562, + "grad_norm": 0.09585479646921158, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 310520 + }, + { + "epoch": 1.2004221366609453, + "grad_norm": 0.12916892766952515, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 310530 + }, + { + "epoch": 1.2004607938643286, + "grad_norm": 0.10099588334560394, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 310540 + }, + { + "epoch": 1.200499451067712, + "grad_norm": 0.08918637037277222, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 310550 + }, + { + "epoch": 1.2005381082710953, + "grad_norm": 0.12469889968633652, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 310560 + }, + { + "epoch": 1.2005767654744786, + "grad_norm": 0.10082226246595383, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 310570 + }, + { + "epoch": 1.2006154226778618, + "grad_norm": 0.10894294828176498, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 310580 + }, + { + "epoch": 1.200654079881245, + "grad_norm": 0.1181500107049942, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 310590 + }, + { + "epoch": 1.2006927370846283, + "grad_norm": 0.09749269485473633, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 310600 + }, + { + "epoch": 1.2007313942880116, + "grad_norm": 0.12218060344457626, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 310610 + }, + { + "epoch": 1.2007700514913948, + "grad_norm": 0.11155841499567032, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 310620 + }, + { + "epoch": 1.2008087086947783, + "grad_norm": 0.09525422006845474, + "learning_rate": 0.002, + "loss": 2.338, + "step": 310630 + }, + { + "epoch": 1.2008473658981615, + "grad_norm": 0.12496226280927658, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 310640 + }, + { + "epoch": 1.2008860231015448, + "grad_norm": 0.11647341400384903, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 310650 + }, + { + "epoch": 1.200924680304928, + "grad_norm": 0.10525427013635635, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 310660 + }, + { + "epoch": 1.2009633375083113, + "grad_norm": 0.10254848748445511, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 310670 + }, + { + "epoch": 1.2010019947116946, + "grad_norm": 0.11311331391334534, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 310680 + }, + { + "epoch": 1.2010406519150778, + "grad_norm": 0.12352067232131958, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 310690 + }, + { + "epoch": 1.201079309118461, + "grad_norm": 0.30723175406455994, + "learning_rate": 0.002, + "loss": 2.344, + "step": 310700 + }, + { + "epoch": 1.2011179663218443, + "grad_norm": 0.11771678924560547, + "learning_rate": 0.002, + "loss": 2.347, + "step": 310710 + }, + { + "epoch": 1.2011566235252278, + "grad_norm": 0.1137363612651825, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 310720 + }, + { + "epoch": 1.201195280728611, + "grad_norm": 0.11905137449502945, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 310730 + }, + { + "epoch": 1.2012339379319943, + "grad_norm": 0.12443353980779648, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 310740 + }, + { + "epoch": 1.2012725951353775, + "grad_norm": 0.11437974125146866, + "learning_rate": 0.002, + "loss": 2.33, + "step": 310750 + }, + { + "epoch": 1.2013112523387608, + "grad_norm": 0.1042497307062149, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 310760 + }, + { + "epoch": 1.201349909542144, + "grad_norm": 0.10717729479074478, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 310770 + }, + { + "epoch": 1.2013885667455273, + "grad_norm": 0.09702154248952866, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 310780 + }, + { + "epoch": 1.2014272239489105, + "grad_norm": 0.15113039314746857, + "learning_rate": 0.002, + "loss": 2.328, + "step": 310790 + }, + { + "epoch": 1.201465881152294, + "grad_norm": 0.09931427240371704, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 310800 + }, + { + "epoch": 1.2015045383556773, + "grad_norm": 0.10436324030160904, + "learning_rate": 0.002, + "loss": 2.328, + "step": 310810 + }, + { + "epoch": 1.2015431955590605, + "grad_norm": 0.14245779812335968, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 310820 + }, + { + "epoch": 1.2015818527624438, + "grad_norm": 0.11675668507814407, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 310830 + }, + { + "epoch": 1.201620509965827, + "grad_norm": 0.09828370809555054, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 310840 + }, + { + "epoch": 1.2016591671692103, + "grad_norm": 0.09691742807626724, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 310850 + }, + { + "epoch": 1.2016978243725935, + "grad_norm": 0.09506635367870331, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 310860 + }, + { + "epoch": 1.2017364815759768, + "grad_norm": 0.10716848075389862, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 310870 + }, + { + "epoch": 1.20177513877936, + "grad_norm": 0.11083471029996872, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 310880 + }, + { + "epoch": 1.2018137959827435, + "grad_norm": 0.10451477020978928, + "learning_rate": 0.002, + "loss": 2.3109, + "step": 310890 + }, + { + "epoch": 1.2018524531861268, + "grad_norm": 0.12889741361141205, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 310900 + }, + { + "epoch": 1.20189111038951, + "grad_norm": 0.11663840711116791, + "learning_rate": 0.002, + "loss": 2.335, + "step": 310910 + }, + { + "epoch": 1.2019297675928933, + "grad_norm": 0.09428218007087708, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 310920 + }, + { + "epoch": 1.2019684247962765, + "grad_norm": 0.10899657756090164, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 310930 + }, + { + "epoch": 1.2020070819996598, + "grad_norm": 0.0901101604104042, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 310940 + }, + { + "epoch": 1.202045739203043, + "grad_norm": 0.1286320686340332, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 310950 + }, + { + "epoch": 1.2020843964064263, + "grad_norm": 0.09208814799785614, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 310960 + }, + { + "epoch": 1.2021230536098098, + "grad_norm": 0.09907399863004684, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 310970 + }, + { + "epoch": 1.202161710813193, + "grad_norm": 0.10375700891017914, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 310980 + }, + { + "epoch": 1.2022003680165763, + "grad_norm": 0.1176227405667305, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 310990 + }, + { + "epoch": 1.2022390252199595, + "grad_norm": 0.10297471284866333, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 311000 + }, + { + "epoch": 1.2022776824233428, + "grad_norm": 0.10666001588106155, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 311010 + }, + { + "epoch": 1.202316339626726, + "grad_norm": 0.10830654203891754, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 311020 + }, + { + "epoch": 1.2023549968301093, + "grad_norm": 0.13971230387687683, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 311030 + }, + { + "epoch": 1.2023936540334925, + "grad_norm": 0.1161842793226242, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 311040 + }, + { + "epoch": 1.2024323112368758, + "grad_norm": 0.11453959345817566, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 311050 + }, + { + "epoch": 1.2024709684402592, + "grad_norm": 0.10962758213281631, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 311060 + }, + { + "epoch": 1.2025096256436425, + "grad_norm": 0.10430661588907242, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 311070 + }, + { + "epoch": 1.2025482828470258, + "grad_norm": 0.10192476212978363, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 311080 + }, + { + "epoch": 1.202586940050409, + "grad_norm": 0.0879945158958435, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 311090 + }, + { + "epoch": 1.2026255972537923, + "grad_norm": 0.10970672965049744, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 311100 + }, + { + "epoch": 1.2026642544571755, + "grad_norm": 0.10071579366922379, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 311110 + }, + { + "epoch": 1.2027029116605588, + "grad_norm": 0.11206358671188354, + "learning_rate": 0.002, + "loss": 2.351, + "step": 311120 + }, + { + "epoch": 1.202741568863942, + "grad_norm": 0.10092099756002426, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 311130 + }, + { + "epoch": 1.2027802260673255, + "grad_norm": 0.09487482905387878, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 311140 + }, + { + "epoch": 1.2028188832707087, + "grad_norm": 0.10168376564979553, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 311150 + }, + { + "epoch": 1.202857540474092, + "grad_norm": 0.11102360486984253, + "learning_rate": 0.002, + "loss": 2.332, + "step": 311160 + }, + { + "epoch": 1.2028961976774752, + "grad_norm": 0.10950905084609985, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 311170 + }, + { + "epoch": 1.2029348548808585, + "grad_norm": 0.09757718443870544, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 311180 + }, + { + "epoch": 1.2029735120842417, + "grad_norm": 0.10875348001718521, + "learning_rate": 0.002, + "loss": 2.347, + "step": 311190 + }, + { + "epoch": 1.203012169287625, + "grad_norm": 0.10386452078819275, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 311200 + }, + { + "epoch": 1.2030508264910083, + "grad_norm": 0.12517563998699188, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 311210 + }, + { + "epoch": 1.2030894836943915, + "grad_norm": 0.11543699353933334, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 311220 + }, + { + "epoch": 1.203128140897775, + "grad_norm": 0.11032703518867493, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 311230 + }, + { + "epoch": 1.2031667981011582, + "grad_norm": 0.10646285861730576, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 311240 + }, + { + "epoch": 1.2032054553045415, + "grad_norm": 0.12593898177146912, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 311250 + }, + { + "epoch": 1.2032441125079247, + "grad_norm": 0.21965470910072327, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 311260 + }, + { + "epoch": 1.203282769711308, + "grad_norm": 0.12008555978536606, + "learning_rate": 0.002, + "loss": 2.324, + "step": 311270 + }, + { + "epoch": 1.2033214269146912, + "grad_norm": 0.09635363519191742, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 311280 + }, + { + "epoch": 1.2033600841180745, + "grad_norm": 0.13143694400787354, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 311290 + }, + { + "epoch": 1.2033987413214577, + "grad_norm": 0.09808219969272614, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 311300 + }, + { + "epoch": 1.2034373985248412, + "grad_norm": 0.12062323093414307, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 311310 + }, + { + "epoch": 1.2034760557282245, + "grad_norm": 0.1174851804971695, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 311320 + }, + { + "epoch": 1.2035147129316077, + "grad_norm": 0.10547292977571487, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 311330 + }, + { + "epoch": 1.203553370134991, + "grad_norm": 0.11646241694688797, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 311340 + }, + { + "epoch": 1.2035920273383742, + "grad_norm": 0.12754155695438385, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 311350 + }, + { + "epoch": 1.2036306845417575, + "grad_norm": 0.10357648134231567, + "learning_rate": 0.002, + "loss": 2.3129, + "step": 311360 + }, + { + "epoch": 1.2036693417451407, + "grad_norm": 0.13172759115695953, + "learning_rate": 0.002, + "loss": 2.335, + "step": 311370 + }, + { + "epoch": 1.203707998948524, + "grad_norm": 0.10856924951076508, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 311380 + }, + { + "epoch": 1.2037466561519072, + "grad_norm": 0.09741537272930145, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 311390 + }, + { + "epoch": 1.2037853133552907, + "grad_norm": 0.09914067387580872, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 311400 + }, + { + "epoch": 1.203823970558674, + "grad_norm": 0.09517299383878708, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 311410 + }, + { + "epoch": 1.2038626277620572, + "grad_norm": 0.1110980212688446, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 311420 + }, + { + "epoch": 1.2039012849654405, + "grad_norm": 0.092925526201725, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 311430 + }, + { + "epoch": 1.2039399421688237, + "grad_norm": 0.09799396991729736, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 311440 + }, + { + "epoch": 1.203978599372207, + "grad_norm": 0.09903433918952942, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 311450 + }, + { + "epoch": 1.2040172565755902, + "grad_norm": 0.11398271471261978, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 311460 + }, + { + "epoch": 1.2040559137789737, + "grad_norm": 0.16214625537395477, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 311470 + }, + { + "epoch": 1.204094570982357, + "grad_norm": 0.11520147323608398, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 311480 + }, + { + "epoch": 1.2041332281857402, + "grad_norm": 0.10259690880775452, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 311490 + }, + { + "epoch": 1.2041718853891235, + "grad_norm": 0.09407929331064224, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 311500 + }, + { + "epoch": 1.2042105425925067, + "grad_norm": 0.10382479429244995, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 311510 + }, + { + "epoch": 1.20424919979589, + "grad_norm": 0.12786495685577393, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 311520 + }, + { + "epoch": 1.2042878569992732, + "grad_norm": 0.10865487158298492, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 311530 + }, + { + "epoch": 1.2043265142026565, + "grad_norm": 0.09080743789672852, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 311540 + }, + { + "epoch": 1.2043651714060397, + "grad_norm": 0.11545614153146744, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 311550 + }, + { + "epoch": 1.204403828609423, + "grad_norm": 0.11504527926445007, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 311560 + }, + { + "epoch": 1.2044424858128064, + "grad_norm": 0.11840058863162994, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 311570 + }, + { + "epoch": 1.2044811430161897, + "grad_norm": 0.10996630787849426, + "learning_rate": 0.002, + "loss": 2.335, + "step": 311580 + }, + { + "epoch": 1.204519800219573, + "grad_norm": 0.08883655816316605, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 311590 + }, + { + "epoch": 1.2045584574229562, + "grad_norm": 0.10874045640230179, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 311600 + }, + { + "epoch": 1.2045971146263394, + "grad_norm": 0.11322815716266632, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 311610 + }, + { + "epoch": 1.2046357718297227, + "grad_norm": 0.11345550417900085, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 311620 + }, + { + "epoch": 1.204674429033106, + "grad_norm": 0.10841508954763412, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 311630 + }, + { + "epoch": 1.2047130862364894, + "grad_norm": 0.10970387607812881, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 311640 + }, + { + "epoch": 1.2047517434398727, + "grad_norm": 0.11749184131622314, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 311650 + }, + { + "epoch": 1.204790400643256, + "grad_norm": 0.0987529382109642, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 311660 + }, + { + "epoch": 1.2048290578466392, + "grad_norm": 0.09749867022037506, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 311670 + }, + { + "epoch": 1.2048677150500224, + "grad_norm": 0.09900732338428497, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 311680 + }, + { + "epoch": 1.2049063722534057, + "grad_norm": 0.09547862410545349, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 311690 + }, + { + "epoch": 1.204945029456789, + "grad_norm": 0.10451550036668777, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 311700 + }, + { + "epoch": 1.2049836866601722, + "grad_norm": 0.10269980877637863, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 311710 + }, + { + "epoch": 1.2050223438635554, + "grad_norm": 0.13897179067134857, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 311720 + }, + { + "epoch": 1.2050610010669387, + "grad_norm": 0.09629146009683609, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 311730 + }, + { + "epoch": 1.2050996582703222, + "grad_norm": 0.1335947960615158, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 311740 + }, + { + "epoch": 1.2051383154737054, + "grad_norm": 0.10571476817131042, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 311750 + }, + { + "epoch": 1.2051769726770887, + "grad_norm": 0.10034831613302231, + "learning_rate": 0.002, + "loss": 2.334, + "step": 311760 + }, + { + "epoch": 1.205215629880472, + "grad_norm": 0.11033013463020325, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 311770 + }, + { + "epoch": 1.2052542870838552, + "grad_norm": 0.09822043031454086, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 311780 + }, + { + "epoch": 1.2052929442872384, + "grad_norm": 0.11147646605968475, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 311790 + }, + { + "epoch": 1.2053316014906217, + "grad_norm": 0.10494410246610641, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 311800 + }, + { + "epoch": 1.2053702586940052, + "grad_norm": 0.09362241625785828, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 311810 + }, + { + "epoch": 1.2054089158973884, + "grad_norm": 0.09579791128635406, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 311820 + }, + { + "epoch": 1.2054475731007717, + "grad_norm": 0.1143781915307045, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 311830 + }, + { + "epoch": 1.205486230304155, + "grad_norm": 0.10467755794525146, + "learning_rate": 0.002, + "loss": 2.327, + "step": 311840 + }, + { + "epoch": 1.2055248875075382, + "grad_norm": 0.10779611766338348, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 311850 + }, + { + "epoch": 1.2055635447109214, + "grad_norm": 0.0962359607219696, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 311860 + }, + { + "epoch": 1.2056022019143047, + "grad_norm": 0.09667246788740158, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 311870 + }, + { + "epoch": 1.205640859117688, + "grad_norm": 0.09373782575130463, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 311880 + }, + { + "epoch": 1.2056795163210712, + "grad_norm": 0.1168796718120575, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 311890 + }, + { + "epoch": 1.2057181735244544, + "grad_norm": 0.08741265535354614, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 311900 + }, + { + "epoch": 1.205756830727838, + "grad_norm": 0.10326257348060608, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 311910 + }, + { + "epoch": 1.2057954879312212, + "grad_norm": 0.10092765092849731, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 311920 + }, + { + "epoch": 1.2058341451346044, + "grad_norm": 0.10536117851734161, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 311930 + }, + { + "epoch": 1.2058728023379877, + "grad_norm": 0.09137813001871109, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 311940 + }, + { + "epoch": 1.205911459541371, + "grad_norm": 0.10496657341718674, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 311950 + }, + { + "epoch": 1.2059501167447542, + "grad_norm": 0.10132060945034027, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 311960 + }, + { + "epoch": 1.2059887739481374, + "grad_norm": 0.11203447729349136, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 311970 + }, + { + "epoch": 1.206027431151521, + "grad_norm": 0.10228228569030762, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 311980 + }, + { + "epoch": 1.2060660883549041, + "grad_norm": 0.12769025564193726, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 311990 + }, + { + "epoch": 1.2061047455582874, + "grad_norm": 0.10657043755054474, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 312000 + }, + { + "epoch": 1.2061434027616706, + "grad_norm": 0.10488114506006241, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 312010 + }, + { + "epoch": 1.206182059965054, + "grad_norm": 0.1053939238190651, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 312020 + }, + { + "epoch": 1.2062207171684372, + "grad_norm": 0.10338485985994339, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 312030 + }, + { + "epoch": 1.2062593743718204, + "grad_norm": 0.09866391867399216, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 312040 + }, + { + "epoch": 1.2062980315752037, + "grad_norm": 0.09621734172105789, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 312050 + }, + { + "epoch": 1.206336688778587, + "grad_norm": 0.10968897491693497, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 312060 + }, + { + "epoch": 1.2063753459819702, + "grad_norm": 0.12046651542186737, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 312070 + }, + { + "epoch": 1.2064140031853536, + "grad_norm": 0.11235171556472778, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 312080 + }, + { + "epoch": 1.2064526603887369, + "grad_norm": 0.09258708357810974, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 312090 + }, + { + "epoch": 1.2064913175921201, + "grad_norm": 0.11983036249876022, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 312100 + }, + { + "epoch": 1.2065299747955034, + "grad_norm": 0.0992218405008316, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 312110 + }, + { + "epoch": 1.2065686319988866, + "grad_norm": 0.1157960370182991, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 312120 + }, + { + "epoch": 1.20660728920227, + "grad_norm": 0.11695519089698792, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 312130 + }, + { + "epoch": 1.2066459464056531, + "grad_norm": 0.14703066647052765, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 312140 + }, + { + "epoch": 1.2066846036090366, + "grad_norm": 0.10208508372306824, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 312150 + }, + { + "epoch": 1.2067232608124199, + "grad_norm": 0.10670216381549835, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 312160 + }, + { + "epoch": 1.2067619180158031, + "grad_norm": 0.1255640834569931, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 312170 + }, + { + "epoch": 1.2068005752191864, + "grad_norm": 0.09390535205602646, + "learning_rate": 0.002, + "loss": 2.3108, + "step": 312180 + }, + { + "epoch": 1.2068392324225696, + "grad_norm": 0.10780937224626541, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 312190 + }, + { + "epoch": 1.2068778896259529, + "grad_norm": 0.10417443513870239, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 312200 + }, + { + "epoch": 1.2069165468293361, + "grad_norm": 0.1079430803656578, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 312210 + }, + { + "epoch": 1.2069552040327194, + "grad_norm": 0.10621048510074615, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 312220 + }, + { + "epoch": 1.2069938612361026, + "grad_norm": 0.10394273698329926, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 312230 + }, + { + "epoch": 1.207032518439486, + "grad_norm": 0.1757679581642151, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 312240 + }, + { + "epoch": 1.2070711756428694, + "grad_norm": 0.09549301862716675, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 312250 + }, + { + "epoch": 1.2071098328462526, + "grad_norm": 0.08968798816204071, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 312260 + }, + { + "epoch": 1.2071484900496359, + "grad_norm": 0.1374940127134323, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 312270 + }, + { + "epoch": 1.2071871472530191, + "grad_norm": 0.11503007262945175, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 312280 + }, + { + "epoch": 1.2072258044564024, + "grad_norm": 0.09821897000074387, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 312290 + }, + { + "epoch": 1.2072644616597856, + "grad_norm": 0.16372337937355042, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 312300 + }, + { + "epoch": 1.2073031188631689, + "grad_norm": 0.0956815704703331, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 312310 + }, + { + "epoch": 1.2073417760665524, + "grad_norm": 0.09556156396865845, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 312320 + }, + { + "epoch": 1.2073804332699356, + "grad_norm": 0.09671517461538315, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 312330 + }, + { + "epoch": 1.2074190904733189, + "grad_norm": 0.09471859037876129, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 312340 + }, + { + "epoch": 1.207457747676702, + "grad_norm": 0.09411439299583435, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 312350 + }, + { + "epoch": 1.2074964048800854, + "grad_norm": 0.09138353914022446, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 312360 + }, + { + "epoch": 1.2075350620834686, + "grad_norm": 0.1586178094148636, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 312370 + }, + { + "epoch": 1.2075737192868519, + "grad_norm": 0.10865724086761475, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 312380 + }, + { + "epoch": 1.2076123764902351, + "grad_norm": 0.14529366791248322, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 312390 + }, + { + "epoch": 1.2076510336936184, + "grad_norm": 0.10525742173194885, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 312400 + }, + { + "epoch": 1.2076896908970018, + "grad_norm": 0.10483916103839874, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 312410 + }, + { + "epoch": 1.207728348100385, + "grad_norm": 0.097395159304142, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 312420 + }, + { + "epoch": 1.2077670053037683, + "grad_norm": 0.09661631286144257, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 312430 + }, + { + "epoch": 1.2078056625071516, + "grad_norm": 0.10141381621360779, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 312440 + }, + { + "epoch": 1.2078443197105349, + "grad_norm": 0.10463304072618484, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 312450 + }, + { + "epoch": 1.207882976913918, + "grad_norm": 0.09635960310697556, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 312460 + }, + { + "epoch": 1.2079216341173014, + "grad_norm": 0.09967007488012314, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 312470 + }, + { + "epoch": 1.2079602913206846, + "grad_norm": 0.10142817348241806, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 312480 + }, + { + "epoch": 1.207998948524068, + "grad_norm": 0.09476986527442932, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 312490 + }, + { + "epoch": 1.2080376057274513, + "grad_norm": 0.09432511031627655, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 312500 + }, + { + "epoch": 1.2080762629308346, + "grad_norm": 0.10480641573667526, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 312510 + }, + { + "epoch": 1.2081149201342178, + "grad_norm": 0.09587416052818298, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 312520 + }, + { + "epoch": 1.208153577337601, + "grad_norm": 0.1027066707611084, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 312530 + }, + { + "epoch": 1.2081922345409843, + "grad_norm": 0.1043514683842659, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 312540 + }, + { + "epoch": 1.2082308917443676, + "grad_norm": 0.10770545899868011, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 312550 + }, + { + "epoch": 1.2082695489477508, + "grad_norm": 0.09790293127298355, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 312560 + }, + { + "epoch": 1.208308206151134, + "grad_norm": 0.09138157218694687, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 312570 + }, + { + "epoch": 1.2083468633545176, + "grad_norm": 0.10595359653234482, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 312580 + }, + { + "epoch": 1.2083855205579008, + "grad_norm": 0.10801562666893005, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 312590 + }, + { + "epoch": 1.208424177761284, + "grad_norm": 0.09553363919258118, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 312600 + }, + { + "epoch": 1.2084628349646673, + "grad_norm": 0.10602040588855743, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 312610 + }, + { + "epoch": 1.2085014921680506, + "grad_norm": 0.11849401891231537, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 312620 + }, + { + "epoch": 1.2085401493714338, + "grad_norm": 0.1252974569797516, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 312630 + }, + { + "epoch": 1.208578806574817, + "grad_norm": 0.09098678082227707, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 312640 + }, + { + "epoch": 1.2086174637782003, + "grad_norm": 0.10646401345729828, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 312650 + }, + { + "epoch": 1.2086561209815838, + "grad_norm": 0.09371813386678696, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 312660 + }, + { + "epoch": 1.208694778184967, + "grad_norm": 0.11778826266527176, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 312670 + }, + { + "epoch": 1.2087334353883503, + "grad_norm": 0.11409354209899902, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 312680 + }, + { + "epoch": 1.2087720925917336, + "grad_norm": 0.1014944463968277, + "learning_rate": 0.002, + "loss": 2.332, + "step": 312690 + }, + { + "epoch": 1.2088107497951168, + "grad_norm": 0.09537586569786072, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 312700 + }, + { + "epoch": 1.2088494069985, + "grad_norm": 0.1035507544875145, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 312710 + }, + { + "epoch": 1.2088880642018833, + "grad_norm": 0.10714036971330643, + "learning_rate": 0.002, + "loss": 2.339, + "step": 312720 + }, + { + "epoch": 1.2089267214052666, + "grad_norm": 0.11011461913585663, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 312730 + }, + { + "epoch": 1.2089653786086498, + "grad_norm": 0.09917209297418594, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 312740 + }, + { + "epoch": 1.2090040358120333, + "grad_norm": 0.1113947182893753, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 312750 + }, + { + "epoch": 1.2090426930154166, + "grad_norm": 0.10072827339172363, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 312760 + }, + { + "epoch": 1.2090813502187998, + "grad_norm": 0.09403202682733536, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 312770 + }, + { + "epoch": 1.209120007422183, + "grad_norm": 0.13177041709423065, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 312780 + }, + { + "epoch": 1.2091586646255663, + "grad_norm": 0.12640121579170227, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 312790 + }, + { + "epoch": 1.2091973218289496, + "grad_norm": 0.09275078028440475, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 312800 + }, + { + "epoch": 1.2092359790323328, + "grad_norm": 0.11911021918058395, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 312810 + }, + { + "epoch": 1.209274636235716, + "grad_norm": 0.09386845678091049, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 312820 + }, + { + "epoch": 1.2093132934390995, + "grad_norm": 0.0880388468503952, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 312830 + }, + { + "epoch": 1.2093519506424828, + "grad_norm": 0.09610741585493088, + "learning_rate": 0.002, + "loss": 2.337, + "step": 312840 + }, + { + "epoch": 1.209390607845866, + "grad_norm": 0.10509993880987167, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 312850 + }, + { + "epoch": 1.2094292650492493, + "grad_norm": 0.11510360240936279, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 312860 + }, + { + "epoch": 1.2094679222526326, + "grad_norm": 0.10064709186553955, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 312870 + }, + { + "epoch": 1.2095065794560158, + "grad_norm": 0.10817084461450577, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 312880 + }, + { + "epoch": 1.209545236659399, + "grad_norm": 0.10244237631559372, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 312890 + }, + { + "epoch": 1.2095838938627823, + "grad_norm": 0.10062064230442047, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 312900 + }, + { + "epoch": 1.2096225510661656, + "grad_norm": 0.10862799733877182, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 312910 + }, + { + "epoch": 1.209661208269549, + "grad_norm": 0.11865110695362091, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 312920 + }, + { + "epoch": 1.2096998654729323, + "grad_norm": 0.1131138727068901, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 312930 + }, + { + "epoch": 1.2097385226763155, + "grad_norm": 0.08708862215280533, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 312940 + }, + { + "epoch": 1.2097771798796988, + "grad_norm": 0.1317860186100006, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 312950 + }, + { + "epoch": 1.209815837083082, + "grad_norm": 0.09572873264551163, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 312960 + }, + { + "epoch": 1.2098544942864653, + "grad_norm": 0.09569898992776871, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 312970 + }, + { + "epoch": 1.2098931514898486, + "grad_norm": 0.11135315150022507, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 312980 + }, + { + "epoch": 1.2099318086932318, + "grad_norm": 0.12566828727722168, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 312990 + }, + { + "epoch": 1.2099704658966153, + "grad_norm": 0.09661416709423065, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 313000 + }, + { + "epoch": 1.2100091230999985, + "grad_norm": 0.10898587852716446, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 313010 + }, + { + "epoch": 1.2100477803033818, + "grad_norm": 0.10114371031522751, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 313020 + }, + { + "epoch": 1.210086437506765, + "grad_norm": 0.10844885557889938, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 313030 + }, + { + "epoch": 1.2101250947101483, + "grad_norm": 0.10523076355457306, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 313040 + }, + { + "epoch": 1.2101637519135315, + "grad_norm": 0.10654118657112122, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 313050 + }, + { + "epoch": 1.2102024091169148, + "grad_norm": 0.10612224787473679, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 313060 + }, + { + "epoch": 1.210241066320298, + "grad_norm": 0.11240912228822708, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 313070 + }, + { + "epoch": 1.2102797235236813, + "grad_norm": 0.1061621680855751, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 313080 + }, + { + "epoch": 1.2103183807270648, + "grad_norm": 0.11006850004196167, + "learning_rate": 0.002, + "loss": 2.325, + "step": 313090 + }, + { + "epoch": 1.210357037930448, + "grad_norm": 0.09734800457954407, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 313100 + }, + { + "epoch": 1.2103956951338313, + "grad_norm": 0.10108668357133865, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 313110 + }, + { + "epoch": 1.2104343523372145, + "grad_norm": 0.12004440277814865, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 313120 + }, + { + "epoch": 1.2104730095405978, + "grad_norm": 0.11126869916915894, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 313130 + }, + { + "epoch": 1.210511666743981, + "grad_norm": 0.11197934299707413, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 313140 + }, + { + "epoch": 1.2105503239473643, + "grad_norm": 0.0979895368218422, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 313150 + }, + { + "epoch": 1.2105889811507475, + "grad_norm": 0.10922550410032272, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 313160 + }, + { + "epoch": 1.210627638354131, + "grad_norm": 0.12186593562364578, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 313170 + }, + { + "epoch": 1.2106662955575143, + "grad_norm": 0.10034412890672684, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 313180 + }, + { + "epoch": 1.2107049527608975, + "grad_norm": 0.47450196743011475, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 313190 + }, + { + "epoch": 1.2107436099642808, + "grad_norm": 0.1090787947177887, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 313200 + }, + { + "epoch": 1.210782267167664, + "grad_norm": 0.08918242901563644, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 313210 + }, + { + "epoch": 1.2108209243710473, + "grad_norm": 0.13283796608448029, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 313220 + }, + { + "epoch": 1.2108595815744305, + "grad_norm": 0.11585294455289841, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 313230 + }, + { + "epoch": 1.2108982387778138, + "grad_norm": 0.10596839338541031, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 313240 + }, + { + "epoch": 1.210936895981197, + "grad_norm": 0.09580260515213013, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 313250 + }, + { + "epoch": 1.2109755531845805, + "grad_norm": 0.1149885281920433, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 313260 + }, + { + "epoch": 1.2110142103879638, + "grad_norm": 0.10573212802410126, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 313270 + }, + { + "epoch": 1.211052867591347, + "grad_norm": 0.09755532443523407, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 313280 + }, + { + "epoch": 1.2110915247947303, + "grad_norm": 0.09388445317745209, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 313290 + }, + { + "epoch": 1.2111301819981135, + "grad_norm": 0.10133211314678192, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 313300 + }, + { + "epoch": 1.2111688392014968, + "grad_norm": 0.11387095600366592, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 313310 + }, + { + "epoch": 1.21120749640488, + "grad_norm": 0.10835465788841248, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 313320 + }, + { + "epoch": 1.2112461536082635, + "grad_norm": 0.10440170764923096, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 313330 + }, + { + "epoch": 1.2112848108116467, + "grad_norm": 0.09477156400680542, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 313340 + }, + { + "epoch": 1.21132346801503, + "grad_norm": 0.11110051721334457, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 313350 + }, + { + "epoch": 1.2113621252184132, + "grad_norm": 0.10621228069067001, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 313360 + }, + { + "epoch": 1.2114007824217965, + "grad_norm": 0.10771118849515915, + "learning_rate": 0.002, + "loss": 2.35, + "step": 313370 + }, + { + "epoch": 1.2114394396251797, + "grad_norm": 0.11031128466129303, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 313380 + }, + { + "epoch": 1.211478096828563, + "grad_norm": 0.10690969973802567, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 313390 + }, + { + "epoch": 1.2115167540319463, + "grad_norm": 0.12000640481710434, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 313400 + }, + { + "epoch": 1.2115554112353295, + "grad_norm": 0.11352237313985825, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 313410 + }, + { + "epoch": 1.2115940684387128, + "grad_norm": 0.11398783326148987, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 313420 + }, + { + "epoch": 1.2116327256420962, + "grad_norm": 0.1145615205168724, + "learning_rate": 0.002, + "loss": 2.33, + "step": 313430 + }, + { + "epoch": 1.2116713828454795, + "grad_norm": 0.11705408245325089, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 313440 + }, + { + "epoch": 1.2117100400488627, + "grad_norm": 0.09482872486114502, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 313450 + }, + { + "epoch": 1.211748697252246, + "grad_norm": 0.10971511900424957, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 313460 + }, + { + "epoch": 1.2117873544556292, + "grad_norm": 0.11067706346511841, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 313470 + }, + { + "epoch": 1.2118260116590125, + "grad_norm": 0.09540124237537384, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 313480 + }, + { + "epoch": 1.2118646688623957, + "grad_norm": 0.0967855378985405, + "learning_rate": 0.002, + "loss": 2.345, + "step": 313490 + }, + { + "epoch": 1.2119033260657792, + "grad_norm": 0.1019299179315567, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 313500 + }, + { + "epoch": 1.2119419832691625, + "grad_norm": 0.1015499085187912, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 313510 + }, + { + "epoch": 1.2119806404725457, + "grad_norm": 0.08799052983522415, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 313520 + }, + { + "epoch": 1.212019297675929, + "grad_norm": 0.09830170124769211, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 313530 + }, + { + "epoch": 1.2120579548793122, + "grad_norm": 0.14667995274066925, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 313540 + }, + { + "epoch": 1.2120966120826955, + "grad_norm": 0.0980951189994812, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 313550 + }, + { + "epoch": 1.2121352692860787, + "grad_norm": 0.10406613349914551, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 313560 + }, + { + "epoch": 1.212173926489462, + "grad_norm": 0.09259974956512451, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 313570 + }, + { + "epoch": 1.2122125836928452, + "grad_norm": 0.10583232343196869, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 313580 + }, + { + "epoch": 1.2122512408962285, + "grad_norm": 0.11357071995735168, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 313590 + }, + { + "epoch": 1.212289898099612, + "grad_norm": 0.1437079757452011, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 313600 + }, + { + "epoch": 1.2123285553029952, + "grad_norm": 0.1140052005648613, + "learning_rate": 0.002, + "loss": 2.338, + "step": 313610 + }, + { + "epoch": 1.2123672125063785, + "grad_norm": 0.10280095785856247, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 313620 + }, + { + "epoch": 1.2124058697097617, + "grad_norm": 0.09381737560033798, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 313630 + }, + { + "epoch": 1.212444526913145, + "grad_norm": 0.12156960368156433, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 313640 + }, + { + "epoch": 1.2124831841165282, + "grad_norm": 0.10479841381311417, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 313650 + }, + { + "epoch": 1.2125218413199115, + "grad_norm": 0.12612693011760712, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 313660 + }, + { + "epoch": 1.212560498523295, + "grad_norm": 0.10016128420829773, + "learning_rate": 0.002, + "loss": 2.33, + "step": 313670 + }, + { + "epoch": 1.2125991557266782, + "grad_norm": 0.11436531692743301, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 313680 + }, + { + "epoch": 1.2126378129300615, + "grad_norm": 0.10612929612398148, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 313690 + }, + { + "epoch": 1.2126764701334447, + "grad_norm": 0.13559940457344055, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 313700 + }, + { + "epoch": 1.212715127336828, + "grad_norm": 0.12024310231208801, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 313710 + }, + { + "epoch": 1.2127537845402112, + "grad_norm": 0.12355874478816986, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 313720 + }, + { + "epoch": 1.2127924417435945, + "grad_norm": 0.10474978387355804, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 313730 + }, + { + "epoch": 1.2128310989469777, + "grad_norm": 0.10101180523633957, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 313740 + }, + { + "epoch": 1.212869756150361, + "grad_norm": 0.09087458997964859, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 313750 + }, + { + "epoch": 1.2129084133537442, + "grad_norm": 0.0855170488357544, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 313760 + }, + { + "epoch": 1.2129470705571277, + "grad_norm": 0.11923859268426895, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 313770 + }, + { + "epoch": 1.212985727760511, + "grad_norm": 0.12264309078454971, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 313780 + }, + { + "epoch": 1.2130243849638942, + "grad_norm": 0.10807929933071136, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 313790 + }, + { + "epoch": 1.2130630421672774, + "grad_norm": 0.09908405691385269, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 313800 + }, + { + "epoch": 1.2131016993706607, + "grad_norm": 0.10864149034023285, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 313810 + }, + { + "epoch": 1.213140356574044, + "grad_norm": 0.09791579842567444, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 313820 + }, + { + "epoch": 1.2131790137774272, + "grad_norm": 0.09416855126619339, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 313830 + }, + { + "epoch": 1.2132176709808107, + "grad_norm": 0.11778811365365982, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 313840 + }, + { + "epoch": 1.213256328184194, + "grad_norm": 0.10956388711929321, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 313850 + }, + { + "epoch": 1.2132949853875772, + "grad_norm": 0.10013221949338913, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 313860 + }, + { + "epoch": 1.2133336425909604, + "grad_norm": 0.10736773908138275, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 313870 + }, + { + "epoch": 1.2133722997943437, + "grad_norm": 0.09436599910259247, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 313880 + }, + { + "epoch": 1.213410956997727, + "grad_norm": 0.11187426745891571, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 313890 + }, + { + "epoch": 1.2134496142011102, + "grad_norm": 0.10453151166439056, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 313900 + }, + { + "epoch": 1.2134882714044934, + "grad_norm": 0.11502104997634888, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 313910 + }, + { + "epoch": 1.2135269286078767, + "grad_norm": 0.10102856159210205, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 313920 + }, + { + "epoch": 1.21356558581126, + "grad_norm": 0.09505726397037506, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 313930 + }, + { + "epoch": 1.2136042430146434, + "grad_norm": 0.09413469582796097, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 313940 + }, + { + "epoch": 1.2136429002180267, + "grad_norm": 0.11024481058120728, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 313950 + }, + { + "epoch": 1.21368155742141, + "grad_norm": 0.0951482281088829, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 313960 + }, + { + "epoch": 1.2137202146247932, + "grad_norm": 0.1029760017991066, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 313970 + }, + { + "epoch": 1.2137588718281764, + "grad_norm": 0.10141143202781677, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 313980 + }, + { + "epoch": 1.2137975290315597, + "grad_norm": 0.09400579333305359, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 313990 + }, + { + "epoch": 1.213836186234943, + "grad_norm": 0.5630881786346436, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 314000 + }, + { + "epoch": 1.2138748434383264, + "grad_norm": 0.1237252727150917, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 314010 + }, + { + "epoch": 1.2139135006417097, + "grad_norm": 0.09510214626789093, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 314020 + }, + { + "epoch": 1.213952157845093, + "grad_norm": 0.09647620469331741, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 314030 + }, + { + "epoch": 1.2139908150484762, + "grad_norm": 0.10102799534797668, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 314040 + }, + { + "epoch": 1.2140294722518594, + "grad_norm": 0.0946962758898735, + "learning_rate": 0.002, + "loss": 2.33, + "step": 314050 + }, + { + "epoch": 1.2140681294552427, + "grad_norm": 0.10435698181390762, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 314060 + }, + { + "epoch": 1.214106786658626, + "grad_norm": 0.10194170475006104, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 314070 + }, + { + "epoch": 1.2141454438620092, + "grad_norm": 0.12994642555713654, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 314080 + }, + { + "epoch": 1.2141841010653924, + "grad_norm": 0.09357239305973053, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 314090 + }, + { + "epoch": 1.2142227582687757, + "grad_norm": 0.09413903206586838, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 314100 + }, + { + "epoch": 1.2142614154721592, + "grad_norm": 0.13679394125938416, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 314110 + }, + { + "epoch": 1.2143000726755424, + "grad_norm": 0.09909292310476303, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 314120 + }, + { + "epoch": 1.2143387298789257, + "grad_norm": 0.09950712323188782, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 314130 + }, + { + "epoch": 1.214377387082309, + "grad_norm": 0.09253106266260147, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 314140 + }, + { + "epoch": 1.2144160442856922, + "grad_norm": 0.10468227416276932, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 314150 + }, + { + "epoch": 1.2144547014890754, + "grad_norm": 0.09493131935596466, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 314160 + }, + { + "epoch": 1.2144933586924587, + "grad_norm": 0.1063208281993866, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 314170 + }, + { + "epoch": 1.2145320158958421, + "grad_norm": 0.09948117285966873, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 314180 + }, + { + "epoch": 1.2145706730992254, + "grad_norm": 0.10003940761089325, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 314190 + }, + { + "epoch": 1.2146093303026086, + "grad_norm": 0.10735779255628586, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 314200 + }, + { + "epoch": 1.214647987505992, + "grad_norm": 0.10176675766706467, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 314210 + }, + { + "epoch": 1.2146866447093752, + "grad_norm": 0.10090947151184082, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 314220 + }, + { + "epoch": 1.2147253019127584, + "grad_norm": 0.0865064188838005, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 314230 + }, + { + "epoch": 1.2147639591161417, + "grad_norm": 0.10404794663190842, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 314240 + }, + { + "epoch": 1.214802616319525, + "grad_norm": 0.10815301537513733, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 314250 + }, + { + "epoch": 1.2148412735229082, + "grad_norm": 0.10101471096277237, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 314260 + }, + { + "epoch": 1.2148799307262914, + "grad_norm": 0.11894841492176056, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 314270 + }, + { + "epoch": 1.2149185879296749, + "grad_norm": 0.11323252320289612, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 314280 + }, + { + "epoch": 1.2149572451330581, + "grad_norm": 0.1007615402340889, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 314290 + }, + { + "epoch": 1.2149959023364414, + "grad_norm": 0.0997728779911995, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 314300 + }, + { + "epoch": 1.2150345595398246, + "grad_norm": 0.11074774712324142, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 314310 + }, + { + "epoch": 1.215073216743208, + "grad_norm": 0.11069449782371521, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 314320 + }, + { + "epoch": 1.2151118739465911, + "grad_norm": 0.12711121141910553, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 314330 + }, + { + "epoch": 1.2151505311499744, + "grad_norm": 0.10363688319921494, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 314340 + }, + { + "epoch": 1.2151891883533579, + "grad_norm": 0.11091338843107224, + "learning_rate": 0.002, + "loss": 2.332, + "step": 314350 + }, + { + "epoch": 1.2152278455567411, + "grad_norm": 0.10448306798934937, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 314360 + }, + { + "epoch": 1.2152665027601244, + "grad_norm": 0.10605467110872269, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 314370 + }, + { + "epoch": 1.2153051599635076, + "grad_norm": 0.09692072123289108, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 314380 + }, + { + "epoch": 1.2153438171668909, + "grad_norm": 0.09874030947685242, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 314390 + }, + { + "epoch": 1.2153824743702741, + "grad_norm": 0.09986934810876846, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 314400 + }, + { + "epoch": 1.2154211315736574, + "grad_norm": 0.10098947584629059, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 314410 + }, + { + "epoch": 1.2154597887770406, + "grad_norm": 0.10305691510438919, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 314420 + }, + { + "epoch": 1.215498445980424, + "grad_norm": 0.10658477991819382, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 314430 + }, + { + "epoch": 1.2155371031838074, + "grad_norm": 0.10453560948371887, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 314440 + }, + { + "epoch": 1.2155757603871906, + "grad_norm": 0.12518541514873505, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 314450 + }, + { + "epoch": 1.2156144175905739, + "grad_norm": 0.10759428143501282, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 314460 + }, + { + "epoch": 1.2156530747939571, + "grad_norm": 0.139102503657341, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 314470 + }, + { + "epoch": 1.2156917319973404, + "grad_norm": 0.12430110573768616, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 314480 + }, + { + "epoch": 1.2157303892007236, + "grad_norm": 0.10704773664474487, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 314490 + }, + { + "epoch": 1.2157690464041069, + "grad_norm": 0.09441366046667099, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 314500 + }, + { + "epoch": 1.2158077036074901, + "grad_norm": 0.1318897306919098, + "learning_rate": 0.002, + "loss": 2.341, + "step": 314510 + }, + { + "epoch": 1.2158463608108736, + "grad_norm": 0.09802470356225967, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 314520 + }, + { + "epoch": 1.2158850180142569, + "grad_norm": 0.09931337088346481, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 314530 + }, + { + "epoch": 1.21592367521764, + "grad_norm": 0.5061547756195068, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 314540 + }, + { + "epoch": 1.2159623324210234, + "grad_norm": 0.10328753292560577, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 314550 + }, + { + "epoch": 1.2160009896244066, + "grad_norm": 0.09852741658687592, + "learning_rate": 0.002, + "loss": 2.3575, + "step": 314560 + }, + { + "epoch": 1.2160396468277899, + "grad_norm": 0.0911833718419075, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 314570 + }, + { + "epoch": 1.2160783040311731, + "grad_norm": 0.10800252854824066, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 314580 + }, + { + "epoch": 1.2161169612345564, + "grad_norm": 0.10898005962371826, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 314590 + }, + { + "epoch": 1.2161556184379396, + "grad_norm": 0.12181519716978073, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 314600 + }, + { + "epoch": 1.216194275641323, + "grad_norm": 0.10061764717102051, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 314610 + }, + { + "epoch": 1.2162329328447063, + "grad_norm": 0.09571439027786255, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 314620 + }, + { + "epoch": 1.2162715900480896, + "grad_norm": 0.17038051784038544, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 314630 + }, + { + "epoch": 1.2163102472514729, + "grad_norm": 0.1250414401292801, + "learning_rate": 0.002, + "loss": 2.344, + "step": 314640 + }, + { + "epoch": 1.216348904454856, + "grad_norm": 0.09042432904243469, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 314650 + }, + { + "epoch": 1.2163875616582394, + "grad_norm": 0.09905265271663666, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 314660 + }, + { + "epoch": 1.2164262188616226, + "grad_norm": 0.11659180372953415, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 314670 + }, + { + "epoch": 1.2164648760650059, + "grad_norm": 0.10536820441484451, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 314680 + }, + { + "epoch": 1.2165035332683893, + "grad_norm": 0.10633594542741776, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 314690 + }, + { + "epoch": 1.2165421904717726, + "grad_norm": 0.10117705166339874, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 314700 + }, + { + "epoch": 1.2165808476751558, + "grad_norm": 0.11793801188468933, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 314710 + }, + { + "epoch": 1.216619504878539, + "grad_norm": 0.1170022115111351, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 314720 + }, + { + "epoch": 1.2166581620819223, + "grad_norm": 0.108283132314682, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 314730 + }, + { + "epoch": 1.2166968192853056, + "grad_norm": 0.2359154373407364, + "learning_rate": 0.002, + "loss": 2.344, + "step": 314740 + }, + { + "epoch": 1.2167354764886888, + "grad_norm": 0.10665125399827957, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 314750 + }, + { + "epoch": 1.216774133692072, + "grad_norm": 0.11395130306482315, + "learning_rate": 0.002, + "loss": 2.334, + "step": 314760 + }, + { + "epoch": 1.2168127908954554, + "grad_norm": 0.0980997234582901, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 314770 + }, + { + "epoch": 1.2168514480988388, + "grad_norm": 0.10164439678192139, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 314780 + }, + { + "epoch": 1.216890105302222, + "grad_norm": 0.2382308840751648, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 314790 + }, + { + "epoch": 1.2169287625056053, + "grad_norm": 0.10206914693117142, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 314800 + }, + { + "epoch": 1.2169674197089886, + "grad_norm": 0.1029033362865448, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 314810 + }, + { + "epoch": 1.2170060769123718, + "grad_norm": 0.11026882380247116, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 314820 + }, + { + "epoch": 1.217044734115755, + "grad_norm": 0.09904853254556656, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 314830 + }, + { + "epoch": 1.2170833913191383, + "grad_norm": 0.10073044896125793, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 314840 + }, + { + "epoch": 1.2171220485225216, + "grad_norm": 0.10072771459817886, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 314850 + }, + { + "epoch": 1.217160705725905, + "grad_norm": 0.11892835050821304, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 314860 + }, + { + "epoch": 1.2171993629292883, + "grad_norm": 0.10275215655565262, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 314870 + }, + { + "epoch": 1.2172380201326716, + "grad_norm": 0.13225539028644562, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 314880 + }, + { + "epoch": 1.2172766773360548, + "grad_norm": 0.10573133081197739, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 314890 + }, + { + "epoch": 1.217315334539438, + "grad_norm": 0.1118333712220192, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 314900 + }, + { + "epoch": 1.2173539917428213, + "grad_norm": 0.1679127812385559, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 314910 + }, + { + "epoch": 1.2173926489462046, + "grad_norm": 0.10684831440448761, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 314920 + }, + { + "epoch": 1.2174313061495878, + "grad_norm": 0.10050488263368607, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 314930 + }, + { + "epoch": 1.217469963352971, + "grad_norm": 0.11194442957639694, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 314940 + }, + { + "epoch": 1.2175086205563546, + "grad_norm": 0.10014735907316208, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 314950 + }, + { + "epoch": 1.2175472777597378, + "grad_norm": 0.1252419799566269, + "learning_rate": 0.002, + "loss": 2.338, + "step": 314960 + }, + { + "epoch": 1.217585934963121, + "grad_norm": 0.11564713716506958, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 314970 + }, + { + "epoch": 1.2176245921665043, + "grad_norm": 0.11097046732902527, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 314980 + }, + { + "epoch": 1.2176632493698876, + "grad_norm": 0.09906239062547684, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 314990 + }, + { + "epoch": 1.2177019065732708, + "grad_norm": 0.11770200729370117, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 315000 + }, + { + "epoch": 1.217740563776654, + "grad_norm": 0.10562057793140411, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 315010 + }, + { + "epoch": 1.2177792209800373, + "grad_norm": 0.16002397239208221, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 315020 + }, + { + "epoch": 1.2178178781834208, + "grad_norm": 0.14758117496967316, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 315030 + }, + { + "epoch": 1.217856535386804, + "grad_norm": 0.11798250675201416, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 315040 + }, + { + "epoch": 1.2178951925901873, + "grad_norm": 0.1063949316740036, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 315050 + }, + { + "epoch": 1.2179338497935706, + "grad_norm": 0.10853171348571777, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 315060 + }, + { + "epoch": 1.2179725069969538, + "grad_norm": 0.10362334549427032, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 315070 + }, + { + "epoch": 1.218011164200337, + "grad_norm": 0.10557835549116135, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 315080 + }, + { + "epoch": 1.2180498214037203, + "grad_norm": 0.13382239639759064, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 315090 + }, + { + "epoch": 1.2180884786071036, + "grad_norm": 0.10299064218997955, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 315100 + }, + { + "epoch": 1.2181271358104868, + "grad_norm": 0.11686524003744125, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 315110 + }, + { + "epoch": 1.2181657930138703, + "grad_norm": 0.09765559434890747, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 315120 + }, + { + "epoch": 1.2182044502172535, + "grad_norm": 0.12375412881374359, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 315130 + }, + { + "epoch": 1.2182431074206368, + "grad_norm": 0.10474059730768204, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 315140 + }, + { + "epoch": 1.21828176462402, + "grad_norm": 0.11837010085582733, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 315150 + }, + { + "epoch": 1.2183204218274033, + "grad_norm": 0.11289907246828079, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 315160 + }, + { + "epoch": 1.2183590790307866, + "grad_norm": 0.11022341251373291, + "learning_rate": 0.002, + "loss": 2.34, + "step": 315170 + }, + { + "epoch": 1.2183977362341698, + "grad_norm": 0.11386298388242722, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 315180 + }, + { + "epoch": 1.2184363934375533, + "grad_norm": 0.12257837504148483, + "learning_rate": 0.002, + "loss": 2.327, + "step": 315190 + }, + { + "epoch": 1.2184750506409365, + "grad_norm": 0.11156097054481506, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 315200 + }, + { + "epoch": 1.2185137078443198, + "grad_norm": 0.10310366004705429, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 315210 + }, + { + "epoch": 1.218552365047703, + "grad_norm": 0.10431301593780518, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 315220 + }, + { + "epoch": 1.2185910222510863, + "grad_norm": 0.09562677890062332, + "learning_rate": 0.002, + "loss": 2.332, + "step": 315230 + }, + { + "epoch": 1.2186296794544695, + "grad_norm": 0.09988576918840408, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 315240 + }, + { + "epoch": 1.2186683366578528, + "grad_norm": 0.11268305033445358, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 315250 + }, + { + "epoch": 1.218706993861236, + "grad_norm": 0.1090041846036911, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 315260 + }, + { + "epoch": 1.2187456510646193, + "grad_norm": 0.09609409421682358, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 315270 + }, + { + "epoch": 1.2187843082680025, + "grad_norm": 0.10149850696325302, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 315280 + }, + { + "epoch": 1.218822965471386, + "grad_norm": 0.1283339262008667, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 315290 + }, + { + "epoch": 1.2188616226747693, + "grad_norm": 0.11716891080141068, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 315300 + }, + { + "epoch": 1.2189002798781525, + "grad_norm": 0.10556206852197647, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 315310 + }, + { + "epoch": 1.2189389370815358, + "grad_norm": 0.1255723237991333, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 315320 + }, + { + "epoch": 1.218977594284919, + "grad_norm": 0.09731069207191467, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 315330 + }, + { + "epoch": 1.2190162514883023, + "grad_norm": 0.11111821234226227, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 315340 + }, + { + "epoch": 1.2190549086916855, + "grad_norm": 0.10132262110710144, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 315350 + }, + { + "epoch": 1.219093565895069, + "grad_norm": 0.11700930446386337, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 315360 + }, + { + "epoch": 1.2191322230984523, + "grad_norm": 0.10876394808292389, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 315370 + }, + { + "epoch": 1.2191708803018355, + "grad_norm": 0.1077607125043869, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 315380 + }, + { + "epoch": 1.2192095375052188, + "grad_norm": 0.10873237252235413, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 315390 + }, + { + "epoch": 1.219248194708602, + "grad_norm": 0.10627356171607971, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 315400 + }, + { + "epoch": 1.2192868519119853, + "grad_norm": 0.1141803041100502, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 315410 + }, + { + "epoch": 1.2193255091153685, + "grad_norm": 0.1050865575671196, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 315420 + }, + { + "epoch": 1.2193641663187518, + "grad_norm": 0.09554886817932129, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 315430 + }, + { + "epoch": 1.219402823522135, + "grad_norm": 0.10202132165431976, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 315440 + }, + { + "epoch": 1.2194414807255183, + "grad_norm": 0.11309561878442764, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 315450 + }, + { + "epoch": 1.2194801379289018, + "grad_norm": 0.14103220403194427, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 315460 + }, + { + "epoch": 1.219518795132285, + "grad_norm": 0.10153191536664963, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 315470 + }, + { + "epoch": 1.2195574523356683, + "grad_norm": 0.09797697514295578, + "learning_rate": 0.002, + "loss": 2.336, + "step": 315480 + }, + { + "epoch": 1.2195961095390515, + "grad_norm": 0.11822252720594406, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 315490 + }, + { + "epoch": 1.2196347667424348, + "grad_norm": 0.09600470960140228, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 315500 + }, + { + "epoch": 1.219673423945818, + "grad_norm": 0.09823154658079147, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 315510 + }, + { + "epoch": 1.2197120811492013, + "grad_norm": 0.1113450825214386, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 315520 + }, + { + "epoch": 1.2197507383525847, + "grad_norm": 0.11101141571998596, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 315530 + }, + { + "epoch": 1.219789395555968, + "grad_norm": 0.1019652783870697, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 315540 + }, + { + "epoch": 1.2198280527593512, + "grad_norm": 0.13949422538280487, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 315550 + }, + { + "epoch": 1.2198667099627345, + "grad_norm": 0.10365574806928635, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 315560 + }, + { + "epoch": 1.2199053671661177, + "grad_norm": 0.10307446122169495, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 315570 + }, + { + "epoch": 1.219944024369501, + "grad_norm": 0.09809573739767075, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 315580 + }, + { + "epoch": 1.2199826815728843, + "grad_norm": 0.11042384803295135, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 315590 + }, + { + "epoch": 1.2200213387762675, + "grad_norm": 0.10844165086746216, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 315600 + }, + { + "epoch": 1.2200599959796508, + "grad_norm": 0.10989883542060852, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 315610 + }, + { + "epoch": 1.220098653183034, + "grad_norm": 0.10099762678146362, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 315620 + }, + { + "epoch": 1.2201373103864175, + "grad_norm": 0.14972710609436035, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 315630 + }, + { + "epoch": 1.2201759675898007, + "grad_norm": 0.103081613779068, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 315640 + }, + { + "epoch": 1.220214624793184, + "grad_norm": 0.10600491613149643, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 315650 + }, + { + "epoch": 1.2202532819965672, + "grad_norm": 0.11377498507499695, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 315660 + }, + { + "epoch": 1.2202919391999505, + "grad_norm": 0.1005830317735672, + "learning_rate": 0.002, + "loss": 2.3121, + "step": 315670 + }, + { + "epoch": 1.2203305964033337, + "grad_norm": 0.10253360122442245, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 315680 + }, + { + "epoch": 1.220369253606717, + "grad_norm": 0.11248026043176651, + "learning_rate": 0.002, + "loss": 2.34, + "step": 315690 + }, + { + "epoch": 1.2204079108101005, + "grad_norm": 0.11137398332357407, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 315700 + }, + { + "epoch": 1.2204465680134837, + "grad_norm": 0.10283508896827698, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 315710 + }, + { + "epoch": 1.220485225216867, + "grad_norm": 0.1013549268245697, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 315720 + }, + { + "epoch": 1.2205238824202502, + "grad_norm": 0.10436509549617767, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 315730 + }, + { + "epoch": 1.2205625396236335, + "grad_norm": 0.11275258660316467, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 315740 + }, + { + "epoch": 1.2206011968270167, + "grad_norm": 0.1164979338645935, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 315750 + }, + { + "epoch": 1.2206398540304, + "grad_norm": 0.11312644928693771, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 315760 + }, + { + "epoch": 1.2206785112337832, + "grad_norm": 0.09488767385482788, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 315770 + }, + { + "epoch": 1.2207171684371665, + "grad_norm": 0.11264296621084213, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 315780 + }, + { + "epoch": 1.2207558256405497, + "grad_norm": 0.10742346942424774, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 315790 + }, + { + "epoch": 1.2207944828439332, + "grad_norm": 0.10454041510820389, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 315800 + }, + { + "epoch": 1.2208331400473165, + "grad_norm": 0.11628180742263794, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 315810 + }, + { + "epoch": 1.2208717972506997, + "grad_norm": 0.10877648741006851, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 315820 + }, + { + "epoch": 1.220910454454083, + "grad_norm": 0.09616900235414505, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 315830 + }, + { + "epoch": 1.2209491116574662, + "grad_norm": 0.10478873550891876, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 315840 + }, + { + "epoch": 1.2209877688608495, + "grad_norm": 0.09449452906847, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 315850 + }, + { + "epoch": 1.2210264260642327, + "grad_norm": 0.09216703474521637, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 315860 + }, + { + "epoch": 1.2210650832676162, + "grad_norm": 0.09965404868125916, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 315870 + }, + { + "epoch": 1.2211037404709995, + "grad_norm": 0.119733527302742, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 315880 + }, + { + "epoch": 1.2211423976743827, + "grad_norm": 0.09414691478013992, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 315890 + }, + { + "epoch": 1.221181054877766, + "grad_norm": 0.10925483703613281, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 315900 + }, + { + "epoch": 1.2212197120811492, + "grad_norm": 0.10677146911621094, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 315910 + }, + { + "epoch": 1.2212583692845325, + "grad_norm": 0.09887148439884186, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 315920 + }, + { + "epoch": 1.2212970264879157, + "grad_norm": 0.09858877211809158, + "learning_rate": 0.002, + "loss": 2.328, + "step": 315930 + }, + { + "epoch": 1.221335683691299, + "grad_norm": 0.16472014784812927, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 315940 + }, + { + "epoch": 1.2213743408946822, + "grad_norm": 0.11468665301799774, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 315950 + }, + { + "epoch": 1.2214129980980655, + "grad_norm": 0.12694212794303894, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 315960 + }, + { + "epoch": 1.221451655301449, + "grad_norm": 0.12337895482778549, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 315970 + }, + { + "epoch": 1.2214903125048322, + "grad_norm": 0.10966597497463226, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 315980 + }, + { + "epoch": 1.2215289697082155, + "grad_norm": 0.17105089128017426, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 315990 + }, + { + "epoch": 1.2215676269115987, + "grad_norm": 0.13237425684928894, + "learning_rate": 0.002, + "loss": 2.342, + "step": 316000 + }, + { + "epoch": 1.221606284114982, + "grad_norm": 0.08818645030260086, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 316010 + }, + { + "epoch": 1.2216449413183652, + "grad_norm": 0.09917477518320084, + "learning_rate": 0.002, + "loss": 2.338, + "step": 316020 + }, + { + "epoch": 1.2216835985217485, + "grad_norm": 0.10293343663215637, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 316030 + }, + { + "epoch": 1.221722255725132, + "grad_norm": 0.11665060371160507, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 316040 + }, + { + "epoch": 1.2217609129285152, + "grad_norm": 0.09830315411090851, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 316050 + }, + { + "epoch": 1.2217995701318984, + "grad_norm": 0.13115578889846802, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 316060 + }, + { + "epoch": 1.2218382273352817, + "grad_norm": 0.10233128070831299, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 316070 + }, + { + "epoch": 1.221876884538665, + "grad_norm": 0.09380917251110077, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 316080 + }, + { + "epoch": 1.2219155417420482, + "grad_norm": 0.10349245369434357, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 316090 + }, + { + "epoch": 1.2219541989454314, + "grad_norm": 0.10417374223470688, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 316100 + }, + { + "epoch": 1.2219928561488147, + "grad_norm": 0.10841956734657288, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 316110 + }, + { + "epoch": 1.222031513352198, + "grad_norm": 0.12777014076709747, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 316120 + }, + { + "epoch": 1.2220701705555812, + "grad_norm": 0.12356674671173096, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 316130 + }, + { + "epoch": 1.2221088277589647, + "grad_norm": 0.11700879037380219, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 316140 + }, + { + "epoch": 1.222147484962348, + "grad_norm": 0.10147671401500702, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 316150 + }, + { + "epoch": 1.2221861421657312, + "grad_norm": 0.12761598825454712, + "learning_rate": 0.002, + "loss": 2.339, + "step": 316160 + }, + { + "epoch": 1.2222247993691144, + "grad_norm": 0.10373452305793762, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 316170 + }, + { + "epoch": 1.2222634565724977, + "grad_norm": 0.10208388417959213, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 316180 + }, + { + "epoch": 1.222302113775881, + "grad_norm": 0.0917031541466713, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 316190 + }, + { + "epoch": 1.2223407709792642, + "grad_norm": 0.10595124959945679, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 316200 + }, + { + "epoch": 1.2223794281826477, + "grad_norm": 0.1208588257431984, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 316210 + }, + { + "epoch": 1.222418085386031, + "grad_norm": 0.1297757923603058, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 316220 + }, + { + "epoch": 1.2224567425894142, + "grad_norm": 0.1111166700720787, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 316230 + }, + { + "epoch": 1.2224953997927974, + "grad_norm": 0.12035632878541946, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 316240 + }, + { + "epoch": 1.2225340569961807, + "grad_norm": 0.0926397442817688, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 316250 + }, + { + "epoch": 1.222572714199564, + "grad_norm": 0.109648197889328, + "learning_rate": 0.002, + "loss": 2.334, + "step": 316260 + }, + { + "epoch": 1.2226113714029472, + "grad_norm": 0.097147636115551, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 316270 + }, + { + "epoch": 1.2226500286063304, + "grad_norm": 0.10870775580406189, + "learning_rate": 0.002, + "loss": 2.337, + "step": 316280 + }, + { + "epoch": 1.2226886858097137, + "grad_norm": 0.09800266474485397, + "learning_rate": 0.002, + "loss": 2.345, + "step": 316290 + }, + { + "epoch": 1.2227273430130972, + "grad_norm": 0.11024896800518036, + "learning_rate": 0.002, + "loss": 2.336, + "step": 316300 + }, + { + "epoch": 1.2227660002164804, + "grad_norm": 0.10475356131792068, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 316310 + }, + { + "epoch": 1.2228046574198637, + "grad_norm": 0.11313185840845108, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 316320 + }, + { + "epoch": 1.222843314623247, + "grad_norm": 0.10262496024370193, + "learning_rate": 0.002, + "loss": 2.33, + "step": 316330 + }, + { + "epoch": 1.2228819718266302, + "grad_norm": 0.09623805433511734, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 316340 + }, + { + "epoch": 1.2229206290300134, + "grad_norm": 0.10645546764135361, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 316350 + }, + { + "epoch": 1.2229592862333967, + "grad_norm": 0.1368713229894638, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 316360 + }, + { + "epoch": 1.22299794343678, + "grad_norm": 0.2733914256095886, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 316370 + }, + { + "epoch": 1.2230366006401634, + "grad_norm": 0.111940898001194, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 316380 + }, + { + "epoch": 1.2230752578435466, + "grad_norm": 0.0978144183754921, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 316390 + }, + { + "epoch": 1.22311391504693, + "grad_norm": 0.11087758839130402, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 316400 + }, + { + "epoch": 1.2231525722503132, + "grad_norm": 0.10408814996480942, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 316410 + }, + { + "epoch": 1.2231912294536964, + "grad_norm": 0.09981653094291687, + "learning_rate": 0.002, + "loss": 2.34, + "step": 316420 + }, + { + "epoch": 1.2232298866570797, + "grad_norm": 0.10293741524219513, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 316430 + }, + { + "epoch": 1.223268543860463, + "grad_norm": 0.10573162138462067, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 316440 + }, + { + "epoch": 1.2233072010638462, + "grad_norm": 0.09074226766824722, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 316450 + }, + { + "epoch": 1.2233458582672294, + "grad_norm": 0.103923000395298, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 316460 + }, + { + "epoch": 1.2233845154706129, + "grad_norm": 0.10840972512960434, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 316470 + }, + { + "epoch": 1.2234231726739961, + "grad_norm": 0.10516613721847534, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 316480 + }, + { + "epoch": 1.2234618298773794, + "grad_norm": 0.10513375699520111, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 316490 + }, + { + "epoch": 1.2235004870807626, + "grad_norm": 0.0963342934846878, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 316500 + }, + { + "epoch": 1.223539144284146, + "grad_norm": 0.114549919962883, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 316510 + }, + { + "epoch": 1.2235778014875291, + "grad_norm": 0.11324504762887955, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 316520 + }, + { + "epoch": 1.2236164586909124, + "grad_norm": 0.11725080758333206, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 316530 + }, + { + "epoch": 1.2236551158942957, + "grad_norm": 0.10726740211248398, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 316540 + }, + { + "epoch": 1.2236937730976791, + "grad_norm": 0.10920976847410202, + "learning_rate": 0.002, + "loss": 2.34, + "step": 316550 + }, + { + "epoch": 1.2237324303010624, + "grad_norm": 0.11233483254909515, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 316560 + }, + { + "epoch": 1.2237710875044456, + "grad_norm": 0.10754575580358505, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 316570 + }, + { + "epoch": 1.2238097447078289, + "grad_norm": 0.26863402128219604, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 316580 + }, + { + "epoch": 1.2238484019112121, + "grad_norm": 1.1302027702331543, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 316590 + }, + { + "epoch": 1.2238870591145954, + "grad_norm": 1.478796124458313, + "learning_rate": 0.002, + "loss": 2.3738, + "step": 316600 + }, + { + "epoch": 1.2239257163179786, + "grad_norm": 0.12988591194152832, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 316610 + }, + { + "epoch": 1.223964373521362, + "grad_norm": 0.09178368747234344, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 316620 + }, + { + "epoch": 1.2240030307247451, + "grad_norm": 0.10592716187238693, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 316630 + }, + { + "epoch": 1.2240416879281286, + "grad_norm": 0.09742153435945511, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 316640 + }, + { + "epoch": 1.2240803451315119, + "grad_norm": 0.11273608356714249, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 316650 + }, + { + "epoch": 1.2241190023348951, + "grad_norm": 0.09497487545013428, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 316660 + }, + { + "epoch": 1.2241576595382784, + "grad_norm": 0.0983516052365303, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 316670 + }, + { + "epoch": 1.2241963167416616, + "grad_norm": 0.11890359967947006, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 316680 + }, + { + "epoch": 1.2242349739450449, + "grad_norm": 0.09866367280483246, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 316690 + }, + { + "epoch": 1.2242736311484281, + "grad_norm": 0.1096860021352768, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 316700 + }, + { + "epoch": 1.2243122883518114, + "grad_norm": 0.13388590514659882, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 316710 + }, + { + "epoch": 1.2243509455551949, + "grad_norm": 0.1254730373620987, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 316720 + }, + { + "epoch": 1.224389602758578, + "grad_norm": 0.09265025705099106, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 316730 + }, + { + "epoch": 1.2244282599619614, + "grad_norm": 0.1134009137749672, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 316740 + }, + { + "epoch": 1.2244669171653446, + "grad_norm": 0.10628406703472137, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 316750 + }, + { + "epoch": 1.2245055743687279, + "grad_norm": 0.11307065188884735, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 316760 + }, + { + "epoch": 1.2245442315721111, + "grad_norm": 0.10712608695030212, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 316770 + }, + { + "epoch": 1.2245828887754944, + "grad_norm": 0.10583803802728653, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 316780 + }, + { + "epoch": 1.2246215459788776, + "grad_norm": 0.1089818999171257, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 316790 + }, + { + "epoch": 1.2246602031822609, + "grad_norm": 0.10267559438943863, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 316800 + }, + { + "epoch": 1.2246988603856444, + "grad_norm": 0.11662919074296951, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 316810 + }, + { + "epoch": 1.2247375175890276, + "grad_norm": 0.10340824723243713, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 316820 + }, + { + "epoch": 1.2247761747924109, + "grad_norm": 0.10434640944004059, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 316830 + }, + { + "epoch": 1.224814831995794, + "grad_norm": 0.10398906469345093, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 316840 + }, + { + "epoch": 1.2248534891991774, + "grad_norm": 0.10468418896198273, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 316850 + }, + { + "epoch": 1.2248921464025606, + "grad_norm": 0.11407443881034851, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 316860 + }, + { + "epoch": 1.2249308036059439, + "grad_norm": 0.10805194824934006, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 316870 + }, + { + "epoch": 1.2249694608093271, + "grad_norm": 0.11241121590137482, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 316880 + }, + { + "epoch": 1.2250081180127106, + "grad_norm": 0.13383576273918152, + "learning_rate": 0.002, + "loss": 2.345, + "step": 316890 + }, + { + "epoch": 1.2250467752160938, + "grad_norm": 0.1047653928399086, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 316900 + }, + { + "epoch": 1.225085432419477, + "grad_norm": 0.10415913909673691, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 316910 + }, + { + "epoch": 1.2251240896228603, + "grad_norm": 0.09722809493541718, + "learning_rate": 0.002, + "loss": 2.352, + "step": 316920 + }, + { + "epoch": 1.2251627468262436, + "grad_norm": 0.11248863488435745, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 316930 + }, + { + "epoch": 1.2252014040296269, + "grad_norm": 0.10968249291181564, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 316940 + }, + { + "epoch": 1.22524006123301, + "grad_norm": 0.1054498702287674, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 316950 + }, + { + "epoch": 1.2252787184363934, + "grad_norm": 0.1123088076710701, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 316960 + }, + { + "epoch": 1.2253173756397766, + "grad_norm": 0.09681358933448792, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 316970 + }, + { + "epoch": 1.22535603284316, + "grad_norm": 0.10066763311624527, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 316980 + }, + { + "epoch": 1.2253946900465433, + "grad_norm": 0.11117463558912277, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 316990 + }, + { + "epoch": 1.2254333472499266, + "grad_norm": 0.10142451524734497, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 317000 + }, + { + "epoch": 1.2254720044533098, + "grad_norm": 0.11557786911725998, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 317010 + }, + { + "epoch": 1.225510661656693, + "grad_norm": 0.10839302092790604, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 317020 + }, + { + "epoch": 1.2255493188600763, + "grad_norm": 0.10059081763029099, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 317030 + }, + { + "epoch": 1.2255879760634596, + "grad_norm": 0.0907895416021347, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 317040 + }, + { + "epoch": 1.2256266332668428, + "grad_norm": 0.11913354694843292, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 317050 + }, + { + "epoch": 1.2256652904702263, + "grad_norm": 0.10284785181283951, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 317060 + }, + { + "epoch": 1.2257039476736096, + "grad_norm": 0.10316737741231918, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 317070 + }, + { + "epoch": 1.2257426048769928, + "grad_norm": 0.1105693131685257, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 317080 + }, + { + "epoch": 1.225781262080376, + "grad_norm": 0.10793313384056091, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 317090 + }, + { + "epoch": 1.2258199192837593, + "grad_norm": 0.09296276420354843, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 317100 + }, + { + "epoch": 1.2258585764871426, + "grad_norm": 0.11745022982358932, + "learning_rate": 0.002, + "loss": 2.347, + "step": 317110 + }, + { + "epoch": 1.2258972336905258, + "grad_norm": 0.1112813726067543, + "learning_rate": 0.002, + "loss": 2.327, + "step": 317120 + }, + { + "epoch": 1.225935890893909, + "grad_norm": 0.1136942058801651, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 317130 + }, + { + "epoch": 1.2259745480972923, + "grad_norm": 0.10509467124938965, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 317140 + }, + { + "epoch": 1.2260132053006758, + "grad_norm": 0.09688248485326767, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 317150 + }, + { + "epoch": 1.226051862504059, + "grad_norm": 0.10055442899465561, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 317160 + }, + { + "epoch": 1.2260905197074423, + "grad_norm": 0.09972388297319412, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 317170 + }, + { + "epoch": 1.2261291769108256, + "grad_norm": 0.103391632437706, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 317180 + }, + { + "epoch": 1.2261678341142088, + "grad_norm": 0.11840787529945374, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 317190 + }, + { + "epoch": 1.226206491317592, + "grad_norm": 0.10957502573728561, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 317200 + }, + { + "epoch": 1.2262451485209753, + "grad_norm": 0.09851084649562836, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 317210 + }, + { + "epoch": 1.2262838057243588, + "grad_norm": 0.1071065366268158, + "learning_rate": 0.002, + "loss": 2.342, + "step": 317220 + }, + { + "epoch": 1.226322462927742, + "grad_norm": 0.10723765939474106, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 317230 + }, + { + "epoch": 1.2263611201311253, + "grad_norm": 0.10659579187631607, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 317240 + }, + { + "epoch": 1.2263997773345086, + "grad_norm": 0.11792304366827011, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 317250 + }, + { + "epoch": 1.2264384345378918, + "grad_norm": 0.10275556892156601, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 317260 + }, + { + "epoch": 1.226477091741275, + "grad_norm": 0.10414616018533707, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 317270 + }, + { + "epoch": 1.2265157489446583, + "grad_norm": 0.10112849622964859, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 317280 + }, + { + "epoch": 1.2265544061480416, + "grad_norm": 0.11342323571443558, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 317290 + }, + { + "epoch": 1.2265930633514248, + "grad_norm": 0.3798435926437378, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 317300 + }, + { + "epoch": 1.226631720554808, + "grad_norm": 0.11032916605472565, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 317310 + }, + { + "epoch": 1.2266703777581915, + "grad_norm": 0.15820318460464478, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 317320 + }, + { + "epoch": 1.2267090349615748, + "grad_norm": 0.14765627682209015, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 317330 + }, + { + "epoch": 1.226747692164958, + "grad_norm": 0.12677901983261108, + "learning_rate": 0.002, + "loss": 2.3611, + "step": 317340 + }, + { + "epoch": 1.2267863493683413, + "grad_norm": 0.1373942494392395, + "learning_rate": 0.002, + "loss": 2.3747, + "step": 317350 + }, + { + "epoch": 1.2268250065717246, + "grad_norm": 0.1087678000330925, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 317360 + }, + { + "epoch": 1.2268636637751078, + "grad_norm": 0.09919967502355576, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 317370 + }, + { + "epoch": 1.226902320978491, + "grad_norm": 0.10102380067110062, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 317380 + }, + { + "epoch": 1.2269409781818745, + "grad_norm": 0.10804580897092819, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 317390 + }, + { + "epoch": 1.2269796353852578, + "grad_norm": 0.10194896906614304, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 317400 + }, + { + "epoch": 1.227018292588641, + "grad_norm": 0.10182174295186996, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 317410 + }, + { + "epoch": 1.2270569497920243, + "grad_norm": 0.09750444442033768, + "learning_rate": 0.002, + "loss": 2.324, + "step": 317420 + }, + { + "epoch": 1.2270956069954075, + "grad_norm": 0.10217190533876419, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 317430 + }, + { + "epoch": 1.2271342641987908, + "grad_norm": 0.11747973412275314, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 317440 + }, + { + "epoch": 1.227172921402174, + "grad_norm": 0.10178996622562408, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 317450 + }, + { + "epoch": 1.2272115786055573, + "grad_norm": 0.2366355061531067, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 317460 + }, + { + "epoch": 1.2272502358089405, + "grad_norm": 0.09020444750785828, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 317470 + }, + { + "epoch": 1.2272888930123238, + "grad_norm": 0.11036652326583862, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 317480 + }, + { + "epoch": 1.2273275502157073, + "grad_norm": 0.10769999772310257, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 317490 + }, + { + "epoch": 1.2273662074190905, + "grad_norm": 0.1324654221534729, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 317500 + }, + { + "epoch": 1.2274048646224738, + "grad_norm": 0.09234415739774704, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 317510 + }, + { + "epoch": 1.227443521825857, + "grad_norm": 0.08859698474407196, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 317520 + }, + { + "epoch": 1.2274821790292403, + "grad_norm": 0.08461683243513107, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 317530 + }, + { + "epoch": 1.2275208362326235, + "grad_norm": 0.10731338709592819, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 317540 + }, + { + "epoch": 1.2275594934360068, + "grad_norm": 0.12990544736385345, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 317550 + }, + { + "epoch": 1.2275981506393903, + "grad_norm": 0.1029435470700264, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 317560 + }, + { + "epoch": 1.2276368078427735, + "grad_norm": 0.10510992258787155, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 317570 + }, + { + "epoch": 1.2276754650461568, + "grad_norm": 0.13130173087120056, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 317580 + }, + { + "epoch": 1.22771412224954, + "grad_norm": 0.1059836894273758, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 317590 + }, + { + "epoch": 1.2277527794529233, + "grad_norm": 0.11718516796827316, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 317600 + }, + { + "epoch": 1.2277914366563065, + "grad_norm": 0.09132679551839828, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 317610 + }, + { + "epoch": 1.2278300938596898, + "grad_norm": 0.10622987151145935, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 317620 + }, + { + "epoch": 1.227868751063073, + "grad_norm": 0.09866461902856827, + "learning_rate": 0.002, + "loss": 2.341, + "step": 317630 + }, + { + "epoch": 1.2279074082664563, + "grad_norm": 0.12461042404174805, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 317640 + }, + { + "epoch": 1.2279460654698395, + "grad_norm": 0.10212338715791702, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 317650 + }, + { + "epoch": 1.227984722673223, + "grad_norm": 0.1242091953754425, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 317660 + }, + { + "epoch": 1.2280233798766063, + "grad_norm": 0.12155944108963013, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 317670 + }, + { + "epoch": 1.2280620370799895, + "grad_norm": 0.11292026191949844, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 317680 + }, + { + "epoch": 1.2281006942833728, + "grad_norm": 0.10036251693964005, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 317690 + }, + { + "epoch": 1.228139351486756, + "grad_norm": 0.10223782062530518, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 317700 + }, + { + "epoch": 1.2281780086901393, + "grad_norm": 0.1443469524383545, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 317710 + }, + { + "epoch": 1.2282166658935225, + "grad_norm": 0.13119737803936005, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 317720 + }, + { + "epoch": 1.228255323096906, + "grad_norm": 0.10402494668960571, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 317730 + }, + { + "epoch": 1.2282939803002892, + "grad_norm": 0.1034676656126976, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 317740 + }, + { + "epoch": 1.2283326375036725, + "grad_norm": 0.12188990414142609, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 317750 + }, + { + "epoch": 1.2283712947070557, + "grad_norm": 0.09709770977497101, + "learning_rate": 0.002, + "loss": 2.351, + "step": 317760 + }, + { + "epoch": 1.228409951910439, + "grad_norm": 0.1213182806968689, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 317770 + }, + { + "epoch": 1.2284486091138223, + "grad_norm": 0.1171458512544632, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 317780 + }, + { + "epoch": 1.2284872663172055, + "grad_norm": 0.09407136589288712, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 317790 + }, + { + "epoch": 1.2285259235205888, + "grad_norm": 0.09715291857719421, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 317800 + }, + { + "epoch": 1.228564580723972, + "grad_norm": 0.12914592027664185, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 317810 + }, + { + "epoch": 1.2286032379273553, + "grad_norm": 0.09152337163686752, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 317820 + }, + { + "epoch": 1.2286418951307387, + "grad_norm": 0.10001769661903381, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 317830 + }, + { + "epoch": 1.228680552334122, + "grad_norm": 0.10625078529119492, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 317840 + }, + { + "epoch": 1.2287192095375052, + "grad_norm": 0.09744622558355331, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 317850 + }, + { + "epoch": 1.2287578667408885, + "grad_norm": 0.10693547129631042, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 317860 + }, + { + "epoch": 1.2287965239442717, + "grad_norm": 0.11540830135345459, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 317870 + }, + { + "epoch": 1.228835181147655, + "grad_norm": 0.11118239164352417, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 317880 + }, + { + "epoch": 1.2288738383510383, + "grad_norm": 0.09778404235839844, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 317890 + }, + { + "epoch": 1.2289124955544217, + "grad_norm": 0.09472370147705078, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 317900 + }, + { + "epoch": 1.228951152757805, + "grad_norm": 0.09750627726316452, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 317910 + }, + { + "epoch": 1.2289898099611882, + "grad_norm": 0.11595270037651062, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 317920 + }, + { + "epoch": 1.2290284671645715, + "grad_norm": 0.12342119961977005, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 317930 + }, + { + "epoch": 1.2290671243679547, + "grad_norm": 0.09711139649152756, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 317940 + }, + { + "epoch": 1.229105781571338, + "grad_norm": 0.09201836585998535, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 317950 + }, + { + "epoch": 1.2291444387747212, + "grad_norm": 0.11474625766277313, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 317960 + }, + { + "epoch": 1.2291830959781045, + "grad_norm": 0.10597923398017883, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 317970 + }, + { + "epoch": 1.2292217531814877, + "grad_norm": 0.10793079435825348, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 317980 + }, + { + "epoch": 1.229260410384871, + "grad_norm": 0.10415330529212952, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 317990 + }, + { + "epoch": 1.2292990675882545, + "grad_norm": 0.213987335562706, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 318000 + }, + { + "epoch": 1.2293377247916377, + "grad_norm": 0.09513404965400696, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 318010 + }, + { + "epoch": 1.229376381995021, + "grad_norm": 0.10263754427433014, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 318020 + }, + { + "epoch": 1.2294150391984042, + "grad_norm": 0.10201618075370789, + "learning_rate": 0.002, + "loss": 2.344, + "step": 318030 + }, + { + "epoch": 1.2294536964017875, + "grad_norm": 0.10374777764081955, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 318040 + }, + { + "epoch": 1.2294923536051707, + "grad_norm": 0.11623865365982056, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 318050 + }, + { + "epoch": 1.229531010808554, + "grad_norm": 0.09610338509082794, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 318060 + }, + { + "epoch": 1.2295696680119375, + "grad_norm": 0.10656408220529556, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 318070 + }, + { + "epoch": 1.2296083252153207, + "grad_norm": 0.08608821034431458, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 318080 + }, + { + "epoch": 1.229646982418704, + "grad_norm": 0.11922752857208252, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 318090 + }, + { + "epoch": 1.2296856396220872, + "grad_norm": 0.1097605973482132, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 318100 + }, + { + "epoch": 1.2297242968254705, + "grad_norm": 0.10531909018754959, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 318110 + }, + { + "epoch": 1.2297629540288537, + "grad_norm": 0.11432632803916931, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 318120 + }, + { + "epoch": 1.229801611232237, + "grad_norm": 0.09751730412244797, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 318130 + }, + { + "epoch": 1.2298402684356202, + "grad_norm": 0.11484427750110626, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 318140 + }, + { + "epoch": 1.2298789256390035, + "grad_norm": 0.1034855917096138, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 318150 + }, + { + "epoch": 1.229917582842387, + "grad_norm": 0.09148947149515152, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 318160 + }, + { + "epoch": 1.2299562400457702, + "grad_norm": 0.10319654643535614, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 318170 + }, + { + "epoch": 1.2299948972491535, + "grad_norm": 0.10570099204778671, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 318180 + }, + { + "epoch": 1.2300335544525367, + "grad_norm": 0.08839817345142365, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 318190 + }, + { + "epoch": 1.23007221165592, + "grad_norm": 0.09677153825759888, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 318200 + }, + { + "epoch": 1.2301108688593032, + "grad_norm": 0.11502538621425629, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 318210 + }, + { + "epoch": 1.2301495260626865, + "grad_norm": 0.11625374108552933, + "learning_rate": 0.002, + "loss": 2.359, + "step": 318220 + }, + { + "epoch": 1.2301881832660697, + "grad_norm": 0.10126259177923203, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 318230 + }, + { + "epoch": 1.2302268404694532, + "grad_norm": 0.100708968937397, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 318240 + }, + { + "epoch": 1.2302654976728364, + "grad_norm": 0.10420756787061691, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 318250 + }, + { + "epoch": 1.2303041548762197, + "grad_norm": 0.11100134998559952, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 318260 + }, + { + "epoch": 1.230342812079603, + "grad_norm": 0.12913987040519714, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 318270 + }, + { + "epoch": 1.2303814692829862, + "grad_norm": 0.09815038740634918, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 318280 + }, + { + "epoch": 1.2304201264863694, + "grad_norm": 0.09525801986455917, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 318290 + }, + { + "epoch": 1.2304587836897527, + "grad_norm": 0.0997467041015625, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 318300 + }, + { + "epoch": 1.230497440893136, + "grad_norm": 0.10505304485559464, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 318310 + }, + { + "epoch": 1.2305360980965192, + "grad_norm": 0.11818020790815353, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 318320 + }, + { + "epoch": 1.2305747552999027, + "grad_norm": 0.12198269367218018, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 318330 + }, + { + "epoch": 1.230613412503286, + "grad_norm": 0.10771975666284561, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 318340 + }, + { + "epoch": 1.2306520697066692, + "grad_norm": 0.10391145944595337, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 318350 + }, + { + "epoch": 1.2306907269100524, + "grad_norm": 0.10317855328321457, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 318360 + }, + { + "epoch": 1.2307293841134357, + "grad_norm": 0.11238475143909454, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 318370 + }, + { + "epoch": 1.230768041316819, + "grad_norm": 0.10218264162540436, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 318380 + }, + { + "epoch": 1.2308066985202022, + "grad_norm": 0.10200051963329315, + "learning_rate": 0.002, + "loss": 2.33, + "step": 318390 + }, + { + "epoch": 1.2308453557235854, + "grad_norm": 0.10236311703920364, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 318400 + }, + { + "epoch": 1.230884012926969, + "grad_norm": 0.10305205732584, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 318410 + }, + { + "epoch": 1.2309226701303522, + "grad_norm": 0.08705495297908783, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 318420 + }, + { + "epoch": 1.2309613273337354, + "grad_norm": 0.14205306768417358, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 318430 + }, + { + "epoch": 1.2309999845371187, + "grad_norm": 0.1121765598654747, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 318440 + }, + { + "epoch": 1.231038641740502, + "grad_norm": 0.09126516431570053, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 318450 + }, + { + "epoch": 1.2310772989438852, + "grad_norm": 0.13235266506671906, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 318460 + }, + { + "epoch": 1.2311159561472684, + "grad_norm": 0.10639721155166626, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 318470 + }, + { + "epoch": 1.2311546133506517, + "grad_norm": 0.15126991271972656, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 318480 + }, + { + "epoch": 1.231193270554035, + "grad_norm": 0.116932213306427, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 318490 + }, + { + "epoch": 1.2312319277574184, + "grad_norm": 0.10263022780418396, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 318500 + }, + { + "epoch": 1.2312705849608017, + "grad_norm": 0.09604064375162125, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 318510 + }, + { + "epoch": 1.231309242164185, + "grad_norm": 0.1132984459400177, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 318520 + }, + { + "epoch": 1.2313478993675682, + "grad_norm": 0.10443414002656937, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 318530 + }, + { + "epoch": 1.2313865565709514, + "grad_norm": 0.12972725927829742, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 318540 + }, + { + "epoch": 1.2314252137743347, + "grad_norm": 0.10199279338121414, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 318550 + }, + { + "epoch": 1.231463870977718, + "grad_norm": 0.10174122452735901, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 318560 + }, + { + "epoch": 1.2315025281811012, + "grad_norm": 0.10849396884441376, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 318570 + }, + { + "epoch": 1.2315411853844846, + "grad_norm": 0.09650886058807373, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 318580 + }, + { + "epoch": 1.231579842587868, + "grad_norm": 0.09846727550029755, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 318590 + }, + { + "epoch": 1.2316184997912512, + "grad_norm": 0.10801167041063309, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 318600 + }, + { + "epoch": 1.2316571569946344, + "grad_norm": 0.1104314848780632, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 318610 + }, + { + "epoch": 1.2316958141980177, + "grad_norm": 0.11098235100507736, + "learning_rate": 0.002, + "loss": 2.336, + "step": 318620 + }, + { + "epoch": 1.231734471401401, + "grad_norm": 0.09050797671079636, + "learning_rate": 0.002, + "loss": 2.333, + "step": 318630 + }, + { + "epoch": 1.2317731286047842, + "grad_norm": 0.12480476498603821, + "learning_rate": 0.002, + "loss": 2.34, + "step": 318640 + }, + { + "epoch": 1.2318117858081674, + "grad_norm": 0.09416493028402328, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 318650 + }, + { + "epoch": 1.2318504430115507, + "grad_norm": 0.10407298058271408, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 318660 + }, + { + "epoch": 1.2318891002149341, + "grad_norm": 0.10351002961397171, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 318670 + }, + { + "epoch": 1.2319277574183174, + "grad_norm": 0.12359218299388885, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 318680 + }, + { + "epoch": 1.2319664146217006, + "grad_norm": 0.10347049683332443, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 318690 + }, + { + "epoch": 1.232005071825084, + "grad_norm": 0.11429308354854584, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 318700 + }, + { + "epoch": 1.2320437290284671, + "grad_norm": 0.0998351201415062, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 318710 + }, + { + "epoch": 1.2320823862318504, + "grad_norm": 0.09996669739484787, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 318720 + }, + { + "epoch": 1.2321210434352337, + "grad_norm": 0.11034220457077026, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 318730 + }, + { + "epoch": 1.232159700638617, + "grad_norm": 0.10390716046094894, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 318740 + }, + { + "epoch": 1.2321983578420004, + "grad_norm": 0.11131299287080765, + "learning_rate": 0.002, + "loss": 2.342, + "step": 318750 + }, + { + "epoch": 1.2322370150453836, + "grad_norm": 0.10226169228553772, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 318760 + }, + { + "epoch": 1.2322756722487669, + "grad_norm": 0.11132095754146576, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 318770 + }, + { + "epoch": 1.2323143294521501, + "grad_norm": 0.10304592549800873, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 318780 + }, + { + "epoch": 1.2323529866555334, + "grad_norm": 0.10334457457065582, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 318790 + }, + { + "epoch": 1.2323916438589166, + "grad_norm": 0.09750176966190338, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 318800 + }, + { + "epoch": 1.2324303010623, + "grad_norm": 0.1015402153134346, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 318810 + }, + { + "epoch": 1.2324689582656831, + "grad_norm": 0.09915605187416077, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 318820 + }, + { + "epoch": 1.2325076154690664, + "grad_norm": 0.14046163856983185, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 318830 + }, + { + "epoch": 1.2325462726724499, + "grad_norm": 0.09245661646127701, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 318840 + }, + { + "epoch": 1.2325849298758331, + "grad_norm": 0.1208549439907074, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 318850 + }, + { + "epoch": 1.2326235870792164, + "grad_norm": 0.11933174729347229, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 318860 + }, + { + "epoch": 1.2326622442825996, + "grad_norm": 0.10953730344772339, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 318870 + }, + { + "epoch": 1.2327009014859829, + "grad_norm": 0.09889823198318481, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 318880 + }, + { + "epoch": 1.2327395586893661, + "grad_norm": 0.11832044273614883, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 318890 + }, + { + "epoch": 1.2327782158927494, + "grad_norm": 0.09988594800233841, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 318900 + }, + { + "epoch": 1.2328168730961326, + "grad_norm": 0.11520710587501526, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 318910 + }, + { + "epoch": 1.2328555302995161, + "grad_norm": 0.11049897968769073, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 318920 + }, + { + "epoch": 1.2328941875028994, + "grad_norm": 0.09919171780347824, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 318930 + }, + { + "epoch": 1.2329328447062826, + "grad_norm": 0.11593607813119888, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 318940 + }, + { + "epoch": 1.2329715019096659, + "grad_norm": 0.10163404792547226, + "learning_rate": 0.002, + "loss": 2.347, + "step": 318950 + }, + { + "epoch": 1.2330101591130491, + "grad_norm": 0.12897267937660217, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 318960 + }, + { + "epoch": 1.2330488163164324, + "grad_norm": 0.09961461275815964, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 318970 + }, + { + "epoch": 1.2330874735198156, + "grad_norm": 0.11196241527795792, + "learning_rate": 0.002, + "loss": 2.335, + "step": 318980 + }, + { + "epoch": 1.2331261307231989, + "grad_norm": 0.10454481095075607, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 318990 + }, + { + "epoch": 1.2331647879265821, + "grad_norm": 0.10805007815361023, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 319000 + }, + { + "epoch": 1.2332034451299656, + "grad_norm": 0.14677643775939941, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 319010 + }, + { + "epoch": 1.2332421023333489, + "grad_norm": 0.09992588311433792, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 319020 + }, + { + "epoch": 1.233280759536732, + "grad_norm": 0.09757273644208908, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 319030 + }, + { + "epoch": 1.2333194167401154, + "grad_norm": 0.10164768248796463, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 319040 + }, + { + "epoch": 1.2333580739434986, + "grad_norm": 0.10568263381719589, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 319050 + }, + { + "epoch": 1.2333967311468819, + "grad_norm": 0.1015145406126976, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 319060 + }, + { + "epoch": 1.2334353883502651, + "grad_norm": 0.11714985966682434, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 319070 + }, + { + "epoch": 1.2334740455536486, + "grad_norm": 0.10222362726926804, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 319080 + }, + { + "epoch": 1.2335127027570318, + "grad_norm": 0.1122443675994873, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 319090 + }, + { + "epoch": 1.233551359960415, + "grad_norm": 0.09449625760316849, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 319100 + }, + { + "epoch": 1.2335900171637983, + "grad_norm": 0.11275065690279007, + "learning_rate": 0.002, + "loss": 2.325, + "step": 319110 + }, + { + "epoch": 1.2336286743671816, + "grad_norm": 0.09365367144346237, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 319120 + }, + { + "epoch": 1.2336673315705649, + "grad_norm": 0.09719277918338776, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 319130 + }, + { + "epoch": 1.233705988773948, + "grad_norm": 0.11273623257875443, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 319140 + }, + { + "epoch": 1.2337446459773314, + "grad_norm": 0.11690273135900497, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 319150 + }, + { + "epoch": 1.2337833031807146, + "grad_norm": 0.10188272595405579, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 319160 + }, + { + "epoch": 1.2338219603840979, + "grad_norm": 0.09970256686210632, + "learning_rate": 0.002, + "loss": 2.335, + "step": 319170 + }, + { + "epoch": 1.2338606175874813, + "grad_norm": 0.112132728099823, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 319180 + }, + { + "epoch": 1.2338992747908646, + "grad_norm": 0.10451371222734451, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 319190 + }, + { + "epoch": 1.2339379319942478, + "grad_norm": 0.11047357320785522, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 319200 + }, + { + "epoch": 1.233976589197631, + "grad_norm": 0.10152693837881088, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 319210 + }, + { + "epoch": 1.2340152464010143, + "grad_norm": 0.10769753903150558, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 319220 + }, + { + "epoch": 1.2340539036043976, + "grad_norm": 0.10410183668136597, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 319230 + }, + { + "epoch": 1.2340925608077808, + "grad_norm": 0.09276141226291656, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 319240 + }, + { + "epoch": 1.2341312180111643, + "grad_norm": 0.09850878268480301, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 319250 + }, + { + "epoch": 1.2341698752145476, + "grad_norm": 0.10244489461183548, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 319260 + }, + { + "epoch": 1.2342085324179308, + "grad_norm": 0.11640128493309021, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 319270 + }, + { + "epoch": 1.234247189621314, + "grad_norm": 0.1207994893193245, + "learning_rate": 0.002, + "loss": 2.332, + "step": 319280 + }, + { + "epoch": 1.2342858468246973, + "grad_norm": 0.10476819425821304, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 319290 + }, + { + "epoch": 1.2343245040280806, + "grad_norm": 0.10477297753095627, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 319300 + }, + { + "epoch": 1.2343631612314638, + "grad_norm": 0.10364378988742828, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 319310 + }, + { + "epoch": 1.234401818434847, + "grad_norm": 0.10070034861564636, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 319320 + }, + { + "epoch": 1.2344404756382303, + "grad_norm": 0.1035122498869896, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 319330 + }, + { + "epoch": 1.2344791328416136, + "grad_norm": 0.11822505295276642, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 319340 + }, + { + "epoch": 1.234517790044997, + "grad_norm": 0.10413868725299835, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 319350 + }, + { + "epoch": 1.2345564472483803, + "grad_norm": 0.11653405427932739, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 319360 + }, + { + "epoch": 1.2345951044517636, + "grad_norm": 0.09522883594036102, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 319370 + }, + { + "epoch": 1.2346337616551468, + "grad_norm": 0.13596872985363007, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 319380 + }, + { + "epoch": 1.23467241885853, + "grad_norm": 0.09495305269956589, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 319390 + }, + { + "epoch": 1.2347110760619133, + "grad_norm": 0.09859729558229446, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 319400 + }, + { + "epoch": 1.2347497332652966, + "grad_norm": 0.09662479907274246, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 319410 + }, + { + "epoch": 1.23478839046868, + "grad_norm": 0.11425012350082397, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 319420 + }, + { + "epoch": 1.2348270476720633, + "grad_norm": 0.12249550223350525, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 319430 + }, + { + "epoch": 1.2348657048754466, + "grad_norm": 0.09997022151947021, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 319440 + }, + { + "epoch": 1.2349043620788298, + "grad_norm": 0.11657397449016571, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 319450 + }, + { + "epoch": 1.234943019282213, + "grad_norm": 0.08549603074789047, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 319460 + }, + { + "epoch": 1.2349816764855963, + "grad_norm": 0.10350823402404785, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 319470 + }, + { + "epoch": 1.2350203336889796, + "grad_norm": 0.10171400755643845, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 319480 + }, + { + "epoch": 1.2350589908923628, + "grad_norm": 0.1119752749800682, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 319490 + }, + { + "epoch": 1.235097648095746, + "grad_norm": 0.11628837138414383, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 319500 + }, + { + "epoch": 1.2351363052991293, + "grad_norm": 0.10625061392784119, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 319510 + }, + { + "epoch": 1.2351749625025128, + "grad_norm": 0.09767068922519684, + "learning_rate": 0.002, + "loss": 2.3076, + "step": 319520 + }, + { + "epoch": 1.235213619705896, + "grad_norm": 0.10150033980607986, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 319530 + }, + { + "epoch": 1.2352522769092793, + "grad_norm": 0.10505492240190506, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 319540 + }, + { + "epoch": 1.2352909341126626, + "grad_norm": 0.09431470930576324, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 319550 + }, + { + "epoch": 1.2353295913160458, + "grad_norm": 0.0902949795126915, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 319560 + }, + { + "epoch": 1.235368248519429, + "grad_norm": 0.09864936023950577, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 319570 + }, + { + "epoch": 1.2354069057228123, + "grad_norm": 0.12213067710399628, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 319580 + }, + { + "epoch": 1.2354455629261958, + "grad_norm": 0.0943695455789566, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 319590 + }, + { + "epoch": 1.235484220129579, + "grad_norm": 0.12795186042785645, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 319600 + }, + { + "epoch": 1.2355228773329623, + "grad_norm": 0.1095963716506958, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 319610 + }, + { + "epoch": 1.2355615345363455, + "grad_norm": 0.10382067412137985, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 319620 + }, + { + "epoch": 1.2356001917397288, + "grad_norm": 0.11466068774461746, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 319630 + }, + { + "epoch": 1.235638848943112, + "grad_norm": 0.1038190945982933, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 319640 + }, + { + "epoch": 1.2356775061464953, + "grad_norm": 0.13916151225566864, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 319650 + }, + { + "epoch": 1.2357161633498785, + "grad_norm": 0.14593204855918884, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 319660 + }, + { + "epoch": 1.2357548205532618, + "grad_norm": 0.22053200006484985, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 319670 + }, + { + "epoch": 1.235793477756645, + "grad_norm": 0.10943808406591415, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 319680 + }, + { + "epoch": 1.2358321349600285, + "grad_norm": 0.09973354637622833, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 319690 + }, + { + "epoch": 1.2358707921634118, + "grad_norm": 0.09205275028944016, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 319700 + }, + { + "epoch": 1.235909449366795, + "grad_norm": 0.1062823161482811, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 319710 + }, + { + "epoch": 1.2359481065701783, + "grad_norm": 0.10272687673568726, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 319720 + }, + { + "epoch": 1.2359867637735615, + "grad_norm": 0.1765047013759613, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 319730 + }, + { + "epoch": 1.2360254209769448, + "grad_norm": 0.10312442481517792, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 319740 + }, + { + "epoch": 1.236064078180328, + "grad_norm": 0.10451663285493851, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 319750 + }, + { + "epoch": 1.2361027353837115, + "grad_norm": 0.11653146147727966, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 319760 + }, + { + "epoch": 1.2361413925870948, + "grad_norm": 0.11353792995214462, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 319770 + }, + { + "epoch": 1.236180049790478, + "grad_norm": 0.10471892356872559, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 319780 + }, + { + "epoch": 1.2362187069938613, + "grad_norm": 0.11989996582269669, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 319790 + }, + { + "epoch": 1.2362573641972445, + "grad_norm": 0.09007346630096436, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 319800 + }, + { + "epoch": 1.2362960214006278, + "grad_norm": 0.10456572473049164, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 319810 + }, + { + "epoch": 1.236334678604011, + "grad_norm": 0.11958345770835876, + "learning_rate": 0.002, + "loss": 2.326, + "step": 319820 + }, + { + "epoch": 1.2363733358073943, + "grad_norm": 0.10415913909673691, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 319830 + }, + { + "epoch": 1.2364119930107775, + "grad_norm": 0.10972173511981964, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 319840 + }, + { + "epoch": 1.2364506502141608, + "grad_norm": 0.12542030215263367, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 319850 + }, + { + "epoch": 1.2364893074175443, + "grad_norm": 0.10410010814666748, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 319860 + }, + { + "epoch": 1.2365279646209275, + "grad_norm": 0.10815926641225815, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 319870 + }, + { + "epoch": 1.2365666218243108, + "grad_norm": 0.11116991937160492, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 319880 + }, + { + "epoch": 1.236605279027694, + "grad_norm": 0.11590959131717682, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 319890 + }, + { + "epoch": 1.2366439362310773, + "grad_norm": 0.22155319154262543, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 319900 + }, + { + "epoch": 1.2366825934344605, + "grad_norm": 0.11598339676856995, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 319910 + }, + { + "epoch": 1.2367212506378438, + "grad_norm": 0.11123668402433395, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 319920 + }, + { + "epoch": 1.2367599078412272, + "grad_norm": 0.1021030843257904, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 319930 + }, + { + "epoch": 1.2367985650446105, + "grad_norm": 0.12723006308078766, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 319940 + }, + { + "epoch": 1.2368372222479938, + "grad_norm": 0.10411926358938217, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 319950 + }, + { + "epoch": 1.236875879451377, + "grad_norm": 0.11432521045207977, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 319960 + }, + { + "epoch": 1.2369145366547603, + "grad_norm": 0.10154687613248825, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 319970 + }, + { + "epoch": 1.2369531938581435, + "grad_norm": 0.10039026290178299, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 319980 + }, + { + "epoch": 1.2369918510615268, + "grad_norm": 0.10765459388494492, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 319990 + }, + { + "epoch": 1.23703050826491, + "grad_norm": 0.113979272544384, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 320000 + }, + { + "epoch": 1.2370691654682933, + "grad_norm": 0.096729576587677, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 320010 + }, + { + "epoch": 1.2371078226716765, + "grad_norm": 0.10932206362485886, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 320020 + }, + { + "epoch": 1.23714647987506, + "grad_norm": 0.10438414663076401, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 320030 + }, + { + "epoch": 1.2371851370784432, + "grad_norm": 0.09965009987354279, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 320040 + }, + { + "epoch": 1.2372237942818265, + "grad_norm": 0.10202410072088242, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 320050 + }, + { + "epoch": 1.2372624514852097, + "grad_norm": 0.10439611971378326, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 320060 + }, + { + "epoch": 1.237301108688593, + "grad_norm": 0.10762651264667511, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 320070 + }, + { + "epoch": 1.2373397658919763, + "grad_norm": 0.10113856941461563, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 320080 + }, + { + "epoch": 1.2373784230953595, + "grad_norm": 0.09861285239458084, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 320090 + }, + { + "epoch": 1.237417080298743, + "grad_norm": 0.12643854320049286, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 320100 + }, + { + "epoch": 1.2374557375021262, + "grad_norm": 0.10227780789136887, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 320110 + }, + { + "epoch": 1.2374943947055095, + "grad_norm": 0.10449239611625671, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 320120 + }, + { + "epoch": 1.2375330519088927, + "grad_norm": 0.10152996331453323, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 320130 + }, + { + "epoch": 1.237571709112276, + "grad_norm": 0.11013568937778473, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 320140 + }, + { + "epoch": 1.2376103663156592, + "grad_norm": 0.12788760662078857, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 320150 + }, + { + "epoch": 1.2376490235190425, + "grad_norm": 0.27631428837776184, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 320160 + }, + { + "epoch": 1.2376876807224257, + "grad_norm": 0.1050380989909172, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 320170 + }, + { + "epoch": 1.237726337925809, + "grad_norm": 0.19374831020832062, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 320180 + }, + { + "epoch": 1.2377649951291925, + "grad_norm": 0.1197795569896698, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 320190 + }, + { + "epoch": 1.2378036523325757, + "grad_norm": 0.10829479247331619, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 320200 + }, + { + "epoch": 1.237842309535959, + "grad_norm": 0.11106017231941223, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 320210 + }, + { + "epoch": 1.2378809667393422, + "grad_norm": 0.10681232810020447, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 320220 + }, + { + "epoch": 1.2379196239427255, + "grad_norm": 0.09508251398801804, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 320230 + }, + { + "epoch": 1.2379582811461087, + "grad_norm": 0.10988068580627441, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 320240 + }, + { + "epoch": 1.237996938349492, + "grad_norm": 0.09734196215867996, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 320250 + }, + { + "epoch": 1.2380355955528752, + "grad_norm": 0.10664012283086777, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 320260 + }, + { + "epoch": 1.2380742527562587, + "grad_norm": 0.10162442177534103, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 320270 + }, + { + "epoch": 1.238112909959642, + "grad_norm": 0.11559383571147919, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 320280 + }, + { + "epoch": 1.2381515671630252, + "grad_norm": 0.10670052468776703, + "learning_rate": 0.002, + "loss": 2.331, + "step": 320290 + }, + { + "epoch": 1.2381902243664085, + "grad_norm": 0.11281143128871918, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 320300 + }, + { + "epoch": 1.2382288815697917, + "grad_norm": 0.10445000231266022, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 320310 + }, + { + "epoch": 1.238267538773175, + "grad_norm": 0.1028844341635704, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 320320 + }, + { + "epoch": 1.2383061959765582, + "grad_norm": 0.10735354572534561, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 320330 + }, + { + "epoch": 1.2383448531799415, + "grad_norm": 0.10678750276565552, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 320340 + }, + { + "epoch": 1.2383835103833247, + "grad_norm": 0.09609895199537277, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 320350 + }, + { + "epoch": 1.2384221675867082, + "grad_norm": 0.10678284615278244, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 320360 + }, + { + "epoch": 1.2384608247900915, + "grad_norm": 0.10990513861179352, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 320370 + }, + { + "epoch": 1.2384994819934747, + "grad_norm": 0.12047272175550461, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 320380 + }, + { + "epoch": 1.238538139196858, + "grad_norm": 0.09940064698457718, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 320390 + }, + { + "epoch": 1.2385767964002412, + "grad_norm": 0.09595578163862228, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 320400 + }, + { + "epoch": 1.2386154536036245, + "grad_norm": 0.11717253923416138, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 320410 + }, + { + "epoch": 1.2386541108070077, + "grad_norm": 0.1013934463262558, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 320420 + }, + { + "epoch": 1.238692768010391, + "grad_norm": 0.1056397333741188, + "learning_rate": 0.002, + "loss": 2.3122, + "step": 320430 + }, + { + "epoch": 1.2387314252137744, + "grad_norm": 0.11160103231668472, + "learning_rate": 0.002, + "loss": 2.332, + "step": 320440 + }, + { + "epoch": 1.2387700824171577, + "grad_norm": 0.09870457649230957, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 320450 + }, + { + "epoch": 1.238808739620541, + "grad_norm": 0.10088534653186798, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 320460 + }, + { + "epoch": 1.2388473968239242, + "grad_norm": 0.09906332194805145, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 320470 + }, + { + "epoch": 1.2388860540273074, + "grad_norm": 0.10245811194181442, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 320480 + }, + { + "epoch": 1.2389247112306907, + "grad_norm": 0.11680164188146591, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 320490 + }, + { + "epoch": 1.238963368434074, + "grad_norm": 0.09580478072166443, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 320500 + }, + { + "epoch": 1.2390020256374572, + "grad_norm": 0.10380738228559494, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 320510 + }, + { + "epoch": 1.2390406828408405, + "grad_norm": 0.10249635577201843, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 320520 + }, + { + "epoch": 1.239079340044224, + "grad_norm": 0.11178512126207352, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 320530 + }, + { + "epoch": 1.2391179972476072, + "grad_norm": 0.10207261145114899, + "learning_rate": 0.002, + "loss": 2.327, + "step": 320540 + }, + { + "epoch": 1.2391566544509904, + "grad_norm": 0.11807084828615189, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 320550 + }, + { + "epoch": 1.2391953116543737, + "grad_norm": 0.10369338095188141, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 320560 + }, + { + "epoch": 1.239233968857757, + "grad_norm": 0.09732040017843246, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 320570 + }, + { + "epoch": 1.2392726260611402, + "grad_norm": 0.10082051903009415, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 320580 + }, + { + "epoch": 1.2393112832645234, + "grad_norm": 0.10379715263843536, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 320590 + }, + { + "epoch": 1.2393499404679067, + "grad_norm": 0.10741373896598816, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 320600 + }, + { + "epoch": 1.2393885976712902, + "grad_norm": 0.0996238961815834, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 320610 + }, + { + "epoch": 1.2394272548746734, + "grad_norm": 0.10056623816490173, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 320620 + }, + { + "epoch": 1.2394659120780567, + "grad_norm": 0.11594576388597488, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 320630 + }, + { + "epoch": 1.23950456928144, + "grad_norm": 0.11052383482456207, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 320640 + }, + { + "epoch": 1.2395432264848232, + "grad_norm": 0.10528027266263962, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 320650 + }, + { + "epoch": 1.2395818836882064, + "grad_norm": 0.08858749270439148, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 320660 + }, + { + "epoch": 1.2396205408915897, + "grad_norm": 0.1268991082906723, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 320670 + }, + { + "epoch": 1.239659198094973, + "grad_norm": 0.10651437938213348, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 320680 + }, + { + "epoch": 1.2396978552983562, + "grad_norm": 0.10093619674444199, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 320690 + }, + { + "epoch": 1.2397365125017397, + "grad_norm": 0.10272887349128723, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 320700 + }, + { + "epoch": 1.239775169705123, + "grad_norm": 0.10681917518377304, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 320710 + }, + { + "epoch": 1.2398138269085062, + "grad_norm": 0.10045037418603897, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 320720 + }, + { + "epoch": 1.2398524841118894, + "grad_norm": 0.10670724511146545, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 320730 + }, + { + "epoch": 1.2398911413152727, + "grad_norm": 0.11286060512065887, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 320740 + }, + { + "epoch": 1.239929798518656, + "grad_norm": 0.09124636650085449, + "learning_rate": 0.002, + "loss": 2.323, + "step": 320750 + }, + { + "epoch": 1.2399684557220392, + "grad_norm": 0.11720071732997894, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 320760 + }, + { + "epoch": 1.2400071129254224, + "grad_norm": 0.11184117943048477, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 320770 + }, + { + "epoch": 1.240045770128806, + "grad_norm": 0.09715303778648376, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 320780 + }, + { + "epoch": 1.2400844273321892, + "grad_norm": 0.10788849741220474, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 320790 + }, + { + "epoch": 1.2401230845355724, + "grad_norm": 0.10525120049715042, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 320800 + }, + { + "epoch": 1.2401617417389557, + "grad_norm": 0.10268529504537582, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 320810 + }, + { + "epoch": 1.240200398942339, + "grad_norm": 0.11665871739387512, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 320820 + }, + { + "epoch": 1.2402390561457222, + "grad_norm": 0.09270110726356506, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 320830 + }, + { + "epoch": 1.2402777133491054, + "grad_norm": 0.12644942104816437, + "learning_rate": 0.002, + "loss": 2.338, + "step": 320840 + }, + { + "epoch": 1.2403163705524887, + "grad_norm": 0.1157117709517479, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 320850 + }, + { + "epoch": 1.240355027755872, + "grad_norm": 0.11809808015823364, + "learning_rate": 0.002, + "loss": 2.351, + "step": 320860 + }, + { + "epoch": 1.2403936849592554, + "grad_norm": 0.114087775349617, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 320870 + }, + { + "epoch": 1.2404323421626386, + "grad_norm": 0.10386577993631363, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 320880 + }, + { + "epoch": 1.240470999366022, + "grad_norm": 0.11299306899309158, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 320890 + }, + { + "epoch": 1.2405096565694052, + "grad_norm": 0.09175659716129303, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 320900 + }, + { + "epoch": 1.2405483137727884, + "grad_norm": 0.1031114012002945, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 320910 + }, + { + "epoch": 1.2405869709761717, + "grad_norm": 0.13444626331329346, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 320920 + }, + { + "epoch": 1.240625628179555, + "grad_norm": 0.10885864496231079, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 320930 + }, + { + "epoch": 1.2406642853829384, + "grad_norm": 0.09645969420671463, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 320940 + }, + { + "epoch": 1.2407029425863216, + "grad_norm": 0.10317646712064743, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 320950 + }, + { + "epoch": 1.2407415997897049, + "grad_norm": 0.10633083432912827, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 320960 + }, + { + "epoch": 1.2407802569930881, + "grad_norm": 0.09729152172803879, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 320970 + }, + { + "epoch": 1.2408189141964714, + "grad_norm": 0.1272486001253128, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 320980 + }, + { + "epoch": 1.2408575713998546, + "grad_norm": 0.1055624857544899, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 320990 + }, + { + "epoch": 1.240896228603238, + "grad_norm": 0.09390482306480408, + "learning_rate": 0.002, + "loss": 2.341, + "step": 321000 + }, + { + "epoch": 1.2409348858066211, + "grad_norm": 0.09995166212320328, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 321010 + }, + { + "epoch": 1.2409735430100044, + "grad_norm": 0.10061180591583252, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 321020 + }, + { + "epoch": 1.2410122002133877, + "grad_norm": 0.10821964591741562, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 321030 + }, + { + "epoch": 1.2410508574167711, + "grad_norm": 0.10576090961694717, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 321040 + }, + { + "epoch": 1.2410895146201544, + "grad_norm": 0.10806534439325333, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 321050 + }, + { + "epoch": 1.2411281718235376, + "grad_norm": 0.11444949358701706, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 321060 + }, + { + "epoch": 1.2411668290269209, + "grad_norm": 0.10749319940805435, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 321070 + }, + { + "epoch": 1.2412054862303041, + "grad_norm": 0.12176235020160675, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 321080 + }, + { + "epoch": 1.2412441434336874, + "grad_norm": 0.10878723859786987, + "learning_rate": 0.002, + "loss": 2.341, + "step": 321090 + }, + { + "epoch": 1.2412828006370706, + "grad_norm": 0.11336250603199005, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 321100 + }, + { + "epoch": 1.2413214578404541, + "grad_norm": 0.0931602269411087, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 321110 + }, + { + "epoch": 1.2413601150438374, + "grad_norm": 0.10910608619451523, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 321120 + }, + { + "epoch": 1.2413987722472206, + "grad_norm": 0.1164921373128891, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 321130 + }, + { + "epoch": 1.2414374294506039, + "grad_norm": 0.09855716675519943, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 321140 + }, + { + "epoch": 1.2414760866539871, + "grad_norm": 0.10300564020872116, + "learning_rate": 0.002, + "loss": 2.336, + "step": 321150 + }, + { + "epoch": 1.2415147438573704, + "grad_norm": 0.11619854718446732, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 321160 + }, + { + "epoch": 1.2415534010607536, + "grad_norm": 0.11751411855220795, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 321170 + }, + { + "epoch": 1.2415920582641369, + "grad_norm": 0.10656020045280457, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 321180 + }, + { + "epoch": 1.2416307154675201, + "grad_norm": 0.09795525670051575, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 321190 + }, + { + "epoch": 1.2416693726709034, + "grad_norm": 0.12078218907117844, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 321200 + }, + { + "epoch": 1.2417080298742869, + "grad_norm": 0.1273619532585144, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 321210 + }, + { + "epoch": 1.24174668707767, + "grad_norm": 0.0964427962899208, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 321220 + }, + { + "epoch": 1.2417853442810534, + "grad_norm": 0.11902978271245956, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 321230 + }, + { + "epoch": 1.2418240014844366, + "grad_norm": 0.1078677698969841, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 321240 + }, + { + "epoch": 1.2418626586878199, + "grad_norm": 0.08856847137212753, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 321250 + }, + { + "epoch": 1.2419013158912031, + "grad_norm": 0.09772702306509018, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 321260 + }, + { + "epoch": 1.2419399730945864, + "grad_norm": 0.11561083793640137, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 321270 + }, + { + "epoch": 1.2419786302979698, + "grad_norm": 0.10868635773658752, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 321280 + }, + { + "epoch": 1.242017287501353, + "grad_norm": 0.09984630346298218, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 321290 + }, + { + "epoch": 1.2420559447047363, + "grad_norm": 0.10971865803003311, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 321300 + }, + { + "epoch": 1.2420946019081196, + "grad_norm": 0.10339515656232834, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 321310 + }, + { + "epoch": 1.2421332591115029, + "grad_norm": 0.1033627837896347, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 321320 + }, + { + "epoch": 1.242171916314886, + "grad_norm": 0.11540020257234573, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 321330 + }, + { + "epoch": 1.2422105735182694, + "grad_norm": 0.11354392021894455, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 321340 + }, + { + "epoch": 1.2422492307216526, + "grad_norm": 0.11125141382217407, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 321350 + }, + { + "epoch": 1.2422878879250359, + "grad_norm": 0.09528633952140808, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 321360 + }, + { + "epoch": 1.2423265451284191, + "grad_norm": 0.10134680569171906, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 321370 + }, + { + "epoch": 1.2423652023318026, + "grad_norm": 0.10753732174634933, + "learning_rate": 0.002, + "loss": 2.335, + "step": 321380 + }, + { + "epoch": 1.2424038595351858, + "grad_norm": 0.10672889649868011, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 321390 + }, + { + "epoch": 1.242442516738569, + "grad_norm": 0.11310621351003647, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 321400 + }, + { + "epoch": 1.2424811739419523, + "grad_norm": 0.10450848191976547, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 321410 + }, + { + "epoch": 1.2425198311453356, + "grad_norm": 0.11112019419670105, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 321420 + }, + { + "epoch": 1.2425584883487188, + "grad_norm": 0.10467560589313507, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 321430 + }, + { + "epoch": 1.242597145552102, + "grad_norm": 0.11490056663751602, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 321440 + }, + { + "epoch": 1.2426358027554856, + "grad_norm": 0.11514199525117874, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 321450 + }, + { + "epoch": 1.2426744599588688, + "grad_norm": 0.09664133936166763, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 321460 + }, + { + "epoch": 1.242713117162252, + "grad_norm": 0.09833734482526779, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 321470 + }, + { + "epoch": 1.2427517743656353, + "grad_norm": 0.0944104790687561, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 321480 + }, + { + "epoch": 1.2427904315690186, + "grad_norm": 0.113669253885746, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 321490 + }, + { + "epoch": 1.2428290887724018, + "grad_norm": 0.11991959810256958, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 321500 + }, + { + "epoch": 1.242867745975785, + "grad_norm": 0.11033076792955399, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 321510 + }, + { + "epoch": 1.2429064031791683, + "grad_norm": 0.10738854855298996, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 321520 + }, + { + "epoch": 1.2429450603825516, + "grad_norm": 0.09296815097332001, + "learning_rate": 0.002, + "loss": 2.3152, + "step": 321530 + }, + { + "epoch": 1.2429837175859348, + "grad_norm": 0.09881693869829178, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 321540 + }, + { + "epoch": 1.2430223747893183, + "grad_norm": 0.10417408496141434, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 321550 + }, + { + "epoch": 1.2430610319927016, + "grad_norm": 0.11233276128768921, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 321560 + }, + { + "epoch": 1.2430996891960848, + "grad_norm": 0.12582659721374512, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 321570 + }, + { + "epoch": 1.243138346399468, + "grad_norm": 0.10311932116746902, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 321580 + }, + { + "epoch": 1.2431770036028513, + "grad_norm": 0.11493781954050064, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 321590 + }, + { + "epoch": 1.2432156608062346, + "grad_norm": 0.1054019182920456, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 321600 + }, + { + "epoch": 1.2432543180096178, + "grad_norm": 0.09729453176259995, + "learning_rate": 0.002, + "loss": 2.351, + "step": 321610 + }, + { + "epoch": 1.2432929752130013, + "grad_norm": 0.11637569218873978, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 321620 + }, + { + "epoch": 1.2433316324163846, + "grad_norm": 0.1038183718919754, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 321630 + }, + { + "epoch": 1.2433702896197678, + "grad_norm": 0.13066309690475464, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 321640 + }, + { + "epoch": 1.243408946823151, + "grad_norm": 0.0983934998512268, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 321650 + }, + { + "epoch": 1.2434476040265343, + "grad_norm": 0.08770925551652908, + "learning_rate": 0.002, + "loss": 2.334, + "step": 321660 + }, + { + "epoch": 1.2434862612299176, + "grad_norm": 0.11882904917001724, + "learning_rate": 0.002, + "loss": 2.321, + "step": 321670 + }, + { + "epoch": 1.2435249184333008, + "grad_norm": 0.0967506393790245, + "learning_rate": 0.002, + "loss": 2.335, + "step": 321680 + }, + { + "epoch": 1.243563575636684, + "grad_norm": 0.10030996799468994, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 321690 + }, + { + "epoch": 1.2436022328400673, + "grad_norm": 0.11539009213447571, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 321700 + }, + { + "epoch": 1.2436408900434506, + "grad_norm": 0.10054624825716019, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 321710 + }, + { + "epoch": 1.243679547246834, + "grad_norm": 0.11041809618473053, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 321720 + }, + { + "epoch": 1.2437182044502173, + "grad_norm": 0.10580580681562424, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 321730 + }, + { + "epoch": 1.2437568616536006, + "grad_norm": 0.10055314004421234, + "learning_rate": 0.002, + "loss": 2.326, + "step": 321740 + }, + { + "epoch": 1.2437955188569838, + "grad_norm": 0.1091417744755745, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 321750 + }, + { + "epoch": 1.243834176060367, + "grad_norm": 0.10327596962451935, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 321760 + }, + { + "epoch": 1.2438728332637503, + "grad_norm": 0.09359539300203323, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 321770 + }, + { + "epoch": 1.2439114904671336, + "grad_norm": 0.10791551321744919, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 321780 + }, + { + "epoch": 1.243950147670517, + "grad_norm": 0.11361528187990189, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 321790 + }, + { + "epoch": 1.2439888048739003, + "grad_norm": 0.1098126620054245, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 321800 + }, + { + "epoch": 1.2440274620772835, + "grad_norm": 0.1013980358839035, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 321810 + }, + { + "epoch": 1.2440661192806668, + "grad_norm": 0.1106821596622467, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 321820 + }, + { + "epoch": 1.24410477648405, + "grad_norm": 0.10736387223005295, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 321830 + }, + { + "epoch": 1.2441434336874333, + "grad_norm": 0.09489861130714417, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 321840 + }, + { + "epoch": 1.2441820908908165, + "grad_norm": 0.11261710524559021, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 321850 + }, + { + "epoch": 1.2442207480941998, + "grad_norm": 0.09783985465765, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 321860 + }, + { + "epoch": 1.244259405297583, + "grad_norm": 0.11677198112010956, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 321870 + }, + { + "epoch": 1.2442980625009663, + "grad_norm": 0.11479967087507248, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 321880 + }, + { + "epoch": 1.2443367197043498, + "grad_norm": 0.10969112068414688, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 321890 + }, + { + "epoch": 1.244375376907733, + "grad_norm": 0.11333106458187103, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 321900 + }, + { + "epoch": 1.2444140341111163, + "grad_norm": 0.11218047887086868, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 321910 + }, + { + "epoch": 1.2444526913144995, + "grad_norm": 0.11694422364234924, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 321920 + }, + { + "epoch": 1.2444913485178828, + "grad_norm": 0.10061642527580261, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 321930 + }, + { + "epoch": 1.244530005721266, + "grad_norm": 0.11137649416923523, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 321940 + }, + { + "epoch": 1.2445686629246493, + "grad_norm": 0.10614387691020966, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 321950 + }, + { + "epoch": 1.2446073201280328, + "grad_norm": 0.11743594706058502, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 321960 + }, + { + "epoch": 1.244645977331416, + "grad_norm": 0.11147256195545197, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 321970 + }, + { + "epoch": 1.2446846345347993, + "grad_norm": 0.0961056798696518, + "learning_rate": 0.002, + "loss": 2.344, + "step": 321980 + }, + { + "epoch": 1.2447232917381825, + "grad_norm": 0.11435211449861526, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 321990 + }, + { + "epoch": 1.2447619489415658, + "grad_norm": 0.10239315778017044, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 322000 + }, + { + "epoch": 1.244800606144949, + "grad_norm": 0.10872689634561539, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 322010 + }, + { + "epoch": 1.2448392633483323, + "grad_norm": 0.10963756591081619, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 322020 + }, + { + "epoch": 1.2448779205517155, + "grad_norm": 0.11242996156215668, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 322030 + }, + { + "epoch": 1.2449165777550988, + "grad_norm": 0.1051398292183876, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 322040 + }, + { + "epoch": 1.2449552349584823, + "grad_norm": 0.11519026011228561, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 322050 + }, + { + "epoch": 1.2449938921618655, + "grad_norm": 0.09578767418861389, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 322060 + }, + { + "epoch": 1.2450325493652488, + "grad_norm": 0.10644643753767014, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 322070 + }, + { + "epoch": 1.245071206568632, + "grad_norm": 0.11046939343214035, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 322080 + }, + { + "epoch": 1.2451098637720153, + "grad_norm": 0.15813681483268738, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 322090 + }, + { + "epoch": 1.2451485209753985, + "grad_norm": 0.09647150337696075, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 322100 + }, + { + "epoch": 1.2451871781787818, + "grad_norm": 0.1031809076666832, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 322110 + }, + { + "epoch": 1.245225835382165, + "grad_norm": 0.09915446490049362, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 322120 + }, + { + "epoch": 1.2452644925855485, + "grad_norm": 0.0994412750005722, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 322130 + }, + { + "epoch": 1.2453031497889318, + "grad_norm": 0.0929538905620575, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 322140 + }, + { + "epoch": 1.245341806992315, + "grad_norm": 0.10131704062223434, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 322150 + }, + { + "epoch": 1.2453804641956983, + "grad_norm": 0.10987386107444763, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 322160 + }, + { + "epoch": 1.2454191213990815, + "grad_norm": 0.13560396432876587, + "learning_rate": 0.002, + "loss": 2.346, + "step": 322170 + }, + { + "epoch": 1.2454577786024648, + "grad_norm": 0.10473495721817017, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 322180 + }, + { + "epoch": 1.245496435805848, + "grad_norm": 0.09378011524677277, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 322190 + }, + { + "epoch": 1.2455350930092313, + "grad_norm": 0.10519225895404816, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 322200 + }, + { + "epoch": 1.2455737502126145, + "grad_norm": 0.11469787359237671, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 322210 + }, + { + "epoch": 1.245612407415998, + "grad_norm": 0.10396713018417358, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 322220 + }, + { + "epoch": 1.2456510646193812, + "grad_norm": 0.09106788784265518, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 322230 + }, + { + "epoch": 1.2456897218227645, + "grad_norm": 0.10046976059675217, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 322240 + }, + { + "epoch": 1.2457283790261477, + "grad_norm": 0.10367235541343689, + "learning_rate": 0.002, + "loss": 2.334, + "step": 322250 + }, + { + "epoch": 1.245767036229531, + "grad_norm": 0.11166314780712128, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 322260 + }, + { + "epoch": 1.2458056934329143, + "grad_norm": 0.09072278439998627, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 322270 + }, + { + "epoch": 1.2458443506362975, + "grad_norm": 0.10355474799871445, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 322280 + }, + { + "epoch": 1.2458830078396808, + "grad_norm": 0.10259489715099335, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 322290 + }, + { + "epoch": 1.2459216650430642, + "grad_norm": 0.12334828078746796, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 322300 + }, + { + "epoch": 1.2459603222464475, + "grad_norm": 0.12639813125133514, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 322310 + }, + { + "epoch": 1.2459989794498307, + "grad_norm": 0.10717806220054626, + "learning_rate": 0.002, + "loss": 2.34, + "step": 322320 + }, + { + "epoch": 1.246037636653214, + "grad_norm": 0.10780540853738785, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 322330 + }, + { + "epoch": 1.2460762938565972, + "grad_norm": 0.12163043767213821, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 322340 + }, + { + "epoch": 1.2461149510599805, + "grad_norm": 0.10512256622314453, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 322350 + }, + { + "epoch": 1.2461536082633637, + "grad_norm": 0.11891256272792816, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 322360 + }, + { + "epoch": 1.246192265466747, + "grad_norm": 0.08721699565649033, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 322370 + }, + { + "epoch": 1.2462309226701302, + "grad_norm": 0.10301852226257324, + "learning_rate": 0.002, + "loss": 2.316, + "step": 322380 + }, + { + "epoch": 1.2462695798735137, + "grad_norm": 0.10433807969093323, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 322390 + }, + { + "epoch": 1.246308237076897, + "grad_norm": 0.11882524937391281, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 322400 + }, + { + "epoch": 1.2463468942802802, + "grad_norm": 0.09801863133907318, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 322410 + }, + { + "epoch": 1.2463855514836635, + "grad_norm": 0.10055270045995712, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 322420 + }, + { + "epoch": 1.2464242086870467, + "grad_norm": 0.11083211749792099, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 322430 + }, + { + "epoch": 1.24646286589043, + "grad_norm": 0.11715829372406006, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 322440 + }, + { + "epoch": 1.2465015230938132, + "grad_norm": 0.10977619141340256, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 322450 + }, + { + "epoch": 1.2465401802971965, + "grad_norm": 0.0934174656867981, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 322460 + }, + { + "epoch": 1.24657883750058, + "grad_norm": 0.10772105306386948, + "learning_rate": 0.002, + "loss": 2.3542, + "step": 322470 + }, + { + "epoch": 1.2466174947039632, + "grad_norm": 0.10033559799194336, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 322480 + }, + { + "epoch": 1.2466561519073465, + "grad_norm": 0.1018151119351387, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 322490 + }, + { + "epoch": 1.2466948091107297, + "grad_norm": 0.11213316768407822, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 322500 + }, + { + "epoch": 1.246733466314113, + "grad_norm": 0.10616469383239746, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 322510 + }, + { + "epoch": 1.2467721235174962, + "grad_norm": 0.09601625800132751, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 322520 + }, + { + "epoch": 1.2468107807208795, + "grad_norm": 0.09934542328119278, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 322530 + }, + { + "epoch": 1.2468494379242627, + "grad_norm": 0.09939441829919815, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 322540 + }, + { + "epoch": 1.246888095127646, + "grad_norm": 0.11408967524766922, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 322550 + }, + { + "epoch": 1.2469267523310295, + "grad_norm": 0.10870948433876038, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 322560 + }, + { + "epoch": 1.2469654095344127, + "grad_norm": 0.10187172889709473, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 322570 + }, + { + "epoch": 1.247004066737796, + "grad_norm": 0.09450990706682205, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 322580 + }, + { + "epoch": 1.2470427239411792, + "grad_norm": 0.09321904927492142, + "learning_rate": 0.002, + "loss": 2.324, + "step": 322590 + }, + { + "epoch": 1.2470813811445625, + "grad_norm": 0.09764519333839417, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 322600 + }, + { + "epoch": 1.2471200383479457, + "grad_norm": 0.10473806411027908, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 322610 + }, + { + "epoch": 1.247158695551329, + "grad_norm": 0.107349693775177, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 322620 + }, + { + "epoch": 1.2471973527547122, + "grad_norm": 0.12023581564426422, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 322630 + }, + { + "epoch": 1.2472360099580957, + "grad_norm": 0.09608040750026703, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 322640 + }, + { + "epoch": 1.247274667161479, + "grad_norm": 0.10328279435634613, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 322650 + }, + { + "epoch": 1.2473133243648622, + "grad_norm": 0.09133925288915634, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 322660 + }, + { + "epoch": 1.2473519815682454, + "grad_norm": 0.10648240149021149, + "learning_rate": 0.002, + "loss": 2.342, + "step": 322670 + }, + { + "epoch": 1.2473906387716287, + "grad_norm": 0.10589372366666794, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 322680 + }, + { + "epoch": 1.247429295975012, + "grad_norm": 0.10525405406951904, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 322690 + }, + { + "epoch": 1.2474679531783952, + "grad_norm": 0.10498455166816711, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 322700 + }, + { + "epoch": 1.2475066103817785, + "grad_norm": 0.10058266669511795, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 322710 + }, + { + "epoch": 1.2475452675851617, + "grad_norm": 0.11225242912769318, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 322720 + }, + { + "epoch": 1.2475839247885452, + "grad_norm": 0.122991181910038, + "learning_rate": 0.002, + "loss": 2.334, + "step": 322730 + }, + { + "epoch": 1.2476225819919284, + "grad_norm": 0.09694638103246689, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 322740 + }, + { + "epoch": 1.2476612391953117, + "grad_norm": 0.09865941852331161, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 322750 + }, + { + "epoch": 1.247699896398695, + "grad_norm": 0.10031592845916748, + "learning_rate": 0.002, + "loss": 2.327, + "step": 322760 + }, + { + "epoch": 1.2477385536020782, + "grad_norm": 0.11467334628105164, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 322770 + }, + { + "epoch": 1.2477772108054614, + "grad_norm": 0.10169904679059982, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 322780 + }, + { + "epoch": 1.2478158680088447, + "grad_norm": 0.09758496284484863, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 322790 + }, + { + "epoch": 1.247854525212228, + "grad_norm": 0.12482014298439026, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 322800 + }, + { + "epoch": 1.2478931824156114, + "grad_norm": 0.11974287778139114, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 322810 + }, + { + "epoch": 1.2479318396189947, + "grad_norm": 0.1108810156583786, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 322820 + }, + { + "epoch": 1.247970496822378, + "grad_norm": 0.09258130937814713, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 322830 + }, + { + "epoch": 1.2480091540257612, + "grad_norm": 0.09061979502439499, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 322840 + }, + { + "epoch": 1.2480478112291444, + "grad_norm": 0.10905580222606659, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 322850 + }, + { + "epoch": 1.2480864684325277, + "grad_norm": 0.11101836711168289, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 322860 + }, + { + "epoch": 1.248125125635911, + "grad_norm": 0.11768799275159836, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 322870 + }, + { + "epoch": 1.2481637828392942, + "grad_norm": 0.11336436867713928, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 322880 + }, + { + "epoch": 1.2482024400426774, + "grad_norm": 0.11874739080667496, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 322890 + }, + { + "epoch": 1.248241097246061, + "grad_norm": 0.10988987237215042, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 322900 + }, + { + "epoch": 1.2482797544494442, + "grad_norm": 0.09862332046031952, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 322910 + }, + { + "epoch": 1.2483184116528274, + "grad_norm": 0.10089791566133499, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 322920 + }, + { + "epoch": 1.2483570688562107, + "grad_norm": 0.10909537971019745, + "learning_rate": 0.002, + "loss": 2.322, + "step": 322930 + }, + { + "epoch": 1.248395726059594, + "grad_norm": 0.13516369462013245, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 322940 + }, + { + "epoch": 1.2484343832629772, + "grad_norm": 0.1360653042793274, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 322950 + }, + { + "epoch": 1.2484730404663604, + "grad_norm": 0.10364606231451035, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 322960 + }, + { + "epoch": 1.248511697669744, + "grad_norm": 0.09633604437112808, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 322970 + }, + { + "epoch": 1.2485503548731272, + "grad_norm": 0.37557491660118103, + "learning_rate": 0.002, + "loss": 2.3774, + "step": 322980 + }, + { + "epoch": 1.2485890120765104, + "grad_norm": 0.11646705120801926, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 322990 + }, + { + "epoch": 1.2486276692798937, + "grad_norm": 0.13214127719402313, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 323000 + }, + { + "epoch": 1.248666326483277, + "grad_norm": 0.08824113756418228, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 323010 + }, + { + "epoch": 1.2487049836866602, + "grad_norm": 0.10804717242717743, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 323020 + }, + { + "epoch": 1.2487436408900434, + "grad_norm": 0.10116700828075409, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 323030 + }, + { + "epoch": 1.2487822980934267, + "grad_norm": 0.121637724339962, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 323040 + }, + { + "epoch": 1.24882095529681, + "grad_norm": 0.12432374805212021, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 323050 + }, + { + "epoch": 1.2488596125001932, + "grad_norm": 0.09569886326789856, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 323060 + }, + { + "epoch": 1.2488982697035766, + "grad_norm": 0.08894059807062149, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 323070 + }, + { + "epoch": 1.24893692690696, + "grad_norm": 0.10931940376758575, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 323080 + }, + { + "epoch": 1.2489755841103432, + "grad_norm": 0.11267866939306259, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 323090 + }, + { + "epoch": 1.2490142413137264, + "grad_norm": 0.09634686261415482, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 323100 + }, + { + "epoch": 1.2490528985171097, + "grad_norm": 0.10163258016109467, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 323110 + }, + { + "epoch": 1.249091555720493, + "grad_norm": 0.09575599431991577, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 323120 + }, + { + "epoch": 1.2491302129238762, + "grad_norm": 0.10602719336748123, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 323130 + }, + { + "epoch": 1.2491688701272596, + "grad_norm": 0.10932130366563797, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 323140 + }, + { + "epoch": 1.2492075273306429, + "grad_norm": 0.10526705533266068, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 323150 + }, + { + "epoch": 1.2492461845340261, + "grad_norm": 0.0988556370139122, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 323160 + }, + { + "epoch": 1.2492848417374094, + "grad_norm": 0.10179826617240906, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 323170 + }, + { + "epoch": 1.2493234989407926, + "grad_norm": 0.0974382758140564, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 323180 + }, + { + "epoch": 1.249362156144176, + "grad_norm": 0.10001240670681, + "learning_rate": 0.002, + "loss": 2.328, + "step": 323190 + }, + { + "epoch": 1.2494008133475591, + "grad_norm": 0.10301165282726288, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 323200 + }, + { + "epoch": 1.2494394705509424, + "grad_norm": 0.1391913890838623, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 323210 + }, + { + "epoch": 1.2494781277543257, + "grad_norm": 0.10194188356399536, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 323220 + }, + { + "epoch": 1.249516784957709, + "grad_norm": 0.10584280639886856, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 323230 + }, + { + "epoch": 1.2495554421610924, + "grad_norm": 0.10867606103420258, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 323240 + }, + { + "epoch": 1.2495940993644756, + "grad_norm": 0.0992872565984726, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 323250 + }, + { + "epoch": 1.2496327565678589, + "grad_norm": 0.12357765436172485, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 323260 + }, + { + "epoch": 1.2496714137712421, + "grad_norm": 0.08753013610839844, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 323270 + }, + { + "epoch": 1.2497100709746254, + "grad_norm": 0.09192200005054474, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 323280 + }, + { + "epoch": 1.2497487281780086, + "grad_norm": 0.1059727743268013, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 323290 + }, + { + "epoch": 1.249787385381392, + "grad_norm": 0.10856989771127701, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 323300 + }, + { + "epoch": 1.2498260425847754, + "grad_norm": 0.10081683099269867, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 323310 + }, + { + "epoch": 1.2498646997881586, + "grad_norm": 0.10372840613126755, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 323320 + }, + { + "epoch": 1.2499033569915419, + "grad_norm": 0.09857896715402603, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 323330 + }, + { + "epoch": 1.2499420141949251, + "grad_norm": 0.10680362582206726, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 323340 + }, + { + "epoch": 1.2499806713983084, + "grad_norm": 0.1055355817079544, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 323350 + }, + { + "epoch": 1.2500193286016916, + "grad_norm": 0.12696675956249237, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 323360 + }, + { + "epoch": 1.2500579858050749, + "grad_norm": 0.09672246128320694, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 323370 + }, + { + "epoch": 1.2500966430084581, + "grad_norm": 0.0960371345281601, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 323380 + }, + { + "epoch": 1.2501353002118414, + "grad_norm": 0.09530273079872131, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 323390 + }, + { + "epoch": 1.2501739574152246, + "grad_norm": 0.17731760442256927, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 323400 + }, + { + "epoch": 1.250212614618608, + "grad_norm": 0.11541884392499924, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 323410 + }, + { + "epoch": 1.2502512718219914, + "grad_norm": 0.09873352199792862, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 323420 + }, + { + "epoch": 1.2502899290253746, + "grad_norm": 0.13132119178771973, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 323430 + }, + { + "epoch": 1.2503285862287579, + "grad_norm": 0.11925432085990906, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 323440 + }, + { + "epoch": 1.2503672434321411, + "grad_norm": 0.10793092101812363, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 323450 + }, + { + "epoch": 1.2504059006355244, + "grad_norm": 0.10153786092996597, + "learning_rate": 0.002, + "loss": 2.336, + "step": 323460 + }, + { + "epoch": 1.2504445578389076, + "grad_norm": 0.10431569069623947, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 323470 + }, + { + "epoch": 1.250483215042291, + "grad_norm": 0.10726450383663177, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 323480 + }, + { + "epoch": 1.2505218722456743, + "grad_norm": 0.10803461819887161, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 323490 + }, + { + "epoch": 1.2505605294490576, + "grad_norm": 0.10541044175624847, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 323500 + }, + { + "epoch": 1.2505991866524409, + "grad_norm": 0.10016429424285889, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 323510 + }, + { + "epoch": 1.250637843855824, + "grad_norm": 0.09697940200567245, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 323520 + }, + { + "epoch": 1.2506765010592074, + "grad_norm": 0.09459806233644485, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 323530 + }, + { + "epoch": 1.2507151582625906, + "grad_norm": 0.10965185612440109, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 323540 + }, + { + "epoch": 1.2507538154659739, + "grad_norm": 0.1285904347896576, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 323550 + }, + { + "epoch": 1.2507924726693571, + "grad_norm": 0.11946802586317062, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 323560 + }, + { + "epoch": 1.2508311298727404, + "grad_norm": 0.11244595050811768, + "learning_rate": 0.002, + "loss": 2.35, + "step": 323570 + }, + { + "epoch": 1.2508697870761238, + "grad_norm": 0.10817580670118332, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 323580 + }, + { + "epoch": 1.250908444279507, + "grad_norm": 0.12300717085599899, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 323590 + }, + { + "epoch": 1.2509471014828903, + "grad_norm": 0.10982823371887207, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 323600 + }, + { + "epoch": 1.2509857586862736, + "grad_norm": 0.10455646365880966, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 323610 + }, + { + "epoch": 1.2510244158896568, + "grad_norm": 0.09407669305801392, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 323620 + }, + { + "epoch": 1.25106307309304, + "grad_norm": 0.14270919561386108, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 323630 + }, + { + "epoch": 1.2511017302964234, + "grad_norm": 0.09745250642299652, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 323640 + }, + { + "epoch": 1.2511403874998068, + "grad_norm": 0.10952335596084595, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 323650 + }, + { + "epoch": 1.25117904470319, + "grad_norm": 0.10126937925815582, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 323660 + }, + { + "epoch": 1.2512177019065733, + "grad_norm": 0.09247476607561111, + "learning_rate": 0.002, + "loss": 2.343, + "step": 323670 + }, + { + "epoch": 1.2512563591099566, + "grad_norm": 0.1153465211391449, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 323680 + }, + { + "epoch": 1.2512950163133398, + "grad_norm": 0.1340150535106659, + "learning_rate": 0.002, + "loss": 2.353, + "step": 323690 + }, + { + "epoch": 1.251333673516723, + "grad_norm": 0.11080127209424973, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 323700 + }, + { + "epoch": 1.2513723307201063, + "grad_norm": 0.13601601123809814, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 323710 + }, + { + "epoch": 1.2514109879234896, + "grad_norm": 0.10433941334486008, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 323720 + }, + { + "epoch": 1.2514496451268728, + "grad_norm": 0.10418447852134705, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 323730 + }, + { + "epoch": 1.251488302330256, + "grad_norm": 0.10729973763227463, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 323740 + }, + { + "epoch": 1.2515269595336396, + "grad_norm": 0.09255962818861008, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 323750 + }, + { + "epoch": 1.2515656167370228, + "grad_norm": 0.12399463355541229, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 323760 + }, + { + "epoch": 1.251604273940406, + "grad_norm": 0.1212589368224144, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 323770 + }, + { + "epoch": 1.2516429311437893, + "grad_norm": 0.10145343095064163, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 323780 + }, + { + "epoch": 1.2516815883471726, + "grad_norm": 0.11932121962308884, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 323790 + }, + { + "epoch": 1.2517202455505558, + "grad_norm": 0.08953940868377686, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 323800 + }, + { + "epoch": 1.2517589027539393, + "grad_norm": 0.09370152652263641, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 323810 + }, + { + "epoch": 1.2517975599573226, + "grad_norm": 0.10805421322584152, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 323820 + }, + { + "epoch": 1.2518362171607058, + "grad_norm": 0.09570808708667755, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 323830 + }, + { + "epoch": 1.251874874364089, + "grad_norm": 0.13799162209033966, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 323840 + }, + { + "epoch": 1.2519135315674723, + "grad_norm": 0.11238276213407516, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 323850 + }, + { + "epoch": 1.2519521887708556, + "grad_norm": 0.12940475344657898, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 323860 + }, + { + "epoch": 1.2519908459742388, + "grad_norm": 0.1093161404132843, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 323870 + }, + { + "epoch": 1.252029503177622, + "grad_norm": 0.1395666003227234, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 323880 + }, + { + "epoch": 1.2520681603810053, + "grad_norm": 0.1023389920592308, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 323890 + }, + { + "epoch": 1.2521068175843886, + "grad_norm": 0.1550685316324234, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 323900 + }, + { + "epoch": 1.2521454747877718, + "grad_norm": 0.11096793413162231, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 323910 + }, + { + "epoch": 1.2521841319911553, + "grad_norm": 0.0967497006058693, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 323920 + }, + { + "epoch": 1.2522227891945386, + "grad_norm": 0.11543576419353485, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 323930 + }, + { + "epoch": 1.2522614463979218, + "grad_norm": 0.11807817220687866, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 323940 + }, + { + "epoch": 1.252300103601305, + "grad_norm": 0.10806535929441452, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 323950 + }, + { + "epoch": 1.2523387608046883, + "grad_norm": 0.12325377762317657, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 323960 + }, + { + "epoch": 1.2523774180080716, + "grad_norm": 0.10087354481220245, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 323970 + }, + { + "epoch": 1.252416075211455, + "grad_norm": 0.12561774253845215, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 323980 + }, + { + "epoch": 1.2524547324148383, + "grad_norm": 0.3844805657863617, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 323990 + }, + { + "epoch": 1.2524933896182215, + "grad_norm": 0.0988330990076065, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 324000 + }, + { + "epoch": 1.2525320468216048, + "grad_norm": 0.11373604089021683, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 324010 + }, + { + "epoch": 1.252570704024988, + "grad_norm": 0.10148126631975174, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 324020 + }, + { + "epoch": 1.2526093612283713, + "grad_norm": 0.09937749803066254, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 324030 + }, + { + "epoch": 1.2526480184317546, + "grad_norm": 0.11062469333410263, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 324040 + }, + { + "epoch": 1.2526866756351378, + "grad_norm": 0.09238646179437637, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 324050 + }, + { + "epoch": 1.252725332838521, + "grad_norm": 0.11890768259763718, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 324060 + }, + { + "epoch": 1.2527639900419043, + "grad_norm": 0.1008533462882042, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 324070 + }, + { + "epoch": 1.2528026472452876, + "grad_norm": 0.11144950240850449, + "learning_rate": 0.002, + "loss": 2.3587, + "step": 324080 + }, + { + "epoch": 1.252841304448671, + "grad_norm": 0.09708874672651291, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 324090 + }, + { + "epoch": 1.2528799616520543, + "grad_norm": 0.10695436596870422, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 324100 + }, + { + "epoch": 1.2529186188554375, + "grad_norm": 0.09778022766113281, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 324110 + }, + { + "epoch": 1.2529572760588208, + "grad_norm": 0.09368152171373367, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 324120 + }, + { + "epoch": 1.252995933262204, + "grad_norm": 0.09820955246686935, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 324130 + }, + { + "epoch": 1.2530345904655873, + "grad_norm": 0.09251420944929123, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 324140 + }, + { + "epoch": 1.2530732476689708, + "grad_norm": 0.09938377887010574, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 324150 + }, + { + "epoch": 1.253111904872354, + "grad_norm": 0.11175873130559921, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 324160 + }, + { + "epoch": 1.2531505620757373, + "grad_norm": 0.1269582211971283, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 324170 + }, + { + "epoch": 1.2531892192791205, + "grad_norm": 0.1151537373661995, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 324180 + }, + { + "epoch": 1.2532278764825038, + "grad_norm": 0.10594908148050308, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 324190 + }, + { + "epoch": 1.253266533685887, + "grad_norm": 0.11179118603467941, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 324200 + }, + { + "epoch": 1.2533051908892703, + "grad_norm": 0.10564833134412766, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 324210 + }, + { + "epoch": 1.2533438480926535, + "grad_norm": 0.12424666434526443, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 324220 + }, + { + "epoch": 1.2533825052960368, + "grad_norm": 0.10340416431427002, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 324230 + }, + { + "epoch": 1.25342116249942, + "grad_norm": 0.10386926680803299, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 324240 + }, + { + "epoch": 1.2534598197028033, + "grad_norm": 0.3834894001483917, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 324250 + }, + { + "epoch": 1.2534984769061868, + "grad_norm": 0.11524228006601334, + "learning_rate": 0.002, + "loss": 2.34, + "step": 324260 + }, + { + "epoch": 1.25353713410957, + "grad_norm": 0.12251781672239304, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 324270 + }, + { + "epoch": 1.2535757913129533, + "grad_norm": 0.10764903575181961, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 324280 + }, + { + "epoch": 1.2536144485163365, + "grad_norm": 0.09343133121728897, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 324290 + }, + { + "epoch": 1.2536531057197198, + "grad_norm": 0.11491622775793076, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 324300 + }, + { + "epoch": 1.253691762923103, + "grad_norm": 0.112935371696949, + "learning_rate": 0.002, + "loss": 2.349, + "step": 324310 + }, + { + "epoch": 1.2537304201264865, + "grad_norm": 0.09517694264650345, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 324320 + }, + { + "epoch": 1.2537690773298698, + "grad_norm": 0.10657618939876556, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 324330 + }, + { + "epoch": 1.253807734533253, + "grad_norm": 0.10911275446414948, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 324340 + }, + { + "epoch": 1.2538463917366363, + "grad_norm": 0.10177423059940338, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 324350 + }, + { + "epoch": 1.2538850489400195, + "grad_norm": 0.09765344858169556, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 324360 + }, + { + "epoch": 1.2539237061434028, + "grad_norm": 0.10047116875648499, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 324370 + }, + { + "epoch": 1.253962363346786, + "grad_norm": 0.10966706275939941, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 324380 + }, + { + "epoch": 1.2540010205501693, + "grad_norm": 0.11583580821752548, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 324390 + }, + { + "epoch": 1.2540396777535525, + "grad_norm": 0.09870652109384537, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 324400 + }, + { + "epoch": 1.2540783349569358, + "grad_norm": 0.08850183337926865, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 324410 + }, + { + "epoch": 1.254116992160319, + "grad_norm": 0.09612442553043365, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 324420 + }, + { + "epoch": 1.2541556493637025, + "grad_norm": 0.12101586163043976, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 324430 + }, + { + "epoch": 1.2541943065670857, + "grad_norm": 0.10274821519851685, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 324440 + }, + { + "epoch": 1.254232963770469, + "grad_norm": 0.09802849590778351, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 324450 + }, + { + "epoch": 1.2542716209738523, + "grad_norm": 0.09413763135671616, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 324460 + }, + { + "epoch": 1.2543102781772355, + "grad_norm": 0.0963965356349945, + "learning_rate": 0.002, + "loss": 2.33, + "step": 324470 + }, + { + "epoch": 1.2543489353806188, + "grad_norm": 0.14099185168743134, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 324480 + }, + { + "epoch": 1.2543875925840022, + "grad_norm": 0.09496639668941498, + "learning_rate": 0.002, + "loss": 2.332, + "step": 324490 + }, + { + "epoch": 1.2544262497873855, + "grad_norm": 0.09903492778539658, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 324500 + }, + { + "epoch": 1.2544649069907687, + "grad_norm": 0.11639129370450974, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 324510 + }, + { + "epoch": 1.254503564194152, + "grad_norm": 0.1040710061788559, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 324520 + }, + { + "epoch": 1.2545422213975352, + "grad_norm": 0.09413284063339233, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 324530 + }, + { + "epoch": 1.2545808786009185, + "grad_norm": 0.09563596546649933, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 324540 + }, + { + "epoch": 1.2546195358043017, + "grad_norm": 0.10063523054122925, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 324550 + }, + { + "epoch": 1.254658193007685, + "grad_norm": 0.1292758584022522, + "learning_rate": 0.002, + "loss": 2.329, + "step": 324560 + }, + { + "epoch": 1.2546968502110682, + "grad_norm": 0.12233975529670715, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 324570 + }, + { + "epoch": 1.2547355074144515, + "grad_norm": 0.10087815672159195, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 324580 + }, + { + "epoch": 1.2547741646178348, + "grad_norm": 0.09283623099327087, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 324590 + }, + { + "epoch": 1.2548128218212182, + "grad_norm": 0.11724922806024551, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 324600 + }, + { + "epoch": 1.2548514790246015, + "grad_norm": 0.0939897671341896, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 324610 + }, + { + "epoch": 1.2548901362279847, + "grad_norm": 0.09834831953048706, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 324620 + }, + { + "epoch": 1.254928793431368, + "grad_norm": 0.10058873146772385, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 324630 + }, + { + "epoch": 1.2549674506347512, + "grad_norm": 0.11193490773439407, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 324640 + }, + { + "epoch": 1.2550061078381345, + "grad_norm": 0.1047125980257988, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 324650 + }, + { + "epoch": 1.255044765041518, + "grad_norm": 0.12173046916723251, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 324660 + }, + { + "epoch": 1.2550834222449012, + "grad_norm": 0.09190022200345993, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 324670 + }, + { + "epoch": 1.2551220794482845, + "grad_norm": 0.09540484100580215, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 324680 + }, + { + "epoch": 1.2551607366516677, + "grad_norm": 0.10608922690153122, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 324690 + }, + { + "epoch": 1.255199393855051, + "grad_norm": 0.10679183155298233, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 324700 + }, + { + "epoch": 1.2552380510584342, + "grad_norm": 0.10198316723108292, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 324710 + }, + { + "epoch": 1.2552767082618175, + "grad_norm": 0.09413036704063416, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 324720 + }, + { + "epoch": 1.2553153654652007, + "grad_norm": 0.09942302852869034, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 324730 + }, + { + "epoch": 1.255354022668584, + "grad_norm": 0.10988292098045349, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 324740 + }, + { + "epoch": 1.2553926798719672, + "grad_norm": 0.10978883504867554, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 324750 + }, + { + "epoch": 1.2554313370753505, + "grad_norm": 0.09921663254499435, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 324760 + }, + { + "epoch": 1.255469994278734, + "grad_norm": 0.10996361821889877, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 324770 + }, + { + "epoch": 1.2555086514821172, + "grad_norm": 0.10230149328708649, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 324780 + }, + { + "epoch": 1.2555473086855005, + "grad_norm": 0.11551018804311752, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 324790 + }, + { + "epoch": 1.2555859658888837, + "grad_norm": 0.10201035439968109, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 324800 + }, + { + "epoch": 1.255624623092267, + "grad_norm": 0.09801744669675827, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 324810 + }, + { + "epoch": 1.2556632802956502, + "grad_norm": 0.0981050655245781, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 324820 + }, + { + "epoch": 1.2557019374990337, + "grad_norm": 0.11296948045492172, + "learning_rate": 0.002, + "loss": 2.332, + "step": 324830 + }, + { + "epoch": 1.255740594702417, + "grad_norm": 0.11516579985618591, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 324840 + }, + { + "epoch": 1.2557792519058002, + "grad_norm": 0.10614938288927078, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 324850 + }, + { + "epoch": 1.2558179091091835, + "grad_norm": 0.09583014249801636, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 324860 + }, + { + "epoch": 1.2558565663125667, + "grad_norm": 0.10878030955791473, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 324870 + }, + { + "epoch": 1.25589522351595, + "grad_norm": 0.09600366652011871, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 324880 + }, + { + "epoch": 1.2559338807193332, + "grad_norm": 0.10757233202457428, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 324890 + }, + { + "epoch": 1.2559725379227165, + "grad_norm": 0.11732026934623718, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 324900 + }, + { + "epoch": 1.2560111951260997, + "grad_norm": 0.09119915962219238, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 324910 + }, + { + "epoch": 1.256049852329483, + "grad_norm": 0.10209295153617859, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 324920 + }, + { + "epoch": 1.2560885095328664, + "grad_norm": 0.1162097305059433, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 324930 + }, + { + "epoch": 1.2561271667362497, + "grad_norm": 0.1079174280166626, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 324940 + }, + { + "epoch": 1.256165823939633, + "grad_norm": 0.12759262323379517, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 324950 + }, + { + "epoch": 1.2562044811430162, + "grad_norm": 0.10023845732212067, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 324960 + }, + { + "epoch": 1.2562431383463994, + "grad_norm": 0.10516351461410522, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 324970 + }, + { + "epoch": 1.2562817955497827, + "grad_norm": 0.11115523427724838, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 324980 + }, + { + "epoch": 1.256320452753166, + "grad_norm": 0.11688680946826935, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 324990 + }, + { + "epoch": 1.2563591099565494, + "grad_norm": 0.09157036989927292, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 325000 + }, + { + "epoch": 1.2563977671599327, + "grad_norm": 0.09125621616840363, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 325010 + }, + { + "epoch": 1.256436424363316, + "grad_norm": 0.1386650800704956, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 325020 + }, + { + "epoch": 1.2564750815666992, + "grad_norm": 0.10481204837560654, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 325030 + }, + { + "epoch": 1.2565137387700824, + "grad_norm": 0.11478083580732346, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 325040 + }, + { + "epoch": 1.2565523959734657, + "grad_norm": 0.09643019735813141, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 325050 + }, + { + "epoch": 1.256591053176849, + "grad_norm": 0.11193980276584625, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 325060 + }, + { + "epoch": 1.2566297103802322, + "grad_norm": 0.10178869217634201, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 325070 + }, + { + "epoch": 1.2566683675836154, + "grad_norm": 0.13125890493392944, + "learning_rate": 0.002, + "loss": 2.336, + "step": 325080 + }, + { + "epoch": 1.2567070247869987, + "grad_norm": 0.09475627541542053, + "learning_rate": 0.002, + "loss": 2.337, + "step": 325090 + }, + { + "epoch": 1.2567456819903822, + "grad_norm": 0.0955866202712059, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 325100 + }, + { + "epoch": 1.2567843391937654, + "grad_norm": 0.10707160085439682, + "learning_rate": 0.002, + "loss": 2.331, + "step": 325110 + }, + { + "epoch": 1.2568229963971487, + "grad_norm": 0.09212376177310944, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 325120 + }, + { + "epoch": 1.256861653600532, + "grad_norm": 0.12452114373445511, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 325130 + }, + { + "epoch": 1.2569003108039152, + "grad_norm": 0.12266793102025986, + "learning_rate": 0.002, + "loss": 2.349, + "step": 325140 + }, + { + "epoch": 1.2569389680072984, + "grad_norm": 0.1223602294921875, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 325150 + }, + { + "epoch": 1.2569776252106817, + "grad_norm": 0.1065744161605835, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 325160 + }, + { + "epoch": 1.2570162824140652, + "grad_norm": 0.09754288196563721, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 325170 + }, + { + "epoch": 1.2570549396174484, + "grad_norm": 0.1234099268913269, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 325180 + }, + { + "epoch": 1.2570935968208317, + "grad_norm": 0.1373051553964615, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 325190 + }, + { + "epoch": 1.257132254024215, + "grad_norm": 0.09725300222635269, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 325200 + }, + { + "epoch": 1.2571709112275982, + "grad_norm": 0.10427434742450714, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 325210 + }, + { + "epoch": 1.2572095684309814, + "grad_norm": 0.1223810687661171, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 325220 + }, + { + "epoch": 1.2572482256343647, + "grad_norm": 0.09645023941993713, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 325230 + }, + { + "epoch": 1.257286882837748, + "grad_norm": 0.08541225641965866, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 325240 + }, + { + "epoch": 1.2573255400411312, + "grad_norm": 0.09978261590003967, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 325250 + }, + { + "epoch": 1.2573641972445144, + "grad_norm": 0.11347880959510803, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 325260 + }, + { + "epoch": 1.257402854447898, + "grad_norm": 0.11747587472200394, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 325270 + }, + { + "epoch": 1.2574415116512812, + "grad_norm": 0.1173802837729454, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 325280 + }, + { + "epoch": 1.2574801688546644, + "grad_norm": 0.10309915244579315, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 325290 + }, + { + "epoch": 1.2575188260580477, + "grad_norm": 0.0944056510925293, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 325300 + }, + { + "epoch": 1.257557483261431, + "grad_norm": 0.1091337502002716, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 325310 + }, + { + "epoch": 1.2575961404648142, + "grad_norm": 0.11433182656764984, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 325320 + }, + { + "epoch": 1.2576347976681974, + "grad_norm": 0.13041917979717255, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 325330 + }, + { + "epoch": 1.2576734548715809, + "grad_norm": 0.1184714138507843, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 325340 + }, + { + "epoch": 1.2577121120749641, + "grad_norm": 0.0960577055811882, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 325350 + }, + { + "epoch": 1.2577507692783474, + "grad_norm": 0.09737391024827957, + "learning_rate": 0.002, + "loss": 2.322, + "step": 325360 + }, + { + "epoch": 1.2577894264817306, + "grad_norm": 0.11727464944124222, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 325370 + }, + { + "epoch": 1.257828083685114, + "grad_norm": 0.1169314980506897, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 325380 + }, + { + "epoch": 1.2578667408884971, + "grad_norm": 0.09046033769845963, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 325390 + }, + { + "epoch": 1.2579053980918804, + "grad_norm": 0.09294285625219345, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 325400 + }, + { + "epoch": 1.2579440552952637, + "grad_norm": 0.1019824743270874, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 325410 + }, + { + "epoch": 1.257982712498647, + "grad_norm": 0.11393209546804428, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 325420 + }, + { + "epoch": 1.2580213697020302, + "grad_norm": 0.10201390832662582, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 325430 + }, + { + "epoch": 1.2580600269054136, + "grad_norm": 0.09985291212797165, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 325440 + }, + { + "epoch": 1.2580986841087969, + "grad_norm": 0.10562863200902939, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 325450 + }, + { + "epoch": 1.2581373413121801, + "grad_norm": 0.09784512966871262, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 325460 + }, + { + "epoch": 1.2581759985155634, + "grad_norm": 0.10671749711036682, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 325470 + }, + { + "epoch": 1.2582146557189466, + "grad_norm": 0.12761123478412628, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 325480 + }, + { + "epoch": 1.25825331292233, + "grad_norm": 0.09099403768777847, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 325490 + }, + { + "epoch": 1.2582919701257131, + "grad_norm": 0.0952981561422348, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 325500 + }, + { + "epoch": 1.2583306273290966, + "grad_norm": 0.12638869881629944, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 325510 + }, + { + "epoch": 1.2583692845324799, + "grad_norm": 0.09895528107881546, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 325520 + }, + { + "epoch": 1.2584079417358631, + "grad_norm": 0.10875288397073746, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 325530 + }, + { + "epoch": 1.2584465989392464, + "grad_norm": 0.10601435601711273, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 325540 + }, + { + "epoch": 1.2584852561426296, + "grad_norm": 0.09812024980783463, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 325550 + }, + { + "epoch": 1.2585239133460129, + "grad_norm": 0.10472586005926132, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 325560 + }, + { + "epoch": 1.2585625705493961, + "grad_norm": 0.09921694546937943, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 325570 + }, + { + "epoch": 1.2586012277527794, + "grad_norm": 0.10477118194103241, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 325580 + }, + { + "epoch": 1.2586398849561626, + "grad_norm": 0.09114798158407211, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 325590 + }, + { + "epoch": 1.2586785421595459, + "grad_norm": 0.0994727835059166, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 325600 + }, + { + "epoch": 1.2587171993629294, + "grad_norm": 0.09729986637830734, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 325610 + }, + { + "epoch": 1.2587558565663126, + "grad_norm": 0.1109003946185112, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 325620 + }, + { + "epoch": 1.2587945137696959, + "grad_norm": 0.09484990686178207, + "learning_rate": 0.002, + "loss": 2.349, + "step": 325630 + }, + { + "epoch": 1.2588331709730791, + "grad_norm": 0.11547428369522095, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 325640 + }, + { + "epoch": 1.2588718281764624, + "grad_norm": 0.11740986257791519, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 325650 + }, + { + "epoch": 1.2589104853798456, + "grad_norm": 0.10123540461063385, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 325660 + }, + { + "epoch": 1.258949142583229, + "grad_norm": 0.1007891520857811, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 325670 + }, + { + "epoch": 1.2589877997866123, + "grad_norm": 0.1192469522356987, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 325680 + }, + { + "epoch": 1.2590264569899956, + "grad_norm": 0.10603582859039307, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 325690 + }, + { + "epoch": 1.2590651141933789, + "grad_norm": 0.10165327042341232, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 325700 + }, + { + "epoch": 1.259103771396762, + "grad_norm": 0.11069201678037643, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 325710 + }, + { + "epoch": 1.2591424286001454, + "grad_norm": 0.10442770272493362, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 325720 + }, + { + "epoch": 1.2591810858035286, + "grad_norm": 0.10615067183971405, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 325730 + }, + { + "epoch": 1.2592197430069119, + "grad_norm": 0.09877041727304459, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 325740 + }, + { + "epoch": 1.2592584002102951, + "grad_norm": 0.1348433494567871, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 325750 + }, + { + "epoch": 1.2592970574136784, + "grad_norm": 0.12218829244375229, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 325760 + }, + { + "epoch": 1.2593357146170616, + "grad_norm": 0.10086431354284286, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 325770 + }, + { + "epoch": 1.259374371820445, + "grad_norm": 0.11661353707313538, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 325780 + }, + { + "epoch": 1.2594130290238283, + "grad_norm": 0.08707497268915176, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 325790 + }, + { + "epoch": 1.2594516862272116, + "grad_norm": 0.1499214470386505, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 325800 + }, + { + "epoch": 1.2594903434305948, + "grad_norm": 0.10032685846090317, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 325810 + }, + { + "epoch": 1.259529000633978, + "grad_norm": 0.10852710157632828, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 325820 + }, + { + "epoch": 1.2595676578373614, + "grad_norm": 0.11643079668283463, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 325830 + }, + { + "epoch": 1.2596063150407448, + "grad_norm": 0.11143416911363602, + "learning_rate": 0.002, + "loss": 2.324, + "step": 325840 + }, + { + "epoch": 1.259644972244128, + "grad_norm": 0.10897476971149445, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 325850 + }, + { + "epoch": 1.2596836294475113, + "grad_norm": 0.09578459709882736, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 325860 + }, + { + "epoch": 1.2597222866508946, + "grad_norm": 0.10422952473163605, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 325870 + }, + { + "epoch": 1.2597609438542778, + "grad_norm": 0.10057570785284042, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 325880 + }, + { + "epoch": 1.259799601057661, + "grad_norm": 0.10279736667871475, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 325890 + }, + { + "epoch": 1.2598382582610443, + "grad_norm": 0.10968570411205292, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 325900 + }, + { + "epoch": 1.2598769154644276, + "grad_norm": 0.09762471914291382, + "learning_rate": 0.002, + "loss": 2.3134, + "step": 325910 + }, + { + "epoch": 1.2599155726678108, + "grad_norm": 0.1026439517736435, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 325920 + }, + { + "epoch": 1.259954229871194, + "grad_norm": 0.10050418972969055, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 325930 + }, + { + "epoch": 1.2599928870745774, + "grad_norm": 0.10055512189865112, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 325940 + }, + { + "epoch": 1.2600315442779608, + "grad_norm": 0.11480934917926788, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 325950 + }, + { + "epoch": 1.260070201481344, + "grad_norm": 0.12158878892660141, + "learning_rate": 0.002, + "loss": 2.334, + "step": 325960 + }, + { + "epoch": 1.2601088586847273, + "grad_norm": 0.11016429215669632, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 325970 + }, + { + "epoch": 1.2601475158881106, + "grad_norm": 0.1013168916106224, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 325980 + }, + { + "epoch": 1.2601861730914938, + "grad_norm": 0.10989508777856827, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 325990 + }, + { + "epoch": 1.260224830294877, + "grad_norm": 0.10646646469831467, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 326000 + }, + { + "epoch": 1.2602634874982606, + "grad_norm": 0.09758585691452026, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 326010 + }, + { + "epoch": 1.2603021447016438, + "grad_norm": 0.11484365910291672, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 326020 + }, + { + "epoch": 1.260340801905027, + "grad_norm": 0.10497698932886124, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 326030 + }, + { + "epoch": 1.2603794591084103, + "grad_norm": 0.11317858844995499, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 326040 + }, + { + "epoch": 1.2604181163117936, + "grad_norm": 0.08910156786441803, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 326050 + }, + { + "epoch": 1.2604567735151768, + "grad_norm": 0.10349009931087494, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 326060 + }, + { + "epoch": 1.26049543071856, + "grad_norm": 0.1077611893415451, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 326070 + }, + { + "epoch": 1.2605340879219433, + "grad_norm": 0.10131070762872696, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 326080 + }, + { + "epoch": 1.2605727451253266, + "grad_norm": 0.1114729791879654, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 326090 + }, + { + "epoch": 1.2606114023287098, + "grad_norm": 0.10723952203989029, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 326100 + }, + { + "epoch": 1.260650059532093, + "grad_norm": 0.10471288859844208, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 326110 + }, + { + "epoch": 1.2606887167354766, + "grad_norm": 0.1356627196073532, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 326120 + }, + { + "epoch": 1.2607273739388598, + "grad_norm": 0.10309799760580063, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 326130 + }, + { + "epoch": 1.260766031142243, + "grad_norm": 0.10922736674547195, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 326140 + }, + { + "epoch": 1.2608046883456263, + "grad_norm": 0.09780388325452805, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 326150 + }, + { + "epoch": 1.2608433455490096, + "grad_norm": 0.10111238062381744, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 326160 + }, + { + "epoch": 1.2608820027523928, + "grad_norm": 0.11071167141199112, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 326170 + }, + { + "epoch": 1.2609206599557763, + "grad_norm": 0.10419418662786484, + "learning_rate": 0.002, + "loss": 2.3665, + "step": 326180 + }, + { + "epoch": 1.2609593171591595, + "grad_norm": 0.13271836936473846, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 326190 + }, + { + "epoch": 1.2609979743625428, + "grad_norm": 0.10130574554204941, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 326200 + }, + { + "epoch": 1.261036631565926, + "grad_norm": 0.11065841466188431, + "learning_rate": 0.002, + "loss": 2.34, + "step": 326210 + }, + { + "epoch": 1.2610752887693093, + "grad_norm": 0.10128871351480484, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 326220 + }, + { + "epoch": 1.2611139459726926, + "grad_norm": 0.11795809119939804, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 326230 + }, + { + "epoch": 1.2611526031760758, + "grad_norm": 0.10937850177288055, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 326240 + }, + { + "epoch": 1.261191260379459, + "grad_norm": 0.10301138460636139, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 326250 + }, + { + "epoch": 1.2612299175828423, + "grad_norm": 0.1003199890255928, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 326260 + }, + { + "epoch": 1.2612685747862256, + "grad_norm": 0.09486802667379379, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 326270 + }, + { + "epoch": 1.2613072319896088, + "grad_norm": 0.12239663302898407, + "learning_rate": 0.002, + "loss": 2.335, + "step": 326280 + }, + { + "epoch": 1.2613458891929923, + "grad_norm": 0.10332158952951431, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 326290 + }, + { + "epoch": 1.2613845463963755, + "grad_norm": 0.10419623553752899, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 326300 + }, + { + "epoch": 1.2614232035997588, + "grad_norm": 0.08871430903673172, + "learning_rate": 0.002, + "loss": 2.345, + "step": 326310 + }, + { + "epoch": 1.261461860803142, + "grad_norm": 0.13712400197982788, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 326320 + }, + { + "epoch": 1.2615005180065253, + "grad_norm": 0.09635666012763977, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 326330 + }, + { + "epoch": 1.2615391752099085, + "grad_norm": 0.0998094230890274, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 326340 + }, + { + "epoch": 1.261577832413292, + "grad_norm": 0.21075473725795746, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 326350 + }, + { + "epoch": 1.2616164896166753, + "grad_norm": 0.1039869412779808, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 326360 + }, + { + "epoch": 1.2616551468200585, + "grad_norm": 0.0978020578622818, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 326370 + }, + { + "epoch": 1.2616938040234418, + "grad_norm": 0.10820144414901733, + "learning_rate": 0.002, + "loss": 2.338, + "step": 326380 + }, + { + "epoch": 1.261732461226825, + "grad_norm": 0.11902282387018204, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 326390 + }, + { + "epoch": 1.2617711184302083, + "grad_norm": 0.09906861186027527, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 326400 + }, + { + "epoch": 1.2618097756335915, + "grad_norm": 0.09596288204193115, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 326410 + }, + { + "epoch": 1.2618484328369748, + "grad_norm": 0.12837019562721252, + "learning_rate": 0.002, + "loss": 2.339, + "step": 326420 + }, + { + "epoch": 1.261887090040358, + "grad_norm": 0.10290003567934036, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 326430 + }, + { + "epoch": 1.2619257472437413, + "grad_norm": 0.11014819890260696, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 326440 + }, + { + "epoch": 1.2619644044471245, + "grad_norm": 0.11158356070518494, + "learning_rate": 0.002, + "loss": 2.332, + "step": 326450 + }, + { + "epoch": 1.262003061650508, + "grad_norm": 0.09634177386760712, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 326460 + }, + { + "epoch": 1.2620417188538913, + "grad_norm": 0.10639488697052002, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 326470 + }, + { + "epoch": 1.2620803760572745, + "grad_norm": 0.11883672326803207, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 326480 + }, + { + "epoch": 1.2621190332606578, + "grad_norm": 0.13678741455078125, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 326490 + }, + { + "epoch": 1.262157690464041, + "grad_norm": 0.10750120133161545, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 326500 + }, + { + "epoch": 1.2621963476674243, + "grad_norm": 0.16615253686904907, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 326510 + }, + { + "epoch": 1.2622350048708078, + "grad_norm": 0.11281480640172958, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 326520 + }, + { + "epoch": 1.262273662074191, + "grad_norm": 0.1095518171787262, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 326530 + }, + { + "epoch": 1.2623123192775743, + "grad_norm": 0.10316134244203568, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 326540 + }, + { + "epoch": 1.2623509764809575, + "grad_norm": 0.09717752784490585, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 326550 + }, + { + "epoch": 1.2623896336843408, + "grad_norm": 0.11651014536619186, + "learning_rate": 0.002, + "loss": 2.345, + "step": 326560 + }, + { + "epoch": 1.262428290887724, + "grad_norm": 0.11065381020307541, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 326570 + }, + { + "epoch": 1.2624669480911073, + "grad_norm": 0.1010802835226059, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 326580 + }, + { + "epoch": 1.2625056052944905, + "grad_norm": 0.09426004439592361, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 326590 + }, + { + "epoch": 1.2625442624978738, + "grad_norm": 0.16192200779914856, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 326600 + }, + { + "epoch": 1.262582919701257, + "grad_norm": 0.09397521615028381, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 326610 + }, + { + "epoch": 1.2626215769046403, + "grad_norm": 0.13081824779510498, + "learning_rate": 0.002, + "loss": 2.342, + "step": 326620 + }, + { + "epoch": 1.2626602341080237, + "grad_norm": 0.09981647878885269, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 326630 + }, + { + "epoch": 1.262698891311407, + "grad_norm": 0.12131404131650925, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 326640 + }, + { + "epoch": 1.2627375485147903, + "grad_norm": 0.11333523690700531, + "learning_rate": 0.002, + "loss": 2.326, + "step": 326650 + }, + { + "epoch": 1.2627762057181735, + "grad_norm": 0.10390151292085648, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 326660 + }, + { + "epoch": 1.2628148629215568, + "grad_norm": 0.10886949300765991, + "learning_rate": 0.002, + "loss": 2.329, + "step": 326670 + }, + { + "epoch": 1.26285352012494, + "grad_norm": 0.10427232086658478, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 326680 + }, + { + "epoch": 1.2628921773283235, + "grad_norm": 0.10599779337644577, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 326690 + }, + { + "epoch": 1.2629308345317067, + "grad_norm": 0.10859518498182297, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 326700 + }, + { + "epoch": 1.26296949173509, + "grad_norm": 0.09477894753217697, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 326710 + }, + { + "epoch": 1.2630081489384732, + "grad_norm": 0.11893752217292786, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 326720 + }, + { + "epoch": 1.2630468061418565, + "grad_norm": 0.10002786666154861, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 326730 + }, + { + "epoch": 1.2630854633452397, + "grad_norm": 0.110787034034729, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 326740 + }, + { + "epoch": 1.263124120548623, + "grad_norm": 0.10369330644607544, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 326750 + }, + { + "epoch": 1.2631627777520062, + "grad_norm": 0.09941741079092026, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 326760 + }, + { + "epoch": 1.2632014349553895, + "grad_norm": 0.11788026988506317, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 326770 + }, + { + "epoch": 1.2632400921587728, + "grad_norm": 0.10224135965108871, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 326780 + }, + { + "epoch": 1.263278749362156, + "grad_norm": 0.10844755917787552, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 326790 + }, + { + "epoch": 1.2633174065655395, + "grad_norm": 0.12195967137813568, + "learning_rate": 0.002, + "loss": 2.346, + "step": 326800 + }, + { + "epoch": 1.2633560637689227, + "grad_norm": 0.10885465145111084, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 326810 + }, + { + "epoch": 1.263394720972306, + "grad_norm": 0.09080061316490173, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 326820 + }, + { + "epoch": 1.2634333781756892, + "grad_norm": 0.1021483764052391, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 326830 + }, + { + "epoch": 1.2634720353790725, + "grad_norm": 0.12462744116783142, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 326840 + }, + { + "epoch": 1.2635106925824557, + "grad_norm": 0.11733567714691162, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 326850 + }, + { + "epoch": 1.2635493497858392, + "grad_norm": 0.10920538753271103, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 326860 + }, + { + "epoch": 1.2635880069892225, + "grad_norm": 0.10772059857845306, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 326870 + }, + { + "epoch": 1.2636266641926057, + "grad_norm": 0.09897726029157639, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 326880 + }, + { + "epoch": 1.263665321395989, + "grad_norm": 0.09287161380052567, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 326890 + }, + { + "epoch": 1.2637039785993722, + "grad_norm": 0.10096503049135208, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 326900 + }, + { + "epoch": 1.2637426358027555, + "grad_norm": 0.10740195959806442, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 326910 + }, + { + "epoch": 1.2637812930061387, + "grad_norm": 0.12665532529354095, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 326920 + }, + { + "epoch": 1.263819950209522, + "grad_norm": 0.09911788254976273, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 326930 + }, + { + "epoch": 1.2638586074129052, + "grad_norm": 0.11081048846244812, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 326940 + }, + { + "epoch": 1.2638972646162885, + "grad_norm": 0.10057511925697327, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 326950 + }, + { + "epoch": 1.263935921819672, + "grad_norm": 0.09975273907184601, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 326960 + }, + { + "epoch": 1.2639745790230552, + "grad_norm": 0.09870759397745132, + "learning_rate": 0.002, + "loss": 2.33, + "step": 326970 + }, + { + "epoch": 1.2640132362264385, + "grad_norm": 0.1000353991985321, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 326980 + }, + { + "epoch": 1.2640518934298217, + "grad_norm": 0.08643193542957306, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 326990 + }, + { + "epoch": 1.264090550633205, + "grad_norm": 0.1016695648431778, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 327000 + }, + { + "epoch": 1.2641292078365882, + "grad_norm": 0.11448567360639572, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 327010 + }, + { + "epoch": 1.2641678650399715, + "grad_norm": 0.1103028953075409, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 327020 + }, + { + "epoch": 1.264206522243355, + "grad_norm": 0.10044880211353302, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 327030 + }, + { + "epoch": 1.2642451794467382, + "grad_norm": 0.10634750872850418, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 327040 + }, + { + "epoch": 1.2642838366501215, + "grad_norm": 0.10897121578454971, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 327050 + }, + { + "epoch": 1.2643224938535047, + "grad_norm": 0.10211598128080368, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 327060 + }, + { + "epoch": 1.264361151056888, + "grad_norm": 0.13065731525421143, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 327070 + }, + { + "epoch": 1.2643998082602712, + "grad_norm": 0.08493944257497787, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 327080 + }, + { + "epoch": 1.2644384654636545, + "grad_norm": 0.09587639570236206, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 327090 + }, + { + "epoch": 1.2644771226670377, + "grad_norm": 0.10556354373693466, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 327100 + }, + { + "epoch": 1.264515779870421, + "grad_norm": 0.10257713496685028, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 327110 + }, + { + "epoch": 1.2645544370738042, + "grad_norm": 0.09609122574329376, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 327120 + }, + { + "epoch": 1.2645930942771877, + "grad_norm": 0.12300848960876465, + "learning_rate": 0.002, + "loss": 2.33, + "step": 327130 + }, + { + "epoch": 1.264631751480571, + "grad_norm": 0.0994371771812439, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 327140 + }, + { + "epoch": 1.2646704086839542, + "grad_norm": 0.08351749926805496, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 327150 + }, + { + "epoch": 1.2647090658873374, + "grad_norm": 0.11216479539871216, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 327160 + }, + { + "epoch": 1.2647477230907207, + "grad_norm": 0.10732587426900864, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 327170 + }, + { + "epoch": 1.264786380294104, + "grad_norm": 0.12903602421283722, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 327180 + }, + { + "epoch": 1.2648250374974872, + "grad_norm": 0.10329445451498032, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 327190 + }, + { + "epoch": 1.2648636947008707, + "grad_norm": 0.10292506217956543, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 327200 + }, + { + "epoch": 1.264902351904254, + "grad_norm": 0.25355979800224304, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 327210 + }, + { + "epoch": 1.2649410091076372, + "grad_norm": 0.10248008370399475, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 327220 + }, + { + "epoch": 1.2649796663110204, + "grad_norm": 0.10794755816459656, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 327230 + }, + { + "epoch": 1.2650183235144037, + "grad_norm": 0.2157856971025467, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 327240 + }, + { + "epoch": 1.265056980717787, + "grad_norm": 0.12279105186462402, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 327250 + }, + { + "epoch": 1.2650956379211702, + "grad_norm": 0.254301518201828, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 327260 + }, + { + "epoch": 1.2651342951245534, + "grad_norm": 0.12824511528015137, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 327270 + }, + { + "epoch": 1.2651729523279367, + "grad_norm": 0.09473264962434769, + "learning_rate": 0.002, + "loss": 2.334, + "step": 327280 + }, + { + "epoch": 1.26521160953132, + "grad_norm": 0.09445297718048096, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 327290 + }, + { + "epoch": 1.2652502667347034, + "grad_norm": 0.12316767871379852, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 327300 + }, + { + "epoch": 1.2652889239380867, + "grad_norm": 0.09455137699842453, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 327310 + }, + { + "epoch": 1.26532758114147, + "grad_norm": 0.10438383370637894, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 327320 + }, + { + "epoch": 1.2653662383448532, + "grad_norm": 0.10898282378911972, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 327330 + }, + { + "epoch": 1.2654048955482364, + "grad_norm": 0.11414649337530136, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 327340 + }, + { + "epoch": 1.2654435527516197, + "grad_norm": 0.1032983660697937, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 327350 + }, + { + "epoch": 1.265482209955003, + "grad_norm": 0.12052953243255615, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 327360 + }, + { + "epoch": 1.2655208671583864, + "grad_norm": 0.12255626171827316, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 327370 + }, + { + "epoch": 1.2655595243617697, + "grad_norm": 0.09772976487874985, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 327380 + }, + { + "epoch": 1.265598181565153, + "grad_norm": 0.09752926230430603, + "learning_rate": 0.002, + "loss": 2.35, + "step": 327390 + }, + { + "epoch": 1.2656368387685362, + "grad_norm": 0.11025545001029968, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 327400 + }, + { + "epoch": 1.2656754959719194, + "grad_norm": 0.11592836678028107, + "learning_rate": 0.002, + "loss": 2.3125, + "step": 327410 + }, + { + "epoch": 1.2657141531753027, + "grad_norm": 0.09760356694459915, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 327420 + }, + { + "epoch": 1.265752810378686, + "grad_norm": 0.17295199632644653, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 327430 + }, + { + "epoch": 1.2657914675820692, + "grad_norm": 0.11333099007606506, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 327440 + }, + { + "epoch": 1.2658301247854524, + "grad_norm": 0.10388067364692688, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 327450 + }, + { + "epoch": 1.2658687819888357, + "grad_norm": 0.1109703928232193, + "learning_rate": 0.002, + "loss": 2.317, + "step": 327460 + }, + { + "epoch": 1.2659074391922192, + "grad_norm": 0.11445900797843933, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 327470 + }, + { + "epoch": 1.2659460963956024, + "grad_norm": 0.11122239381074905, + "learning_rate": 0.002, + "loss": 2.349, + "step": 327480 + }, + { + "epoch": 1.2659847535989857, + "grad_norm": 0.10093493014574051, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 327490 + }, + { + "epoch": 1.266023410802369, + "grad_norm": 0.09518415480852127, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 327500 + }, + { + "epoch": 1.2660620680057522, + "grad_norm": 0.12398479133844376, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 327510 + }, + { + "epoch": 1.2661007252091354, + "grad_norm": 0.10292278230190277, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 327520 + }, + { + "epoch": 1.2661393824125187, + "grad_norm": 0.11218395084142685, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 327530 + }, + { + "epoch": 1.2661780396159021, + "grad_norm": 0.11104859411716461, + "learning_rate": 0.002, + "loss": 2.329, + "step": 327540 + }, + { + "epoch": 1.2662166968192854, + "grad_norm": 0.12359805405139923, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 327550 + }, + { + "epoch": 1.2662553540226686, + "grad_norm": 0.09395068138837814, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 327560 + }, + { + "epoch": 1.266294011226052, + "grad_norm": 0.10843432694673538, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 327570 + }, + { + "epoch": 1.2663326684294351, + "grad_norm": 0.10082303732633591, + "learning_rate": 0.002, + "loss": 2.344, + "step": 327580 + }, + { + "epoch": 1.2663713256328184, + "grad_norm": 0.09647303074598312, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 327590 + }, + { + "epoch": 1.2664099828362017, + "grad_norm": 0.11412365734577179, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 327600 + }, + { + "epoch": 1.266448640039585, + "grad_norm": 0.11117341369390488, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 327610 + }, + { + "epoch": 1.2664872972429682, + "grad_norm": 0.0973285362124443, + "learning_rate": 0.002, + "loss": 2.344, + "step": 327620 + }, + { + "epoch": 1.2665259544463514, + "grad_norm": 0.1106308177113533, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 327630 + }, + { + "epoch": 1.2665646116497349, + "grad_norm": 0.11163157969713211, + "learning_rate": 0.002, + "loss": 2.342, + "step": 327640 + }, + { + "epoch": 1.2666032688531181, + "grad_norm": 0.09332996606826782, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 327650 + }, + { + "epoch": 1.2666419260565014, + "grad_norm": 0.10091850161552429, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 327660 + }, + { + "epoch": 1.2666805832598846, + "grad_norm": 0.11225081235170364, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 327670 + }, + { + "epoch": 1.266719240463268, + "grad_norm": 0.10611289739608765, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 327680 + }, + { + "epoch": 1.2667578976666511, + "grad_norm": 0.13344308733940125, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 327690 + }, + { + "epoch": 1.2667965548700346, + "grad_norm": 0.11110425740480423, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 327700 + }, + { + "epoch": 1.2668352120734179, + "grad_norm": 0.11922081559896469, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 327710 + }, + { + "epoch": 1.2668738692768011, + "grad_norm": 0.0929272323846817, + "learning_rate": 0.002, + "loss": 2.3125, + "step": 327720 + }, + { + "epoch": 1.2669125264801844, + "grad_norm": 0.11714441329240799, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 327730 + }, + { + "epoch": 1.2669511836835676, + "grad_norm": 0.12794846296310425, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 327740 + }, + { + "epoch": 1.2669898408869509, + "grad_norm": 0.10424962639808655, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 327750 + }, + { + "epoch": 1.2670284980903341, + "grad_norm": 0.09436719119548798, + "learning_rate": 0.002, + "loss": 2.338, + "step": 327760 + }, + { + "epoch": 1.2670671552937174, + "grad_norm": 0.10538437962532043, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 327770 + }, + { + "epoch": 1.2671058124971006, + "grad_norm": 0.10082082450389862, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 327780 + }, + { + "epoch": 1.2671444697004839, + "grad_norm": 0.09229423850774765, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 327790 + }, + { + "epoch": 1.2671831269038671, + "grad_norm": 0.10747473686933517, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 327800 + }, + { + "epoch": 1.2672217841072506, + "grad_norm": 0.10845305770635605, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 327810 + }, + { + "epoch": 1.2672604413106339, + "grad_norm": 0.12466636300086975, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 327820 + }, + { + "epoch": 1.2672990985140171, + "grad_norm": 0.10115847736597061, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 327830 + }, + { + "epoch": 1.2673377557174004, + "grad_norm": 0.10708760470151901, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 327840 + }, + { + "epoch": 1.2673764129207836, + "grad_norm": 0.1159091666340828, + "learning_rate": 0.002, + "loss": 2.341, + "step": 327850 + }, + { + "epoch": 1.2674150701241669, + "grad_norm": 0.10875213146209717, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 327860 + }, + { + "epoch": 1.2674537273275504, + "grad_norm": 0.11530718207359314, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 327870 + }, + { + "epoch": 1.2674923845309336, + "grad_norm": 0.09820769727230072, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 327880 + }, + { + "epoch": 1.2675310417343169, + "grad_norm": 0.10630863159894943, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 327890 + }, + { + "epoch": 1.2675696989377, + "grad_norm": 0.11648089438676834, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 327900 + }, + { + "epoch": 1.2676083561410834, + "grad_norm": 0.10292232036590576, + "learning_rate": 0.002, + "loss": 2.318, + "step": 327910 + }, + { + "epoch": 1.2676470133444666, + "grad_norm": 0.11076568812131882, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 327920 + }, + { + "epoch": 1.2676856705478499, + "grad_norm": 0.13279521465301514, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 327930 + }, + { + "epoch": 1.2677243277512331, + "grad_norm": 0.10814601182937622, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 327940 + }, + { + "epoch": 1.2677629849546164, + "grad_norm": 0.09568917751312256, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 327950 + }, + { + "epoch": 1.2678016421579996, + "grad_norm": 0.10426489263772964, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 327960 + }, + { + "epoch": 1.2678402993613829, + "grad_norm": 0.1028430238366127, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 327970 + }, + { + "epoch": 1.2678789565647663, + "grad_norm": 0.11005204170942307, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 327980 + }, + { + "epoch": 1.2679176137681496, + "grad_norm": 0.10916189104318619, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 327990 + }, + { + "epoch": 1.2679562709715329, + "grad_norm": 0.09372225403785706, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 328000 + }, + { + "epoch": 1.267994928174916, + "grad_norm": 0.11431435495615005, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 328010 + }, + { + "epoch": 1.2680335853782994, + "grad_norm": 0.13004766404628754, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 328020 + }, + { + "epoch": 1.2680722425816826, + "grad_norm": 0.26628178358078003, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 328030 + }, + { + "epoch": 1.268110899785066, + "grad_norm": 0.12861287593841553, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 328040 + }, + { + "epoch": 1.2681495569884493, + "grad_norm": 0.11879384517669678, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 328050 + }, + { + "epoch": 1.2681882141918326, + "grad_norm": 0.3298933506011963, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 328060 + }, + { + "epoch": 1.2682268713952158, + "grad_norm": 0.10708534717559814, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 328070 + }, + { + "epoch": 1.268265528598599, + "grad_norm": 0.10920543223619461, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 328080 + }, + { + "epoch": 1.2683041858019823, + "grad_norm": 0.0907408595085144, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 328090 + }, + { + "epoch": 1.2683428430053656, + "grad_norm": 0.1118137463927269, + "learning_rate": 0.002, + "loss": 2.331, + "step": 328100 + }, + { + "epoch": 1.2683815002087488, + "grad_norm": 0.10773264616727829, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 328110 + }, + { + "epoch": 1.268420157412132, + "grad_norm": 0.10068121552467346, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 328120 + }, + { + "epoch": 1.2684588146155154, + "grad_norm": 0.10338988155126572, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 328130 + }, + { + "epoch": 1.2684974718188986, + "grad_norm": 0.09399203211069107, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 328140 + }, + { + "epoch": 1.268536129022282, + "grad_norm": 0.12288428843021393, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 328150 + }, + { + "epoch": 1.2685747862256653, + "grad_norm": 0.09742148965597153, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 328160 + }, + { + "epoch": 1.2686134434290486, + "grad_norm": 0.11509417742490768, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 328170 + }, + { + "epoch": 1.2686521006324318, + "grad_norm": 0.1266513615846634, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 328180 + }, + { + "epoch": 1.268690757835815, + "grad_norm": 0.0991605892777443, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 328190 + }, + { + "epoch": 1.2687294150391983, + "grad_norm": 0.1043912023305893, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 328200 + }, + { + "epoch": 1.2687680722425818, + "grad_norm": 0.10311584174633026, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 328210 + }, + { + "epoch": 1.268806729445965, + "grad_norm": 0.09867702424526215, + "learning_rate": 0.002, + "loss": 2.309, + "step": 328220 + }, + { + "epoch": 1.2688453866493483, + "grad_norm": 0.10184060037136078, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 328230 + }, + { + "epoch": 1.2688840438527316, + "grad_norm": 0.10069439560174942, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 328240 + }, + { + "epoch": 1.2689227010561148, + "grad_norm": 0.10947359353303909, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 328250 + }, + { + "epoch": 1.268961358259498, + "grad_norm": 0.1088777631521225, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 328260 + }, + { + "epoch": 1.2690000154628813, + "grad_norm": 0.13187672197818756, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 328270 + }, + { + "epoch": 1.2690386726662646, + "grad_norm": 0.1275816261768341, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 328280 + }, + { + "epoch": 1.2690773298696478, + "grad_norm": 0.1199779212474823, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 328290 + }, + { + "epoch": 1.269115987073031, + "grad_norm": 0.09605924040079117, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 328300 + }, + { + "epoch": 1.2691546442764143, + "grad_norm": 0.10590273141860962, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 328310 + }, + { + "epoch": 1.2691933014797978, + "grad_norm": 0.10233684629201889, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 328320 + }, + { + "epoch": 1.269231958683181, + "grad_norm": 0.1044899970293045, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 328330 + }, + { + "epoch": 1.2692706158865643, + "grad_norm": 0.09585341811180115, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 328340 + }, + { + "epoch": 1.2693092730899476, + "grad_norm": 0.11074862629175186, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 328350 + }, + { + "epoch": 1.2693479302933308, + "grad_norm": 0.11396773159503937, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 328360 + }, + { + "epoch": 1.269386587496714, + "grad_norm": 0.09954419732093811, + "learning_rate": 0.002, + "loss": 2.3108, + "step": 328370 + }, + { + "epoch": 1.2694252447000975, + "grad_norm": 0.11653383076190948, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 328380 + }, + { + "epoch": 1.2694639019034808, + "grad_norm": 0.11051646620035172, + "learning_rate": 0.002, + "loss": 2.322, + "step": 328390 + }, + { + "epoch": 1.269502559106864, + "grad_norm": 0.10264965891838074, + "learning_rate": 0.002, + "loss": 2.339, + "step": 328400 + }, + { + "epoch": 1.2695412163102473, + "grad_norm": 0.09628605097532272, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 328410 + }, + { + "epoch": 1.2695798735136306, + "grad_norm": 0.10045577585697174, + "learning_rate": 0.002, + "loss": 2.34, + "step": 328420 + }, + { + "epoch": 1.2696185307170138, + "grad_norm": 0.10564279556274414, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 328430 + }, + { + "epoch": 1.269657187920397, + "grad_norm": 0.10866489261388779, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 328440 + }, + { + "epoch": 1.2696958451237803, + "grad_norm": 0.11215133965015411, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 328450 + }, + { + "epoch": 1.2697345023271636, + "grad_norm": 0.09572611004114151, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 328460 + }, + { + "epoch": 1.2697731595305468, + "grad_norm": 0.10626961290836334, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 328470 + }, + { + "epoch": 1.26981181673393, + "grad_norm": 0.10905894637107849, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 328480 + }, + { + "epoch": 1.2698504739373135, + "grad_norm": 0.10089924186468124, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 328490 + }, + { + "epoch": 1.2698891311406968, + "grad_norm": 0.10201458632946014, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 328500 + }, + { + "epoch": 1.26992778834408, + "grad_norm": 0.10394861549139023, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 328510 + }, + { + "epoch": 1.2699664455474633, + "grad_norm": 0.12203310430049896, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 328520 + }, + { + "epoch": 1.2700051027508465, + "grad_norm": 0.11315803974866867, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 328530 + }, + { + "epoch": 1.2700437599542298, + "grad_norm": 0.09893783181905746, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 328540 + }, + { + "epoch": 1.2700824171576133, + "grad_norm": 0.10396460443735123, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 328550 + }, + { + "epoch": 1.2701210743609965, + "grad_norm": 0.10547640174627304, + "learning_rate": 0.002, + "loss": 2.327, + "step": 328560 + }, + { + "epoch": 1.2701597315643798, + "grad_norm": 0.0919446349143982, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 328570 + }, + { + "epoch": 1.270198388767763, + "grad_norm": 0.10054364055395126, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 328580 + }, + { + "epoch": 1.2702370459711463, + "grad_norm": 0.10095813870429993, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 328590 + }, + { + "epoch": 1.2702757031745295, + "grad_norm": 0.1102762222290039, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 328600 + }, + { + "epoch": 1.2703143603779128, + "grad_norm": 0.10052599012851715, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 328610 + }, + { + "epoch": 1.270353017581296, + "grad_norm": 0.1568072885274887, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 328620 + }, + { + "epoch": 1.2703916747846793, + "grad_norm": 0.10998766869306564, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 328630 + }, + { + "epoch": 1.2704303319880625, + "grad_norm": 0.10029449313879013, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 328640 + }, + { + "epoch": 1.2704689891914458, + "grad_norm": 0.10915552824735641, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 328650 + }, + { + "epoch": 1.2705076463948293, + "grad_norm": 0.13900934159755707, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 328660 + }, + { + "epoch": 1.2705463035982125, + "grad_norm": 0.09437382221221924, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 328670 + }, + { + "epoch": 1.2705849608015958, + "grad_norm": 0.0986660048365593, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 328680 + }, + { + "epoch": 1.270623618004979, + "grad_norm": 0.09324406832456589, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 328690 + }, + { + "epoch": 1.2706622752083623, + "grad_norm": 0.11582262068986893, + "learning_rate": 0.002, + "loss": 2.3082, + "step": 328700 + }, + { + "epoch": 1.2707009324117455, + "grad_norm": 0.11707904189825058, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 328710 + }, + { + "epoch": 1.270739589615129, + "grad_norm": 0.10172367841005325, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 328720 + }, + { + "epoch": 1.2707782468185123, + "grad_norm": 0.10533636063337326, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 328730 + }, + { + "epoch": 1.2708169040218955, + "grad_norm": 0.11258100718259811, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 328740 + }, + { + "epoch": 1.2708555612252788, + "grad_norm": 0.11175481230020523, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 328750 + }, + { + "epoch": 1.270894218428662, + "grad_norm": 0.1085057482123375, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 328760 + }, + { + "epoch": 1.2709328756320453, + "grad_norm": 0.10206273198127747, + "learning_rate": 0.002, + "loss": 2.3134, + "step": 328770 + }, + { + "epoch": 1.2709715328354285, + "grad_norm": 0.104701928794384, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 328780 + }, + { + "epoch": 1.2710101900388118, + "grad_norm": 0.10033353418111801, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 328790 + }, + { + "epoch": 1.271048847242195, + "grad_norm": 0.0895090103149414, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 328800 + }, + { + "epoch": 1.2710875044455783, + "grad_norm": 0.10525655001401901, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 328810 + }, + { + "epoch": 1.2711261616489617, + "grad_norm": 0.10305428504943848, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 328820 + }, + { + "epoch": 1.271164818852345, + "grad_norm": 0.10475514084100723, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 328830 + }, + { + "epoch": 1.2712034760557283, + "grad_norm": 0.10202089697122574, + "learning_rate": 0.002, + "loss": 2.344, + "step": 328840 + }, + { + "epoch": 1.2712421332591115, + "grad_norm": 0.12603822350502014, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 328850 + }, + { + "epoch": 1.2712807904624948, + "grad_norm": 0.10456045717000961, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 328860 + }, + { + "epoch": 1.271319447665878, + "grad_norm": 0.10017739981412888, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 328870 + }, + { + "epoch": 1.2713581048692613, + "grad_norm": 0.0923190712928772, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 328880 + }, + { + "epoch": 1.2713967620726447, + "grad_norm": 0.10937119275331497, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 328890 + }, + { + "epoch": 1.271435419276028, + "grad_norm": 0.10462003201246262, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 328900 + }, + { + "epoch": 1.2714740764794112, + "grad_norm": 0.10549817979335785, + "learning_rate": 0.002, + "loss": 2.3108, + "step": 328910 + }, + { + "epoch": 1.2715127336827945, + "grad_norm": 0.11557579040527344, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 328920 + }, + { + "epoch": 1.2715513908861777, + "grad_norm": 0.11169763654470444, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 328930 + }, + { + "epoch": 1.271590048089561, + "grad_norm": 0.09808123856782913, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 328940 + }, + { + "epoch": 1.2716287052929443, + "grad_norm": 0.09811786562204361, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 328950 + }, + { + "epoch": 1.2716673624963275, + "grad_norm": 0.11601688712835312, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 328960 + }, + { + "epoch": 1.2717060196997108, + "grad_norm": 0.11422158032655716, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 328970 + }, + { + "epoch": 1.271744676903094, + "grad_norm": 0.11772765219211578, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 328980 + }, + { + "epoch": 1.2717833341064775, + "grad_norm": 0.09244425594806671, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 328990 + }, + { + "epoch": 1.2718219913098607, + "grad_norm": 0.10463545471429825, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 329000 + }, + { + "epoch": 1.271860648513244, + "grad_norm": 0.1032916009426117, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 329010 + }, + { + "epoch": 1.2718993057166272, + "grad_norm": 0.12296883016824722, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 329020 + }, + { + "epoch": 1.2719379629200105, + "grad_norm": 0.09704263508319855, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 329030 + }, + { + "epoch": 1.2719766201233937, + "grad_norm": 0.11321119964122772, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 329040 + }, + { + "epoch": 1.272015277326777, + "grad_norm": 0.11545021831989288, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 329050 + }, + { + "epoch": 1.2720539345301605, + "grad_norm": 0.10558608174324036, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 329060 + }, + { + "epoch": 1.2720925917335437, + "grad_norm": 0.0944591835141182, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 329070 + }, + { + "epoch": 1.272131248936927, + "grad_norm": 0.09486759454011917, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 329080 + }, + { + "epoch": 1.2721699061403102, + "grad_norm": 0.10092812031507492, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 329090 + }, + { + "epoch": 1.2722085633436935, + "grad_norm": 0.11117656528949738, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 329100 + }, + { + "epoch": 1.2722472205470767, + "grad_norm": 0.10919707268476486, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 329110 + }, + { + "epoch": 1.27228587775046, + "grad_norm": 0.13514567911624908, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 329120 + }, + { + "epoch": 1.2723245349538432, + "grad_norm": 0.3119554817676544, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 329130 + }, + { + "epoch": 1.2723631921572265, + "grad_norm": 0.11358354985713959, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 329140 + }, + { + "epoch": 1.2724018493606097, + "grad_norm": 0.09593138098716736, + "learning_rate": 0.002, + "loss": 2.342, + "step": 329150 + }, + { + "epoch": 1.2724405065639932, + "grad_norm": 0.13451911509037018, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 329160 + }, + { + "epoch": 1.2724791637673765, + "grad_norm": 0.12064133584499359, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 329170 + }, + { + "epoch": 1.2725178209707597, + "grad_norm": 0.09411446750164032, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 329180 + }, + { + "epoch": 1.272556478174143, + "grad_norm": 0.10068316012620926, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 329190 + }, + { + "epoch": 1.2725951353775262, + "grad_norm": 0.10359273850917816, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 329200 + }, + { + "epoch": 1.2726337925809095, + "grad_norm": 0.09584148228168488, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 329210 + }, + { + "epoch": 1.2726724497842927, + "grad_norm": 0.18259383738040924, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 329220 + }, + { + "epoch": 1.2727111069876762, + "grad_norm": 0.11002141237258911, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 329230 + }, + { + "epoch": 1.2727497641910595, + "grad_norm": 0.09783057123422623, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 329240 + }, + { + "epoch": 1.2727884213944427, + "grad_norm": 0.09692111611366272, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 329250 + }, + { + "epoch": 1.272827078597826, + "grad_norm": 0.09134890139102936, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 329260 + }, + { + "epoch": 1.2728657358012092, + "grad_norm": 0.12369803339242935, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 329270 + }, + { + "epoch": 1.2729043930045925, + "grad_norm": 0.10898613184690475, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 329280 + }, + { + "epoch": 1.2729430502079757, + "grad_norm": 0.09025852382183075, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 329290 + }, + { + "epoch": 1.272981707411359, + "grad_norm": 0.09845314174890518, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 329300 + }, + { + "epoch": 1.2730203646147422, + "grad_norm": 0.09647542238235474, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 329310 + }, + { + "epoch": 1.2730590218181255, + "grad_norm": 0.09182790666818619, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 329320 + }, + { + "epoch": 1.273097679021509, + "grad_norm": 0.11160074174404144, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 329330 + }, + { + "epoch": 1.2731363362248922, + "grad_norm": 0.11283206939697266, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 329340 + }, + { + "epoch": 1.2731749934282754, + "grad_norm": 0.1080353707075119, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 329350 + }, + { + "epoch": 1.2732136506316587, + "grad_norm": 0.10062264651060104, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 329360 + }, + { + "epoch": 1.273252307835042, + "grad_norm": 0.09652829170227051, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 329370 + }, + { + "epoch": 1.2732909650384252, + "grad_norm": 0.1126071885228157, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 329380 + }, + { + "epoch": 1.2733296222418085, + "grad_norm": 0.08950099349021912, + "learning_rate": 0.002, + "loss": 2.336, + "step": 329390 + }, + { + "epoch": 1.273368279445192, + "grad_norm": 0.09760060161352158, + "learning_rate": 0.002, + "loss": 2.33, + "step": 329400 + }, + { + "epoch": 1.2734069366485752, + "grad_norm": 0.10084472596645355, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 329410 + }, + { + "epoch": 1.2734455938519584, + "grad_norm": 0.10872059315443039, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 329420 + }, + { + "epoch": 1.2734842510553417, + "grad_norm": 0.09868069738149643, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 329430 + }, + { + "epoch": 1.273522908258725, + "grad_norm": 0.11543266475200653, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 329440 + }, + { + "epoch": 1.2735615654621082, + "grad_norm": 0.22089999914169312, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 329450 + }, + { + "epoch": 1.2736002226654914, + "grad_norm": 0.0972055122256279, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 329460 + }, + { + "epoch": 1.2736388798688747, + "grad_norm": 0.0951479971408844, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 329470 + }, + { + "epoch": 1.273677537072258, + "grad_norm": 0.113665871322155, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 329480 + }, + { + "epoch": 1.2737161942756412, + "grad_norm": 0.10153911262750626, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 329490 + }, + { + "epoch": 1.2737548514790247, + "grad_norm": 0.10728565603494644, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 329500 + }, + { + "epoch": 1.273793508682408, + "grad_norm": 0.11218227446079254, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 329510 + }, + { + "epoch": 1.2738321658857912, + "grad_norm": 0.09646441042423248, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 329520 + }, + { + "epoch": 1.2738708230891744, + "grad_norm": 0.08836732059717178, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 329530 + }, + { + "epoch": 1.2739094802925577, + "grad_norm": 0.09999502450227737, + "learning_rate": 0.002, + "loss": 2.335, + "step": 329540 + }, + { + "epoch": 1.273948137495941, + "grad_norm": 0.11865074932575226, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 329550 + }, + { + "epoch": 1.2739867946993244, + "grad_norm": 0.10607121139764786, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 329560 + }, + { + "epoch": 1.2740254519027077, + "grad_norm": 0.09681159257888794, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 329570 + }, + { + "epoch": 1.274064109106091, + "grad_norm": 0.10897815227508545, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 329580 + }, + { + "epoch": 1.2741027663094742, + "grad_norm": 0.11636954545974731, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 329590 + }, + { + "epoch": 1.2741414235128574, + "grad_norm": 0.08978872001171112, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 329600 + }, + { + "epoch": 1.2741800807162407, + "grad_norm": 0.1251995861530304, + "learning_rate": 0.002, + "loss": 2.353, + "step": 329610 + }, + { + "epoch": 1.274218737919624, + "grad_norm": 0.10945021361112595, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 329620 + }, + { + "epoch": 1.2742573951230072, + "grad_norm": 0.09309005737304688, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 329630 + }, + { + "epoch": 1.2742960523263904, + "grad_norm": 0.10380406677722931, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 329640 + }, + { + "epoch": 1.2743347095297737, + "grad_norm": 0.09443452209234238, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 329650 + }, + { + "epoch": 1.274373366733157, + "grad_norm": 0.10749629884958267, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 329660 + }, + { + "epoch": 1.2744120239365404, + "grad_norm": 0.11914601922035217, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 329670 + }, + { + "epoch": 1.2744506811399237, + "grad_norm": 0.10565859824419022, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 329680 + }, + { + "epoch": 1.274489338343307, + "grad_norm": 0.10837848484516144, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 329690 + }, + { + "epoch": 1.2745279955466902, + "grad_norm": 0.11632353067398071, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 329700 + }, + { + "epoch": 1.2745666527500734, + "grad_norm": 0.08982210606336594, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 329710 + }, + { + "epoch": 1.2746053099534567, + "grad_norm": 0.11114289611577988, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 329720 + }, + { + "epoch": 1.2746439671568401, + "grad_norm": 0.09798840433359146, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 329730 + }, + { + "epoch": 1.2746826243602234, + "grad_norm": 0.1059572622179985, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 329740 + }, + { + "epoch": 1.2747212815636066, + "grad_norm": 0.09854008257389069, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 329750 + }, + { + "epoch": 1.27475993876699, + "grad_norm": 0.106520876288414, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 329760 + }, + { + "epoch": 1.2747985959703731, + "grad_norm": 0.11019045114517212, + "learning_rate": 0.002, + "loss": 2.331, + "step": 329770 + }, + { + "epoch": 1.2748372531737564, + "grad_norm": 0.11295673251152039, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 329780 + }, + { + "epoch": 1.2748759103771397, + "grad_norm": 0.10148819535970688, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 329790 + }, + { + "epoch": 1.274914567580523, + "grad_norm": 0.10899393260478973, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 329800 + }, + { + "epoch": 1.2749532247839062, + "grad_norm": 0.09473425149917603, + "learning_rate": 0.002, + "loss": 2.323, + "step": 329810 + }, + { + "epoch": 1.2749918819872894, + "grad_norm": 0.09841945767402649, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 329820 + }, + { + "epoch": 1.2750305391906727, + "grad_norm": 0.10254204273223877, + "learning_rate": 0.002, + "loss": 2.319, + "step": 329830 + }, + { + "epoch": 1.2750691963940561, + "grad_norm": 0.12272008508443832, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 329840 + }, + { + "epoch": 1.2751078535974394, + "grad_norm": 0.09451805055141449, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 329850 + }, + { + "epoch": 1.2751465108008226, + "grad_norm": 0.10965383797883987, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 329860 + }, + { + "epoch": 1.275185168004206, + "grad_norm": 0.09316661953926086, + "learning_rate": 0.002, + "loss": 2.334, + "step": 329870 + }, + { + "epoch": 1.2752238252075891, + "grad_norm": 0.10202120244503021, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 329880 + }, + { + "epoch": 1.2752624824109724, + "grad_norm": 0.11021611839532852, + "learning_rate": 0.002, + "loss": 2.3135, + "step": 329890 + }, + { + "epoch": 1.2753011396143559, + "grad_norm": 0.13178619742393494, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 329900 + }, + { + "epoch": 1.2753397968177391, + "grad_norm": 0.10314160585403442, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 329910 + }, + { + "epoch": 1.2753784540211224, + "grad_norm": 0.09276103973388672, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 329920 + }, + { + "epoch": 1.2754171112245056, + "grad_norm": 0.10184242576360703, + "learning_rate": 0.002, + "loss": 2.3049, + "step": 329930 + }, + { + "epoch": 1.2754557684278889, + "grad_norm": 0.13053973019123077, + "learning_rate": 0.002, + "loss": 2.337, + "step": 329940 + }, + { + "epoch": 1.2754944256312721, + "grad_norm": 0.09545928239822388, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 329950 + }, + { + "epoch": 1.2755330828346554, + "grad_norm": 0.14243371784687042, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 329960 + }, + { + "epoch": 1.2755717400380386, + "grad_norm": 0.09996338933706284, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 329970 + }, + { + "epoch": 1.275610397241422, + "grad_norm": 0.11278600245714188, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 329980 + }, + { + "epoch": 1.2756490544448051, + "grad_norm": 0.1257156878709793, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 329990 + }, + { + "epoch": 1.2756877116481884, + "grad_norm": 0.09659942984580994, + "learning_rate": 0.002, + "loss": 2.339, + "step": 330000 + }, + { + "epoch": 1.2757263688515719, + "grad_norm": 0.10965616255998611, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 330010 + }, + { + "epoch": 1.2757650260549551, + "grad_norm": 0.1346312016248703, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 330020 + }, + { + "epoch": 1.2758036832583384, + "grad_norm": 0.08886497467756271, + "learning_rate": 0.002, + "loss": 2.323, + "step": 330030 + }, + { + "epoch": 1.2758423404617216, + "grad_norm": 0.09994087368249893, + "learning_rate": 0.002, + "loss": 2.338, + "step": 330040 + }, + { + "epoch": 1.2758809976651049, + "grad_norm": 0.10083264112472534, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 330050 + }, + { + "epoch": 1.2759196548684881, + "grad_norm": 0.5164046883583069, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 330060 + }, + { + "epoch": 1.2759583120718716, + "grad_norm": 0.10039766877889633, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 330070 + }, + { + "epoch": 1.2759969692752549, + "grad_norm": 0.10687883198261261, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 330080 + }, + { + "epoch": 1.276035626478638, + "grad_norm": 0.09684999287128448, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 330090 + }, + { + "epoch": 1.2760742836820214, + "grad_norm": 0.09277097135782242, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 330100 + }, + { + "epoch": 1.2761129408854046, + "grad_norm": 0.10449618101119995, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 330110 + }, + { + "epoch": 1.2761515980887879, + "grad_norm": 0.10649379342794418, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 330120 + }, + { + "epoch": 1.2761902552921711, + "grad_norm": 0.10755191743373871, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 330130 + }, + { + "epoch": 1.2762289124955544, + "grad_norm": 0.10106264799833298, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 330140 + }, + { + "epoch": 1.2762675696989376, + "grad_norm": 0.10244058817625046, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 330150 + }, + { + "epoch": 1.2763062269023209, + "grad_norm": 0.10412459075450897, + "learning_rate": 0.002, + "loss": 2.333, + "step": 330160 + }, + { + "epoch": 1.2763448841057041, + "grad_norm": 0.10179536044597626, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 330170 + }, + { + "epoch": 1.2763835413090876, + "grad_norm": 0.09596909582614899, + "learning_rate": 0.002, + "loss": 2.33, + "step": 330180 + }, + { + "epoch": 1.2764221985124709, + "grad_norm": 0.11567061394453049, + "learning_rate": 0.002, + "loss": 2.327, + "step": 330190 + }, + { + "epoch": 1.276460855715854, + "grad_norm": 0.11815230548381805, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 330200 + }, + { + "epoch": 1.2764995129192374, + "grad_norm": 0.11768794804811478, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 330210 + }, + { + "epoch": 1.2765381701226206, + "grad_norm": 0.1113450825214386, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 330220 + }, + { + "epoch": 1.2765768273260039, + "grad_norm": 0.12138304114341736, + "learning_rate": 0.002, + "loss": 2.329, + "step": 330230 + }, + { + "epoch": 1.2766154845293873, + "grad_norm": 0.11253490298986435, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 330240 + }, + { + "epoch": 1.2766541417327706, + "grad_norm": 0.0988660380244255, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 330250 + }, + { + "epoch": 1.2766927989361538, + "grad_norm": 0.12407879531383514, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 330260 + }, + { + "epoch": 1.276731456139537, + "grad_norm": 0.10875300318002701, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 330270 + }, + { + "epoch": 1.2767701133429203, + "grad_norm": 0.15900826454162598, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 330280 + }, + { + "epoch": 1.2768087705463036, + "grad_norm": 0.12358444929122925, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 330290 + }, + { + "epoch": 1.2768474277496868, + "grad_norm": 0.10288117080926895, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 330300 + }, + { + "epoch": 1.27688608495307, + "grad_norm": 0.12121498584747314, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 330310 + }, + { + "epoch": 1.2769247421564534, + "grad_norm": 0.10315801203250885, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 330320 + }, + { + "epoch": 1.2769633993598366, + "grad_norm": 0.10812592506408691, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 330330 + }, + { + "epoch": 1.2770020565632199, + "grad_norm": 0.09591901302337646, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 330340 + }, + { + "epoch": 1.2770407137666033, + "grad_norm": 0.11056618392467499, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 330350 + }, + { + "epoch": 1.2770793709699866, + "grad_norm": 0.10496780276298523, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 330360 + }, + { + "epoch": 1.2771180281733698, + "grad_norm": 0.10728298127651215, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 330370 + }, + { + "epoch": 1.277156685376753, + "grad_norm": 0.12340054661035538, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 330380 + }, + { + "epoch": 1.2771953425801363, + "grad_norm": 0.12097841501235962, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 330390 + }, + { + "epoch": 1.2772339997835196, + "grad_norm": 0.10657333582639694, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 330400 + }, + { + "epoch": 1.277272656986903, + "grad_norm": 0.10780542343854904, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 330410 + }, + { + "epoch": 1.2773113141902863, + "grad_norm": 0.09199956059455872, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 330420 + }, + { + "epoch": 1.2773499713936696, + "grad_norm": 0.08981581777334213, + "learning_rate": 0.002, + "loss": 2.34, + "step": 330430 + }, + { + "epoch": 1.2773886285970528, + "grad_norm": 0.11883309483528137, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 330440 + }, + { + "epoch": 1.277427285800436, + "grad_norm": 0.09634125977754593, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 330450 + }, + { + "epoch": 1.2774659430038193, + "grad_norm": 0.10400119423866272, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 330460 + }, + { + "epoch": 1.2775046002072026, + "grad_norm": 0.11077505350112915, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 330470 + }, + { + "epoch": 1.2775432574105858, + "grad_norm": 0.09788201004266739, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 330480 + }, + { + "epoch": 1.277581914613969, + "grad_norm": 0.10747364163398743, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 330490 + }, + { + "epoch": 1.2776205718173523, + "grad_norm": 0.11120908707380295, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 330500 + }, + { + "epoch": 1.2776592290207356, + "grad_norm": 0.10732141137123108, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 330510 + }, + { + "epoch": 1.277697886224119, + "grad_norm": 0.09890435636043549, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 330520 + }, + { + "epoch": 1.2777365434275023, + "grad_norm": 0.09526444971561432, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 330530 + }, + { + "epoch": 1.2777752006308856, + "grad_norm": 0.13470156490802765, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 330540 + }, + { + "epoch": 1.2778138578342688, + "grad_norm": 0.10842323303222656, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 330550 + }, + { + "epoch": 1.277852515037652, + "grad_norm": 0.09326523542404175, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 330560 + }, + { + "epoch": 1.2778911722410353, + "grad_norm": 0.10812563449144363, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 330570 + }, + { + "epoch": 1.2779298294444188, + "grad_norm": 0.09856005012989044, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 330580 + }, + { + "epoch": 1.277968486647802, + "grad_norm": 0.11291167885065079, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 330590 + }, + { + "epoch": 1.2780071438511853, + "grad_norm": 0.10198131203651428, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 330600 + }, + { + "epoch": 1.2780458010545686, + "grad_norm": 0.10310892760753632, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 330610 + }, + { + "epoch": 1.2780844582579518, + "grad_norm": 0.10659323632717133, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 330620 + }, + { + "epoch": 1.278123115461335, + "grad_norm": 0.09991616010665894, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 330630 + }, + { + "epoch": 1.2781617726647183, + "grad_norm": 0.10998846590518951, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 330640 + }, + { + "epoch": 1.2782004298681016, + "grad_norm": 0.1059923768043518, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 330650 + }, + { + "epoch": 1.2782390870714848, + "grad_norm": 0.08670962601900101, + "learning_rate": 0.002, + "loss": 2.333, + "step": 330660 + }, + { + "epoch": 1.278277744274868, + "grad_norm": 0.10149190574884415, + "learning_rate": 0.002, + "loss": 2.332, + "step": 330670 + }, + { + "epoch": 1.2783164014782515, + "grad_norm": 0.10550031065940857, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 330680 + }, + { + "epoch": 1.2783550586816348, + "grad_norm": 0.100679412484169, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 330690 + }, + { + "epoch": 1.278393715885018, + "grad_norm": 0.11549176275730133, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 330700 + }, + { + "epoch": 1.2784323730884013, + "grad_norm": 0.09727973490953445, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 330710 + }, + { + "epoch": 1.2784710302917845, + "grad_norm": 0.10293383151292801, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 330720 + }, + { + "epoch": 1.2785096874951678, + "grad_norm": 0.1048823818564415, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 330730 + }, + { + "epoch": 1.278548344698551, + "grad_norm": 0.10713917762041092, + "learning_rate": 0.002, + "loss": 2.329, + "step": 330740 + }, + { + "epoch": 1.2785870019019345, + "grad_norm": 0.11875288933515549, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 330750 + }, + { + "epoch": 1.2786256591053178, + "grad_norm": 0.09954854100942612, + "learning_rate": 0.002, + "loss": 2.316, + "step": 330760 + }, + { + "epoch": 1.278664316308701, + "grad_norm": 0.09504789113998413, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 330770 + }, + { + "epoch": 1.2787029735120843, + "grad_norm": 0.1305740773677826, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 330780 + }, + { + "epoch": 1.2787416307154675, + "grad_norm": 0.10554517805576324, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 330790 + }, + { + "epoch": 1.2787802879188508, + "grad_norm": 0.1055464893579483, + "learning_rate": 0.002, + "loss": 2.34, + "step": 330800 + }, + { + "epoch": 1.278818945122234, + "grad_norm": 0.09993565827608109, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 330810 + }, + { + "epoch": 1.2788576023256173, + "grad_norm": 0.10794758796691895, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 330820 + }, + { + "epoch": 1.2788962595290005, + "grad_norm": 0.09847798198461533, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 330830 + }, + { + "epoch": 1.2789349167323838, + "grad_norm": 0.14246560633182526, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 330840 + }, + { + "epoch": 1.2789735739357673, + "grad_norm": 0.10085310786962509, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 330850 + }, + { + "epoch": 1.2790122311391505, + "grad_norm": 0.09441061317920685, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 330860 + }, + { + "epoch": 1.2790508883425338, + "grad_norm": 0.09778943657875061, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 330870 + }, + { + "epoch": 1.279089545545917, + "grad_norm": 0.09769915044307709, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 330880 + }, + { + "epoch": 1.2791282027493003, + "grad_norm": 0.10353109985589981, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 330890 + }, + { + "epoch": 1.2791668599526835, + "grad_norm": 0.11549326032400131, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 330900 + }, + { + "epoch": 1.2792055171560668, + "grad_norm": 0.09193005412817001, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 330910 + }, + { + "epoch": 1.2792441743594503, + "grad_norm": 0.09352631121873856, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 330920 + }, + { + "epoch": 1.2792828315628335, + "grad_norm": 0.10780029743909836, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 330930 + }, + { + "epoch": 1.2793214887662168, + "grad_norm": 0.09702304750680923, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 330940 + }, + { + "epoch": 1.2793601459696, + "grad_norm": 0.10021896660327911, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 330950 + }, + { + "epoch": 1.2793988031729833, + "grad_norm": 0.11453984677791595, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 330960 + }, + { + "epoch": 1.2794374603763665, + "grad_norm": 0.13235102593898773, + "learning_rate": 0.002, + "loss": 2.344, + "step": 330970 + }, + { + "epoch": 1.2794761175797498, + "grad_norm": 0.0944540798664093, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 330980 + }, + { + "epoch": 1.279514774783133, + "grad_norm": 0.12713098526000977, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 330990 + }, + { + "epoch": 1.2795534319865163, + "grad_norm": 0.09712237119674683, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 331000 + }, + { + "epoch": 1.2795920891898995, + "grad_norm": 0.11072835326194763, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 331010 + }, + { + "epoch": 1.279630746393283, + "grad_norm": 0.11048628389835358, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 331020 + }, + { + "epoch": 1.2796694035966663, + "grad_norm": 0.10861332714557648, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 331030 + }, + { + "epoch": 1.2797080608000495, + "grad_norm": 0.11873438954353333, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 331040 + }, + { + "epoch": 1.2797467180034328, + "grad_norm": 0.10168084502220154, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 331050 + }, + { + "epoch": 1.279785375206816, + "grad_norm": 0.10715948790311813, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 331060 + }, + { + "epoch": 1.2798240324101993, + "grad_norm": 0.1116722822189331, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 331070 + }, + { + "epoch": 1.2798626896135825, + "grad_norm": 0.0973399206995964, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 331080 + }, + { + "epoch": 1.279901346816966, + "grad_norm": 0.09459315240383148, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 331090 + }, + { + "epoch": 1.2799400040203492, + "grad_norm": 0.10086680203676224, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 331100 + }, + { + "epoch": 1.2799786612237325, + "grad_norm": 0.10074183344841003, + "learning_rate": 0.002, + "loss": 2.326, + "step": 331110 + }, + { + "epoch": 1.2800173184271157, + "grad_norm": 0.1126500815153122, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 331120 + }, + { + "epoch": 1.280055975630499, + "grad_norm": 0.11003128439188004, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 331130 + }, + { + "epoch": 1.2800946328338823, + "grad_norm": 0.11613652855157852, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 331140 + }, + { + "epoch": 1.2801332900372655, + "grad_norm": 0.13420990109443665, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 331150 + }, + { + "epoch": 1.2801719472406488, + "grad_norm": 0.10044151544570923, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 331160 + }, + { + "epoch": 1.280210604444032, + "grad_norm": 0.10778767615556717, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 331170 + }, + { + "epoch": 1.2802492616474153, + "grad_norm": 0.10573440790176392, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 331180 + }, + { + "epoch": 1.2802879188507987, + "grad_norm": 0.10240902006626129, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 331190 + }, + { + "epoch": 1.280326576054182, + "grad_norm": 0.11871691793203354, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 331200 + }, + { + "epoch": 1.2803652332575652, + "grad_norm": 0.14145809412002563, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 331210 + }, + { + "epoch": 1.2804038904609485, + "grad_norm": 0.10078298300504684, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 331220 + }, + { + "epoch": 1.2804425476643317, + "grad_norm": 0.11439738422632217, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 331230 + }, + { + "epoch": 1.280481204867715, + "grad_norm": 0.1064569428563118, + "learning_rate": 0.002, + "loss": 2.337, + "step": 331240 + }, + { + "epoch": 1.2805198620710982, + "grad_norm": 0.10036151111125946, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 331250 + }, + { + "epoch": 1.2805585192744817, + "grad_norm": 0.11667811125516891, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 331260 + }, + { + "epoch": 1.280597176477865, + "grad_norm": 0.1017821803689003, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 331270 + }, + { + "epoch": 1.2806358336812482, + "grad_norm": 0.1017158031463623, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 331280 + }, + { + "epoch": 1.2806744908846315, + "grad_norm": 0.09027392417192459, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 331290 + }, + { + "epoch": 1.2807131480880147, + "grad_norm": 0.1140141710639, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 331300 + }, + { + "epoch": 1.280751805291398, + "grad_norm": 0.13576845824718475, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 331310 + }, + { + "epoch": 1.2807904624947812, + "grad_norm": 0.10066722333431244, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 331320 + }, + { + "epoch": 1.2808291196981645, + "grad_norm": 0.09830858558416367, + "learning_rate": 0.002, + "loss": 2.341, + "step": 331330 + }, + { + "epoch": 1.2808677769015477, + "grad_norm": 0.09742210805416107, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 331340 + }, + { + "epoch": 1.280906434104931, + "grad_norm": 0.10384872555732727, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 331350 + }, + { + "epoch": 1.2809450913083145, + "grad_norm": 0.10708346217870712, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 331360 + }, + { + "epoch": 1.2809837485116977, + "grad_norm": 0.10111190378665924, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 331370 + }, + { + "epoch": 1.281022405715081, + "grad_norm": 0.09716185927391052, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 331380 + }, + { + "epoch": 1.2810610629184642, + "grad_norm": 0.09608956426382065, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 331390 + }, + { + "epoch": 1.2810997201218475, + "grad_norm": 0.1042938381433487, + "learning_rate": 0.002, + "loss": 2.334, + "step": 331400 + }, + { + "epoch": 1.2811383773252307, + "grad_norm": 0.09524276852607727, + "learning_rate": 0.002, + "loss": 2.338, + "step": 331410 + }, + { + "epoch": 1.2811770345286142, + "grad_norm": 0.10780641436576843, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 331420 + }, + { + "epoch": 1.2812156917319975, + "grad_norm": 0.10179249942302704, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 331430 + }, + { + "epoch": 1.2812543489353807, + "grad_norm": 0.11295023560523987, + "learning_rate": 0.002, + "loss": 2.316, + "step": 331440 + }, + { + "epoch": 1.281293006138764, + "grad_norm": 0.11664408445358276, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 331450 + }, + { + "epoch": 1.2813316633421472, + "grad_norm": 0.11424199491739273, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 331460 + }, + { + "epoch": 1.2813703205455305, + "grad_norm": 0.8250250816345215, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 331470 + }, + { + "epoch": 1.2814089777489137, + "grad_norm": 0.6171488165855408, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 331480 + }, + { + "epoch": 1.281447634952297, + "grad_norm": 0.15012137591838837, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 331490 + }, + { + "epoch": 1.2814862921556802, + "grad_norm": 0.10564377903938293, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 331500 + }, + { + "epoch": 1.2815249493590635, + "grad_norm": 0.11790092289447784, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 331510 + }, + { + "epoch": 1.2815636065624467, + "grad_norm": 0.10824461281299591, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 331520 + }, + { + "epoch": 1.2816022637658302, + "grad_norm": 0.10344535857439041, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 331530 + }, + { + "epoch": 1.2816409209692134, + "grad_norm": 0.10616878420114517, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 331540 + }, + { + "epoch": 1.2816795781725967, + "grad_norm": 0.11207808554172516, + "learning_rate": 0.002, + "loss": 2.327, + "step": 331550 + }, + { + "epoch": 1.28171823537598, + "grad_norm": 0.09932498633861542, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 331560 + }, + { + "epoch": 1.2817568925793632, + "grad_norm": 0.10903242230415344, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 331570 + }, + { + "epoch": 1.2817955497827465, + "grad_norm": 0.12127885967493057, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 331580 + }, + { + "epoch": 1.28183420698613, + "grad_norm": 0.10876268148422241, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 331590 + }, + { + "epoch": 1.2818728641895132, + "grad_norm": 0.10647932440042496, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 331600 + }, + { + "epoch": 1.2819115213928964, + "grad_norm": 0.09786685556173325, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 331610 + }, + { + "epoch": 1.2819501785962797, + "grad_norm": 0.12839925289154053, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 331620 + }, + { + "epoch": 1.281988835799663, + "grad_norm": 0.11178138852119446, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 331630 + }, + { + "epoch": 1.2820274930030462, + "grad_norm": 0.09654244035482407, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 331640 + }, + { + "epoch": 1.2820661502064294, + "grad_norm": 0.11449973285198212, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 331650 + }, + { + "epoch": 1.2821048074098127, + "grad_norm": 0.10151177644729614, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 331660 + }, + { + "epoch": 1.282143464613196, + "grad_norm": 0.09369701892137527, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 331670 + }, + { + "epoch": 1.2821821218165792, + "grad_norm": 0.09277847409248352, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 331680 + }, + { + "epoch": 1.2822207790199625, + "grad_norm": 0.11237433552742004, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 331690 + }, + { + "epoch": 1.282259436223346, + "grad_norm": 0.2749531865119934, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 331700 + }, + { + "epoch": 1.2822980934267292, + "grad_norm": 0.12759670615196228, + "learning_rate": 0.002, + "loss": 2.337, + "step": 331710 + }, + { + "epoch": 1.2823367506301124, + "grad_norm": 0.1205337718129158, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 331720 + }, + { + "epoch": 1.2823754078334957, + "grad_norm": 0.10772675275802612, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 331730 + }, + { + "epoch": 1.282414065036879, + "grad_norm": 0.1037701889872551, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 331740 + }, + { + "epoch": 1.2824527222402622, + "grad_norm": 0.10590890049934387, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 331750 + }, + { + "epoch": 1.2824913794436457, + "grad_norm": 0.09448172152042389, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 331760 + }, + { + "epoch": 1.282530036647029, + "grad_norm": 0.11490815132856369, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 331770 + }, + { + "epoch": 1.2825686938504122, + "grad_norm": 0.21631719172000885, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 331780 + }, + { + "epoch": 1.2826073510537954, + "grad_norm": 0.09326942265033722, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 331790 + }, + { + "epoch": 1.2826460082571787, + "grad_norm": 0.09836390614509583, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 331800 + }, + { + "epoch": 1.282684665460562, + "grad_norm": 0.09765587002038956, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 331810 + }, + { + "epoch": 1.2827233226639452, + "grad_norm": 0.09442969411611557, + "learning_rate": 0.002, + "loss": 2.325, + "step": 331820 + }, + { + "epoch": 1.2827619798673284, + "grad_norm": 0.12060291320085526, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 331830 + }, + { + "epoch": 1.2828006370707117, + "grad_norm": 0.10620687156915665, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 331840 + }, + { + "epoch": 1.282839294274095, + "grad_norm": 0.12601856887340546, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 331850 + }, + { + "epoch": 1.2828779514774782, + "grad_norm": 0.10232758522033691, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 331860 + }, + { + "epoch": 1.2829166086808617, + "grad_norm": 0.11093420535326004, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 331870 + }, + { + "epoch": 1.282955265884245, + "grad_norm": 0.12195949256420135, + "learning_rate": 0.002, + "loss": 2.326, + "step": 331880 + }, + { + "epoch": 1.2829939230876282, + "grad_norm": 0.1167902797460556, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 331890 + }, + { + "epoch": 1.2830325802910114, + "grad_norm": 0.09185588359832764, + "learning_rate": 0.002, + "loss": 2.343, + "step": 331900 + }, + { + "epoch": 1.2830712374943947, + "grad_norm": 0.10136355459690094, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 331910 + }, + { + "epoch": 1.283109894697778, + "grad_norm": 0.09425336867570877, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 331920 + }, + { + "epoch": 1.2831485519011614, + "grad_norm": 0.12710914015769958, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 331930 + }, + { + "epoch": 1.2831872091045446, + "grad_norm": 0.10715360939502716, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 331940 + }, + { + "epoch": 1.283225866307928, + "grad_norm": 0.1006692498922348, + "learning_rate": 0.002, + "loss": 2.331, + "step": 331950 + }, + { + "epoch": 1.2832645235113112, + "grad_norm": 0.11472304165363312, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 331960 + }, + { + "epoch": 1.2833031807146944, + "grad_norm": 0.09872464090585709, + "learning_rate": 0.002, + "loss": 2.343, + "step": 331970 + }, + { + "epoch": 1.2833418379180777, + "grad_norm": 0.11383762210607529, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 331980 + }, + { + "epoch": 1.283380495121461, + "grad_norm": 0.11117720603942871, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 331990 + }, + { + "epoch": 1.2834191523248442, + "grad_norm": 0.10939493030309677, + "learning_rate": 0.002, + "loss": 2.327, + "step": 332000 + }, + { + "epoch": 1.2834578095282274, + "grad_norm": 0.11570484936237335, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 332010 + }, + { + "epoch": 1.2834964667316107, + "grad_norm": 0.10188289731740952, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 332020 + }, + { + "epoch": 1.283535123934994, + "grad_norm": 0.10991719365119934, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 332030 + }, + { + "epoch": 1.2835737811383774, + "grad_norm": 0.10216062515974045, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 332040 + }, + { + "epoch": 1.2836124383417606, + "grad_norm": 0.10282238572835922, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 332050 + }, + { + "epoch": 1.283651095545144, + "grad_norm": 0.09967167675495148, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 332060 + }, + { + "epoch": 1.2836897527485271, + "grad_norm": 0.09352394938468933, + "learning_rate": 0.002, + "loss": 2.336, + "step": 332070 + }, + { + "epoch": 1.2837284099519104, + "grad_norm": 0.11892616003751755, + "learning_rate": 0.002, + "loss": 2.329, + "step": 332080 + }, + { + "epoch": 1.2837670671552937, + "grad_norm": 0.10310930758714676, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 332090 + }, + { + "epoch": 1.2838057243586771, + "grad_norm": 0.1289806365966797, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 332100 + }, + { + "epoch": 1.2838443815620604, + "grad_norm": 0.10785408318042755, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 332110 + }, + { + "epoch": 1.2838830387654436, + "grad_norm": 0.1182045266032219, + "learning_rate": 0.002, + "loss": 2.347, + "step": 332120 + }, + { + "epoch": 1.2839216959688269, + "grad_norm": 0.12128960341215134, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 332130 + }, + { + "epoch": 1.2839603531722101, + "grad_norm": 0.09235437959432602, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 332140 + }, + { + "epoch": 1.2839990103755934, + "grad_norm": 0.10368960350751877, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 332150 + }, + { + "epoch": 1.2840376675789766, + "grad_norm": 0.1179451122879982, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 332160 + }, + { + "epoch": 1.28407632478236, + "grad_norm": 0.11054787784814835, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 332170 + }, + { + "epoch": 1.2841149819857431, + "grad_norm": 0.09622831642627716, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 332180 + }, + { + "epoch": 1.2841536391891264, + "grad_norm": 0.1420263797044754, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 332190 + }, + { + "epoch": 1.2841922963925096, + "grad_norm": 0.10438098013401031, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 332200 + }, + { + "epoch": 1.2842309535958931, + "grad_norm": 0.11499829590320587, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 332210 + }, + { + "epoch": 1.2842696107992764, + "grad_norm": 0.11683389544487, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 332220 + }, + { + "epoch": 1.2843082680026596, + "grad_norm": 0.10560715198516846, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 332230 + }, + { + "epoch": 1.2843469252060429, + "grad_norm": 0.12091319262981415, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 332240 + }, + { + "epoch": 1.2843855824094261, + "grad_norm": 0.0979316458106041, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 332250 + }, + { + "epoch": 1.2844242396128094, + "grad_norm": 0.10224587470293045, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 332260 + }, + { + "epoch": 1.2844628968161929, + "grad_norm": 0.10661505162715912, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 332270 + }, + { + "epoch": 1.284501554019576, + "grad_norm": 0.09975156933069229, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 332280 + }, + { + "epoch": 1.2845402112229594, + "grad_norm": 0.1133033037185669, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 332290 + }, + { + "epoch": 1.2845788684263426, + "grad_norm": 0.14396722614765167, + "learning_rate": 0.002, + "loss": 2.32, + "step": 332300 + }, + { + "epoch": 1.2846175256297259, + "grad_norm": 0.10433393716812134, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 332310 + }, + { + "epoch": 1.2846561828331091, + "grad_norm": 0.09392431378364563, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 332320 + }, + { + "epoch": 1.2846948400364924, + "grad_norm": 0.09219883382320404, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 332330 + }, + { + "epoch": 1.2847334972398756, + "grad_norm": 0.09603586047887802, + "learning_rate": 0.002, + "loss": 2.323, + "step": 332340 + }, + { + "epoch": 1.2847721544432589, + "grad_norm": 0.10061074048280716, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 332350 + }, + { + "epoch": 1.2848108116466421, + "grad_norm": 0.11493312567472458, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 332360 + }, + { + "epoch": 1.2848494688500254, + "grad_norm": 0.11308170855045319, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 332370 + }, + { + "epoch": 1.2848881260534089, + "grad_norm": 0.09977343678474426, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 332380 + }, + { + "epoch": 1.284926783256792, + "grad_norm": 0.10442902892827988, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 332390 + }, + { + "epoch": 1.2849654404601754, + "grad_norm": 0.11375498026609421, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 332400 + }, + { + "epoch": 1.2850040976635586, + "grad_norm": 0.09284843504428864, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 332410 + }, + { + "epoch": 1.2850427548669419, + "grad_norm": 0.09526360780000687, + "learning_rate": 0.002, + "loss": 2.339, + "step": 332420 + }, + { + "epoch": 1.2850814120703251, + "grad_norm": 0.1074945479631424, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 332430 + }, + { + "epoch": 1.2851200692737086, + "grad_norm": 0.10130199044942856, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 332440 + }, + { + "epoch": 1.2851587264770918, + "grad_norm": 0.11674440652132034, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 332450 + }, + { + "epoch": 1.285197383680475, + "grad_norm": 0.12361260503530502, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 332460 + }, + { + "epoch": 1.2852360408838583, + "grad_norm": 0.10297805070877075, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 332470 + }, + { + "epoch": 1.2852746980872416, + "grad_norm": 0.10889725387096405, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 332480 + }, + { + "epoch": 1.2853133552906248, + "grad_norm": 0.12578794360160828, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 332490 + }, + { + "epoch": 1.285352012494008, + "grad_norm": 0.11805281788110733, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 332500 + }, + { + "epoch": 1.2853906696973914, + "grad_norm": 0.120563805103302, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 332510 + }, + { + "epoch": 1.2854293269007746, + "grad_norm": 0.11009590327739716, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 332520 + }, + { + "epoch": 1.2854679841041579, + "grad_norm": 0.10590917617082596, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 332530 + }, + { + "epoch": 1.285506641307541, + "grad_norm": 0.0980500653386116, + "learning_rate": 0.002, + "loss": 2.341, + "step": 332540 + }, + { + "epoch": 1.2855452985109246, + "grad_norm": 0.1209760531783104, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 332550 + }, + { + "epoch": 1.2855839557143078, + "grad_norm": 0.10607732832431793, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 332560 + }, + { + "epoch": 1.285622612917691, + "grad_norm": 0.11301247775554657, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 332570 + }, + { + "epoch": 1.2856612701210743, + "grad_norm": 0.12619836628437042, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 332580 + }, + { + "epoch": 1.2856999273244576, + "grad_norm": 0.09536372870206833, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 332590 + }, + { + "epoch": 1.2857385845278408, + "grad_norm": 0.12440627813339233, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 332600 + }, + { + "epoch": 1.2857772417312243, + "grad_norm": 0.12063415348529816, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 332610 + }, + { + "epoch": 1.2858158989346076, + "grad_norm": 0.10072565078735352, + "learning_rate": 0.002, + "loss": 2.335, + "step": 332620 + }, + { + "epoch": 1.2858545561379908, + "grad_norm": 0.10750167071819305, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 332630 + }, + { + "epoch": 1.285893213341374, + "grad_norm": 0.10173853486776352, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 332640 + }, + { + "epoch": 1.2859318705447573, + "grad_norm": 0.09797729551792145, + "learning_rate": 0.002, + "loss": 2.33, + "step": 332650 + }, + { + "epoch": 1.2859705277481406, + "grad_norm": 0.09454578906297684, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 332660 + }, + { + "epoch": 1.2860091849515238, + "grad_norm": 0.09492919594049454, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 332670 + }, + { + "epoch": 1.286047842154907, + "grad_norm": 0.12662118673324585, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 332680 + }, + { + "epoch": 1.2860864993582903, + "grad_norm": 0.11416533589363098, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 332690 + }, + { + "epoch": 1.2861251565616736, + "grad_norm": 0.09003262966871262, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 332700 + }, + { + "epoch": 1.286163813765057, + "grad_norm": 0.10572832822799683, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 332710 + }, + { + "epoch": 1.2862024709684403, + "grad_norm": 0.1264459639787674, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 332720 + }, + { + "epoch": 1.2862411281718236, + "grad_norm": 0.1304396539926529, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 332730 + }, + { + "epoch": 1.2862797853752068, + "grad_norm": 0.11325572431087494, + "learning_rate": 0.002, + "loss": 2.3104, + "step": 332740 + }, + { + "epoch": 1.28631844257859, + "grad_norm": 0.09685946255922318, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 332750 + }, + { + "epoch": 1.2863570997819733, + "grad_norm": 0.11727797240018845, + "learning_rate": 0.002, + "loss": 2.336, + "step": 332760 + }, + { + "epoch": 1.2863957569853566, + "grad_norm": 0.10779083520174026, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 332770 + }, + { + "epoch": 1.28643441418874, + "grad_norm": 0.1004313975572586, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 332780 + }, + { + "epoch": 1.2864730713921233, + "grad_norm": 0.1066732108592987, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 332790 + }, + { + "epoch": 1.2865117285955066, + "grad_norm": 0.09625577926635742, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 332800 + }, + { + "epoch": 1.2865503857988898, + "grad_norm": 0.09934387356042862, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 332810 + }, + { + "epoch": 1.286589043002273, + "grad_norm": 0.11568392068147659, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 332820 + }, + { + "epoch": 1.2866277002056563, + "grad_norm": 0.10015887767076492, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 332830 + }, + { + "epoch": 1.2866663574090396, + "grad_norm": 0.10513238608837128, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 332840 + }, + { + "epoch": 1.2867050146124228, + "grad_norm": 0.11645331233739853, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 332850 + }, + { + "epoch": 1.286743671815806, + "grad_norm": 0.15896683931350708, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 332860 + }, + { + "epoch": 1.2867823290191893, + "grad_norm": 0.11531778424978256, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 332870 + }, + { + "epoch": 1.2868209862225728, + "grad_norm": 0.10579390078783035, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 332880 + }, + { + "epoch": 1.286859643425956, + "grad_norm": 0.0989760085940361, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 332890 + }, + { + "epoch": 1.2868983006293393, + "grad_norm": 0.1162877306342125, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 332900 + }, + { + "epoch": 1.2869369578327226, + "grad_norm": 0.11907833814620972, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 332910 + }, + { + "epoch": 1.2869756150361058, + "grad_norm": 0.11692595481872559, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 332920 + }, + { + "epoch": 1.287014272239489, + "grad_norm": 0.0976448655128479, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 332930 + }, + { + "epoch": 1.2870529294428723, + "grad_norm": 0.10867959260940552, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 332940 + }, + { + "epoch": 1.2870915866462558, + "grad_norm": 0.10054941475391388, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 332950 + }, + { + "epoch": 1.287130243849639, + "grad_norm": 0.11905647069215775, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 332960 + }, + { + "epoch": 1.2871689010530223, + "grad_norm": 0.12290788441896439, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 332970 + }, + { + "epoch": 1.2872075582564055, + "grad_norm": 0.14153672754764557, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 332980 + }, + { + "epoch": 1.2872462154597888, + "grad_norm": 0.10869767516851425, + "learning_rate": 0.002, + "loss": 2.35, + "step": 332990 + }, + { + "epoch": 1.287284872663172, + "grad_norm": 0.1402626931667328, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 333000 + }, + { + "epoch": 1.2873235298665553, + "grad_norm": 0.11460593342781067, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 333010 + }, + { + "epoch": 1.2873621870699385, + "grad_norm": 0.11055731773376465, + "learning_rate": 0.002, + "loss": 2.334, + "step": 333020 + }, + { + "epoch": 1.2874008442733218, + "grad_norm": 0.11018706113100052, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 333030 + }, + { + "epoch": 1.287439501476705, + "grad_norm": 0.12170359492301941, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 333040 + }, + { + "epoch": 1.2874781586800885, + "grad_norm": 0.10103699564933777, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 333050 + }, + { + "epoch": 1.2875168158834718, + "grad_norm": 0.08956848829984665, + "learning_rate": 0.002, + "loss": 2.3561, + "step": 333060 + }, + { + "epoch": 1.287555473086855, + "grad_norm": 0.10570359230041504, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 333070 + }, + { + "epoch": 1.2875941302902383, + "grad_norm": 0.11920324712991714, + "learning_rate": 0.002, + "loss": 2.322, + "step": 333080 + }, + { + "epoch": 1.2876327874936215, + "grad_norm": 0.09573183208703995, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 333090 + }, + { + "epoch": 1.2876714446970048, + "grad_norm": 0.11040244251489639, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 333100 + }, + { + "epoch": 1.287710101900388, + "grad_norm": 0.09444387257099152, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 333110 + }, + { + "epoch": 1.2877487591037715, + "grad_norm": 0.1170918419957161, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 333120 + }, + { + "epoch": 1.2877874163071548, + "grad_norm": 0.10428602993488312, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 333130 + }, + { + "epoch": 1.287826073510538, + "grad_norm": 0.1396740972995758, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 333140 + }, + { + "epoch": 1.2878647307139213, + "grad_norm": 0.10158234089612961, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 333150 + }, + { + "epoch": 1.2879033879173045, + "grad_norm": 0.10186280310153961, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 333160 + }, + { + "epoch": 1.2879420451206878, + "grad_norm": 0.11232930421829224, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 333170 + }, + { + "epoch": 1.287980702324071, + "grad_norm": 0.09306978434324265, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 333180 + }, + { + "epoch": 1.2880193595274543, + "grad_norm": 0.10687271505594254, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 333190 + }, + { + "epoch": 1.2880580167308375, + "grad_norm": 0.09269829094409943, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 333200 + }, + { + "epoch": 1.2880966739342208, + "grad_norm": 0.1074671596288681, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 333210 + }, + { + "epoch": 1.2881353311376043, + "grad_norm": 0.10309260338544846, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 333220 + }, + { + "epoch": 1.2881739883409875, + "grad_norm": 0.11010892689228058, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 333230 + }, + { + "epoch": 1.2882126455443708, + "grad_norm": 0.10869075357913971, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 333240 + }, + { + "epoch": 1.288251302747754, + "grad_norm": 0.10025380551815033, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 333250 + }, + { + "epoch": 1.2882899599511373, + "grad_norm": 0.0960700735449791, + "learning_rate": 0.002, + "loss": 2.344, + "step": 333260 + }, + { + "epoch": 1.2883286171545205, + "grad_norm": 0.13028484582901, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 333270 + }, + { + "epoch": 1.2883672743579038, + "grad_norm": 0.08875899016857147, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 333280 + }, + { + "epoch": 1.2884059315612872, + "grad_norm": 0.10433296114206314, + "learning_rate": 0.002, + "loss": 2.34, + "step": 333290 + }, + { + "epoch": 1.2884445887646705, + "grad_norm": 0.09889642894268036, + "learning_rate": 0.002, + "loss": 2.319, + "step": 333300 + }, + { + "epoch": 1.2884832459680537, + "grad_norm": 0.11376189440488815, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 333310 + }, + { + "epoch": 1.288521903171437, + "grad_norm": 0.10081131011247635, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 333320 + }, + { + "epoch": 1.2885605603748203, + "grad_norm": 0.11411105841398239, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 333330 + }, + { + "epoch": 1.2885992175782035, + "grad_norm": 0.11444903165102005, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 333340 + }, + { + "epoch": 1.2886378747815868, + "grad_norm": 0.10614798218011856, + "learning_rate": 0.002, + "loss": 2.332, + "step": 333350 + }, + { + "epoch": 1.28867653198497, + "grad_norm": 0.1183219850063324, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 333360 + }, + { + "epoch": 1.2887151891883533, + "grad_norm": 0.09680161625146866, + "learning_rate": 0.002, + "loss": 2.3144, + "step": 333370 + }, + { + "epoch": 1.2887538463917365, + "grad_norm": 0.10841412842273712, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 333380 + }, + { + "epoch": 1.28879250359512, + "grad_norm": 0.08972472697496414, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 333390 + }, + { + "epoch": 1.2888311607985032, + "grad_norm": 0.12399033457040787, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 333400 + }, + { + "epoch": 1.2888698180018865, + "grad_norm": 0.12081394344568253, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 333410 + }, + { + "epoch": 1.2889084752052697, + "grad_norm": 0.09331200271844864, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 333420 + }, + { + "epoch": 1.288947132408653, + "grad_norm": 0.09348198771476746, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 333430 + }, + { + "epoch": 1.2889857896120362, + "grad_norm": 0.1317223161458969, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 333440 + }, + { + "epoch": 1.2890244468154197, + "grad_norm": 0.10621041804552078, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 333450 + }, + { + "epoch": 1.289063104018803, + "grad_norm": 0.10697954893112183, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 333460 + }, + { + "epoch": 1.2891017612221862, + "grad_norm": 0.21501240134239197, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 333470 + }, + { + "epoch": 1.2891404184255695, + "grad_norm": 0.10425000637769699, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 333480 + }, + { + "epoch": 1.2891790756289527, + "grad_norm": 0.11156164109706879, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 333490 + }, + { + "epoch": 1.289217732832336, + "grad_norm": 0.11151176691055298, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 333500 + }, + { + "epoch": 1.2892563900357192, + "grad_norm": 0.1183115541934967, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 333510 + }, + { + "epoch": 1.2892950472391025, + "grad_norm": 0.0971737876534462, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 333520 + }, + { + "epoch": 1.2893337044424857, + "grad_norm": 0.09613676369190216, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 333530 + }, + { + "epoch": 1.289372361645869, + "grad_norm": 0.10197040438652039, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 333540 + }, + { + "epoch": 1.2894110188492522, + "grad_norm": 0.10441060364246368, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 333550 + }, + { + "epoch": 1.2894496760526357, + "grad_norm": 0.10522343963384628, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 333560 + }, + { + "epoch": 1.289488333256019, + "grad_norm": 0.10223446786403656, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 333570 + }, + { + "epoch": 1.2895269904594022, + "grad_norm": 0.12261584401130676, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 333580 + }, + { + "epoch": 1.2895656476627855, + "grad_norm": 0.11205879598855972, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 333590 + }, + { + "epoch": 1.2896043048661687, + "grad_norm": 0.09977878630161285, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 333600 + }, + { + "epoch": 1.289642962069552, + "grad_norm": 0.09606704860925674, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 333610 + }, + { + "epoch": 1.2896816192729355, + "grad_norm": 0.13726183772087097, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 333620 + }, + { + "epoch": 1.2897202764763187, + "grad_norm": 0.10022982954978943, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 333630 + }, + { + "epoch": 1.289758933679702, + "grad_norm": 0.11005251854658127, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 333640 + }, + { + "epoch": 1.2897975908830852, + "grad_norm": 0.10867244750261307, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 333650 + }, + { + "epoch": 1.2898362480864685, + "grad_norm": 0.10423463582992554, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 333660 + }, + { + "epoch": 1.2898749052898517, + "grad_norm": 0.09577327966690063, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 333670 + }, + { + "epoch": 1.289913562493235, + "grad_norm": 0.12469309568405151, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 333680 + }, + { + "epoch": 1.2899522196966182, + "grad_norm": 0.11108225584030151, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 333690 + }, + { + "epoch": 1.2899908769000015, + "grad_norm": 0.09957767277956009, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 333700 + }, + { + "epoch": 1.2900295341033847, + "grad_norm": 0.1158376932144165, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 333710 + }, + { + "epoch": 1.290068191306768, + "grad_norm": 0.10757265239953995, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 333720 + }, + { + "epoch": 1.2901068485101514, + "grad_norm": 0.09862326085567474, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 333730 + }, + { + "epoch": 1.2901455057135347, + "grad_norm": 0.09710995852947235, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 333740 + }, + { + "epoch": 1.290184162916918, + "grad_norm": 0.10201451927423477, + "learning_rate": 0.002, + "loss": 2.3115, + "step": 333750 + }, + { + "epoch": 1.2902228201203012, + "grad_norm": 0.11438058316707611, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 333760 + }, + { + "epoch": 1.2902614773236845, + "grad_norm": 0.11025866121053696, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 333770 + }, + { + "epoch": 1.2903001345270677, + "grad_norm": 0.09014347195625305, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 333780 + }, + { + "epoch": 1.2903387917304512, + "grad_norm": 0.11819101125001907, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 333790 + }, + { + "epoch": 1.2903774489338344, + "grad_norm": 0.1083206906914711, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 333800 + }, + { + "epoch": 1.2904161061372177, + "grad_norm": 0.1187998354434967, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 333810 + }, + { + "epoch": 1.290454763340601, + "grad_norm": 0.1018686294555664, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 333820 + }, + { + "epoch": 1.2904934205439842, + "grad_norm": 0.09539389610290527, + "learning_rate": 0.002, + "loss": 2.332, + "step": 333830 + }, + { + "epoch": 1.2905320777473674, + "grad_norm": 0.10429718345403671, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 333840 + }, + { + "epoch": 1.2905707349507507, + "grad_norm": 0.11827939003705978, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 333850 + }, + { + "epoch": 1.290609392154134, + "grad_norm": 0.10822010040283203, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 333860 + }, + { + "epoch": 1.2906480493575172, + "grad_norm": 0.09067022800445557, + "learning_rate": 0.002, + "loss": 2.343, + "step": 333870 + }, + { + "epoch": 1.2906867065609005, + "grad_norm": 0.11707578599452972, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 333880 + }, + { + "epoch": 1.2907253637642837, + "grad_norm": 0.10048212110996246, + "learning_rate": 0.002, + "loss": 2.319, + "step": 333890 + }, + { + "epoch": 1.2907640209676672, + "grad_norm": 0.10516627132892609, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 333900 + }, + { + "epoch": 1.2908026781710504, + "grad_norm": 0.13592956960201263, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 333910 + }, + { + "epoch": 1.2908413353744337, + "grad_norm": 0.10363247990608215, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 333920 + }, + { + "epoch": 1.290879992577817, + "grad_norm": 0.10619153082370758, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 333930 + }, + { + "epoch": 1.2909186497812002, + "grad_norm": 0.10397988557815552, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 333940 + }, + { + "epoch": 1.2909573069845834, + "grad_norm": 0.1019267588853836, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 333950 + }, + { + "epoch": 1.290995964187967, + "grad_norm": 0.10218657553195953, + "learning_rate": 0.002, + "loss": 2.341, + "step": 333960 + }, + { + "epoch": 1.2910346213913502, + "grad_norm": 0.117024265229702, + "learning_rate": 0.002, + "loss": 2.351, + "step": 333970 + }, + { + "epoch": 1.2910732785947334, + "grad_norm": 0.10420398414134979, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 333980 + }, + { + "epoch": 1.2911119357981167, + "grad_norm": 0.13143710792064667, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 333990 + }, + { + "epoch": 1.2911505930015, + "grad_norm": 0.10686857998371124, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 334000 + }, + { + "epoch": 1.2911892502048832, + "grad_norm": 0.0898427739739418, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 334010 + }, + { + "epoch": 1.2912279074082664, + "grad_norm": 0.268848717212677, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 334020 + }, + { + "epoch": 1.2912665646116497, + "grad_norm": 0.11120209097862244, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 334030 + }, + { + "epoch": 1.291305221815033, + "grad_norm": 0.09789606928825378, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 334040 + }, + { + "epoch": 1.2913438790184162, + "grad_norm": 0.1309780478477478, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 334050 + }, + { + "epoch": 1.2913825362217994, + "grad_norm": 0.1025337427854538, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 334060 + }, + { + "epoch": 1.291421193425183, + "grad_norm": 0.09491181373596191, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 334070 + }, + { + "epoch": 1.2914598506285662, + "grad_norm": 0.09663745015859604, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 334080 + }, + { + "epoch": 1.2914985078319494, + "grad_norm": 0.11719124019145966, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 334090 + }, + { + "epoch": 1.2915371650353327, + "grad_norm": 0.09677215665578842, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 334100 + }, + { + "epoch": 1.291575822238716, + "grad_norm": 0.11550380289554596, + "learning_rate": 0.002, + "loss": 2.3116, + "step": 334110 + }, + { + "epoch": 1.2916144794420992, + "grad_norm": 0.1174958124756813, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 334120 + }, + { + "epoch": 1.2916531366454826, + "grad_norm": 0.1034787967801094, + "learning_rate": 0.002, + "loss": 2.335, + "step": 334130 + }, + { + "epoch": 1.291691793848866, + "grad_norm": 0.09159667044878006, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 334140 + }, + { + "epoch": 1.2917304510522492, + "grad_norm": 0.11333204805850983, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 334150 + }, + { + "epoch": 1.2917691082556324, + "grad_norm": 0.1204734668135643, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 334160 + }, + { + "epoch": 1.2918077654590157, + "grad_norm": 0.10005148500204086, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 334170 + }, + { + "epoch": 1.291846422662399, + "grad_norm": 0.10399097204208374, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 334180 + }, + { + "epoch": 1.2918850798657822, + "grad_norm": 0.10964294523000717, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 334190 + }, + { + "epoch": 1.2919237370691654, + "grad_norm": 0.11057308316230774, + "learning_rate": 0.002, + "loss": 2.329, + "step": 334200 + }, + { + "epoch": 1.2919623942725487, + "grad_norm": 0.09860649704933167, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 334210 + }, + { + "epoch": 1.292001051475932, + "grad_norm": 0.11533652245998383, + "learning_rate": 0.002, + "loss": 2.339, + "step": 334220 + }, + { + "epoch": 1.2920397086793152, + "grad_norm": 0.12342556565999985, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 334230 + }, + { + "epoch": 1.2920783658826986, + "grad_norm": 0.10310442000627518, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 334240 + }, + { + "epoch": 1.292117023086082, + "grad_norm": 0.09682980179786682, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 334250 + }, + { + "epoch": 1.2921556802894651, + "grad_norm": 0.10156537592411041, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 334260 + }, + { + "epoch": 1.2921943374928484, + "grad_norm": 0.1010320708155632, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 334270 + }, + { + "epoch": 1.2922329946962317, + "grad_norm": 0.13401281833648682, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 334280 + }, + { + "epoch": 1.292271651899615, + "grad_norm": 0.0929509848356247, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 334290 + }, + { + "epoch": 1.2923103091029984, + "grad_norm": 0.10416734218597412, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 334300 + }, + { + "epoch": 1.2923489663063816, + "grad_norm": 0.10413626581430435, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 334310 + }, + { + "epoch": 1.2923876235097649, + "grad_norm": 0.09059230238199234, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 334320 + }, + { + "epoch": 1.2924262807131481, + "grad_norm": 0.11529724299907684, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 334330 + }, + { + "epoch": 1.2924649379165314, + "grad_norm": 0.10367235541343689, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 334340 + }, + { + "epoch": 1.2925035951199146, + "grad_norm": 0.10158482193946838, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 334350 + }, + { + "epoch": 1.292542252323298, + "grad_norm": 0.10162920504808426, + "learning_rate": 0.002, + "loss": 2.324, + "step": 334360 + }, + { + "epoch": 1.2925809095266811, + "grad_norm": 0.12735851109027863, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 334370 + }, + { + "epoch": 1.2926195667300644, + "grad_norm": 0.0941648855805397, + "learning_rate": 0.002, + "loss": 2.345, + "step": 334380 + }, + { + "epoch": 1.2926582239334476, + "grad_norm": 0.10265415161848068, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 334390 + }, + { + "epoch": 1.292696881136831, + "grad_norm": 0.09607397764921188, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 334400 + }, + { + "epoch": 1.2927355383402144, + "grad_norm": 0.12596885859966278, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 334410 + }, + { + "epoch": 1.2927741955435976, + "grad_norm": 0.09151950478553772, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 334420 + }, + { + "epoch": 1.2928128527469809, + "grad_norm": 0.12431547790765762, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 334430 + }, + { + "epoch": 1.2928515099503641, + "grad_norm": 0.10649395734071732, + "learning_rate": 0.002, + "loss": 2.335, + "step": 334440 + }, + { + "epoch": 1.2928901671537474, + "grad_norm": 0.11754316091537476, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 334450 + }, + { + "epoch": 1.2929288243571306, + "grad_norm": 0.10570388287305832, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 334460 + }, + { + "epoch": 1.292967481560514, + "grad_norm": 0.09635928273200989, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 334470 + }, + { + "epoch": 1.2930061387638974, + "grad_norm": 0.11909741163253784, + "learning_rate": 0.002, + "loss": 2.33, + "step": 334480 + }, + { + "epoch": 1.2930447959672806, + "grad_norm": 0.10475486516952515, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 334490 + }, + { + "epoch": 1.2930834531706639, + "grad_norm": 0.1174573004245758, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 334500 + }, + { + "epoch": 1.2931221103740471, + "grad_norm": 0.0980861634016037, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 334510 + }, + { + "epoch": 1.2931607675774304, + "grad_norm": 0.10527966916561127, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 334520 + }, + { + "epoch": 1.2931994247808136, + "grad_norm": 0.15596070885658264, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 334530 + }, + { + "epoch": 1.2932380819841969, + "grad_norm": 0.09148496389389038, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 334540 + }, + { + "epoch": 1.2932767391875801, + "grad_norm": 0.10149691253900528, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 334550 + }, + { + "epoch": 1.2933153963909634, + "grad_norm": 0.1106388196349144, + "learning_rate": 0.002, + "loss": 2.332, + "step": 334560 + }, + { + "epoch": 1.2933540535943469, + "grad_norm": 0.10097920894622803, + "learning_rate": 0.002, + "loss": 2.3511, + "step": 334570 + }, + { + "epoch": 1.29339271079773, + "grad_norm": 0.10403983294963837, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 334580 + }, + { + "epoch": 1.2934313680011134, + "grad_norm": 0.11371996253728867, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 334590 + }, + { + "epoch": 1.2934700252044966, + "grad_norm": 0.11415140330791473, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 334600 + }, + { + "epoch": 1.2935086824078799, + "grad_norm": 0.10624424368143082, + "learning_rate": 0.002, + "loss": 2.33, + "step": 334610 + }, + { + "epoch": 1.2935473396112631, + "grad_norm": 0.1049097403883934, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 334620 + }, + { + "epoch": 1.2935859968146464, + "grad_norm": 0.09997744113206863, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 334630 + }, + { + "epoch": 1.2936246540180298, + "grad_norm": 0.11178059130907059, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 334640 + }, + { + "epoch": 1.293663311221413, + "grad_norm": 0.09644506126642227, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 334650 + }, + { + "epoch": 1.2937019684247963, + "grad_norm": 0.133858323097229, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 334660 + }, + { + "epoch": 1.2937406256281796, + "grad_norm": 0.1203598603606224, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 334670 + }, + { + "epoch": 1.2937792828315628, + "grad_norm": 0.15581074357032776, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 334680 + }, + { + "epoch": 1.293817940034946, + "grad_norm": 0.09663668274879456, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 334690 + }, + { + "epoch": 1.2938565972383294, + "grad_norm": 0.0946556106209755, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 334700 + }, + { + "epoch": 1.2938952544417126, + "grad_norm": 0.10505830496549606, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 334710 + }, + { + "epoch": 1.2939339116450959, + "grad_norm": 0.1246907189488411, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 334720 + }, + { + "epoch": 1.293972568848479, + "grad_norm": 0.10076715052127838, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 334730 + }, + { + "epoch": 1.2940112260518626, + "grad_norm": 0.09258640557527542, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 334740 + }, + { + "epoch": 1.2940498832552458, + "grad_norm": 0.11278831958770752, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 334750 + }, + { + "epoch": 1.294088540458629, + "grad_norm": 0.12063068896532059, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 334760 + }, + { + "epoch": 1.2941271976620123, + "grad_norm": 0.110850989818573, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 334770 + }, + { + "epoch": 1.2941658548653956, + "grad_norm": 0.12250538170337677, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 334780 + }, + { + "epoch": 1.2942045120687788, + "grad_norm": 0.0948261097073555, + "learning_rate": 0.002, + "loss": 2.326, + "step": 334790 + }, + { + "epoch": 1.294243169272162, + "grad_norm": 0.09137909859418869, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 334800 + }, + { + "epoch": 1.2942818264755456, + "grad_norm": 0.10054846107959747, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 334810 + }, + { + "epoch": 1.2943204836789288, + "grad_norm": 0.09631969779729843, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 334820 + }, + { + "epoch": 1.294359140882312, + "grad_norm": 0.09292064607143402, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 334830 + }, + { + "epoch": 1.2943977980856953, + "grad_norm": 0.10343699157238007, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 334840 + }, + { + "epoch": 1.2944364552890786, + "grad_norm": 0.11731761693954468, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 334850 + }, + { + "epoch": 1.2944751124924618, + "grad_norm": 0.11252184212207794, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 334860 + }, + { + "epoch": 1.294513769695845, + "grad_norm": 0.11709108203649521, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 334870 + }, + { + "epoch": 1.2945524268992283, + "grad_norm": 0.09856344014406204, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 334880 + }, + { + "epoch": 1.2945910841026116, + "grad_norm": 0.10993824899196625, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 334890 + }, + { + "epoch": 1.2946297413059948, + "grad_norm": 0.11090542376041412, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 334900 + }, + { + "epoch": 1.2946683985093783, + "grad_norm": 0.11012540757656097, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 334910 + }, + { + "epoch": 1.2947070557127616, + "grad_norm": 0.09483652561903, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 334920 + }, + { + "epoch": 1.2947457129161448, + "grad_norm": 0.09918684512376785, + "learning_rate": 0.002, + "loss": 2.338, + "step": 334930 + }, + { + "epoch": 1.294784370119528, + "grad_norm": 0.09008168429136276, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 334940 + }, + { + "epoch": 1.2948230273229113, + "grad_norm": 0.10129818320274353, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 334950 + }, + { + "epoch": 1.2948616845262946, + "grad_norm": 0.10634876042604446, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 334960 + }, + { + "epoch": 1.2949003417296778, + "grad_norm": 0.12891563773155212, + "learning_rate": 0.002, + "loss": 2.3156, + "step": 334970 + }, + { + "epoch": 1.2949389989330613, + "grad_norm": 0.10008393973112106, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 334980 + }, + { + "epoch": 1.2949776561364446, + "grad_norm": 0.1001753956079483, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 334990 + }, + { + "epoch": 1.2950163133398278, + "grad_norm": 0.1086198091506958, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 335000 + }, + { + "epoch": 1.295054970543211, + "grad_norm": 0.10994220525026321, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 335010 + }, + { + "epoch": 1.2950936277465943, + "grad_norm": 0.11280565708875656, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 335020 + }, + { + "epoch": 1.2951322849499776, + "grad_norm": 0.12463513016700745, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 335030 + }, + { + "epoch": 1.2951709421533608, + "grad_norm": 0.10185111314058304, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 335040 + }, + { + "epoch": 1.295209599356744, + "grad_norm": 0.10697565227746964, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 335050 + }, + { + "epoch": 1.2952482565601273, + "grad_norm": 0.1059790849685669, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 335060 + }, + { + "epoch": 1.2952869137635106, + "grad_norm": 0.10459061712026596, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 335070 + }, + { + "epoch": 1.295325570966894, + "grad_norm": 0.11010526120662689, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 335080 + }, + { + "epoch": 1.2953642281702773, + "grad_norm": 0.11023080348968506, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 335090 + }, + { + "epoch": 1.2954028853736606, + "grad_norm": 0.09726311266422272, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 335100 + }, + { + "epoch": 1.2954415425770438, + "grad_norm": 0.10792980343103409, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 335110 + }, + { + "epoch": 1.295480199780427, + "grad_norm": 0.09982597827911377, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 335120 + }, + { + "epoch": 1.2955188569838103, + "grad_norm": 0.0997501090168953, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 335130 + }, + { + "epoch": 1.2955575141871936, + "grad_norm": 0.12466259300708771, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 335140 + }, + { + "epoch": 1.295596171390577, + "grad_norm": 0.12011416256427765, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 335150 + }, + { + "epoch": 1.2956348285939603, + "grad_norm": 0.11512520164251328, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 335160 + }, + { + "epoch": 1.2956734857973435, + "grad_norm": 0.10447486490011215, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 335170 + }, + { + "epoch": 1.2957121430007268, + "grad_norm": 0.10611852258443832, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 335180 + }, + { + "epoch": 1.29575080020411, + "grad_norm": 0.10485371947288513, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 335190 + }, + { + "epoch": 1.2957894574074933, + "grad_norm": 0.09470439702272415, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 335200 + }, + { + "epoch": 1.2958281146108765, + "grad_norm": 0.11864740401506424, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 335210 + }, + { + "epoch": 1.2958667718142598, + "grad_norm": 0.10729487240314484, + "learning_rate": 0.002, + "loss": 2.345, + "step": 335220 + }, + { + "epoch": 1.295905429017643, + "grad_norm": 0.09711641073226929, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 335230 + }, + { + "epoch": 1.2959440862210263, + "grad_norm": 0.09695418924093246, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 335240 + }, + { + "epoch": 1.2959827434244098, + "grad_norm": 0.10560396313667297, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 335250 + }, + { + "epoch": 1.296021400627793, + "grad_norm": 0.10399089008569717, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 335260 + }, + { + "epoch": 1.2960600578311763, + "grad_norm": 0.09577060490846634, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 335270 + }, + { + "epoch": 1.2960987150345595, + "grad_norm": 0.10907510668039322, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 335280 + }, + { + "epoch": 1.2961373722379428, + "grad_norm": 0.09519508481025696, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 335290 + }, + { + "epoch": 1.296176029441326, + "grad_norm": 0.11467217653989792, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 335300 + }, + { + "epoch": 1.2962146866447095, + "grad_norm": 0.10151347517967224, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 335310 + }, + { + "epoch": 1.2962533438480928, + "grad_norm": 0.10027828067541122, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 335320 + }, + { + "epoch": 1.296292001051476, + "grad_norm": 0.11993741989135742, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 335330 + }, + { + "epoch": 1.2963306582548593, + "grad_norm": 0.1123969629406929, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 335340 + }, + { + "epoch": 1.2963693154582425, + "grad_norm": 0.1024906262755394, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 335350 + }, + { + "epoch": 1.2964079726616258, + "grad_norm": 0.10444162040948868, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 335360 + }, + { + "epoch": 1.296446629865009, + "grad_norm": 0.09288977086544037, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 335370 + }, + { + "epoch": 1.2964852870683923, + "grad_norm": 0.10308873653411865, + "learning_rate": 0.002, + "loss": 2.357, + "step": 335380 + }, + { + "epoch": 1.2965239442717755, + "grad_norm": 0.0930061861872673, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 335390 + }, + { + "epoch": 1.2965626014751588, + "grad_norm": 0.12539707124233246, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 335400 + }, + { + "epoch": 1.296601258678542, + "grad_norm": 0.10915507376194, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 335410 + }, + { + "epoch": 1.2966399158819255, + "grad_norm": 0.10099329799413681, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 335420 + }, + { + "epoch": 1.2966785730853088, + "grad_norm": 0.09893771260976791, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 335430 + }, + { + "epoch": 1.296717230288692, + "grad_norm": 0.13321642577648163, + "learning_rate": 0.002, + "loss": 2.328, + "step": 335440 + }, + { + "epoch": 1.2967558874920753, + "grad_norm": 0.10005443543195724, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 335450 + }, + { + "epoch": 1.2967945446954585, + "grad_norm": 0.11731036752462387, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 335460 + }, + { + "epoch": 1.2968332018988418, + "grad_norm": 0.11234107613563538, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 335470 + }, + { + "epoch": 1.2968718591022252, + "grad_norm": 0.12516427040100098, + "learning_rate": 0.002, + "loss": 2.326, + "step": 335480 + }, + { + "epoch": 1.2969105163056085, + "grad_norm": 0.08740191906690598, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 335490 + }, + { + "epoch": 1.2969491735089917, + "grad_norm": 0.11662085354328156, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 335500 + }, + { + "epoch": 1.296987830712375, + "grad_norm": 0.09221193194389343, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 335510 + }, + { + "epoch": 1.2970264879157583, + "grad_norm": 0.1095178946852684, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 335520 + }, + { + "epoch": 1.2970651451191415, + "grad_norm": 0.09435485303401947, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 335530 + }, + { + "epoch": 1.2971038023225248, + "grad_norm": 0.1369830220937729, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 335540 + }, + { + "epoch": 1.297142459525908, + "grad_norm": 0.12459218502044678, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 335550 + }, + { + "epoch": 1.2971811167292913, + "grad_norm": 0.11784704774618149, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 335560 + }, + { + "epoch": 1.2972197739326745, + "grad_norm": 0.09934574365615845, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 335570 + }, + { + "epoch": 1.2972584311360578, + "grad_norm": 0.09636490792036057, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 335580 + }, + { + "epoch": 1.2972970883394412, + "grad_norm": 0.10465794056653976, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 335590 + }, + { + "epoch": 1.2973357455428245, + "grad_norm": 0.11000781506299973, + "learning_rate": 0.002, + "loss": 2.321, + "step": 335600 + }, + { + "epoch": 1.2973744027462077, + "grad_norm": 0.13012675940990448, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 335610 + }, + { + "epoch": 1.297413059949591, + "grad_norm": 0.09287827461957932, + "learning_rate": 0.002, + "loss": 2.34, + "step": 335620 + }, + { + "epoch": 1.2974517171529742, + "grad_norm": 0.09919273108243942, + "learning_rate": 0.002, + "loss": 2.3572, + "step": 335630 + }, + { + "epoch": 1.2974903743563575, + "grad_norm": 0.09842280298471451, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 335640 + }, + { + "epoch": 1.297529031559741, + "grad_norm": 0.11691803485155106, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 335650 + }, + { + "epoch": 1.2975676887631242, + "grad_norm": 0.10015939176082611, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 335660 + }, + { + "epoch": 1.2976063459665075, + "grad_norm": 0.09664006531238556, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 335670 + }, + { + "epoch": 1.2976450031698907, + "grad_norm": 0.12423496693372726, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 335680 + }, + { + "epoch": 1.297683660373274, + "grad_norm": 0.10078054666519165, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 335690 + }, + { + "epoch": 1.2977223175766572, + "grad_norm": 0.11702215671539307, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 335700 + }, + { + "epoch": 1.2977609747800405, + "grad_norm": 0.09916981309652328, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 335710 + }, + { + "epoch": 1.2977996319834237, + "grad_norm": 0.11054660379886627, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 335720 + }, + { + "epoch": 1.297838289186807, + "grad_norm": 0.10521753132343292, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 335730 + }, + { + "epoch": 1.2978769463901902, + "grad_norm": 0.14257629215717316, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 335740 + }, + { + "epoch": 1.2979156035935735, + "grad_norm": 0.0905749499797821, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 335750 + }, + { + "epoch": 1.297954260796957, + "grad_norm": 0.11051960289478302, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 335760 + }, + { + "epoch": 1.2979929180003402, + "grad_norm": 0.10828401148319244, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 335770 + }, + { + "epoch": 1.2980315752037235, + "grad_norm": 0.11706371605396271, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 335780 + }, + { + "epoch": 1.2980702324071067, + "grad_norm": 0.11631658673286438, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 335790 + }, + { + "epoch": 1.29810888961049, + "grad_norm": 0.0892113521695137, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 335800 + }, + { + "epoch": 1.2981475468138732, + "grad_norm": 0.12711207568645477, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 335810 + }, + { + "epoch": 1.2981862040172567, + "grad_norm": 0.11014045774936676, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 335820 + }, + { + "epoch": 1.29822486122064, + "grad_norm": 0.09357475489377975, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 335830 + }, + { + "epoch": 1.2982635184240232, + "grad_norm": 0.09620398283004761, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 335840 + }, + { + "epoch": 1.2983021756274065, + "grad_norm": 0.10793457180261612, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 335850 + }, + { + "epoch": 1.2983408328307897, + "grad_norm": 0.13548347353935242, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 335860 + }, + { + "epoch": 1.298379490034173, + "grad_norm": 0.10772477835416794, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 335870 + }, + { + "epoch": 1.2984181472375562, + "grad_norm": 0.09605101495981216, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 335880 + }, + { + "epoch": 1.2984568044409395, + "grad_norm": 0.11341050267219543, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 335890 + }, + { + "epoch": 1.2984954616443227, + "grad_norm": 0.11545135825872421, + "learning_rate": 0.002, + "loss": 2.327, + "step": 335900 + }, + { + "epoch": 1.298534118847706, + "grad_norm": 0.12482807785272598, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 335910 + }, + { + "epoch": 1.2985727760510892, + "grad_norm": 0.08460353314876556, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 335920 + }, + { + "epoch": 1.2986114332544727, + "grad_norm": 0.28585994243621826, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 335930 + }, + { + "epoch": 1.298650090457856, + "grad_norm": 0.10742087662220001, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 335940 + }, + { + "epoch": 1.2986887476612392, + "grad_norm": 0.12978002429008484, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 335950 + }, + { + "epoch": 1.2987274048646225, + "grad_norm": 0.09802679717540741, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 335960 + }, + { + "epoch": 1.2987660620680057, + "grad_norm": 0.10356856882572174, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 335970 + }, + { + "epoch": 1.298804719271389, + "grad_norm": 0.12377564609050751, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 335980 + }, + { + "epoch": 1.2988433764747724, + "grad_norm": 0.10669989138841629, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 335990 + }, + { + "epoch": 1.2988820336781557, + "grad_norm": 0.09646441042423248, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 336000 + }, + { + "epoch": 1.298920690881539, + "grad_norm": 0.13213708996772766, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 336010 + }, + { + "epoch": 1.2989593480849222, + "grad_norm": 0.0944688692688942, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 336020 + }, + { + "epoch": 1.2989980052883054, + "grad_norm": 0.10846567898988724, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 336030 + }, + { + "epoch": 1.2990366624916887, + "grad_norm": 0.09957831352949142, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 336040 + }, + { + "epoch": 1.299075319695072, + "grad_norm": 0.09659119695425034, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 336050 + }, + { + "epoch": 1.2991139768984552, + "grad_norm": 0.10422774404287338, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 336060 + }, + { + "epoch": 1.2991526341018385, + "grad_norm": 0.10652784258127213, + "learning_rate": 0.002, + "loss": 2.3156, + "step": 336070 + }, + { + "epoch": 1.2991912913052217, + "grad_norm": 0.11235243827104568, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 336080 + }, + { + "epoch": 1.299229948508605, + "grad_norm": 0.1195795089006424, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 336090 + }, + { + "epoch": 1.2992686057119884, + "grad_norm": 0.11767005175352097, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 336100 + }, + { + "epoch": 1.2993072629153717, + "grad_norm": 0.09454245865345001, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 336110 + }, + { + "epoch": 1.299345920118755, + "grad_norm": 0.12325325608253479, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 336120 + }, + { + "epoch": 1.2993845773221382, + "grad_norm": 0.10042198747396469, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 336130 + }, + { + "epoch": 1.2994232345255214, + "grad_norm": 0.10879756510257721, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 336140 + }, + { + "epoch": 1.2994618917289047, + "grad_norm": 0.09640879184007645, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 336150 + }, + { + "epoch": 1.2995005489322882, + "grad_norm": 0.1137596070766449, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 336160 + }, + { + "epoch": 1.2995392061356714, + "grad_norm": 0.10465048998594284, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 336170 + }, + { + "epoch": 1.2995778633390547, + "grad_norm": 0.1117529347538948, + "learning_rate": 0.002, + "loss": 2.32, + "step": 336180 + }, + { + "epoch": 1.299616520542438, + "grad_norm": 0.1018039882183075, + "learning_rate": 0.002, + "loss": 2.343, + "step": 336190 + }, + { + "epoch": 1.2996551777458212, + "grad_norm": 0.0903591737151146, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 336200 + }, + { + "epoch": 1.2996938349492044, + "grad_norm": 0.09142091870307922, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 336210 + }, + { + "epoch": 1.2997324921525877, + "grad_norm": 0.11007067561149597, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 336220 + }, + { + "epoch": 1.299771149355971, + "grad_norm": 0.09496650099754333, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 336230 + }, + { + "epoch": 1.2998098065593542, + "grad_norm": 0.11753443628549576, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 336240 + }, + { + "epoch": 1.2998484637627374, + "grad_norm": 0.10725142061710358, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 336250 + }, + { + "epoch": 1.2998871209661207, + "grad_norm": 0.11182443797588348, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 336260 + }, + { + "epoch": 1.2999257781695042, + "grad_norm": 0.09985292702913284, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 336270 + }, + { + "epoch": 1.2999644353728874, + "grad_norm": 0.14232194423675537, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 336280 + }, + { + "epoch": 1.3000030925762707, + "grad_norm": 0.09903427958488464, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 336290 + }, + { + "epoch": 1.300041749779654, + "grad_norm": 0.10525563359260559, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 336300 + }, + { + "epoch": 1.3000804069830372, + "grad_norm": 0.10530021786689758, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 336310 + }, + { + "epoch": 1.3001190641864204, + "grad_norm": 0.10072167217731476, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 336320 + }, + { + "epoch": 1.300157721389804, + "grad_norm": 0.11304374784231186, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 336330 + }, + { + "epoch": 1.3001963785931872, + "grad_norm": 0.11669431626796722, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 336340 + }, + { + "epoch": 1.3002350357965704, + "grad_norm": 0.11866362392902374, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 336350 + }, + { + "epoch": 1.3002736929999537, + "grad_norm": 0.09389518201351166, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 336360 + }, + { + "epoch": 1.300312350203337, + "grad_norm": 0.12181688845157623, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 336370 + }, + { + "epoch": 1.3003510074067202, + "grad_norm": 0.12622328102588654, + "learning_rate": 0.002, + "loss": 2.343, + "step": 336380 + }, + { + "epoch": 1.3003896646101034, + "grad_norm": 0.12845391035079956, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 336390 + }, + { + "epoch": 1.3004283218134867, + "grad_norm": 0.11289853602647781, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 336400 + }, + { + "epoch": 1.30046697901687, + "grad_norm": 0.12921935319900513, + "learning_rate": 0.002, + "loss": 2.317, + "step": 336410 + }, + { + "epoch": 1.3005056362202532, + "grad_norm": 0.10790617763996124, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 336420 + }, + { + "epoch": 1.3005442934236366, + "grad_norm": 0.10064290463924408, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 336430 + }, + { + "epoch": 1.30058295062702, + "grad_norm": 0.10124460607767105, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 336440 + }, + { + "epoch": 1.3006216078304031, + "grad_norm": 0.09299731999635696, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 336450 + }, + { + "epoch": 1.3006602650337864, + "grad_norm": 0.10883777588605881, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 336460 + }, + { + "epoch": 1.3006989222371697, + "grad_norm": 0.10185468941926956, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 336470 + }, + { + "epoch": 1.300737579440553, + "grad_norm": 0.1190355122089386, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 336480 + }, + { + "epoch": 1.3007762366439362, + "grad_norm": 0.1011800616979599, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 336490 + }, + { + "epoch": 1.3008148938473196, + "grad_norm": 0.08444646000862122, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 336500 + }, + { + "epoch": 1.3008535510507029, + "grad_norm": 0.10320805013179779, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 336510 + }, + { + "epoch": 1.3008922082540861, + "grad_norm": 0.11415956169366837, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 336520 + }, + { + "epoch": 1.3009308654574694, + "grad_norm": 0.1253964602947235, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 336530 + }, + { + "epoch": 1.3009695226608526, + "grad_norm": 0.11867789179086685, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 336540 + }, + { + "epoch": 1.301008179864236, + "grad_norm": 0.13367541134357452, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 336550 + }, + { + "epoch": 1.3010468370676191, + "grad_norm": 0.1060272827744484, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 336560 + }, + { + "epoch": 1.3010854942710024, + "grad_norm": 0.1167825311422348, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 336570 + }, + { + "epoch": 1.3011241514743856, + "grad_norm": 0.0996595248579979, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 336580 + }, + { + "epoch": 1.301162808677769, + "grad_norm": 0.10070031136274338, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 336590 + }, + { + "epoch": 1.3012014658811524, + "grad_norm": 0.14351944625377655, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 336600 + }, + { + "epoch": 1.3012401230845356, + "grad_norm": 0.09690338373184204, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 336610 + }, + { + "epoch": 1.3012787802879189, + "grad_norm": 0.1026904284954071, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 336620 + }, + { + "epoch": 1.3013174374913021, + "grad_norm": 0.12201692909002304, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 336630 + }, + { + "epoch": 1.3013560946946854, + "grad_norm": 0.15047767758369446, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 336640 + }, + { + "epoch": 1.3013947518980686, + "grad_norm": 0.11998075991868973, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 336650 + }, + { + "epoch": 1.3014334091014519, + "grad_norm": 0.09685704112052917, + "learning_rate": 0.002, + "loss": 2.34, + "step": 336660 + }, + { + "epoch": 1.3014720663048354, + "grad_norm": 0.08562976866960526, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 336670 + }, + { + "epoch": 1.3015107235082186, + "grad_norm": 0.10097695887088776, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 336680 + }, + { + "epoch": 1.3015493807116019, + "grad_norm": 0.1395394206047058, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 336690 + }, + { + "epoch": 1.3015880379149851, + "grad_norm": 0.097958505153656, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 336700 + }, + { + "epoch": 1.3016266951183684, + "grad_norm": 0.1106363832950592, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 336710 + }, + { + "epoch": 1.3016653523217516, + "grad_norm": 0.10885391384363174, + "learning_rate": 0.002, + "loss": 2.321, + "step": 336720 + }, + { + "epoch": 1.3017040095251349, + "grad_norm": 0.10044512897729874, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 336730 + }, + { + "epoch": 1.3017426667285181, + "grad_norm": 0.1009591743350029, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 336740 + }, + { + "epoch": 1.3017813239319014, + "grad_norm": 0.09731756895780563, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 336750 + }, + { + "epoch": 1.3018199811352846, + "grad_norm": 0.18127033114433289, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 336760 + }, + { + "epoch": 1.301858638338668, + "grad_norm": 0.1387605369091034, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 336770 + }, + { + "epoch": 1.3018972955420514, + "grad_norm": 0.13605371117591858, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 336780 + }, + { + "epoch": 1.3019359527454346, + "grad_norm": 0.09742670506238937, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 336790 + }, + { + "epoch": 1.3019746099488179, + "grad_norm": 0.11238933354616165, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 336800 + }, + { + "epoch": 1.3020132671522011, + "grad_norm": 0.1343478411436081, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 336810 + }, + { + "epoch": 1.3020519243555844, + "grad_norm": 0.11248623579740524, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 336820 + }, + { + "epoch": 1.3020905815589676, + "grad_norm": 0.10541951656341553, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 336830 + }, + { + "epoch": 1.302129238762351, + "grad_norm": 0.08968300372362137, + "learning_rate": 0.002, + "loss": 2.33, + "step": 336840 + }, + { + "epoch": 1.3021678959657343, + "grad_norm": 0.1093692034482956, + "learning_rate": 0.002, + "loss": 2.335, + "step": 336850 + }, + { + "epoch": 1.3022065531691176, + "grad_norm": 0.10313425213098526, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 336860 + }, + { + "epoch": 1.3022452103725008, + "grad_norm": 0.11104105412960052, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 336870 + }, + { + "epoch": 1.302283867575884, + "grad_norm": 0.10031123459339142, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 336880 + }, + { + "epoch": 1.3023225247792674, + "grad_norm": 0.1145625114440918, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 336890 + }, + { + "epoch": 1.3023611819826506, + "grad_norm": 0.11190888285636902, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 336900 + }, + { + "epoch": 1.3023998391860339, + "grad_norm": 0.10132209956645966, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 336910 + }, + { + "epoch": 1.302438496389417, + "grad_norm": 0.1086704432964325, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 336920 + }, + { + "epoch": 1.3024771535928004, + "grad_norm": 0.10256091505289078, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 336930 + }, + { + "epoch": 1.3025158107961838, + "grad_norm": 0.10453871637582779, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 336940 + }, + { + "epoch": 1.302554467999567, + "grad_norm": 0.11024262756109238, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 336950 + }, + { + "epoch": 1.3025931252029503, + "grad_norm": 0.09911150485277176, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 336960 + }, + { + "epoch": 1.3026317824063336, + "grad_norm": 0.09939631074666977, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 336970 + }, + { + "epoch": 1.3026704396097168, + "grad_norm": 0.10066703706979752, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 336980 + }, + { + "epoch": 1.3027090968131, + "grad_norm": 0.10721877217292786, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 336990 + }, + { + "epoch": 1.3027477540164834, + "grad_norm": 0.10138001292943954, + "learning_rate": 0.002, + "loss": 2.323, + "step": 337000 + }, + { + "epoch": 1.3027864112198668, + "grad_norm": 0.09895486384630203, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 337010 + }, + { + "epoch": 1.30282506842325, + "grad_norm": 0.1078089028596878, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 337020 + }, + { + "epoch": 1.3028637256266333, + "grad_norm": 0.1079610288143158, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 337030 + }, + { + "epoch": 1.3029023828300166, + "grad_norm": 0.1307416409254074, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 337040 + }, + { + "epoch": 1.3029410400333998, + "grad_norm": 0.09234651178121567, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 337050 + }, + { + "epoch": 1.302979697236783, + "grad_norm": 0.10930173099040985, + "learning_rate": 0.002, + "loss": 2.33, + "step": 337060 + }, + { + "epoch": 1.3030183544401663, + "grad_norm": 0.10918635129928589, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 337070 + }, + { + "epoch": 1.3030570116435496, + "grad_norm": 0.10842832177877426, + "learning_rate": 0.002, + "loss": 2.327, + "step": 337080 + }, + { + "epoch": 1.3030956688469328, + "grad_norm": 0.11361797153949738, + "learning_rate": 0.002, + "loss": 2.343, + "step": 337090 + }, + { + "epoch": 1.303134326050316, + "grad_norm": 0.09919606894254684, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 337100 + }, + { + "epoch": 1.3031729832536996, + "grad_norm": 0.12349893897771835, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 337110 + }, + { + "epoch": 1.3032116404570828, + "grad_norm": 0.10912661254405975, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 337120 + }, + { + "epoch": 1.303250297660466, + "grad_norm": 0.12357006967067719, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 337130 + }, + { + "epoch": 1.3032889548638493, + "grad_norm": 0.10051541775465012, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 337140 + }, + { + "epoch": 1.3033276120672326, + "grad_norm": 0.09484165161848068, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 337150 + }, + { + "epoch": 1.3033662692706158, + "grad_norm": 0.12610755860805511, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 337160 + }, + { + "epoch": 1.3034049264739993, + "grad_norm": 0.1031932383775711, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 337170 + }, + { + "epoch": 1.3034435836773826, + "grad_norm": 0.11469267308712006, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 337180 + }, + { + "epoch": 1.3034822408807658, + "grad_norm": 0.10891234874725342, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 337190 + }, + { + "epoch": 1.303520898084149, + "grad_norm": 0.12440503388643265, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 337200 + }, + { + "epoch": 1.3035595552875323, + "grad_norm": 0.12115427106618881, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 337210 + }, + { + "epoch": 1.3035982124909156, + "grad_norm": 0.1324635148048401, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 337220 + }, + { + "epoch": 1.3036368696942988, + "grad_norm": 0.10365882515907288, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 337230 + }, + { + "epoch": 1.303675526897682, + "grad_norm": 0.11532828211784363, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 337240 + }, + { + "epoch": 1.3037141841010653, + "grad_norm": 0.1240374743938446, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 337250 + }, + { + "epoch": 1.3037528413044486, + "grad_norm": 0.10586193203926086, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 337260 + }, + { + "epoch": 1.3037914985078318, + "grad_norm": 0.09692249447107315, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 337270 + }, + { + "epoch": 1.3038301557112153, + "grad_norm": 0.11600619554519653, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 337280 + }, + { + "epoch": 1.3038688129145986, + "grad_norm": 0.09627915918827057, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 337290 + }, + { + "epoch": 1.3039074701179818, + "grad_norm": 0.11454172432422638, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 337300 + }, + { + "epoch": 1.303946127321365, + "grad_norm": 0.09462732821702957, + "learning_rate": 0.002, + "loss": 2.3119, + "step": 337310 + }, + { + "epoch": 1.3039847845247483, + "grad_norm": 0.10958430916070938, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 337320 + }, + { + "epoch": 1.3040234417281316, + "grad_norm": 0.1141655296087265, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 337330 + }, + { + "epoch": 1.304062098931515, + "grad_norm": 0.11504334211349487, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 337340 + }, + { + "epoch": 1.3041007561348983, + "grad_norm": 0.1253875344991684, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 337350 + }, + { + "epoch": 1.3041394133382815, + "grad_norm": 0.11920283734798431, + "learning_rate": 0.002, + "loss": 2.337, + "step": 337360 + }, + { + "epoch": 1.3041780705416648, + "grad_norm": 0.1002359464764595, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 337370 + }, + { + "epoch": 1.304216727745048, + "grad_norm": 0.09919452667236328, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 337380 + }, + { + "epoch": 1.3042553849484313, + "grad_norm": 0.10681872814893723, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 337390 + }, + { + "epoch": 1.3042940421518145, + "grad_norm": 0.10820811986923218, + "learning_rate": 0.002, + "loss": 2.334, + "step": 337400 + }, + { + "epoch": 1.3043326993551978, + "grad_norm": 0.11254443228244781, + "learning_rate": 0.002, + "loss": 2.342, + "step": 337410 + }, + { + "epoch": 1.304371356558581, + "grad_norm": 0.0999123752117157, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 337420 + }, + { + "epoch": 1.3044100137619643, + "grad_norm": 0.10298678278923035, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 337430 + }, + { + "epoch": 1.3044486709653476, + "grad_norm": 0.11172180622816086, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 337440 + }, + { + "epoch": 1.304487328168731, + "grad_norm": 0.10319048166275024, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 337450 + }, + { + "epoch": 1.3045259853721143, + "grad_norm": 0.10047553479671478, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 337460 + }, + { + "epoch": 1.3045646425754975, + "grad_norm": 0.11037591099739075, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 337470 + }, + { + "epoch": 1.3046032997788808, + "grad_norm": 0.11529282480478287, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 337480 + }, + { + "epoch": 1.304641956982264, + "grad_norm": 0.10569468140602112, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 337490 + }, + { + "epoch": 1.3046806141856473, + "grad_norm": 0.10355425626039505, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 337500 + }, + { + "epoch": 1.3047192713890308, + "grad_norm": 0.102251797914505, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 337510 + }, + { + "epoch": 1.304757928592414, + "grad_norm": 0.09056451916694641, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 337520 + }, + { + "epoch": 1.3047965857957973, + "grad_norm": 0.13302507996559143, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 337530 + }, + { + "epoch": 1.3048352429991805, + "grad_norm": 0.09038243442773819, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 337540 + }, + { + "epoch": 1.3048739002025638, + "grad_norm": 0.09669879823923111, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 337550 + }, + { + "epoch": 1.304912557405947, + "grad_norm": 0.11306969076395035, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 337560 + }, + { + "epoch": 1.3049512146093303, + "grad_norm": 0.11504258960485458, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 337570 + }, + { + "epoch": 1.3049898718127135, + "grad_norm": 0.1048077791929245, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 337580 + }, + { + "epoch": 1.3050285290160968, + "grad_norm": 0.09368283301591873, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 337590 + }, + { + "epoch": 1.30506718621948, + "grad_norm": 0.10178250074386597, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 337600 + }, + { + "epoch": 1.3051058434228633, + "grad_norm": 0.12559524178504944, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 337610 + }, + { + "epoch": 1.3051445006262468, + "grad_norm": 0.1250172257423401, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 337620 + }, + { + "epoch": 1.30518315782963, + "grad_norm": 0.10265842080116272, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 337630 + }, + { + "epoch": 1.3052218150330133, + "grad_norm": 0.11880707740783691, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 337640 + }, + { + "epoch": 1.3052604722363965, + "grad_norm": 0.11822383105754852, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 337650 + }, + { + "epoch": 1.3052991294397798, + "grad_norm": 0.10905078053474426, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 337660 + }, + { + "epoch": 1.305337786643163, + "grad_norm": 0.10963452607393265, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 337670 + }, + { + "epoch": 1.3053764438465465, + "grad_norm": 0.12794677913188934, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 337680 + }, + { + "epoch": 1.3054151010499297, + "grad_norm": 0.10933011770248413, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 337690 + }, + { + "epoch": 1.305453758253313, + "grad_norm": 0.10404634475708008, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 337700 + }, + { + "epoch": 1.3054924154566963, + "grad_norm": 0.11655057221651077, + "learning_rate": 0.002, + "loss": 2.3645, + "step": 337710 + }, + { + "epoch": 1.3055310726600795, + "grad_norm": 0.09879942238330841, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 337720 + }, + { + "epoch": 1.3055697298634628, + "grad_norm": 0.09191339462995529, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 337730 + }, + { + "epoch": 1.305608387066846, + "grad_norm": 0.14594559371471405, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 337740 + }, + { + "epoch": 1.3056470442702293, + "grad_norm": 0.10261886566877365, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 337750 + }, + { + "epoch": 1.3056857014736125, + "grad_norm": 0.10173392295837402, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 337760 + }, + { + "epoch": 1.3057243586769958, + "grad_norm": 0.1198262944817543, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 337770 + }, + { + "epoch": 1.305763015880379, + "grad_norm": 0.0957404300570488, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 337780 + }, + { + "epoch": 1.3058016730837625, + "grad_norm": 0.10244777798652649, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 337790 + }, + { + "epoch": 1.3058403302871457, + "grad_norm": 0.09964311122894287, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 337800 + }, + { + "epoch": 1.305878987490529, + "grad_norm": 0.1049494594335556, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 337810 + }, + { + "epoch": 1.3059176446939122, + "grad_norm": 0.09348540008068085, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 337820 + }, + { + "epoch": 1.3059563018972955, + "grad_norm": 0.12033034861087799, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 337830 + }, + { + "epoch": 1.3059949591006788, + "grad_norm": 0.10889285057783127, + "learning_rate": 0.002, + "loss": 2.3607, + "step": 337840 + }, + { + "epoch": 1.3060336163040622, + "grad_norm": 0.1031394749879837, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 337850 + }, + { + "epoch": 1.3060722735074455, + "grad_norm": 0.10475998371839523, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 337860 + }, + { + "epoch": 1.3061109307108287, + "grad_norm": 0.11214390397071838, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 337870 + }, + { + "epoch": 1.306149587914212, + "grad_norm": 0.10629909485578537, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 337880 + }, + { + "epoch": 1.3061882451175952, + "grad_norm": 0.10484512895345688, + "learning_rate": 0.002, + "loss": 2.333, + "step": 337890 + }, + { + "epoch": 1.3062269023209785, + "grad_norm": 0.10622546821832657, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 337900 + }, + { + "epoch": 1.3062655595243617, + "grad_norm": 0.11129456013441086, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 337910 + }, + { + "epoch": 1.306304216727745, + "grad_norm": 0.11849325895309448, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 337920 + }, + { + "epoch": 1.3063428739311282, + "grad_norm": 0.10488150268793106, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 337930 + }, + { + "epoch": 1.3063815311345115, + "grad_norm": 0.09617189317941666, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 337940 + }, + { + "epoch": 1.3064201883378947, + "grad_norm": 0.09484464675188065, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 337950 + }, + { + "epoch": 1.3064588455412782, + "grad_norm": 0.10381089895963669, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 337960 + }, + { + "epoch": 1.3064975027446615, + "grad_norm": 0.10935788601636887, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 337970 + }, + { + "epoch": 1.3065361599480447, + "grad_norm": 0.114654541015625, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 337980 + }, + { + "epoch": 1.306574817151428, + "grad_norm": 0.10088855028152466, + "learning_rate": 0.002, + "loss": 2.343, + "step": 337990 + }, + { + "epoch": 1.3066134743548112, + "grad_norm": 0.10404116660356522, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 338000 + }, + { + "epoch": 1.3066521315581945, + "grad_norm": 0.10011463612318039, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 338010 + }, + { + "epoch": 1.306690788761578, + "grad_norm": 0.11404065042734146, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 338020 + }, + { + "epoch": 1.3067294459649612, + "grad_norm": 0.09019742161035538, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 338030 + }, + { + "epoch": 1.3067681031683445, + "grad_norm": 0.11440113186836243, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 338040 + }, + { + "epoch": 1.3068067603717277, + "grad_norm": 0.1026352122426033, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 338050 + }, + { + "epoch": 1.306845417575111, + "grad_norm": 0.10444994270801544, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 338060 + }, + { + "epoch": 1.3068840747784942, + "grad_norm": 0.09628769010305405, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 338070 + }, + { + "epoch": 1.3069227319818775, + "grad_norm": 0.09596199542284012, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 338080 + }, + { + "epoch": 1.3069613891852607, + "grad_norm": 0.11345992237329483, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 338090 + }, + { + "epoch": 1.307000046388644, + "grad_norm": 0.12929995357990265, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 338100 + }, + { + "epoch": 1.3070387035920272, + "grad_norm": 0.1082843467593193, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 338110 + }, + { + "epoch": 1.3070773607954105, + "grad_norm": 0.42977941036224365, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 338120 + }, + { + "epoch": 1.307116017998794, + "grad_norm": 0.10232046246528625, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 338130 + }, + { + "epoch": 1.3071546752021772, + "grad_norm": 0.1078309491276741, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 338140 + }, + { + "epoch": 1.3071933324055605, + "grad_norm": 0.09055358916521072, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 338150 + }, + { + "epoch": 1.3072319896089437, + "grad_norm": 0.11110270768404007, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 338160 + }, + { + "epoch": 1.307270646812327, + "grad_norm": 0.09694137424230576, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 338170 + }, + { + "epoch": 1.3073093040157102, + "grad_norm": 0.10096246004104614, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 338180 + }, + { + "epoch": 1.3073479612190937, + "grad_norm": 0.13043995201587677, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 338190 + }, + { + "epoch": 1.307386618422477, + "grad_norm": 0.10851225256919861, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 338200 + }, + { + "epoch": 1.3074252756258602, + "grad_norm": 0.09410696476697922, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 338210 + }, + { + "epoch": 1.3074639328292434, + "grad_norm": 0.09398966282606125, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 338220 + }, + { + "epoch": 1.3075025900326267, + "grad_norm": 0.12850618362426758, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 338230 + }, + { + "epoch": 1.30754124723601, + "grad_norm": 0.10962171852588654, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 338240 + }, + { + "epoch": 1.3075799044393932, + "grad_norm": 0.09918195009231567, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 338250 + }, + { + "epoch": 1.3076185616427765, + "grad_norm": 0.1096796989440918, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 338260 + }, + { + "epoch": 1.3076572188461597, + "grad_norm": 0.11385281383991241, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 338270 + }, + { + "epoch": 1.307695876049543, + "grad_norm": 0.09344173222780228, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 338280 + }, + { + "epoch": 1.3077345332529262, + "grad_norm": 0.10489895939826965, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 338290 + }, + { + "epoch": 1.3077731904563097, + "grad_norm": 0.08784829825162888, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 338300 + }, + { + "epoch": 1.307811847659693, + "grad_norm": 0.11536280065774918, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 338310 + }, + { + "epoch": 1.3078505048630762, + "grad_norm": 0.11776147037744522, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 338320 + }, + { + "epoch": 1.3078891620664594, + "grad_norm": 0.13148784637451172, + "learning_rate": 0.002, + "loss": 2.332, + "step": 338330 + }, + { + "epoch": 1.3079278192698427, + "grad_norm": 0.10201974213123322, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 338340 + }, + { + "epoch": 1.307966476473226, + "grad_norm": 0.09577985852956772, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 338350 + }, + { + "epoch": 1.3080051336766094, + "grad_norm": 0.10251278430223465, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 338360 + }, + { + "epoch": 1.3080437908799927, + "grad_norm": 0.1167164072394371, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 338370 + }, + { + "epoch": 1.308082448083376, + "grad_norm": 0.10759881138801575, + "learning_rate": 0.002, + "loss": 2.324, + "step": 338380 + }, + { + "epoch": 1.3081211052867592, + "grad_norm": 0.09826534241437912, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 338390 + }, + { + "epoch": 1.3081597624901424, + "grad_norm": 0.10461527854204178, + "learning_rate": 0.002, + "loss": 2.34, + "step": 338400 + }, + { + "epoch": 1.3081984196935257, + "grad_norm": 0.10983319580554962, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 338410 + }, + { + "epoch": 1.308237076896909, + "grad_norm": 0.12366301566362381, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 338420 + }, + { + "epoch": 1.3082757341002922, + "grad_norm": 0.10852979123592377, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 338430 + }, + { + "epoch": 1.3083143913036754, + "grad_norm": 0.09494627267122269, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 338440 + }, + { + "epoch": 1.3083530485070587, + "grad_norm": 0.10899453610181808, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 338450 + }, + { + "epoch": 1.3083917057104422, + "grad_norm": 0.12259631603956223, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 338460 + }, + { + "epoch": 1.3084303629138254, + "grad_norm": 0.10453185439109802, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 338470 + }, + { + "epoch": 1.3084690201172087, + "grad_norm": 0.1154693216085434, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 338480 + }, + { + "epoch": 1.308507677320592, + "grad_norm": 0.10357803106307983, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 338490 + }, + { + "epoch": 1.3085463345239752, + "grad_norm": 0.0999743640422821, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 338500 + }, + { + "epoch": 1.3085849917273584, + "grad_norm": 0.10579682141542435, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 338510 + }, + { + "epoch": 1.3086236489307417, + "grad_norm": 0.09436147660017014, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 338520 + }, + { + "epoch": 1.3086623061341252, + "grad_norm": 0.09775619208812714, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 338530 + }, + { + "epoch": 1.3087009633375084, + "grad_norm": 0.12779007852077484, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 338540 + }, + { + "epoch": 1.3087396205408917, + "grad_norm": 0.08898934721946716, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 338550 + }, + { + "epoch": 1.308778277744275, + "grad_norm": 0.0936533585190773, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 338560 + }, + { + "epoch": 1.3088169349476582, + "grad_norm": 0.12484470009803772, + "learning_rate": 0.002, + "loss": 2.34, + "step": 338570 + }, + { + "epoch": 1.3088555921510414, + "grad_norm": 0.08969317376613617, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 338580 + }, + { + "epoch": 1.3088942493544247, + "grad_norm": 0.09576170891523361, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 338590 + }, + { + "epoch": 1.308932906557808, + "grad_norm": 0.11190787702798843, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 338600 + }, + { + "epoch": 1.3089715637611912, + "grad_norm": 0.11296659708023071, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 338610 + }, + { + "epoch": 1.3090102209645744, + "grad_norm": 0.09852434694766998, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 338620 + }, + { + "epoch": 1.309048878167958, + "grad_norm": 0.0959552451968193, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 338630 + }, + { + "epoch": 1.3090875353713411, + "grad_norm": 0.10912944376468658, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 338640 + }, + { + "epoch": 1.3091261925747244, + "grad_norm": 0.10973319411277771, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 338650 + }, + { + "epoch": 1.3091648497781077, + "grad_norm": 0.21840856969356537, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 338660 + }, + { + "epoch": 1.309203506981491, + "grad_norm": 0.09725868701934814, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 338670 + }, + { + "epoch": 1.3092421641848742, + "grad_norm": 0.09659942239522934, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 338680 + }, + { + "epoch": 1.3092808213882574, + "grad_norm": 0.1109233871102333, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 338690 + }, + { + "epoch": 1.3093194785916409, + "grad_norm": 0.11313378065824509, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 338700 + }, + { + "epoch": 1.3093581357950241, + "grad_norm": 0.11242232471704483, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 338710 + }, + { + "epoch": 1.3093967929984074, + "grad_norm": 0.08756279200315475, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 338720 + }, + { + "epoch": 1.3094354502017906, + "grad_norm": 0.10244888067245483, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 338730 + }, + { + "epoch": 1.309474107405174, + "grad_norm": 0.10156544297933578, + "learning_rate": 0.002, + "loss": 2.344, + "step": 338740 + }, + { + "epoch": 1.3095127646085571, + "grad_norm": 0.10466597974300385, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 338750 + }, + { + "epoch": 1.3095514218119404, + "grad_norm": 0.10462167859077454, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 338760 + }, + { + "epoch": 1.3095900790153236, + "grad_norm": 0.1474732607603073, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 338770 + }, + { + "epoch": 1.309628736218707, + "grad_norm": 0.10373160243034363, + "learning_rate": 0.002, + "loss": 2.338, + "step": 338780 + }, + { + "epoch": 1.3096673934220902, + "grad_norm": 0.08936724811792374, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 338790 + }, + { + "epoch": 1.3097060506254736, + "grad_norm": 0.09592811018228531, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 338800 + }, + { + "epoch": 1.3097447078288569, + "grad_norm": 0.1223786324262619, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 338810 + }, + { + "epoch": 1.3097833650322401, + "grad_norm": 0.10772044211626053, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 338820 + }, + { + "epoch": 1.3098220222356234, + "grad_norm": 0.111927330493927, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 338830 + }, + { + "epoch": 1.3098606794390066, + "grad_norm": 0.10590586811304092, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 338840 + }, + { + "epoch": 1.3098993366423899, + "grad_norm": 0.11565124988555908, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 338850 + }, + { + "epoch": 1.3099379938457731, + "grad_norm": 0.09856195747852325, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 338860 + }, + { + "epoch": 1.3099766510491566, + "grad_norm": 0.10407767444849014, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 338870 + }, + { + "epoch": 1.3100153082525399, + "grad_norm": 0.11623556166887283, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 338880 + }, + { + "epoch": 1.3100539654559231, + "grad_norm": 0.12146317958831787, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 338890 + }, + { + "epoch": 1.3100926226593064, + "grad_norm": 0.09300675243139267, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 338900 + }, + { + "epoch": 1.3101312798626896, + "grad_norm": 0.0998629629611969, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 338910 + }, + { + "epoch": 1.3101699370660729, + "grad_norm": 0.09744694083929062, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 338920 + }, + { + "epoch": 1.3102085942694561, + "grad_norm": 0.09870826452970505, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 338930 + }, + { + "epoch": 1.3102472514728394, + "grad_norm": 0.11773059517145157, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 338940 + }, + { + "epoch": 1.3102859086762226, + "grad_norm": 0.10237418860197067, + "learning_rate": 0.002, + "loss": 2.308, + "step": 338950 + }, + { + "epoch": 1.3103245658796059, + "grad_norm": 0.105544313788414, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 338960 + }, + { + "epoch": 1.3103632230829894, + "grad_norm": 0.10724999010562897, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 338970 + }, + { + "epoch": 1.3104018802863726, + "grad_norm": 0.09732942283153534, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 338980 + }, + { + "epoch": 1.3104405374897559, + "grad_norm": 0.10429544001817703, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 338990 + }, + { + "epoch": 1.3104791946931391, + "grad_norm": 0.10754740983247757, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 339000 + }, + { + "epoch": 1.3105178518965224, + "grad_norm": 0.09705036878585815, + "learning_rate": 0.002, + "loss": 2.318, + "step": 339010 + }, + { + "epoch": 1.3105565090999056, + "grad_norm": 0.13875411450862885, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 339020 + }, + { + "epoch": 1.3105951663032889, + "grad_norm": 0.10345365107059479, + "learning_rate": 0.002, + "loss": 2.3126, + "step": 339030 + }, + { + "epoch": 1.3106338235066723, + "grad_norm": 0.11298981308937073, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 339040 + }, + { + "epoch": 1.3106724807100556, + "grad_norm": 0.09629258513450623, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 339050 + }, + { + "epoch": 1.3107111379134389, + "grad_norm": 0.1021524965763092, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 339060 + }, + { + "epoch": 1.310749795116822, + "grad_norm": 0.10271374136209488, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 339070 + }, + { + "epoch": 1.3107884523202054, + "grad_norm": 0.12685607373714447, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 339080 + }, + { + "epoch": 1.3108271095235886, + "grad_norm": 0.1172247901558876, + "learning_rate": 0.002, + "loss": 2.337, + "step": 339090 + }, + { + "epoch": 1.3108657667269719, + "grad_norm": 0.09642387181520462, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 339100 + }, + { + "epoch": 1.3109044239303551, + "grad_norm": 0.10917683690786362, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 339110 + }, + { + "epoch": 1.3109430811337384, + "grad_norm": 0.10852925479412079, + "learning_rate": 0.002, + "loss": 2.321, + "step": 339120 + }, + { + "epoch": 1.3109817383371216, + "grad_norm": 0.1264217495918274, + "learning_rate": 0.002, + "loss": 2.323, + "step": 339130 + }, + { + "epoch": 1.311020395540505, + "grad_norm": 0.09436733275651932, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 339140 + }, + { + "epoch": 1.3110590527438883, + "grad_norm": 0.1040552482008934, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 339150 + }, + { + "epoch": 1.3110977099472716, + "grad_norm": 0.10122077912092209, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 339160 + }, + { + "epoch": 1.3111363671506548, + "grad_norm": 0.08897639065980911, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 339170 + }, + { + "epoch": 1.311175024354038, + "grad_norm": 0.1053071916103363, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 339180 + }, + { + "epoch": 1.3112136815574214, + "grad_norm": 0.11119119822978973, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 339190 + }, + { + "epoch": 1.3112523387608048, + "grad_norm": 0.101591557264328, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 339200 + }, + { + "epoch": 1.311290995964188, + "grad_norm": 0.1332840770483017, + "learning_rate": 0.002, + "loss": 2.336, + "step": 339210 + }, + { + "epoch": 1.3113296531675713, + "grad_norm": 0.1152091696858406, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 339220 + }, + { + "epoch": 1.3113683103709546, + "grad_norm": 0.12425046414136887, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 339230 + }, + { + "epoch": 1.3114069675743378, + "grad_norm": 0.09523022174835205, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 339240 + }, + { + "epoch": 1.311445624777721, + "grad_norm": 0.09999441355466843, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 339250 + }, + { + "epoch": 1.3114842819811043, + "grad_norm": 0.10136357694864273, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 339260 + }, + { + "epoch": 1.3115229391844876, + "grad_norm": 0.22549983859062195, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 339270 + }, + { + "epoch": 1.3115615963878708, + "grad_norm": 0.10386420786380768, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 339280 + }, + { + "epoch": 1.311600253591254, + "grad_norm": 0.09532750397920609, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 339290 + }, + { + "epoch": 1.3116389107946373, + "grad_norm": 0.11204537749290466, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 339300 + }, + { + "epoch": 1.3116775679980208, + "grad_norm": 0.10338576138019562, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 339310 + }, + { + "epoch": 1.311716225201404, + "grad_norm": 0.11117055267095566, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 339320 + }, + { + "epoch": 1.3117548824047873, + "grad_norm": 0.09967552870512009, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 339330 + }, + { + "epoch": 1.3117935396081706, + "grad_norm": 0.14006535708904266, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 339340 + }, + { + "epoch": 1.3118321968115538, + "grad_norm": 0.09871932864189148, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 339350 + }, + { + "epoch": 1.311870854014937, + "grad_norm": 0.09871833026409149, + "learning_rate": 0.002, + "loss": 2.3124, + "step": 339360 + }, + { + "epoch": 1.3119095112183206, + "grad_norm": 0.09466978162527084, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 339370 + }, + { + "epoch": 1.3119481684217038, + "grad_norm": 0.10804964601993561, + "learning_rate": 0.002, + "loss": 2.3115, + "step": 339380 + }, + { + "epoch": 1.311986825625087, + "grad_norm": 0.09714234620332718, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 339390 + }, + { + "epoch": 1.3120254828284703, + "grad_norm": 0.12230552732944489, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 339400 + }, + { + "epoch": 1.3120641400318536, + "grad_norm": 0.12029729783535004, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 339410 + }, + { + "epoch": 1.3121027972352368, + "grad_norm": 0.08929474651813507, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 339420 + }, + { + "epoch": 1.31214145443862, + "grad_norm": 0.11715824156999588, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 339430 + }, + { + "epoch": 1.3121801116420033, + "grad_norm": 0.10215269029140472, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 339440 + }, + { + "epoch": 1.3122187688453866, + "grad_norm": 0.10783392935991287, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 339450 + }, + { + "epoch": 1.3122574260487698, + "grad_norm": 0.0915178582072258, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 339460 + }, + { + "epoch": 1.312296083252153, + "grad_norm": 0.09194423258304596, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 339470 + }, + { + "epoch": 1.3123347404555366, + "grad_norm": 0.10409944504499435, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 339480 + }, + { + "epoch": 1.3123733976589198, + "grad_norm": 0.10948554426431656, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 339490 + }, + { + "epoch": 1.312412054862303, + "grad_norm": 0.10612896084785461, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 339500 + }, + { + "epoch": 1.3124507120656863, + "grad_norm": 0.09984373301267624, + "learning_rate": 0.002, + "loss": 2.346, + "step": 339510 + }, + { + "epoch": 1.3124893692690696, + "grad_norm": 0.12539131939411163, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 339520 + }, + { + "epoch": 1.3125280264724528, + "grad_norm": 0.12929649651050568, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 339530 + }, + { + "epoch": 1.3125666836758363, + "grad_norm": 0.11457858979701996, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 339540 + }, + { + "epoch": 1.3126053408792195, + "grad_norm": 0.11001051962375641, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 339550 + }, + { + "epoch": 1.3126439980826028, + "grad_norm": 0.09543713927268982, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 339560 + }, + { + "epoch": 1.312682655285986, + "grad_norm": 0.10295376926660538, + "learning_rate": 0.002, + "loss": 2.331, + "step": 339570 + }, + { + "epoch": 1.3127213124893693, + "grad_norm": 0.12589390575885773, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 339580 + }, + { + "epoch": 1.3127599696927525, + "grad_norm": 0.10750047862529755, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 339590 + }, + { + "epoch": 1.3127986268961358, + "grad_norm": 0.10956700891256332, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 339600 + }, + { + "epoch": 1.312837284099519, + "grad_norm": 0.09963594377040863, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 339610 + }, + { + "epoch": 1.3128759413029023, + "grad_norm": 0.11178959906101227, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 339620 + }, + { + "epoch": 1.3129145985062856, + "grad_norm": 0.09853495657444, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 339630 + }, + { + "epoch": 1.3129532557096688, + "grad_norm": 0.1059534028172493, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 339640 + }, + { + "epoch": 1.3129919129130523, + "grad_norm": 0.12822778522968292, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 339650 + }, + { + "epoch": 1.3130305701164355, + "grad_norm": 0.12121497094631195, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 339660 + }, + { + "epoch": 1.3130692273198188, + "grad_norm": 0.10710785537958145, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 339670 + }, + { + "epoch": 1.313107884523202, + "grad_norm": 0.0960579365491867, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 339680 + }, + { + "epoch": 1.3131465417265853, + "grad_norm": 0.11348490417003632, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 339690 + }, + { + "epoch": 1.3131851989299685, + "grad_norm": 0.1104402244091034, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 339700 + }, + { + "epoch": 1.313223856133352, + "grad_norm": 0.10747892409563065, + "learning_rate": 0.002, + "loss": 2.337, + "step": 339710 + }, + { + "epoch": 1.3132625133367353, + "grad_norm": 0.11154567450284958, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 339720 + }, + { + "epoch": 1.3133011705401185, + "grad_norm": 0.10602911561727524, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 339730 + }, + { + "epoch": 1.3133398277435018, + "grad_norm": 0.11539280414581299, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 339740 + }, + { + "epoch": 1.313378484946885, + "grad_norm": 0.11169593036174774, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 339750 + }, + { + "epoch": 1.3134171421502683, + "grad_norm": 0.11478446424007416, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 339760 + }, + { + "epoch": 1.3134557993536515, + "grad_norm": 0.10598297417163849, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 339770 + }, + { + "epoch": 1.3134944565570348, + "grad_norm": 0.10229714959859848, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 339780 + }, + { + "epoch": 1.313533113760418, + "grad_norm": 0.12123233079910278, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 339790 + }, + { + "epoch": 1.3135717709638013, + "grad_norm": 0.10334493219852448, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 339800 + }, + { + "epoch": 1.3136104281671845, + "grad_norm": 0.10026278346776962, + "learning_rate": 0.002, + "loss": 2.322, + "step": 339810 + }, + { + "epoch": 1.313649085370568, + "grad_norm": 0.11529593914747238, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 339820 + }, + { + "epoch": 1.3136877425739513, + "grad_norm": 0.11085256189107895, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 339830 + }, + { + "epoch": 1.3137263997773345, + "grad_norm": 0.0989978238940239, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 339840 + }, + { + "epoch": 1.3137650569807178, + "grad_norm": 0.09534777700901031, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 339850 + }, + { + "epoch": 1.313803714184101, + "grad_norm": 0.13268207013607025, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 339860 + }, + { + "epoch": 1.3138423713874843, + "grad_norm": 0.09691354632377625, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 339870 + }, + { + "epoch": 1.3138810285908677, + "grad_norm": 0.11440901458263397, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 339880 + }, + { + "epoch": 1.313919685794251, + "grad_norm": 0.09623962640762329, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 339890 + }, + { + "epoch": 1.3139583429976343, + "grad_norm": 0.09594407677650452, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 339900 + }, + { + "epoch": 1.3139970002010175, + "grad_norm": 0.12013139575719833, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 339910 + }, + { + "epoch": 1.3140356574044008, + "grad_norm": 0.09702719002962112, + "learning_rate": 0.002, + "loss": 2.327, + "step": 339920 + }, + { + "epoch": 1.314074314607784, + "grad_norm": 0.12030521035194397, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 339930 + }, + { + "epoch": 1.3141129718111673, + "grad_norm": 0.10773120820522308, + "learning_rate": 0.002, + "loss": 2.344, + "step": 339940 + }, + { + "epoch": 1.3141516290145505, + "grad_norm": 0.09532781690359116, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 339950 + }, + { + "epoch": 1.3141902862179338, + "grad_norm": 0.09022223204374313, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 339960 + }, + { + "epoch": 1.314228943421317, + "grad_norm": 0.12461259961128235, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 339970 + }, + { + "epoch": 1.3142676006247003, + "grad_norm": 0.1056964248418808, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 339980 + }, + { + "epoch": 1.3143062578280837, + "grad_norm": 0.12232209742069244, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 339990 + }, + { + "epoch": 1.314344915031467, + "grad_norm": 0.12208342552185059, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 340000 + }, + { + "epoch": 1.3143835722348503, + "grad_norm": 0.10568199306726456, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 340010 + }, + { + "epoch": 1.3144222294382335, + "grad_norm": 0.09507480263710022, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 340020 + }, + { + "epoch": 1.3144608866416168, + "grad_norm": 0.10756602883338928, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 340030 + }, + { + "epoch": 1.314499543845, + "grad_norm": 0.08802318572998047, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 340040 + }, + { + "epoch": 1.3145382010483835, + "grad_norm": 0.13032051920890808, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 340050 + }, + { + "epoch": 1.3145768582517667, + "grad_norm": 0.09277044236660004, + "learning_rate": 0.002, + "loss": 2.342, + "step": 340060 + }, + { + "epoch": 1.31461551545515, + "grad_norm": 0.1054871678352356, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 340070 + }, + { + "epoch": 1.3146541726585332, + "grad_norm": 0.11483970284461975, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 340080 + }, + { + "epoch": 1.3146928298619165, + "grad_norm": 0.09837651997804642, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 340090 + }, + { + "epoch": 1.3147314870652997, + "grad_norm": 0.1070932149887085, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 340100 + }, + { + "epoch": 1.314770144268683, + "grad_norm": 0.09720432758331299, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 340110 + }, + { + "epoch": 1.3148088014720662, + "grad_norm": 0.11132647842168808, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 340120 + }, + { + "epoch": 1.3148474586754495, + "grad_norm": 0.10063239187002182, + "learning_rate": 0.002, + "loss": 2.325, + "step": 340130 + }, + { + "epoch": 1.3148861158788328, + "grad_norm": 0.10028837621212006, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 340140 + }, + { + "epoch": 1.314924773082216, + "grad_norm": 0.10856592655181885, + "learning_rate": 0.002, + "loss": 2.337, + "step": 340150 + }, + { + "epoch": 1.3149634302855995, + "grad_norm": 0.09780581295490265, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 340160 + }, + { + "epoch": 1.3150020874889827, + "grad_norm": 0.14413388073444366, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 340170 + }, + { + "epoch": 1.315040744692366, + "grad_norm": 0.11259722709655762, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 340180 + }, + { + "epoch": 1.3150794018957492, + "grad_norm": 0.09913891553878784, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 340190 + }, + { + "epoch": 1.3151180590991325, + "grad_norm": 0.10148324817419052, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 340200 + }, + { + "epoch": 1.3151567163025157, + "grad_norm": 0.13096053898334503, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 340210 + }, + { + "epoch": 1.3151953735058992, + "grad_norm": 0.10373006761074066, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 340220 + }, + { + "epoch": 1.3152340307092825, + "grad_norm": 0.12267054617404938, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 340230 + }, + { + "epoch": 1.3152726879126657, + "grad_norm": 0.0968276634812355, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 340240 + }, + { + "epoch": 1.315311345116049, + "grad_norm": 0.1325179785490036, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 340250 + }, + { + "epoch": 1.3153500023194322, + "grad_norm": 0.12446484714746475, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 340260 + }, + { + "epoch": 1.3153886595228155, + "grad_norm": 0.11135133355855942, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 340270 + }, + { + "epoch": 1.3154273167261987, + "grad_norm": 0.1034776121377945, + "learning_rate": 0.002, + "loss": 2.335, + "step": 340280 + }, + { + "epoch": 1.315465973929582, + "grad_norm": 0.11215350031852722, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 340290 + }, + { + "epoch": 1.3155046311329652, + "grad_norm": 0.09734928607940674, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 340300 + }, + { + "epoch": 1.3155432883363485, + "grad_norm": 0.09327472746372223, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 340310 + }, + { + "epoch": 1.315581945539732, + "grad_norm": 0.12669144570827484, + "learning_rate": 0.002, + "loss": 2.325, + "step": 340320 + }, + { + "epoch": 1.3156206027431152, + "grad_norm": 0.10268600285053253, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 340330 + }, + { + "epoch": 1.3156592599464985, + "grad_norm": 0.09908057749271393, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 340340 + }, + { + "epoch": 1.3156979171498817, + "grad_norm": 0.10985678434371948, + "learning_rate": 0.002, + "loss": 2.342, + "step": 340350 + }, + { + "epoch": 1.315736574353265, + "grad_norm": 0.09564399719238281, + "learning_rate": 0.002, + "loss": 2.337, + "step": 340360 + }, + { + "epoch": 1.3157752315566482, + "grad_norm": 0.1189763993024826, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 340370 + }, + { + "epoch": 1.3158138887600315, + "grad_norm": 0.10353276133537292, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 340380 + }, + { + "epoch": 1.315852545963415, + "grad_norm": 0.09505298733711243, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 340390 + }, + { + "epoch": 1.3158912031667982, + "grad_norm": 0.10912010073661804, + "learning_rate": 0.002, + "loss": 2.34, + "step": 340400 + }, + { + "epoch": 1.3159298603701814, + "grad_norm": 0.11756809055805206, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 340410 + }, + { + "epoch": 1.3159685175735647, + "grad_norm": 0.09390808641910553, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 340420 + }, + { + "epoch": 1.316007174776948, + "grad_norm": 0.108702652156353, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 340430 + }, + { + "epoch": 1.3160458319803312, + "grad_norm": 0.09625627100467682, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 340440 + }, + { + "epoch": 1.3160844891837145, + "grad_norm": 0.09767688810825348, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 340450 + }, + { + "epoch": 1.3161231463870977, + "grad_norm": 0.11160442978143692, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 340460 + }, + { + "epoch": 1.316161803590481, + "grad_norm": 0.08791060745716095, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 340470 + }, + { + "epoch": 1.3162004607938642, + "grad_norm": 0.12832403182983398, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 340480 + }, + { + "epoch": 1.3162391179972477, + "grad_norm": 0.0986124649643898, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 340490 + }, + { + "epoch": 1.316277775200631, + "grad_norm": 0.12974348664283752, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 340500 + }, + { + "epoch": 1.3163164324040142, + "grad_norm": 0.11423316597938538, + "learning_rate": 0.002, + "loss": 2.335, + "step": 340510 + }, + { + "epoch": 1.3163550896073974, + "grad_norm": 0.1260526031255722, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 340520 + }, + { + "epoch": 1.3163937468107807, + "grad_norm": 0.10613839328289032, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 340530 + }, + { + "epoch": 1.316432404014164, + "grad_norm": 0.10647633671760559, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 340540 + }, + { + "epoch": 1.3164710612175472, + "grad_norm": 0.09708483517169952, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 340550 + }, + { + "epoch": 1.3165097184209307, + "grad_norm": 0.10543903708457947, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 340560 + }, + { + "epoch": 1.316548375624314, + "grad_norm": 0.12418416142463684, + "learning_rate": 0.002, + "loss": 2.348, + "step": 340570 + }, + { + "epoch": 1.3165870328276972, + "grad_norm": 0.09926290810108185, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 340580 + }, + { + "epoch": 1.3166256900310804, + "grad_norm": 0.11084403097629547, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 340590 + }, + { + "epoch": 1.3166643472344637, + "grad_norm": 0.10365424305200577, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 340600 + }, + { + "epoch": 1.316703004437847, + "grad_norm": 0.11727438867092133, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 340610 + }, + { + "epoch": 1.3167416616412302, + "grad_norm": 0.10081959515810013, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 340620 + }, + { + "epoch": 1.3167803188446134, + "grad_norm": 0.09295682609081268, + "learning_rate": 0.002, + "loss": 2.34, + "step": 340630 + }, + { + "epoch": 1.3168189760479967, + "grad_norm": 0.11949366331100464, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 340640 + }, + { + "epoch": 1.31685763325138, + "grad_norm": 0.09315700083971024, + "learning_rate": 0.002, + "loss": 2.33, + "step": 340650 + }, + { + "epoch": 1.3168962904547634, + "grad_norm": 0.09729575365781784, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 340660 + }, + { + "epoch": 1.3169349476581467, + "grad_norm": 0.09525522589683533, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 340670 + }, + { + "epoch": 1.31697360486153, + "grad_norm": 0.12238912284374237, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 340680 + }, + { + "epoch": 1.3170122620649132, + "grad_norm": 0.09936051815748215, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 340690 + }, + { + "epoch": 1.3170509192682964, + "grad_norm": 0.10798247903585434, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 340700 + }, + { + "epoch": 1.3170895764716797, + "grad_norm": 0.12307767570018768, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 340710 + }, + { + "epoch": 1.317128233675063, + "grad_norm": 0.11636736243963242, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 340720 + }, + { + "epoch": 1.3171668908784464, + "grad_norm": 0.12146612256765366, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 340730 + }, + { + "epoch": 1.3172055480818297, + "grad_norm": 0.09943657368421555, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 340740 + }, + { + "epoch": 1.317244205285213, + "grad_norm": 0.10115927457809448, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 340750 + }, + { + "epoch": 1.3172828624885962, + "grad_norm": 0.09202753007411957, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 340760 + }, + { + "epoch": 1.3173215196919794, + "grad_norm": 0.114414744079113, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 340770 + }, + { + "epoch": 1.3173601768953627, + "grad_norm": 0.09059332311153412, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 340780 + }, + { + "epoch": 1.317398834098746, + "grad_norm": 0.11916932463645935, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 340790 + }, + { + "epoch": 1.3174374913021292, + "grad_norm": 0.13115397095680237, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 340800 + }, + { + "epoch": 1.3174761485055124, + "grad_norm": 0.1227148175239563, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 340810 + }, + { + "epoch": 1.3175148057088957, + "grad_norm": 0.10211624950170517, + "learning_rate": 0.002, + "loss": 2.348, + "step": 340820 + }, + { + "epoch": 1.3175534629122791, + "grad_norm": 0.1126120314002037, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 340830 + }, + { + "epoch": 1.3175921201156624, + "grad_norm": 0.10091283917427063, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 340840 + }, + { + "epoch": 1.3176307773190457, + "grad_norm": 0.10280703008174896, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 340850 + }, + { + "epoch": 1.317669434522429, + "grad_norm": 0.09712085127830505, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 340860 + }, + { + "epoch": 1.3177080917258122, + "grad_norm": 0.11485341191291809, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 340870 + }, + { + "epoch": 1.3177467489291954, + "grad_norm": 0.09300262480974197, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 340880 + }, + { + "epoch": 1.3177854061325787, + "grad_norm": 0.09692247211933136, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 340890 + }, + { + "epoch": 1.3178240633359621, + "grad_norm": 0.10225555300712585, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 340900 + }, + { + "epoch": 1.3178627205393454, + "grad_norm": 0.09950529783964157, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 340910 + }, + { + "epoch": 1.3179013777427286, + "grad_norm": 0.10115771740674973, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 340920 + }, + { + "epoch": 1.317940034946112, + "grad_norm": 0.11324211210012436, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 340930 + }, + { + "epoch": 1.3179786921494951, + "grad_norm": 0.12060297280550003, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 340940 + }, + { + "epoch": 1.3180173493528784, + "grad_norm": 0.10018421709537506, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 340950 + }, + { + "epoch": 1.3180560065562617, + "grad_norm": 0.10009391605854034, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 340960 + }, + { + "epoch": 1.318094663759645, + "grad_norm": 0.10127507150173187, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 340970 + }, + { + "epoch": 1.3181333209630282, + "grad_norm": 0.09859684854745865, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 340980 + }, + { + "epoch": 1.3181719781664114, + "grad_norm": 0.12744922935962677, + "learning_rate": 0.002, + "loss": 2.337, + "step": 340990 + }, + { + "epoch": 1.3182106353697949, + "grad_norm": 0.11380188912153244, + "learning_rate": 0.002, + "loss": 2.329, + "step": 341000 + }, + { + "epoch": 1.3182492925731781, + "grad_norm": 0.11590145528316498, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 341010 + }, + { + "epoch": 1.3182879497765614, + "grad_norm": 0.10371170192956924, + "learning_rate": 0.002, + "loss": 2.33, + "step": 341020 + }, + { + "epoch": 1.3183266069799446, + "grad_norm": 0.13881763815879822, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 341030 + }, + { + "epoch": 1.318365264183328, + "grad_norm": 0.09833131730556488, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 341040 + }, + { + "epoch": 1.3184039213867111, + "grad_norm": 0.10128436982631683, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 341050 + }, + { + "epoch": 1.3184425785900946, + "grad_norm": 0.10208149254322052, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 341060 + }, + { + "epoch": 1.3184812357934779, + "grad_norm": 0.10042634606361389, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 341070 + }, + { + "epoch": 1.3185198929968611, + "grad_norm": 0.10970644652843475, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 341080 + }, + { + "epoch": 1.3185585502002444, + "grad_norm": 0.08735210448503494, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 341090 + }, + { + "epoch": 1.3185972074036276, + "grad_norm": 0.0989793911576271, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 341100 + }, + { + "epoch": 1.3186358646070109, + "grad_norm": 0.10791467875242233, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 341110 + }, + { + "epoch": 1.3186745218103941, + "grad_norm": 0.11252657324075699, + "learning_rate": 0.002, + "loss": 2.336, + "step": 341120 + }, + { + "epoch": 1.3187131790137774, + "grad_norm": 0.10048529505729675, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 341130 + }, + { + "epoch": 1.3187518362171606, + "grad_norm": 0.11139585077762604, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 341140 + }, + { + "epoch": 1.3187904934205439, + "grad_norm": 0.09323547780513763, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 341150 + }, + { + "epoch": 1.3188291506239271, + "grad_norm": 0.10829438269138336, + "learning_rate": 0.002, + "loss": 2.343, + "step": 341160 + }, + { + "epoch": 1.3188678078273106, + "grad_norm": 0.09060853719711304, + "learning_rate": 0.002, + "loss": 2.328, + "step": 341170 + }, + { + "epoch": 1.3189064650306939, + "grad_norm": 0.10160361230373383, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 341180 + }, + { + "epoch": 1.3189451222340771, + "grad_norm": 0.09072021394968033, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 341190 + }, + { + "epoch": 1.3189837794374604, + "grad_norm": 0.09953247755765915, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 341200 + }, + { + "epoch": 1.3190224366408436, + "grad_norm": 0.09971699118614197, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 341210 + }, + { + "epoch": 1.3190610938442269, + "grad_norm": 0.10871617496013641, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 341220 + }, + { + "epoch": 1.3190997510476103, + "grad_norm": 0.12742142379283905, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 341230 + }, + { + "epoch": 1.3191384082509936, + "grad_norm": 0.09261079877614975, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 341240 + }, + { + "epoch": 1.3191770654543769, + "grad_norm": 0.10547822713851929, + "learning_rate": 0.002, + "loss": 2.319, + "step": 341250 + }, + { + "epoch": 1.31921572265776, + "grad_norm": 0.11054132878780365, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 341260 + }, + { + "epoch": 1.3192543798611434, + "grad_norm": 0.10952828079462051, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 341270 + }, + { + "epoch": 1.3192930370645266, + "grad_norm": 0.09914680570363998, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 341280 + }, + { + "epoch": 1.3193316942679099, + "grad_norm": 0.10533926635980606, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 341290 + }, + { + "epoch": 1.3193703514712931, + "grad_norm": 0.10060252249240875, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 341300 + }, + { + "epoch": 1.3194090086746764, + "grad_norm": 0.09601141512393951, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 341310 + }, + { + "epoch": 1.3194476658780596, + "grad_norm": 0.09400826692581177, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 341320 + }, + { + "epoch": 1.3194863230814429, + "grad_norm": 0.13698619604110718, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 341330 + }, + { + "epoch": 1.3195249802848263, + "grad_norm": 0.10330939292907715, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 341340 + }, + { + "epoch": 1.3195636374882096, + "grad_norm": 0.12004543840885162, + "learning_rate": 0.002, + "loss": 2.335, + "step": 341350 + }, + { + "epoch": 1.3196022946915928, + "grad_norm": 0.09568639099597931, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 341360 + }, + { + "epoch": 1.319640951894976, + "grad_norm": 0.10221485048532486, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 341370 + }, + { + "epoch": 1.3196796090983594, + "grad_norm": 0.12222684174776077, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 341380 + }, + { + "epoch": 1.3197182663017426, + "grad_norm": 0.1150476261973381, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 341390 + }, + { + "epoch": 1.319756923505126, + "grad_norm": 0.09879761189222336, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 341400 + }, + { + "epoch": 1.3197955807085093, + "grad_norm": 0.1274539828300476, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 341410 + }, + { + "epoch": 1.3198342379118926, + "grad_norm": 0.11528249830007553, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 341420 + }, + { + "epoch": 1.3198728951152758, + "grad_norm": 0.10103266686201096, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 341430 + }, + { + "epoch": 1.319911552318659, + "grad_norm": 0.12781168520450592, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 341440 + }, + { + "epoch": 1.3199502095220423, + "grad_norm": 0.10776611417531967, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 341450 + }, + { + "epoch": 1.3199888667254256, + "grad_norm": 0.10403729975223541, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 341460 + }, + { + "epoch": 1.3200275239288088, + "grad_norm": 0.10193544626235962, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 341470 + }, + { + "epoch": 1.320066181132192, + "grad_norm": 0.10320829600095749, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 341480 + }, + { + "epoch": 1.3201048383355753, + "grad_norm": 0.1006522998213768, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 341490 + }, + { + "epoch": 1.3201434955389586, + "grad_norm": 0.1228528618812561, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 341500 + }, + { + "epoch": 1.320182152742342, + "grad_norm": 0.09381001442670822, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 341510 + }, + { + "epoch": 1.3202208099457253, + "grad_norm": 0.0969909206032753, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 341520 + }, + { + "epoch": 1.3202594671491086, + "grad_norm": 0.10850610584020615, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 341530 + }, + { + "epoch": 1.3202981243524918, + "grad_norm": 0.10401296615600586, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 341540 + }, + { + "epoch": 1.320336781555875, + "grad_norm": 0.10126899927854538, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 341550 + }, + { + "epoch": 1.3203754387592583, + "grad_norm": 0.11133930832147598, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 341560 + }, + { + "epoch": 1.3204140959626418, + "grad_norm": 0.0961000993847847, + "learning_rate": 0.002, + "loss": 2.336, + "step": 341570 + }, + { + "epoch": 1.320452753166025, + "grad_norm": 0.10838087648153305, + "learning_rate": 0.002, + "loss": 2.34, + "step": 341580 + }, + { + "epoch": 1.3204914103694083, + "grad_norm": 0.12038574367761612, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 341590 + }, + { + "epoch": 1.3205300675727916, + "grad_norm": 0.10663647949695587, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 341600 + }, + { + "epoch": 1.3205687247761748, + "grad_norm": 0.10277918726205826, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 341610 + }, + { + "epoch": 1.320607381979558, + "grad_norm": 0.12107924371957779, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 341620 + }, + { + "epoch": 1.3206460391829413, + "grad_norm": 0.08732627332210541, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 341630 + }, + { + "epoch": 1.3206846963863246, + "grad_norm": 0.10941670089960098, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 341640 + }, + { + "epoch": 1.3207233535897078, + "grad_norm": 0.09457511454820633, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 341650 + }, + { + "epoch": 1.320762010793091, + "grad_norm": 0.09568420797586441, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 341660 + }, + { + "epoch": 1.3208006679964743, + "grad_norm": 0.10222925990819931, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 341670 + }, + { + "epoch": 1.3208393251998578, + "grad_norm": 0.10123637318611145, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 341680 + }, + { + "epoch": 1.320877982403241, + "grad_norm": 0.1014849916100502, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 341690 + }, + { + "epoch": 1.3209166396066243, + "grad_norm": 0.10959955304861069, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 341700 + }, + { + "epoch": 1.3209552968100076, + "grad_norm": 0.10240094363689423, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 341710 + }, + { + "epoch": 1.3209939540133908, + "grad_norm": 0.10184129327535629, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 341720 + }, + { + "epoch": 1.321032611216774, + "grad_norm": 0.12254176288843155, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 341730 + }, + { + "epoch": 1.3210712684201575, + "grad_norm": 0.11405491828918457, + "learning_rate": 0.002, + "loss": 2.343, + "step": 341740 + }, + { + "epoch": 1.3211099256235408, + "grad_norm": 0.1007409393787384, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 341750 + }, + { + "epoch": 1.321148582826924, + "grad_norm": 0.11579888314008713, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 341760 + }, + { + "epoch": 1.3211872400303073, + "grad_norm": 0.09673046320676804, + "learning_rate": 0.002, + "loss": 2.331, + "step": 341770 + }, + { + "epoch": 1.3212258972336905, + "grad_norm": 0.10171656310558319, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 341780 + }, + { + "epoch": 1.3212645544370738, + "grad_norm": 0.11281340569257736, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 341790 + }, + { + "epoch": 1.321303211640457, + "grad_norm": 0.0931347906589508, + "learning_rate": 0.002, + "loss": 2.325, + "step": 341800 + }, + { + "epoch": 1.3213418688438403, + "grad_norm": 0.09975960850715637, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 341810 + }, + { + "epoch": 1.3213805260472236, + "grad_norm": 0.1423596441745758, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 341820 + }, + { + "epoch": 1.3214191832506068, + "grad_norm": 0.11021801829338074, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 341830 + }, + { + "epoch": 1.32145784045399, + "grad_norm": 0.10833445191383362, + "learning_rate": 0.002, + "loss": 2.322, + "step": 341840 + }, + { + "epoch": 1.3214964976573735, + "grad_norm": 0.10257164388895035, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 341850 + }, + { + "epoch": 1.3215351548607568, + "grad_norm": 0.1051739826798439, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 341860 + }, + { + "epoch": 1.32157381206414, + "grad_norm": 0.242488294839859, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 341870 + }, + { + "epoch": 1.3216124692675233, + "grad_norm": 0.10353244841098785, + "learning_rate": 0.002, + "loss": 2.332, + "step": 341880 + }, + { + "epoch": 1.3216511264709065, + "grad_norm": 0.11375704407691956, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 341890 + }, + { + "epoch": 1.3216897836742898, + "grad_norm": 0.09590550512075424, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 341900 + }, + { + "epoch": 1.3217284408776733, + "grad_norm": 0.0975589007139206, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 341910 + }, + { + "epoch": 1.3217670980810565, + "grad_norm": 0.11673767119646072, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 341920 + }, + { + "epoch": 1.3218057552844398, + "grad_norm": 0.10108847171068192, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 341930 + }, + { + "epoch": 1.321844412487823, + "grad_norm": 0.10159999877214432, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 341940 + }, + { + "epoch": 1.3218830696912063, + "grad_norm": 0.10191165655851364, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 341950 + }, + { + "epoch": 1.3219217268945895, + "grad_norm": 0.09570711106061935, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 341960 + }, + { + "epoch": 1.3219603840979728, + "grad_norm": 0.10358276218175888, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 341970 + }, + { + "epoch": 1.321999041301356, + "grad_norm": 0.10976678878068924, + "learning_rate": 0.002, + "loss": 2.329, + "step": 341980 + }, + { + "epoch": 1.3220376985047393, + "grad_norm": 0.1106809452176094, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 341990 + }, + { + "epoch": 1.3220763557081225, + "grad_norm": 0.12253010272979736, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 342000 + }, + { + "epoch": 1.3221150129115058, + "grad_norm": 0.09194432944059372, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 342010 + }, + { + "epoch": 1.3221536701148893, + "grad_norm": 0.10456662625074387, + "learning_rate": 0.002, + "loss": 2.328, + "step": 342020 + }, + { + "epoch": 1.3221923273182725, + "grad_norm": 0.12826426327228546, + "learning_rate": 0.002, + "loss": 2.322, + "step": 342030 + }, + { + "epoch": 1.3222309845216558, + "grad_norm": 0.11366898566484451, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 342040 + }, + { + "epoch": 1.322269641725039, + "grad_norm": 0.10935889929533005, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 342050 + }, + { + "epoch": 1.3223082989284223, + "grad_norm": 0.1325126737356186, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 342060 + }, + { + "epoch": 1.3223469561318055, + "grad_norm": 0.10648973286151886, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 342070 + }, + { + "epoch": 1.322385613335189, + "grad_norm": 0.10392796248197556, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 342080 + }, + { + "epoch": 1.3224242705385723, + "grad_norm": 0.09765364974737167, + "learning_rate": 0.002, + "loss": 2.3105, + "step": 342090 + }, + { + "epoch": 1.3224629277419555, + "grad_norm": 0.09096361696720123, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 342100 + }, + { + "epoch": 1.3225015849453388, + "grad_norm": 0.12809644639492035, + "learning_rate": 0.002, + "loss": 2.325, + "step": 342110 + }, + { + "epoch": 1.322540242148722, + "grad_norm": 0.09639798104763031, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 342120 + }, + { + "epoch": 1.3225788993521053, + "grad_norm": 0.10138837993144989, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 342130 + }, + { + "epoch": 1.3226175565554885, + "grad_norm": 0.1052599549293518, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 342140 + }, + { + "epoch": 1.3226562137588718, + "grad_norm": 0.116051085293293, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 342150 + }, + { + "epoch": 1.322694870962255, + "grad_norm": 0.11769842356443405, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 342160 + }, + { + "epoch": 1.3227335281656383, + "grad_norm": 0.1063615009188652, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 342170 + }, + { + "epoch": 1.3227721853690215, + "grad_norm": 0.10278849303722382, + "learning_rate": 0.002, + "loss": 2.3129, + "step": 342180 + }, + { + "epoch": 1.322810842572405, + "grad_norm": 0.0904436707496643, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 342190 + }, + { + "epoch": 1.3228494997757883, + "grad_norm": 0.10121815651655197, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 342200 + }, + { + "epoch": 1.3228881569791715, + "grad_norm": 0.08775201439857483, + "learning_rate": 0.002, + "loss": 2.33, + "step": 342210 + }, + { + "epoch": 1.3229268141825548, + "grad_norm": 0.10893165320158005, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 342220 + }, + { + "epoch": 1.322965471385938, + "grad_norm": 0.11072231829166412, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 342230 + }, + { + "epoch": 1.3230041285893213, + "grad_norm": 0.1255996972322464, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 342240 + }, + { + "epoch": 1.3230427857927047, + "grad_norm": 0.09480229765176773, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 342250 + }, + { + "epoch": 1.323081442996088, + "grad_norm": 0.10230188071727753, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 342260 + }, + { + "epoch": 1.3231201001994712, + "grad_norm": 0.11351063847541809, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 342270 + }, + { + "epoch": 1.3231587574028545, + "grad_norm": 0.10005253553390503, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 342280 + }, + { + "epoch": 1.3231974146062377, + "grad_norm": 0.1019071415066719, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 342290 + }, + { + "epoch": 1.323236071809621, + "grad_norm": 0.10154785215854645, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 342300 + }, + { + "epoch": 1.3232747290130042, + "grad_norm": 0.13221405446529388, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 342310 + }, + { + "epoch": 1.3233133862163875, + "grad_norm": 0.09709381312131882, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 342320 + }, + { + "epoch": 1.3233520434197708, + "grad_norm": 0.09548242390155792, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 342330 + }, + { + "epoch": 1.323390700623154, + "grad_norm": 0.10958176106214523, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 342340 + }, + { + "epoch": 1.3234293578265375, + "grad_norm": 0.1257767677307129, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 342350 + }, + { + "epoch": 1.3234680150299207, + "grad_norm": 0.10943162441253662, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 342360 + }, + { + "epoch": 1.323506672233304, + "grad_norm": 0.1096360981464386, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 342370 + }, + { + "epoch": 1.3235453294366872, + "grad_norm": 0.13019192218780518, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 342380 + }, + { + "epoch": 1.3235839866400705, + "grad_norm": 0.09725780040025711, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 342390 + }, + { + "epoch": 1.3236226438434537, + "grad_norm": 0.12376907467842102, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 342400 + }, + { + "epoch": 1.323661301046837, + "grad_norm": 0.11352759599685669, + "learning_rate": 0.002, + "loss": 2.336, + "step": 342410 + }, + { + "epoch": 1.3236999582502205, + "grad_norm": 0.1032281219959259, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 342420 + }, + { + "epoch": 1.3237386154536037, + "grad_norm": 0.09839282929897308, + "learning_rate": 0.002, + "loss": 2.344, + "step": 342430 + }, + { + "epoch": 1.323777272656987, + "grad_norm": 0.11424583196640015, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 342440 + }, + { + "epoch": 1.3238159298603702, + "grad_norm": 0.11875501275062561, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 342450 + }, + { + "epoch": 1.3238545870637535, + "grad_norm": 0.1253490298986435, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 342460 + }, + { + "epoch": 1.3238932442671367, + "grad_norm": 0.11857567727565765, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 342470 + }, + { + "epoch": 1.32393190147052, + "grad_norm": 0.09765864163637161, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 342480 + }, + { + "epoch": 1.3239705586739032, + "grad_norm": 0.09970621764659882, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 342490 + }, + { + "epoch": 1.3240092158772865, + "grad_norm": 0.10559707880020142, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 342500 + }, + { + "epoch": 1.3240478730806697, + "grad_norm": 0.10606026649475098, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 342510 + }, + { + "epoch": 1.3240865302840532, + "grad_norm": 0.0962778627872467, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 342520 + }, + { + "epoch": 1.3241251874874365, + "grad_norm": 0.12174908816814423, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 342530 + }, + { + "epoch": 1.3241638446908197, + "grad_norm": 0.09986695647239685, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 342540 + }, + { + "epoch": 1.324202501894203, + "grad_norm": 0.09701001644134521, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 342550 + }, + { + "epoch": 1.3242411590975862, + "grad_norm": 0.11607201397418976, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 342560 + }, + { + "epoch": 1.3242798163009695, + "grad_norm": 0.10730637609958649, + "learning_rate": 0.002, + "loss": 2.332, + "step": 342570 + }, + { + "epoch": 1.3243184735043527, + "grad_norm": 0.11313966661691666, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 342580 + }, + { + "epoch": 1.3243571307077362, + "grad_norm": 0.1085088849067688, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 342590 + }, + { + "epoch": 1.3243957879111194, + "grad_norm": 0.10107274353504181, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 342600 + }, + { + "epoch": 1.3244344451145027, + "grad_norm": 0.09417583793401718, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 342610 + }, + { + "epoch": 1.324473102317886, + "grad_norm": 0.12345251441001892, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 342620 + }, + { + "epoch": 1.3245117595212692, + "grad_norm": 0.11787278205156326, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 342630 + }, + { + "epoch": 1.3245504167246525, + "grad_norm": 0.0970306396484375, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 342640 + }, + { + "epoch": 1.3245890739280357, + "grad_norm": 0.09616800397634506, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 342650 + }, + { + "epoch": 1.324627731131419, + "grad_norm": 0.10771799832582474, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 342660 + }, + { + "epoch": 1.3246663883348022, + "grad_norm": 0.1141650378704071, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 342670 + }, + { + "epoch": 1.3247050455381855, + "grad_norm": 0.1058678925037384, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 342680 + }, + { + "epoch": 1.324743702741569, + "grad_norm": 0.09846770018339157, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 342690 + }, + { + "epoch": 1.3247823599449522, + "grad_norm": 0.10203972458839417, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 342700 + }, + { + "epoch": 1.3248210171483354, + "grad_norm": 0.12421320378780365, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 342710 + }, + { + "epoch": 1.3248596743517187, + "grad_norm": 0.09551411122083664, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 342720 + }, + { + "epoch": 1.324898331555102, + "grad_norm": 0.10334688425064087, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 342730 + }, + { + "epoch": 1.3249369887584852, + "grad_norm": 0.0986914187669754, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 342740 + }, + { + "epoch": 1.3249756459618685, + "grad_norm": 0.0937899798154831, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 342750 + }, + { + "epoch": 1.325014303165252, + "grad_norm": 0.10014919936656952, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 342760 + }, + { + "epoch": 1.3250529603686352, + "grad_norm": 0.10135438293218613, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 342770 + }, + { + "epoch": 1.3250916175720184, + "grad_norm": 0.1364315301179886, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 342780 + }, + { + "epoch": 1.3251302747754017, + "grad_norm": 0.11440681666135788, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 342790 + }, + { + "epoch": 1.325168931978785, + "grad_norm": 0.09707643836736679, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 342800 + }, + { + "epoch": 1.3252075891821682, + "grad_norm": 0.0961620956659317, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 342810 + }, + { + "epoch": 1.3252462463855514, + "grad_norm": 0.10802191495895386, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 342820 + }, + { + "epoch": 1.3252849035889347, + "grad_norm": 0.1003083810210228, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 342830 + }, + { + "epoch": 1.325323560792318, + "grad_norm": 0.10295507311820984, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 342840 + }, + { + "epoch": 1.3253622179957012, + "grad_norm": 0.11288342624902725, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 342850 + }, + { + "epoch": 1.3254008751990847, + "grad_norm": 0.09611604362726212, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 342860 + }, + { + "epoch": 1.325439532402468, + "grad_norm": 0.09539765119552612, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 342870 + }, + { + "epoch": 1.3254781896058512, + "grad_norm": 0.15975433588027954, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 342880 + }, + { + "epoch": 1.3255168468092344, + "grad_norm": 0.11723661422729492, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 342890 + }, + { + "epoch": 1.3255555040126177, + "grad_norm": 0.09128891676664352, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 342900 + }, + { + "epoch": 1.325594161216001, + "grad_norm": 0.09708354622125626, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 342910 + }, + { + "epoch": 1.3256328184193844, + "grad_norm": 0.1222986951470375, + "learning_rate": 0.002, + "loss": 2.335, + "step": 342920 + }, + { + "epoch": 1.3256714756227677, + "grad_norm": 0.1060582846403122, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 342930 + }, + { + "epoch": 1.325710132826151, + "grad_norm": 0.08390968292951584, + "learning_rate": 0.002, + "loss": 2.3601, + "step": 342940 + }, + { + "epoch": 1.3257487900295342, + "grad_norm": 0.09373115003108978, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 342950 + }, + { + "epoch": 1.3257874472329174, + "grad_norm": 0.11386244744062424, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 342960 + }, + { + "epoch": 1.3258261044363007, + "grad_norm": 0.0926695317029953, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 342970 + }, + { + "epoch": 1.325864761639684, + "grad_norm": 0.09657523781061172, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 342980 + }, + { + "epoch": 1.3259034188430672, + "grad_norm": 0.1014002114534378, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 342990 + }, + { + "epoch": 1.3259420760464504, + "grad_norm": 0.12703995406627655, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 343000 + }, + { + "epoch": 1.3259807332498337, + "grad_norm": 0.29733362793922424, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 343010 + }, + { + "epoch": 1.326019390453217, + "grad_norm": 0.11396072059869766, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 343020 + }, + { + "epoch": 1.3260580476566004, + "grad_norm": 0.1067725121974945, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 343030 + }, + { + "epoch": 1.3260967048599837, + "grad_norm": 0.09846443682909012, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 343040 + }, + { + "epoch": 1.326135362063367, + "grad_norm": 0.1201457604765892, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 343050 + }, + { + "epoch": 1.3261740192667502, + "grad_norm": 0.09486748278141022, + "learning_rate": 0.002, + "loss": 2.354, + "step": 343060 + }, + { + "epoch": 1.3262126764701334, + "grad_norm": 0.09636746346950531, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 343070 + }, + { + "epoch": 1.3262513336735167, + "grad_norm": 0.10028290003538132, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 343080 + }, + { + "epoch": 1.3262899908769001, + "grad_norm": 0.11330897361040115, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 343090 + }, + { + "epoch": 1.3263286480802834, + "grad_norm": 0.13019509613513947, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 343100 + }, + { + "epoch": 1.3263673052836666, + "grad_norm": 0.10553409159183502, + "learning_rate": 0.002, + "loss": 2.333, + "step": 343110 + }, + { + "epoch": 1.32640596248705, + "grad_norm": 0.09633781015872955, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 343120 + }, + { + "epoch": 1.3264446196904331, + "grad_norm": 0.08768466860055923, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 343130 + }, + { + "epoch": 1.3264832768938164, + "grad_norm": 0.10579250752925873, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 343140 + }, + { + "epoch": 1.3265219340971997, + "grad_norm": 0.09313761442899704, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 343150 + }, + { + "epoch": 1.326560591300583, + "grad_norm": 0.10495753586292267, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 343160 + }, + { + "epoch": 1.3265992485039662, + "grad_norm": 0.11235906183719635, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 343170 + }, + { + "epoch": 1.3266379057073494, + "grad_norm": 0.09539853781461716, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 343180 + }, + { + "epoch": 1.3266765629107327, + "grad_norm": 0.09865597635507584, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 343190 + }, + { + "epoch": 1.3267152201141161, + "grad_norm": 0.09811379760503769, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 343200 + }, + { + "epoch": 1.3267538773174994, + "grad_norm": 0.09318213909864426, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 343210 + }, + { + "epoch": 1.3267925345208826, + "grad_norm": 0.11621811240911484, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 343220 + }, + { + "epoch": 1.326831191724266, + "grad_norm": 0.11613493412733078, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 343230 + }, + { + "epoch": 1.3268698489276491, + "grad_norm": 0.12907305359840393, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 343240 + }, + { + "epoch": 1.3269085061310324, + "grad_norm": 0.1041342243552208, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 343250 + }, + { + "epoch": 1.3269471633344159, + "grad_norm": 0.09700960665941238, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 343260 + }, + { + "epoch": 1.3269858205377991, + "grad_norm": 0.11570774763822556, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 343270 + }, + { + "epoch": 1.3270244777411824, + "grad_norm": 0.08997366577386856, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 343280 + }, + { + "epoch": 1.3270631349445656, + "grad_norm": 0.11459000408649445, + "learning_rate": 0.002, + "loss": 2.347, + "step": 343290 + }, + { + "epoch": 1.3271017921479489, + "grad_norm": 0.09992536157369614, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 343300 + }, + { + "epoch": 1.3271404493513321, + "grad_norm": 0.10141663253307343, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 343310 + }, + { + "epoch": 1.3271791065547154, + "grad_norm": 0.1016429141163826, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 343320 + }, + { + "epoch": 1.3272177637580986, + "grad_norm": 0.1101653203368187, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 343330 + }, + { + "epoch": 1.3272564209614819, + "grad_norm": 0.10410647839307785, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 343340 + }, + { + "epoch": 1.3272950781648651, + "grad_norm": 0.1013018786907196, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 343350 + }, + { + "epoch": 1.3273337353682484, + "grad_norm": 0.12418507039546967, + "learning_rate": 0.002, + "loss": 2.346, + "step": 343360 + }, + { + "epoch": 1.3273723925716319, + "grad_norm": 0.1398543268442154, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 343370 + }, + { + "epoch": 1.3274110497750151, + "grad_norm": 0.10456975549459457, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 343380 + }, + { + "epoch": 1.3274497069783984, + "grad_norm": 0.09469009190797806, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 343390 + }, + { + "epoch": 1.3274883641817816, + "grad_norm": 0.10595841705799103, + "learning_rate": 0.002, + "loss": 2.3136, + "step": 343400 + }, + { + "epoch": 1.3275270213851649, + "grad_norm": 0.10216021537780762, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 343410 + }, + { + "epoch": 1.3275656785885481, + "grad_norm": 0.11457862704992294, + "learning_rate": 0.002, + "loss": 2.33, + "step": 343420 + }, + { + "epoch": 1.3276043357919316, + "grad_norm": 0.1330697238445282, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 343430 + }, + { + "epoch": 1.3276429929953149, + "grad_norm": 0.09366831928491592, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 343440 + }, + { + "epoch": 1.327681650198698, + "grad_norm": 0.11288196593523026, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 343450 + }, + { + "epoch": 1.3277203074020814, + "grad_norm": 0.10931092500686646, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 343460 + }, + { + "epoch": 1.3277589646054646, + "grad_norm": 0.11085681617259979, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 343470 + }, + { + "epoch": 1.3277976218088479, + "grad_norm": 0.11561580002307892, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 343480 + }, + { + "epoch": 1.3278362790122311, + "grad_norm": 0.09727177768945694, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 343490 + }, + { + "epoch": 1.3278749362156144, + "grad_norm": 0.09696878492832184, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 343500 + }, + { + "epoch": 1.3279135934189976, + "grad_norm": 0.10486599802970886, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 343510 + }, + { + "epoch": 1.3279522506223809, + "grad_norm": 0.10580369830131531, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 343520 + }, + { + "epoch": 1.3279909078257641, + "grad_norm": 0.0963820368051529, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 343530 + }, + { + "epoch": 1.3280295650291476, + "grad_norm": 0.10694960504770279, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 343540 + }, + { + "epoch": 1.3280682222325308, + "grad_norm": 0.11083703488111496, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 343550 + }, + { + "epoch": 1.328106879435914, + "grad_norm": 0.11412572115659714, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 343560 + }, + { + "epoch": 1.3281455366392974, + "grad_norm": 0.09420335292816162, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 343570 + }, + { + "epoch": 1.3281841938426806, + "grad_norm": 0.1165669858455658, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 343580 + }, + { + "epoch": 1.3282228510460639, + "grad_norm": 0.09835314005613327, + "learning_rate": 0.002, + "loss": 2.337, + "step": 343590 + }, + { + "epoch": 1.3282615082494473, + "grad_norm": 0.11818283051252365, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 343600 + }, + { + "epoch": 1.3283001654528306, + "grad_norm": 0.10762883722782135, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 343610 + }, + { + "epoch": 1.3283388226562138, + "grad_norm": 0.10828974097967148, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 343620 + }, + { + "epoch": 1.328377479859597, + "grad_norm": 0.1360919326543808, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 343630 + }, + { + "epoch": 1.3284161370629803, + "grad_norm": 0.10663457214832306, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 343640 + }, + { + "epoch": 1.3284547942663636, + "grad_norm": 0.17864476144313812, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 343650 + }, + { + "epoch": 1.3284934514697468, + "grad_norm": 0.0979250967502594, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 343660 + }, + { + "epoch": 1.32853210867313, + "grad_norm": 0.09882812201976776, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 343670 + }, + { + "epoch": 1.3285707658765133, + "grad_norm": 0.09272123873233795, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 343680 + }, + { + "epoch": 1.3286094230798966, + "grad_norm": 0.10529595613479614, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 343690 + }, + { + "epoch": 1.3286480802832799, + "grad_norm": 0.10470674186944962, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 343700 + }, + { + "epoch": 1.3286867374866633, + "grad_norm": 0.11703024804592133, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 343710 + }, + { + "epoch": 1.3287253946900466, + "grad_norm": 0.100809745490551, + "learning_rate": 0.002, + "loss": 2.327, + "step": 343720 + }, + { + "epoch": 1.3287640518934298, + "grad_norm": 0.1015714555978775, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 343730 + }, + { + "epoch": 1.328802709096813, + "grad_norm": 0.10203837603330612, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 343740 + }, + { + "epoch": 1.3288413663001963, + "grad_norm": 0.11030900478363037, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 343750 + }, + { + "epoch": 1.3288800235035796, + "grad_norm": 0.10859356820583344, + "learning_rate": 0.002, + "loss": 2.325, + "step": 343760 + }, + { + "epoch": 1.328918680706963, + "grad_norm": 0.10833962261676788, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 343770 + }, + { + "epoch": 1.3289573379103463, + "grad_norm": 0.10143111646175385, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 343780 + }, + { + "epoch": 1.3289959951137296, + "grad_norm": 0.1062939316034317, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 343790 + }, + { + "epoch": 1.3290346523171128, + "grad_norm": 0.16646593809127808, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 343800 + }, + { + "epoch": 1.329073309520496, + "grad_norm": 0.10195731371641159, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 343810 + }, + { + "epoch": 1.3291119667238793, + "grad_norm": 0.10865110158920288, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 343820 + }, + { + "epoch": 1.3291506239272626, + "grad_norm": 0.10576749593019485, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 343830 + }, + { + "epoch": 1.3291892811306458, + "grad_norm": 0.09654872119426727, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 343840 + }, + { + "epoch": 1.329227938334029, + "grad_norm": 0.1124534159898758, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 343850 + }, + { + "epoch": 1.3292665955374123, + "grad_norm": 0.12102707475423813, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 343860 + }, + { + "epoch": 1.3293052527407956, + "grad_norm": 0.12327728420495987, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 343870 + }, + { + "epoch": 1.329343909944179, + "grad_norm": 0.086943618953228, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 343880 + }, + { + "epoch": 1.3293825671475623, + "grad_norm": 0.09946412593126297, + "learning_rate": 0.002, + "loss": 2.333, + "step": 343890 + }, + { + "epoch": 1.3294212243509456, + "grad_norm": 0.12250570207834244, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 343900 + }, + { + "epoch": 1.3294598815543288, + "grad_norm": 0.1032809242606163, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 343910 + }, + { + "epoch": 1.329498538757712, + "grad_norm": 0.12389107048511505, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 343920 + }, + { + "epoch": 1.3295371959610953, + "grad_norm": 0.09774418920278549, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 343930 + }, + { + "epoch": 1.3295758531644788, + "grad_norm": 0.0978965312242508, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 343940 + }, + { + "epoch": 1.329614510367862, + "grad_norm": 0.09504812955856323, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 343950 + }, + { + "epoch": 1.3296531675712453, + "grad_norm": 0.10925097018480301, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 343960 + }, + { + "epoch": 1.3296918247746286, + "grad_norm": 0.11287527531385422, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 343970 + }, + { + "epoch": 1.3297304819780118, + "grad_norm": 0.09954530745744705, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 343980 + }, + { + "epoch": 1.329769139181395, + "grad_norm": 0.09281942993402481, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 343990 + }, + { + "epoch": 1.3298077963847783, + "grad_norm": 0.09409678727388382, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 344000 + }, + { + "epoch": 1.3298464535881616, + "grad_norm": 0.11386945843696594, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 344010 + }, + { + "epoch": 1.3298851107915448, + "grad_norm": 0.09691344201564789, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 344020 + }, + { + "epoch": 1.329923767994928, + "grad_norm": 0.11871121823787689, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 344030 + }, + { + "epoch": 1.3299624251983113, + "grad_norm": 0.10736346244812012, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 344040 + }, + { + "epoch": 1.3300010824016948, + "grad_norm": 0.11998474597930908, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 344050 + }, + { + "epoch": 1.330039739605078, + "grad_norm": 0.11618227511644363, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 344060 + }, + { + "epoch": 1.3300783968084613, + "grad_norm": 0.11615073680877686, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 344070 + }, + { + "epoch": 1.3301170540118445, + "grad_norm": 0.09982374310493469, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 344080 + }, + { + "epoch": 1.3301557112152278, + "grad_norm": 0.10129628330469131, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 344090 + }, + { + "epoch": 1.330194368418611, + "grad_norm": 0.10706644505262375, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 344100 + }, + { + "epoch": 1.3302330256219945, + "grad_norm": 0.11260034143924713, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 344110 + }, + { + "epoch": 1.3302716828253778, + "grad_norm": 0.11245223879814148, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 344120 + }, + { + "epoch": 1.330310340028761, + "grad_norm": 0.1035739928483963, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 344130 + }, + { + "epoch": 1.3303489972321443, + "grad_norm": 0.13902145624160767, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 344140 + }, + { + "epoch": 1.3303876544355275, + "grad_norm": 0.11603587120771408, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 344150 + }, + { + "epoch": 1.3304263116389108, + "grad_norm": 0.09771740436553955, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 344160 + }, + { + "epoch": 1.330464968842294, + "grad_norm": 0.0917857438325882, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 344170 + }, + { + "epoch": 1.3305036260456773, + "grad_norm": 0.10400768369436264, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 344180 + }, + { + "epoch": 1.3305422832490605, + "grad_norm": 0.0979352667927742, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 344190 + }, + { + "epoch": 1.3305809404524438, + "grad_norm": 0.10228806734085083, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 344200 + }, + { + "epoch": 1.3306195976558273, + "grad_norm": 0.09991668164730072, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 344210 + }, + { + "epoch": 1.3306582548592105, + "grad_norm": 0.11020371317863464, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 344220 + }, + { + "epoch": 1.3306969120625938, + "grad_norm": 0.0935186892747879, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 344230 + }, + { + "epoch": 1.330735569265977, + "grad_norm": 0.09985950589179993, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 344240 + }, + { + "epoch": 1.3307742264693603, + "grad_norm": 0.10477671027183533, + "learning_rate": 0.002, + "loss": 2.316, + "step": 344250 + }, + { + "epoch": 1.3308128836727435, + "grad_norm": 0.10817205160856247, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 344260 + }, + { + "epoch": 1.3308515408761268, + "grad_norm": 0.10950993001461029, + "learning_rate": 0.002, + "loss": 2.312, + "step": 344270 + }, + { + "epoch": 1.3308901980795103, + "grad_norm": 0.1234552189707756, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 344280 + }, + { + "epoch": 1.3309288552828935, + "grad_norm": 0.09903118014335632, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 344290 + }, + { + "epoch": 1.3309675124862768, + "grad_norm": 0.11547055095434189, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 344300 + }, + { + "epoch": 1.33100616968966, + "grad_norm": 0.10943587124347687, + "learning_rate": 0.002, + "loss": 2.342, + "step": 344310 + }, + { + "epoch": 1.3310448268930433, + "grad_norm": 0.1084182858467102, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 344320 + }, + { + "epoch": 1.3310834840964265, + "grad_norm": 0.09529057145118713, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 344330 + }, + { + "epoch": 1.3311221412998098, + "grad_norm": 0.1016959473490715, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 344340 + }, + { + "epoch": 1.331160798503193, + "grad_norm": 0.11874452978372574, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 344350 + }, + { + "epoch": 1.3311994557065763, + "grad_norm": 0.10183338075876236, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 344360 + }, + { + "epoch": 1.3312381129099595, + "grad_norm": 0.11162742972373962, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 344370 + }, + { + "epoch": 1.331276770113343, + "grad_norm": 0.10805539786815643, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 344380 + }, + { + "epoch": 1.3313154273167263, + "grad_norm": 0.11087726801633835, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 344390 + }, + { + "epoch": 1.3313540845201095, + "grad_norm": 0.10322079062461853, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 344400 + }, + { + "epoch": 1.3313927417234928, + "grad_norm": 0.10526745766401291, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 344410 + }, + { + "epoch": 1.331431398926876, + "grad_norm": 0.10890132188796997, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 344420 + }, + { + "epoch": 1.3314700561302593, + "grad_norm": 0.10265296697616577, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 344430 + }, + { + "epoch": 1.3315087133336425, + "grad_norm": 0.09728667885065079, + "learning_rate": 0.002, + "loss": 2.3113, + "step": 344440 + }, + { + "epoch": 1.331547370537026, + "grad_norm": 0.1127006933093071, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 344450 + }, + { + "epoch": 1.3315860277404092, + "grad_norm": 0.12087395042181015, + "learning_rate": 0.002, + "loss": 2.3133, + "step": 344460 + }, + { + "epoch": 1.3316246849437925, + "grad_norm": 0.12616465985774994, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 344470 + }, + { + "epoch": 1.3316633421471757, + "grad_norm": 0.1002395749092102, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 344480 + }, + { + "epoch": 1.331701999350559, + "grad_norm": 0.10422340780496597, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 344490 + }, + { + "epoch": 1.3317406565539422, + "grad_norm": 0.09601067751646042, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 344500 + }, + { + "epoch": 1.3317793137573255, + "grad_norm": 0.11861871182918549, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 344510 + }, + { + "epoch": 1.3318179709607088, + "grad_norm": 0.09754236042499542, + "learning_rate": 0.002, + "loss": 2.332, + "step": 344520 + }, + { + "epoch": 1.331856628164092, + "grad_norm": 0.10452759265899658, + "learning_rate": 0.002, + "loss": 2.345, + "step": 344530 + }, + { + "epoch": 1.3318952853674753, + "grad_norm": 0.10563155263662338, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 344540 + }, + { + "epoch": 1.3319339425708587, + "grad_norm": 0.13401366770267487, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 344550 + }, + { + "epoch": 1.331972599774242, + "grad_norm": 0.10596875101327896, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 344560 + }, + { + "epoch": 1.3320112569776252, + "grad_norm": 0.11349476128816605, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 344570 + }, + { + "epoch": 1.3320499141810085, + "grad_norm": 0.1171863004565239, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 344580 + }, + { + "epoch": 1.3320885713843917, + "grad_norm": 0.09667433053255081, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 344590 + }, + { + "epoch": 1.332127228587775, + "grad_norm": 0.10653450340032578, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 344600 + }, + { + "epoch": 1.3321658857911582, + "grad_norm": 0.10144241154193878, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 344610 + }, + { + "epoch": 1.3322045429945417, + "grad_norm": 0.11736975610256195, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 344620 + }, + { + "epoch": 1.332243200197925, + "grad_norm": 0.09723929315805435, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 344630 + }, + { + "epoch": 1.3322818574013082, + "grad_norm": 0.11661922931671143, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 344640 + }, + { + "epoch": 1.3323205146046915, + "grad_norm": 0.10617439448833466, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 344650 + }, + { + "epoch": 1.3323591718080747, + "grad_norm": 0.11689261347055435, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 344660 + }, + { + "epoch": 1.332397829011458, + "grad_norm": 0.10449826717376709, + "learning_rate": 0.002, + "loss": 2.337, + "step": 344670 + }, + { + "epoch": 1.3324364862148412, + "grad_norm": 0.09332484751939774, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 344680 + }, + { + "epoch": 1.3324751434182245, + "grad_norm": 0.11841703951358795, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 344690 + }, + { + "epoch": 1.3325138006216077, + "grad_norm": 0.09929771721363068, + "learning_rate": 0.002, + "loss": 2.347, + "step": 344700 + }, + { + "epoch": 1.332552457824991, + "grad_norm": 0.1107405349612236, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 344710 + }, + { + "epoch": 1.3325911150283745, + "grad_norm": 0.11031211912631989, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 344720 + }, + { + "epoch": 1.3326297722317577, + "grad_norm": 0.1112285703420639, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 344730 + }, + { + "epoch": 1.332668429435141, + "grad_norm": 0.09748245030641556, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 344740 + }, + { + "epoch": 1.3327070866385242, + "grad_norm": 0.11218224465847015, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 344750 + }, + { + "epoch": 1.3327457438419075, + "grad_norm": 0.12697817385196686, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 344760 + }, + { + "epoch": 1.3327844010452907, + "grad_norm": 0.10008467733860016, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 344770 + }, + { + "epoch": 1.332823058248674, + "grad_norm": 0.09765961021184921, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 344780 + }, + { + "epoch": 1.3328617154520574, + "grad_norm": 0.09477970749139786, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 344790 + }, + { + "epoch": 1.3329003726554407, + "grad_norm": 0.09909243881702423, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 344800 + }, + { + "epoch": 1.332939029858824, + "grad_norm": 0.1287049949169159, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 344810 + }, + { + "epoch": 1.3329776870622072, + "grad_norm": 0.09282911568880081, + "learning_rate": 0.002, + "loss": 2.339, + "step": 344820 + }, + { + "epoch": 1.3330163442655905, + "grad_norm": 0.1277105212211609, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 344830 + }, + { + "epoch": 1.3330550014689737, + "grad_norm": 0.1023770347237587, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 344840 + }, + { + "epoch": 1.333093658672357, + "grad_norm": 0.10521621257066727, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 344850 + }, + { + "epoch": 1.3331323158757402, + "grad_norm": 0.11044228821992874, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 344860 + }, + { + "epoch": 1.3331709730791235, + "grad_norm": 0.11694236099720001, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 344870 + }, + { + "epoch": 1.3332096302825067, + "grad_norm": 0.1320260465145111, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 344880 + }, + { + "epoch": 1.3332482874858902, + "grad_norm": 0.12980304658412933, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 344890 + }, + { + "epoch": 1.3332869446892734, + "grad_norm": 0.11151394248008728, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 344900 + }, + { + "epoch": 1.3333256018926567, + "grad_norm": 0.09492821246385574, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 344910 + }, + { + "epoch": 1.33336425909604, + "grad_norm": 0.11971234530210495, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 344920 + }, + { + "epoch": 1.3334029162994232, + "grad_norm": 0.10840121656656265, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 344930 + }, + { + "epoch": 1.3334415735028065, + "grad_norm": 0.10081777721643448, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 344940 + }, + { + "epoch": 1.33348023070619, + "grad_norm": 0.09995166212320328, + "learning_rate": 0.002, + "loss": 2.337, + "step": 344950 + }, + { + "epoch": 1.3335188879095732, + "grad_norm": 0.10326774418354034, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 344960 + }, + { + "epoch": 1.3335575451129564, + "grad_norm": 0.1096605733036995, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 344970 + }, + { + "epoch": 1.3335962023163397, + "grad_norm": 0.08859793841838837, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 344980 + }, + { + "epoch": 1.333634859519723, + "grad_norm": 0.12324798107147217, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 344990 + }, + { + "epoch": 1.3336735167231062, + "grad_norm": 0.09335483610630035, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 345000 + }, + { + "epoch": 1.3337121739264894, + "grad_norm": 0.09280901402235031, + "learning_rate": 0.002, + "loss": 2.3107, + "step": 345010 + }, + { + "epoch": 1.3337508311298727, + "grad_norm": 0.10214249044656754, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 345020 + }, + { + "epoch": 1.333789488333256, + "grad_norm": 0.09952248632907867, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 345030 + }, + { + "epoch": 1.3338281455366392, + "grad_norm": 0.10830455273389816, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 345040 + }, + { + "epoch": 1.3338668027400225, + "grad_norm": 0.10797925293445587, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 345050 + }, + { + "epoch": 1.333905459943406, + "grad_norm": 0.1023314893245697, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 345060 + }, + { + "epoch": 1.3339441171467892, + "grad_norm": 0.11088632792234421, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 345070 + }, + { + "epoch": 1.3339827743501724, + "grad_norm": 0.10849376022815704, + "learning_rate": 0.002, + "loss": 2.334, + "step": 345080 + }, + { + "epoch": 1.3340214315535557, + "grad_norm": 0.10615264624357224, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 345090 + }, + { + "epoch": 1.334060088756939, + "grad_norm": 0.11973266303539276, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 345100 + }, + { + "epoch": 1.3340987459603222, + "grad_norm": 0.11139839887619019, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 345110 + }, + { + "epoch": 1.3341374031637057, + "grad_norm": 0.1239207535982132, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 345120 + }, + { + "epoch": 1.334176060367089, + "grad_norm": 0.09981207549571991, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 345130 + }, + { + "epoch": 1.3342147175704722, + "grad_norm": 0.1028282567858696, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 345140 + }, + { + "epoch": 1.3342533747738554, + "grad_norm": 0.11014251410961151, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 345150 + }, + { + "epoch": 1.3342920319772387, + "grad_norm": 0.14106076955795288, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 345160 + }, + { + "epoch": 1.334330689180622, + "grad_norm": 0.09924093633890152, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 345170 + }, + { + "epoch": 1.3343693463840052, + "grad_norm": 0.1121000200510025, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 345180 + }, + { + "epoch": 1.3344080035873884, + "grad_norm": 0.10735615342855453, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 345190 + }, + { + "epoch": 1.3344466607907717, + "grad_norm": 0.09548373520374298, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 345200 + }, + { + "epoch": 1.334485317994155, + "grad_norm": 0.10804015398025513, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 345210 + }, + { + "epoch": 1.3345239751975382, + "grad_norm": 0.11735550314188004, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 345220 + }, + { + "epoch": 1.3345626324009217, + "grad_norm": 0.10910956561565399, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 345230 + }, + { + "epoch": 1.334601289604305, + "grad_norm": 0.11945263296365738, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 345240 + }, + { + "epoch": 1.3346399468076882, + "grad_norm": 0.09623727947473526, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 345250 + }, + { + "epoch": 1.3346786040110714, + "grad_norm": 0.11667291074991226, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 345260 + }, + { + "epoch": 1.3347172612144547, + "grad_norm": 0.12302277237176895, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 345270 + }, + { + "epoch": 1.334755918417838, + "grad_norm": 0.10679807513952255, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 345280 + }, + { + "epoch": 1.3347945756212214, + "grad_norm": 0.09852226823568344, + "learning_rate": 0.002, + "loss": 2.332, + "step": 345290 + }, + { + "epoch": 1.3348332328246046, + "grad_norm": 0.10933008790016174, + "learning_rate": 0.002, + "loss": 2.328, + "step": 345300 + }, + { + "epoch": 1.334871890027988, + "grad_norm": 0.09499634057283401, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 345310 + }, + { + "epoch": 1.3349105472313711, + "grad_norm": 0.11454311013221741, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 345320 + }, + { + "epoch": 1.3349492044347544, + "grad_norm": 0.10472351312637329, + "learning_rate": 0.002, + "loss": 2.332, + "step": 345330 + }, + { + "epoch": 1.3349878616381377, + "grad_norm": 0.10407882183790207, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 345340 + }, + { + "epoch": 1.335026518841521, + "grad_norm": 0.10216553509235382, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 345350 + }, + { + "epoch": 1.3350651760449042, + "grad_norm": 0.12331422418355942, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 345360 + }, + { + "epoch": 1.3351038332482874, + "grad_norm": 0.10996781289577484, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 345370 + }, + { + "epoch": 1.3351424904516707, + "grad_norm": 0.09760157763957977, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 345380 + }, + { + "epoch": 1.335181147655054, + "grad_norm": 0.0954732596874237, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 345390 + }, + { + "epoch": 1.3352198048584374, + "grad_norm": 0.10300255566835403, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 345400 + }, + { + "epoch": 1.3352584620618206, + "grad_norm": 0.0967373326420784, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 345410 + }, + { + "epoch": 1.335297119265204, + "grad_norm": 0.110762819647789, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 345420 + }, + { + "epoch": 1.3353357764685871, + "grad_norm": 0.10032440721988678, + "learning_rate": 0.002, + "loss": 2.324, + "step": 345430 + }, + { + "epoch": 1.3353744336719704, + "grad_norm": 0.1212349459528923, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 345440 + }, + { + "epoch": 1.3354130908753536, + "grad_norm": 0.1062375158071518, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 345450 + }, + { + "epoch": 1.3354517480787371, + "grad_norm": 0.1182907298207283, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 345460 + }, + { + "epoch": 1.3354904052821204, + "grad_norm": 0.10918321460485458, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 345470 + }, + { + "epoch": 1.3355290624855036, + "grad_norm": 0.1067313551902771, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 345480 + }, + { + "epoch": 1.3355677196888869, + "grad_norm": 0.09972357749938965, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 345490 + }, + { + "epoch": 1.3356063768922701, + "grad_norm": 0.12399875372648239, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 345500 + }, + { + "epoch": 1.3356450340956534, + "grad_norm": 0.10412443429231644, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 345510 + }, + { + "epoch": 1.3356836912990366, + "grad_norm": 0.11216738820075989, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 345520 + }, + { + "epoch": 1.3357223485024199, + "grad_norm": 0.10026592761278152, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 345530 + }, + { + "epoch": 1.3357610057058031, + "grad_norm": 0.11337658017873764, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 345540 + }, + { + "epoch": 1.3357996629091864, + "grad_norm": 0.1043723002076149, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 345550 + }, + { + "epoch": 1.3358383201125696, + "grad_norm": 0.09790969640016556, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 345560 + }, + { + "epoch": 1.3358769773159531, + "grad_norm": 0.1073826476931572, + "learning_rate": 0.002, + "loss": 2.318, + "step": 345570 + }, + { + "epoch": 1.3359156345193364, + "grad_norm": 0.1073329970240593, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 345580 + }, + { + "epoch": 1.3359542917227196, + "grad_norm": 0.09542405605316162, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 345590 + }, + { + "epoch": 1.3359929489261029, + "grad_norm": 0.10292265564203262, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 345600 + }, + { + "epoch": 1.3360316061294861, + "grad_norm": 0.1295763999223709, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 345610 + }, + { + "epoch": 1.3360702633328694, + "grad_norm": 0.1814815104007721, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 345620 + }, + { + "epoch": 1.3361089205362529, + "grad_norm": 0.10305207967758179, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 345630 + }, + { + "epoch": 1.336147577739636, + "grad_norm": 0.11229586601257324, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 345640 + }, + { + "epoch": 1.3361862349430194, + "grad_norm": 0.10201216489076614, + "learning_rate": 0.002, + "loss": 2.345, + "step": 345650 + }, + { + "epoch": 1.3362248921464026, + "grad_norm": 0.10922586172819138, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 345660 + }, + { + "epoch": 1.3362635493497859, + "grad_norm": 0.09847729653120041, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 345670 + }, + { + "epoch": 1.3363022065531691, + "grad_norm": 0.10319638252258301, + "learning_rate": 0.002, + "loss": 2.332, + "step": 345680 + }, + { + "epoch": 1.3363408637565524, + "grad_norm": 0.12661710381507874, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 345690 + }, + { + "epoch": 1.3363795209599356, + "grad_norm": 0.12502607703208923, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 345700 + }, + { + "epoch": 1.3364181781633189, + "grad_norm": 0.10091539472341537, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 345710 + }, + { + "epoch": 1.3364568353667021, + "grad_norm": 0.10177405923604965, + "learning_rate": 0.002, + "loss": 2.3532, + "step": 345720 + }, + { + "epoch": 1.3364954925700854, + "grad_norm": 0.16185766458511353, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 345730 + }, + { + "epoch": 1.3365341497734688, + "grad_norm": 0.09906337410211563, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 345740 + }, + { + "epoch": 1.336572806976852, + "grad_norm": 0.12838874757289886, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 345750 + }, + { + "epoch": 1.3366114641802354, + "grad_norm": 0.10789935290813446, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 345760 + }, + { + "epoch": 1.3366501213836186, + "grad_norm": 0.09930015355348587, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 345770 + }, + { + "epoch": 1.3366887785870019, + "grad_norm": 0.10896844416856766, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 345780 + }, + { + "epoch": 1.336727435790385, + "grad_norm": 0.10427610576152802, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 345790 + }, + { + "epoch": 1.3367660929937686, + "grad_norm": 0.14572319388389587, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 345800 + }, + { + "epoch": 1.3368047501971518, + "grad_norm": 0.10233652591705322, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 345810 + }, + { + "epoch": 1.336843407400535, + "grad_norm": 0.107231505215168, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 345820 + }, + { + "epoch": 1.3368820646039183, + "grad_norm": 0.0979439839720726, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 345830 + }, + { + "epoch": 1.3369207218073016, + "grad_norm": 0.09307397156953812, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 345840 + }, + { + "epoch": 1.3369593790106848, + "grad_norm": 0.10109108686447144, + "learning_rate": 0.002, + "loss": 2.339, + "step": 345850 + }, + { + "epoch": 1.336998036214068, + "grad_norm": 0.10365600138902664, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 345860 + }, + { + "epoch": 1.3370366934174513, + "grad_norm": 0.10309231281280518, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 345870 + }, + { + "epoch": 1.3370753506208346, + "grad_norm": 0.13914698362350464, + "learning_rate": 0.002, + "loss": 2.315, + "step": 345880 + }, + { + "epoch": 1.3371140078242179, + "grad_norm": 0.1049744263291359, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 345890 + }, + { + "epoch": 1.337152665027601, + "grad_norm": 0.10597793012857437, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 345900 + }, + { + "epoch": 1.3371913222309846, + "grad_norm": 0.10046493262052536, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 345910 + }, + { + "epoch": 1.3372299794343678, + "grad_norm": 0.09887133538722992, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 345920 + }, + { + "epoch": 1.337268636637751, + "grad_norm": 0.19354459643363953, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 345930 + }, + { + "epoch": 1.3373072938411343, + "grad_norm": 0.10834994912147522, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 345940 + }, + { + "epoch": 1.3373459510445176, + "grad_norm": 0.1194472461938858, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 345950 + }, + { + "epoch": 1.3373846082479008, + "grad_norm": 0.09798796474933624, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 345960 + }, + { + "epoch": 1.3374232654512843, + "grad_norm": 0.09186162054538727, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 345970 + }, + { + "epoch": 1.3374619226546676, + "grad_norm": 0.10261370986700058, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 345980 + }, + { + "epoch": 1.3375005798580508, + "grad_norm": 0.1267152577638626, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 345990 + }, + { + "epoch": 1.337539237061434, + "grad_norm": 0.10745598375797272, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 346000 + }, + { + "epoch": 1.3375778942648173, + "grad_norm": 0.10257593542337418, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 346010 + }, + { + "epoch": 1.3376165514682006, + "grad_norm": 0.11340894550085068, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 346020 + }, + { + "epoch": 1.3376552086715838, + "grad_norm": 0.1047341376543045, + "learning_rate": 0.002, + "loss": 2.333, + "step": 346030 + }, + { + "epoch": 1.337693865874967, + "grad_norm": 0.10924689471721649, + "learning_rate": 0.002, + "loss": 2.3156, + "step": 346040 + }, + { + "epoch": 1.3377325230783503, + "grad_norm": 0.10795491933822632, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 346050 + }, + { + "epoch": 1.3377711802817336, + "grad_norm": 0.10510119795799255, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 346060 + }, + { + "epoch": 1.337809837485117, + "grad_norm": 0.10001824796199799, + "learning_rate": 0.002, + "loss": 2.3573, + "step": 346070 + }, + { + "epoch": 1.3378484946885003, + "grad_norm": 0.12009944766759872, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 346080 + }, + { + "epoch": 1.3378871518918836, + "grad_norm": 0.10906676948070526, + "learning_rate": 0.002, + "loss": 2.341, + "step": 346090 + }, + { + "epoch": 1.3379258090952668, + "grad_norm": 0.09728783369064331, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 346100 + }, + { + "epoch": 1.33796446629865, + "grad_norm": 0.10508811473846436, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 346110 + }, + { + "epoch": 1.3380031235020333, + "grad_norm": 0.09222324192523956, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 346120 + }, + { + "epoch": 1.3380417807054166, + "grad_norm": 0.10402661561965942, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 346130 + }, + { + "epoch": 1.3380804379088, + "grad_norm": 0.12433262169361115, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 346140 + }, + { + "epoch": 1.3381190951121833, + "grad_norm": 0.10922899842262268, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 346150 + }, + { + "epoch": 1.3381577523155666, + "grad_norm": 0.10978659242391586, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 346160 + }, + { + "epoch": 1.3381964095189498, + "grad_norm": 0.10603276640176773, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 346170 + }, + { + "epoch": 1.338235066722333, + "grad_norm": 0.0904986634850502, + "learning_rate": 0.002, + "loss": 2.318, + "step": 346180 + }, + { + "epoch": 1.3382737239257163, + "grad_norm": 0.10122311115264893, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 346190 + }, + { + "epoch": 1.3383123811290996, + "grad_norm": 0.09152434021234512, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 346200 + }, + { + "epoch": 1.3383510383324828, + "grad_norm": 0.1075081005692482, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 346210 + }, + { + "epoch": 1.338389695535866, + "grad_norm": 0.11105257272720337, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 346220 + }, + { + "epoch": 1.3384283527392493, + "grad_norm": 0.0925607830286026, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 346230 + }, + { + "epoch": 1.3384670099426328, + "grad_norm": 0.11009430140256882, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 346240 + }, + { + "epoch": 1.338505667146016, + "grad_norm": 0.1331329345703125, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 346250 + }, + { + "epoch": 1.3385443243493993, + "grad_norm": 0.11093542724847794, + "learning_rate": 0.002, + "loss": 2.326, + "step": 346260 + }, + { + "epoch": 1.3385829815527825, + "grad_norm": 0.08782430738210678, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 346270 + }, + { + "epoch": 1.3386216387561658, + "grad_norm": 0.10207182168960571, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 346280 + }, + { + "epoch": 1.338660295959549, + "grad_norm": 0.11096299439668655, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 346290 + }, + { + "epoch": 1.3386989531629323, + "grad_norm": 0.1043325737118721, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 346300 + }, + { + "epoch": 1.3387376103663158, + "grad_norm": 0.10775876045227051, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 346310 + }, + { + "epoch": 1.338776267569699, + "grad_norm": 0.11067583411931992, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 346320 + }, + { + "epoch": 1.3388149247730823, + "grad_norm": 0.11076585203409195, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 346330 + }, + { + "epoch": 1.3388535819764655, + "grad_norm": 0.11195888370275497, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 346340 + }, + { + "epoch": 1.3388922391798488, + "grad_norm": 0.0904572531580925, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 346350 + }, + { + "epoch": 1.338930896383232, + "grad_norm": 0.11926350742578506, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 346360 + }, + { + "epoch": 1.3389695535866153, + "grad_norm": 0.10238216817378998, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 346370 + }, + { + "epoch": 1.3390082107899985, + "grad_norm": 0.09457932412624359, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 346380 + }, + { + "epoch": 1.3390468679933818, + "grad_norm": 0.10814918577671051, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 346390 + }, + { + "epoch": 1.339085525196765, + "grad_norm": 0.09990043193101883, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 346400 + }, + { + "epoch": 1.3391241824001485, + "grad_norm": 0.12533685564994812, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 346410 + }, + { + "epoch": 1.3391628396035318, + "grad_norm": 0.10196174681186676, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 346420 + }, + { + "epoch": 1.339201496806915, + "grad_norm": 0.10151616483926773, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 346430 + }, + { + "epoch": 1.3392401540102983, + "grad_norm": 0.11178027838468552, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 346440 + }, + { + "epoch": 1.3392788112136815, + "grad_norm": 0.0982932522892952, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 346450 + }, + { + "epoch": 1.3393174684170648, + "grad_norm": 0.11447232961654663, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 346460 + }, + { + "epoch": 1.339356125620448, + "grad_norm": 0.1159837618470192, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 346470 + }, + { + "epoch": 1.3393947828238315, + "grad_norm": 0.1221732422709465, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 346480 + }, + { + "epoch": 1.3394334400272148, + "grad_norm": 0.11282023787498474, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 346490 + }, + { + "epoch": 1.339472097230598, + "grad_norm": 0.10463476926088333, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 346500 + }, + { + "epoch": 1.3395107544339813, + "grad_norm": 0.11097527295351028, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 346510 + }, + { + "epoch": 1.3395494116373645, + "grad_norm": 0.1008048802614212, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 346520 + }, + { + "epoch": 1.3395880688407478, + "grad_norm": 0.12519340217113495, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 346530 + }, + { + "epoch": 1.339626726044131, + "grad_norm": 0.13712844252586365, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 346540 + }, + { + "epoch": 1.3396653832475143, + "grad_norm": 0.11297990381717682, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 346550 + }, + { + "epoch": 1.3397040404508975, + "grad_norm": 0.09795337170362473, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 346560 + }, + { + "epoch": 1.3397426976542808, + "grad_norm": 0.09507597237825394, + "learning_rate": 0.002, + "loss": 2.344, + "step": 346570 + }, + { + "epoch": 1.3397813548576643, + "grad_norm": 0.1199960857629776, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 346580 + }, + { + "epoch": 1.3398200120610475, + "grad_norm": 0.11006709933280945, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 346590 + }, + { + "epoch": 1.3398586692644308, + "grad_norm": 0.09973582625389099, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 346600 + }, + { + "epoch": 1.339897326467814, + "grad_norm": 0.09433923661708832, + "learning_rate": 0.002, + "loss": 2.332, + "step": 346610 + }, + { + "epoch": 1.3399359836711973, + "grad_norm": 0.10537783801555634, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 346620 + }, + { + "epoch": 1.3399746408745805, + "grad_norm": 0.10902126133441925, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 346630 + }, + { + "epoch": 1.3400132980779638, + "grad_norm": 0.1410796344280243, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 346640 + }, + { + "epoch": 1.3400519552813472, + "grad_norm": 0.09471646696329117, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 346650 + }, + { + "epoch": 1.3400906124847305, + "grad_norm": 0.09297826886177063, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 346660 + }, + { + "epoch": 1.3401292696881137, + "grad_norm": 0.09805559366941452, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 346670 + }, + { + "epoch": 1.340167926891497, + "grad_norm": 0.1065869852900505, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 346680 + }, + { + "epoch": 1.3402065840948802, + "grad_norm": 0.1044825091958046, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 346690 + }, + { + "epoch": 1.3402452412982635, + "grad_norm": 0.12405242770910263, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 346700 + }, + { + "epoch": 1.3402838985016468, + "grad_norm": 0.096046082675457, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 346710 + }, + { + "epoch": 1.34032255570503, + "grad_norm": 0.11131946742534637, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 346720 + }, + { + "epoch": 1.3403612129084133, + "grad_norm": 0.10490655899047852, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 346730 + }, + { + "epoch": 1.3403998701117965, + "grad_norm": 0.09529917687177658, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 346740 + }, + { + "epoch": 1.34043852731518, + "grad_norm": 0.09577424079179764, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 346750 + }, + { + "epoch": 1.3404771845185632, + "grad_norm": 0.10369036346673965, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 346760 + }, + { + "epoch": 1.3405158417219465, + "grad_norm": 0.09698975831270218, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 346770 + }, + { + "epoch": 1.3405544989253297, + "grad_norm": 0.11344336718320847, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 346780 + }, + { + "epoch": 1.340593156128713, + "grad_norm": 0.09168387204408646, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 346790 + }, + { + "epoch": 1.3406318133320962, + "grad_norm": 0.09688182920217514, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 346800 + }, + { + "epoch": 1.3406704705354797, + "grad_norm": 0.11499480158090591, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 346810 + }, + { + "epoch": 1.340709127738863, + "grad_norm": 0.10256528854370117, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 346820 + }, + { + "epoch": 1.3407477849422462, + "grad_norm": 0.11224600672721863, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 346830 + }, + { + "epoch": 1.3407864421456295, + "grad_norm": 0.11144499480724335, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 346840 + }, + { + "epoch": 1.3408250993490127, + "grad_norm": 0.09368912875652313, + "learning_rate": 0.002, + "loss": 2.34, + "step": 346850 + }, + { + "epoch": 1.340863756552396, + "grad_norm": 0.10852344334125519, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 346860 + }, + { + "epoch": 1.3409024137557792, + "grad_norm": 0.09394946694374084, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 346870 + }, + { + "epoch": 1.3409410709591625, + "grad_norm": 0.10553434491157532, + "learning_rate": 0.002, + "loss": 2.318, + "step": 346880 + }, + { + "epoch": 1.3409797281625457, + "grad_norm": 0.15650472044944763, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 346890 + }, + { + "epoch": 1.341018385365929, + "grad_norm": 0.10130909085273743, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 346900 + }, + { + "epoch": 1.3410570425693122, + "grad_norm": 0.109117291867733, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 346910 + }, + { + "epoch": 1.3410956997726957, + "grad_norm": 0.11375482380390167, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 346920 + }, + { + "epoch": 1.341134356976079, + "grad_norm": 0.13961325585842133, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 346930 + }, + { + "epoch": 1.3411730141794622, + "grad_norm": 0.10730651766061783, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 346940 + }, + { + "epoch": 1.3412116713828455, + "grad_norm": 0.10354288667440414, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 346950 + }, + { + "epoch": 1.3412503285862287, + "grad_norm": 0.10738546401262283, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 346960 + }, + { + "epoch": 1.341288985789612, + "grad_norm": 0.10498213022947311, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 346970 + }, + { + "epoch": 1.3413276429929955, + "grad_norm": 0.0919991061091423, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 346980 + }, + { + "epoch": 1.3413663001963787, + "grad_norm": 0.1167188286781311, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 346990 + }, + { + "epoch": 1.341404957399762, + "grad_norm": 0.11551283299922943, + "learning_rate": 0.002, + "loss": 2.327, + "step": 347000 + }, + { + "epoch": 1.3414436146031452, + "grad_norm": 0.11017212271690369, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 347010 + }, + { + "epoch": 1.3414822718065285, + "grad_norm": 0.1336277276277542, + "learning_rate": 0.002, + "loss": 2.322, + "step": 347020 + }, + { + "epoch": 1.3415209290099117, + "grad_norm": 0.09945376962423325, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 347030 + }, + { + "epoch": 1.341559586213295, + "grad_norm": 0.0978284403681755, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 347040 + }, + { + "epoch": 1.3415982434166782, + "grad_norm": 0.11005248874425888, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 347050 + }, + { + "epoch": 1.3416369006200615, + "grad_norm": 0.11907824128866196, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 347060 + }, + { + "epoch": 1.3416755578234447, + "grad_norm": 0.10550861805677414, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 347070 + }, + { + "epoch": 1.341714215026828, + "grad_norm": 0.0961516946554184, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 347080 + }, + { + "epoch": 1.3417528722302114, + "grad_norm": 0.12996280193328857, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 347090 + }, + { + "epoch": 1.3417915294335947, + "grad_norm": 0.1039133295416832, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 347100 + }, + { + "epoch": 1.341830186636978, + "grad_norm": 0.09067199379205704, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 347110 + }, + { + "epoch": 1.3418688438403612, + "grad_norm": 0.12177203595638275, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 347120 + }, + { + "epoch": 1.3419075010437445, + "grad_norm": 0.12148578464984894, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 347130 + }, + { + "epoch": 1.3419461582471277, + "grad_norm": 0.10434716939926147, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 347140 + }, + { + "epoch": 1.3419848154505112, + "grad_norm": 0.10731692612171173, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 347150 + }, + { + "epoch": 1.3420234726538944, + "grad_norm": 0.0981302484869957, + "learning_rate": 0.002, + "loss": 2.33, + "step": 347160 + }, + { + "epoch": 1.3420621298572777, + "grad_norm": 0.09808428585529327, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 347170 + }, + { + "epoch": 1.342100787060661, + "grad_norm": 0.1706089824438095, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 347180 + }, + { + "epoch": 1.3421394442640442, + "grad_norm": 0.12091352045536041, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 347190 + }, + { + "epoch": 1.3421781014674274, + "grad_norm": 0.10970164835453033, + "learning_rate": 0.002, + "loss": 2.333, + "step": 347200 + }, + { + "epoch": 1.3422167586708107, + "grad_norm": 0.08676803857088089, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 347210 + }, + { + "epoch": 1.342255415874194, + "grad_norm": 0.14231640100479126, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 347220 + }, + { + "epoch": 1.3422940730775772, + "grad_norm": 0.0979641005396843, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 347230 + }, + { + "epoch": 1.3423327302809605, + "grad_norm": 0.10979917645454407, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 347240 + }, + { + "epoch": 1.3423713874843437, + "grad_norm": 0.10588161647319794, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 347250 + }, + { + "epoch": 1.3424100446877272, + "grad_norm": 0.10137680917978287, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 347260 + }, + { + "epoch": 1.3424487018911104, + "grad_norm": 0.10275550186634064, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 347270 + }, + { + "epoch": 1.3424873590944937, + "grad_norm": 0.10416662693023682, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 347280 + }, + { + "epoch": 1.342526016297877, + "grad_norm": 0.1014779582619667, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 347290 + }, + { + "epoch": 1.3425646735012602, + "grad_norm": 0.10870113223791122, + "learning_rate": 0.002, + "loss": 2.323, + "step": 347300 + }, + { + "epoch": 1.3426033307046434, + "grad_norm": 0.11813866347074509, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 347310 + }, + { + "epoch": 1.342641987908027, + "grad_norm": 0.10769974440336227, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 347320 + }, + { + "epoch": 1.3426806451114102, + "grad_norm": 0.10103698074817657, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 347330 + }, + { + "epoch": 1.3427193023147934, + "grad_norm": 0.12572821974754333, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 347340 + }, + { + "epoch": 1.3427579595181767, + "grad_norm": 0.11496661603450775, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 347350 + }, + { + "epoch": 1.34279661672156, + "grad_norm": 0.09949234873056412, + "learning_rate": 0.002, + "loss": 2.349, + "step": 347360 + }, + { + "epoch": 1.3428352739249432, + "grad_norm": 0.099911630153656, + "learning_rate": 0.002, + "loss": 2.329, + "step": 347370 + }, + { + "epoch": 1.3428739311283264, + "grad_norm": 0.10400547832250595, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 347380 + }, + { + "epoch": 1.3429125883317097, + "grad_norm": 0.10007713735103607, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 347390 + }, + { + "epoch": 1.342951245535093, + "grad_norm": 0.10954345762729645, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 347400 + }, + { + "epoch": 1.3429899027384762, + "grad_norm": 0.09668856859207153, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 347410 + }, + { + "epoch": 1.3430285599418594, + "grad_norm": 0.12628354132175446, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 347420 + }, + { + "epoch": 1.343067217145243, + "grad_norm": 0.1271989941596985, + "learning_rate": 0.002, + "loss": 2.331, + "step": 347430 + }, + { + "epoch": 1.3431058743486262, + "grad_norm": 0.1027301475405693, + "learning_rate": 0.002, + "loss": 2.334, + "step": 347440 + }, + { + "epoch": 1.3431445315520094, + "grad_norm": 0.10065386444330215, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 347450 + }, + { + "epoch": 1.3431831887553927, + "grad_norm": 0.10471511632204056, + "learning_rate": 0.002, + "loss": 2.332, + "step": 347460 + }, + { + "epoch": 1.343221845958776, + "grad_norm": 0.10855276137590408, + "learning_rate": 0.002, + "loss": 2.348, + "step": 347470 + }, + { + "epoch": 1.3432605031621592, + "grad_norm": 0.11760521680116653, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 347480 + }, + { + "epoch": 1.3432991603655426, + "grad_norm": 0.10220196098089218, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 347490 + }, + { + "epoch": 1.343337817568926, + "grad_norm": 0.10600744187831879, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 347500 + }, + { + "epoch": 1.3433764747723091, + "grad_norm": 0.09794536978006363, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 347510 + }, + { + "epoch": 1.3434151319756924, + "grad_norm": 0.09697246551513672, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 347520 + }, + { + "epoch": 1.3434537891790757, + "grad_norm": 0.10189753025770187, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 347530 + }, + { + "epoch": 1.343492446382459, + "grad_norm": 0.11046060919761658, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 347540 + }, + { + "epoch": 1.3435311035858422, + "grad_norm": 0.1047743633389473, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 347550 + }, + { + "epoch": 1.3435697607892254, + "grad_norm": 0.10901220142841339, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 347560 + }, + { + "epoch": 1.3436084179926087, + "grad_norm": 0.10370327532291412, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 347570 + }, + { + "epoch": 1.343647075195992, + "grad_norm": 0.12953785061836243, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 347580 + }, + { + "epoch": 1.3436857323993752, + "grad_norm": 0.10009057819843292, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 347590 + }, + { + "epoch": 1.3437243896027586, + "grad_norm": 0.10040712356567383, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 347600 + }, + { + "epoch": 1.343763046806142, + "grad_norm": 0.10058463364839554, + "learning_rate": 0.002, + "loss": 2.34, + "step": 347610 + }, + { + "epoch": 1.3438017040095251, + "grad_norm": 0.10221666097640991, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 347620 + }, + { + "epoch": 1.3438403612129084, + "grad_norm": 0.09885133802890778, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 347630 + }, + { + "epoch": 1.3438790184162916, + "grad_norm": 0.08774641901254654, + "learning_rate": 0.002, + "loss": 2.346, + "step": 347640 + }, + { + "epoch": 1.343917675619675, + "grad_norm": 0.11479109525680542, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 347650 + }, + { + "epoch": 1.3439563328230584, + "grad_norm": 0.09676390886306763, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 347660 + }, + { + "epoch": 1.3439949900264416, + "grad_norm": 0.08994947373867035, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 347670 + }, + { + "epoch": 1.3440336472298249, + "grad_norm": 0.09621430188417435, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 347680 + }, + { + "epoch": 1.3440723044332081, + "grad_norm": 0.10796340554952621, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 347690 + }, + { + "epoch": 1.3441109616365914, + "grad_norm": 0.15210728347301483, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 347700 + }, + { + "epoch": 1.3441496188399746, + "grad_norm": 0.12892857193946838, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 347710 + }, + { + "epoch": 1.3441882760433579, + "grad_norm": 0.10418952256441116, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 347720 + }, + { + "epoch": 1.3442269332467411, + "grad_norm": 0.11700917780399323, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 347730 + }, + { + "epoch": 1.3442655904501244, + "grad_norm": 0.09763418138027191, + "learning_rate": 0.002, + "loss": 2.329, + "step": 347740 + }, + { + "epoch": 1.3443042476535076, + "grad_norm": 0.10042642056941986, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 347750 + }, + { + "epoch": 1.344342904856891, + "grad_norm": 0.10885453969240189, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 347760 + }, + { + "epoch": 1.3443815620602744, + "grad_norm": 0.09971655905246735, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 347770 + }, + { + "epoch": 1.3444202192636576, + "grad_norm": 0.08985363692045212, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 347780 + }, + { + "epoch": 1.3444588764670409, + "grad_norm": 0.12392828613519669, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 347790 + }, + { + "epoch": 1.3444975336704241, + "grad_norm": 0.11634746938943863, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 347800 + }, + { + "epoch": 1.3445361908738074, + "grad_norm": 0.12065248936414719, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 347810 + }, + { + "epoch": 1.3445748480771906, + "grad_norm": 0.10127338021993637, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 347820 + }, + { + "epoch": 1.344613505280574, + "grad_norm": 0.10347548872232437, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 347830 + }, + { + "epoch": 1.3446521624839574, + "grad_norm": 0.11170519143342972, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 347840 + }, + { + "epoch": 1.3446908196873406, + "grad_norm": 0.09464278817176819, + "learning_rate": 0.002, + "loss": 2.3517, + "step": 347850 + }, + { + "epoch": 1.3447294768907239, + "grad_norm": 0.10744038224220276, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 347860 + }, + { + "epoch": 1.3447681340941071, + "grad_norm": 0.1069311797618866, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 347870 + }, + { + "epoch": 1.3448067912974904, + "grad_norm": 0.11059171706438065, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 347880 + }, + { + "epoch": 1.3448454485008736, + "grad_norm": 0.09943494945764542, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 347890 + }, + { + "epoch": 1.3448841057042569, + "grad_norm": 0.10599261522293091, + "learning_rate": 0.002, + "loss": 2.333, + "step": 347900 + }, + { + "epoch": 1.3449227629076401, + "grad_norm": 0.10283409804105759, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 347910 + }, + { + "epoch": 1.3449614201110234, + "grad_norm": 0.1135038286447525, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 347920 + }, + { + "epoch": 1.3450000773144066, + "grad_norm": 0.11863948404788971, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 347930 + }, + { + "epoch": 1.34503873451779, + "grad_norm": 0.10357358306646347, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 347940 + }, + { + "epoch": 1.3450773917211734, + "grad_norm": 0.12235188484191895, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 347950 + }, + { + "epoch": 1.3451160489245566, + "grad_norm": 0.11046276986598969, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 347960 + }, + { + "epoch": 1.3451547061279399, + "grad_norm": 0.11126643419265747, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 347970 + }, + { + "epoch": 1.345193363331323, + "grad_norm": 0.10441526770591736, + "learning_rate": 0.002, + "loss": 2.3106, + "step": 347980 + }, + { + "epoch": 1.3452320205347064, + "grad_norm": 0.11165712773799896, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 347990 + }, + { + "epoch": 1.3452706777380898, + "grad_norm": 0.10283760726451874, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 348000 + }, + { + "epoch": 1.345309334941473, + "grad_norm": 0.12190571427345276, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 348010 + }, + { + "epoch": 1.3453479921448563, + "grad_norm": 0.10616656392812729, + "learning_rate": 0.002, + "loss": 2.327, + "step": 348020 + }, + { + "epoch": 1.3453866493482396, + "grad_norm": 0.11338208615779877, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 348030 + }, + { + "epoch": 1.3454253065516228, + "grad_norm": 0.1024947240948677, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 348040 + }, + { + "epoch": 1.345463963755006, + "grad_norm": 0.13262341916561127, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 348050 + }, + { + "epoch": 1.3455026209583894, + "grad_norm": 0.10618232935667038, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 348060 + }, + { + "epoch": 1.3455412781617726, + "grad_norm": 0.10304050147533417, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 348070 + }, + { + "epoch": 1.3455799353651559, + "grad_norm": 0.0965445265173912, + "learning_rate": 0.002, + "loss": 2.3547, + "step": 348080 + }, + { + "epoch": 1.345618592568539, + "grad_norm": 0.11161847412586212, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 348090 + }, + { + "epoch": 1.3456572497719226, + "grad_norm": 0.1187560185790062, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 348100 + }, + { + "epoch": 1.3456959069753058, + "grad_norm": 0.09679316729307175, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 348110 + }, + { + "epoch": 1.345734564178689, + "grad_norm": 0.12807625532150269, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 348120 + }, + { + "epoch": 1.3457732213820723, + "grad_norm": 0.10582450777292252, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 348130 + }, + { + "epoch": 1.3458118785854556, + "grad_norm": 0.10744981467723846, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 348140 + }, + { + "epoch": 1.3458505357888388, + "grad_norm": 0.10922669619321823, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 348150 + }, + { + "epoch": 1.345889192992222, + "grad_norm": 0.10119405388832092, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 348160 + }, + { + "epoch": 1.3459278501956056, + "grad_norm": 0.11563913524150848, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 348170 + }, + { + "epoch": 1.3459665073989888, + "grad_norm": 0.10904233902692795, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 348180 + }, + { + "epoch": 1.346005164602372, + "grad_norm": 0.11013088375329971, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 348190 + }, + { + "epoch": 1.3460438218057553, + "grad_norm": 0.09150010347366333, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 348200 + }, + { + "epoch": 1.3460824790091386, + "grad_norm": 0.1301971673965454, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 348210 + }, + { + "epoch": 1.3461211362125218, + "grad_norm": 0.10532008856534958, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 348220 + }, + { + "epoch": 1.346159793415905, + "grad_norm": 0.10974858701229095, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 348230 + }, + { + "epoch": 1.3461984506192883, + "grad_norm": 0.16881871223449707, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 348240 + }, + { + "epoch": 1.3462371078226716, + "grad_norm": 0.0989382266998291, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 348250 + }, + { + "epoch": 1.3462757650260548, + "grad_norm": 0.09359890222549438, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 348260 + }, + { + "epoch": 1.3463144222294383, + "grad_norm": 0.12183345854282379, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 348270 + }, + { + "epoch": 1.3463530794328216, + "grad_norm": 0.12346727401018143, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 348280 + }, + { + "epoch": 1.3463917366362048, + "grad_norm": 0.10327605903148651, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 348290 + }, + { + "epoch": 1.346430393839588, + "grad_norm": 0.10280732810497284, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 348300 + }, + { + "epoch": 1.3464690510429713, + "grad_norm": 0.11782421916723251, + "learning_rate": 0.002, + "loss": 2.335, + "step": 348310 + }, + { + "epoch": 1.3465077082463546, + "grad_norm": 0.09593365341424942, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 348320 + }, + { + "epoch": 1.3465463654497378, + "grad_norm": 0.09912115335464478, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 348330 + }, + { + "epoch": 1.3465850226531213, + "grad_norm": 0.10711175948381424, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 348340 + }, + { + "epoch": 1.3466236798565046, + "grad_norm": 0.11354491114616394, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 348350 + }, + { + "epoch": 1.3466623370598878, + "grad_norm": 0.11319012194871902, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 348360 + }, + { + "epoch": 1.346700994263271, + "grad_norm": 0.1476483941078186, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 348370 + }, + { + "epoch": 1.3467396514666543, + "grad_norm": 0.10306962579488754, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 348380 + }, + { + "epoch": 1.3467783086700376, + "grad_norm": 0.10173012316226959, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 348390 + }, + { + "epoch": 1.3468169658734208, + "grad_norm": 0.0901515930891037, + "learning_rate": 0.002, + "loss": 2.323, + "step": 348400 + }, + { + "epoch": 1.346855623076804, + "grad_norm": 0.11575210839509964, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 348410 + }, + { + "epoch": 1.3468942802801873, + "grad_norm": 0.11052270233631134, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 348420 + }, + { + "epoch": 1.3469329374835706, + "grad_norm": 0.1113486960530281, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 348430 + }, + { + "epoch": 1.346971594686954, + "grad_norm": 0.10080469399690628, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 348440 + }, + { + "epoch": 1.3470102518903373, + "grad_norm": 0.09699404239654541, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 348450 + }, + { + "epoch": 1.3470489090937205, + "grad_norm": 0.09992402046918869, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 348460 + }, + { + "epoch": 1.3470875662971038, + "grad_norm": 0.10376543551683426, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 348470 + }, + { + "epoch": 1.347126223500487, + "grad_norm": 0.11510486155748367, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 348480 + }, + { + "epoch": 1.3471648807038703, + "grad_norm": 0.11083221435546875, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 348490 + }, + { + "epoch": 1.3472035379072536, + "grad_norm": 0.12963175773620605, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 348500 + }, + { + "epoch": 1.347242195110637, + "grad_norm": 0.1083407923579216, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 348510 + }, + { + "epoch": 1.3472808523140203, + "grad_norm": 0.11508948355913162, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 348520 + }, + { + "epoch": 1.3473195095174035, + "grad_norm": 0.09246627986431122, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 348530 + }, + { + "epoch": 1.3473581667207868, + "grad_norm": 0.10896582156419754, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 348540 + }, + { + "epoch": 1.34739682392417, + "grad_norm": 0.10738394409418106, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 348550 + }, + { + "epoch": 1.3474354811275533, + "grad_norm": 0.09998328238725662, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 348560 + }, + { + "epoch": 1.3474741383309365, + "grad_norm": 0.10204174369573593, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 348570 + }, + { + "epoch": 1.3475127955343198, + "grad_norm": 0.11803248524665833, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 348580 + }, + { + "epoch": 1.347551452737703, + "grad_norm": 0.10468433797359467, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 348590 + }, + { + "epoch": 1.3475901099410863, + "grad_norm": 0.11109494417905807, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 348600 + }, + { + "epoch": 1.3476287671444698, + "grad_norm": 0.1056119054555893, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 348610 + }, + { + "epoch": 1.347667424347853, + "grad_norm": 0.10069143772125244, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 348620 + }, + { + "epoch": 1.3477060815512363, + "grad_norm": 0.12363746762275696, + "learning_rate": 0.002, + "loss": 2.323, + "step": 348630 + }, + { + "epoch": 1.3477447387546195, + "grad_norm": 0.11203087866306305, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 348640 + }, + { + "epoch": 1.3477833959580028, + "grad_norm": 0.09826168417930603, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 348650 + }, + { + "epoch": 1.347822053161386, + "grad_norm": 0.10257137566804886, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 348660 + }, + { + "epoch": 1.3478607103647695, + "grad_norm": 0.10058474540710449, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 348670 + }, + { + "epoch": 1.3478993675681528, + "grad_norm": 0.10483745485544205, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 348680 + }, + { + "epoch": 1.347938024771536, + "grad_norm": 0.12295182794332504, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 348690 + }, + { + "epoch": 1.3479766819749193, + "grad_norm": 0.12495746463537216, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 348700 + }, + { + "epoch": 1.3480153391783025, + "grad_norm": 0.10037694126367569, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 348710 + }, + { + "epoch": 1.3480539963816858, + "grad_norm": 0.10300444066524506, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 348720 + }, + { + "epoch": 1.348092653585069, + "grad_norm": 0.11531723290681839, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 348730 + }, + { + "epoch": 1.3481313107884523, + "grad_norm": 0.09703969955444336, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 348740 + }, + { + "epoch": 1.3481699679918355, + "grad_norm": 0.11398639529943466, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 348750 + }, + { + "epoch": 1.3482086251952188, + "grad_norm": 0.09538202732801437, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 348760 + }, + { + "epoch": 1.348247282398602, + "grad_norm": 0.10117063671350479, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 348770 + }, + { + "epoch": 1.3482859396019855, + "grad_norm": 0.1012919619679451, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 348780 + }, + { + "epoch": 1.3483245968053688, + "grad_norm": 0.10244598984718323, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 348790 + }, + { + "epoch": 1.348363254008752, + "grad_norm": 0.11705554276704788, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 348800 + }, + { + "epoch": 1.3484019112121353, + "grad_norm": 0.10573650896549225, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 348810 + }, + { + "epoch": 1.3484405684155185, + "grad_norm": 0.16497986018657684, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 348820 + }, + { + "epoch": 1.3484792256189018, + "grad_norm": 0.09472975134849548, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 348830 + }, + { + "epoch": 1.3485178828222852, + "grad_norm": 0.12757426500320435, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 348840 + }, + { + "epoch": 1.3485565400256685, + "grad_norm": 0.10717236250638962, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 348850 + }, + { + "epoch": 1.3485951972290517, + "grad_norm": 0.09877166152000427, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 348860 + }, + { + "epoch": 1.348633854432435, + "grad_norm": 0.12250571697950363, + "learning_rate": 0.002, + "loss": 2.344, + "step": 348870 + }, + { + "epoch": 1.3486725116358182, + "grad_norm": 0.12091425806283951, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 348880 + }, + { + "epoch": 1.3487111688392015, + "grad_norm": 0.1172761395573616, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 348890 + }, + { + "epoch": 1.3487498260425848, + "grad_norm": 0.10755904018878937, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 348900 + }, + { + "epoch": 1.348788483245968, + "grad_norm": 0.10510231554508209, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 348910 + }, + { + "epoch": 1.3488271404493513, + "grad_norm": 0.15903323888778687, + "learning_rate": 0.002, + "loss": 2.337, + "step": 348920 + }, + { + "epoch": 1.3488657976527345, + "grad_norm": 0.11088036000728607, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 348930 + }, + { + "epoch": 1.3489044548561178, + "grad_norm": 0.16186174750328064, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 348940 + }, + { + "epoch": 1.3489431120595012, + "grad_norm": 0.09969766438007355, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 348950 + }, + { + "epoch": 1.3489817692628845, + "grad_norm": 0.10213224589824677, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 348960 + }, + { + "epoch": 1.3490204264662677, + "grad_norm": 0.10633454471826553, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 348970 + }, + { + "epoch": 1.349059083669651, + "grad_norm": 0.09949535876512527, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 348980 + }, + { + "epoch": 1.3490977408730342, + "grad_norm": 0.10401295125484467, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 348990 + }, + { + "epoch": 1.3491363980764175, + "grad_norm": 0.1099301129579544, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 349000 + }, + { + "epoch": 1.349175055279801, + "grad_norm": 0.1048002764582634, + "learning_rate": 0.002, + "loss": 2.34, + "step": 349010 + }, + { + "epoch": 1.3492137124831842, + "grad_norm": 0.14082182943820953, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 349020 + }, + { + "epoch": 1.3492523696865675, + "grad_norm": 0.09485018253326416, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 349030 + }, + { + "epoch": 1.3492910268899507, + "grad_norm": 0.11179543286561966, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 349040 + }, + { + "epoch": 1.349329684093334, + "grad_norm": 0.1056070551276207, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 349050 + }, + { + "epoch": 1.3493683412967172, + "grad_norm": 0.09375675767660141, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 349060 + }, + { + "epoch": 1.3494069985001005, + "grad_norm": 0.09856487810611725, + "learning_rate": 0.002, + "loss": 2.323, + "step": 349070 + }, + { + "epoch": 1.3494456557034837, + "grad_norm": 0.10716529190540314, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 349080 + }, + { + "epoch": 1.349484312906867, + "grad_norm": 0.09925325959920883, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 349090 + }, + { + "epoch": 1.3495229701102502, + "grad_norm": 0.10733172297477722, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 349100 + }, + { + "epoch": 1.3495616273136335, + "grad_norm": 0.11140187084674835, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 349110 + }, + { + "epoch": 1.349600284517017, + "grad_norm": 0.12716157734394073, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 349120 + }, + { + "epoch": 1.3496389417204002, + "grad_norm": 0.11385093629360199, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 349130 + }, + { + "epoch": 1.3496775989237835, + "grad_norm": 0.10158204287290573, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 349140 + }, + { + "epoch": 1.3497162561271667, + "grad_norm": 0.11852376908063889, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 349150 + }, + { + "epoch": 1.34975491333055, + "grad_norm": 0.10581693798303604, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 349160 + }, + { + "epoch": 1.3497935705339332, + "grad_norm": 0.10468754917383194, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 349170 + }, + { + "epoch": 1.3498322277373167, + "grad_norm": 0.09875061362981796, + "learning_rate": 0.002, + "loss": 2.332, + "step": 349180 + }, + { + "epoch": 1.3498708849407, + "grad_norm": 0.11517211049795151, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 349190 + }, + { + "epoch": 1.3499095421440832, + "grad_norm": 0.10867352038621902, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 349200 + }, + { + "epoch": 1.3499481993474665, + "grad_norm": 0.09441282600164413, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 349210 + }, + { + "epoch": 1.3499868565508497, + "grad_norm": 0.1236996278166771, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 349220 + }, + { + "epoch": 1.350025513754233, + "grad_norm": 0.09415584057569504, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 349230 + }, + { + "epoch": 1.3500641709576162, + "grad_norm": 0.08901593089103699, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 349240 + }, + { + "epoch": 1.3501028281609995, + "grad_norm": 0.10068287700414658, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 349250 + }, + { + "epoch": 1.3501414853643827, + "grad_norm": 0.09829511493444443, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 349260 + }, + { + "epoch": 1.350180142567766, + "grad_norm": 0.10522589832544327, + "learning_rate": 0.002, + "loss": 2.33, + "step": 349270 + }, + { + "epoch": 1.3502187997711492, + "grad_norm": 0.1001722514629364, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 349280 + }, + { + "epoch": 1.3502574569745327, + "grad_norm": 0.11357532441616058, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 349290 + }, + { + "epoch": 1.350296114177916, + "grad_norm": 0.10763426870107651, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 349300 + }, + { + "epoch": 1.3503347713812992, + "grad_norm": 0.12531541287899017, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 349310 + }, + { + "epoch": 1.3503734285846825, + "grad_norm": 0.09431840479373932, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 349320 + }, + { + "epoch": 1.3504120857880657, + "grad_norm": 0.10716231912374496, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 349330 + }, + { + "epoch": 1.350450742991449, + "grad_norm": 0.13337059319019318, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 349340 + }, + { + "epoch": 1.3504894001948324, + "grad_norm": 0.09658301621675491, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 349350 + }, + { + "epoch": 1.3505280573982157, + "grad_norm": 0.11998984217643738, + "learning_rate": 0.002, + "loss": 2.348, + "step": 349360 + }, + { + "epoch": 1.350566714601599, + "grad_norm": 0.22490796446800232, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 349370 + }, + { + "epoch": 1.3506053718049822, + "grad_norm": 0.12542197108268738, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 349380 + }, + { + "epoch": 1.3506440290083654, + "grad_norm": 0.101040780544281, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 349390 + }, + { + "epoch": 1.3506826862117487, + "grad_norm": 0.1157328262925148, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 349400 + }, + { + "epoch": 1.350721343415132, + "grad_norm": 0.10763093084096909, + "learning_rate": 0.002, + "loss": 2.332, + "step": 349410 + }, + { + "epoch": 1.3507600006185152, + "grad_norm": 0.10246003419160843, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 349420 + }, + { + "epoch": 1.3507986578218985, + "grad_norm": 0.09681911766529083, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 349430 + }, + { + "epoch": 1.3508373150252817, + "grad_norm": 0.10902806371450424, + "learning_rate": 0.002, + "loss": 2.34, + "step": 349440 + }, + { + "epoch": 1.350875972228665, + "grad_norm": 0.08912651985883713, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 349450 + }, + { + "epoch": 1.3509146294320484, + "grad_norm": 0.11318572610616684, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 349460 + }, + { + "epoch": 1.3509532866354317, + "grad_norm": 0.12583884596824646, + "learning_rate": 0.002, + "loss": 2.321, + "step": 349470 + }, + { + "epoch": 1.350991943838815, + "grad_norm": 0.10462572425603867, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 349480 + }, + { + "epoch": 1.3510306010421982, + "grad_norm": 0.1022852286696434, + "learning_rate": 0.002, + "loss": 2.3612, + "step": 349490 + }, + { + "epoch": 1.3510692582455814, + "grad_norm": 0.11671693623065948, + "learning_rate": 0.002, + "loss": 2.3121, + "step": 349500 + }, + { + "epoch": 1.3511079154489647, + "grad_norm": 0.10456795245409012, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 349510 + }, + { + "epoch": 1.3511465726523482, + "grad_norm": 0.10208845138549805, + "learning_rate": 0.002, + "loss": 2.345, + "step": 349520 + }, + { + "epoch": 1.3511852298557314, + "grad_norm": 0.09636218845844269, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 349530 + }, + { + "epoch": 1.3512238870591147, + "grad_norm": 0.11306154727935791, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 349540 + }, + { + "epoch": 1.351262544262498, + "grad_norm": 0.21951696276664734, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 349550 + }, + { + "epoch": 1.3513012014658812, + "grad_norm": 0.11239375919103622, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 349560 + }, + { + "epoch": 1.3513398586692644, + "grad_norm": 0.10407473891973495, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 349570 + }, + { + "epoch": 1.3513785158726477, + "grad_norm": 0.09843426942825317, + "learning_rate": 0.002, + "loss": 2.338, + "step": 349580 + }, + { + "epoch": 1.351417173076031, + "grad_norm": 0.1171356588602066, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 349590 + }, + { + "epoch": 1.3514558302794142, + "grad_norm": 0.10250692814588547, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 349600 + }, + { + "epoch": 1.3514944874827974, + "grad_norm": 0.12931481003761292, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 349610 + }, + { + "epoch": 1.3515331446861807, + "grad_norm": 0.11812961846590042, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 349620 + }, + { + "epoch": 1.3515718018895642, + "grad_norm": 0.09838510304689407, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 349630 + }, + { + "epoch": 1.3516104590929474, + "grad_norm": 0.08915498852729797, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 349640 + }, + { + "epoch": 1.3516491162963307, + "grad_norm": 0.11878800392150879, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 349650 + }, + { + "epoch": 1.351687773499714, + "grad_norm": 0.10394832491874695, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 349660 + }, + { + "epoch": 1.3517264307030972, + "grad_norm": 0.12385991215705872, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 349670 + }, + { + "epoch": 1.3517650879064804, + "grad_norm": 0.1123647391796112, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 349680 + }, + { + "epoch": 1.351803745109864, + "grad_norm": 0.10797573626041412, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 349690 + }, + { + "epoch": 1.3518424023132471, + "grad_norm": 0.10810215026140213, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 349700 + }, + { + "epoch": 1.3518810595166304, + "grad_norm": 0.08901609480381012, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 349710 + }, + { + "epoch": 1.3519197167200137, + "grad_norm": 0.11496555805206299, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 349720 + }, + { + "epoch": 1.351958373923397, + "grad_norm": 0.10802195221185684, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 349730 + }, + { + "epoch": 1.3519970311267802, + "grad_norm": 0.11436771601438522, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 349740 + }, + { + "epoch": 1.3520356883301634, + "grad_norm": 0.09937600791454315, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 349750 + }, + { + "epoch": 1.3520743455335467, + "grad_norm": 0.09579189121723175, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 349760 + }, + { + "epoch": 1.35211300273693, + "grad_norm": 0.11161386221647263, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 349770 + }, + { + "epoch": 1.3521516599403132, + "grad_norm": 0.10039962083101273, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 349780 + }, + { + "epoch": 1.3521903171436964, + "grad_norm": 0.11212499439716339, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 349790 + }, + { + "epoch": 1.35222897434708, + "grad_norm": 0.11101280152797699, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 349800 + }, + { + "epoch": 1.3522676315504631, + "grad_norm": 0.10846508294343948, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 349810 + }, + { + "epoch": 1.3523062887538464, + "grad_norm": 0.09751371294260025, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 349820 + }, + { + "epoch": 1.3523449459572296, + "grad_norm": 0.09648773819208145, + "learning_rate": 0.002, + "loss": 2.342, + "step": 349830 + }, + { + "epoch": 1.352383603160613, + "grad_norm": 0.08779910206794739, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 349840 + }, + { + "epoch": 1.3524222603639962, + "grad_norm": 0.10542741417884827, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 349850 + }, + { + "epoch": 1.3524609175673796, + "grad_norm": 0.09809595346450806, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 349860 + }, + { + "epoch": 1.3524995747707629, + "grad_norm": 0.11002374440431595, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 349870 + }, + { + "epoch": 1.3525382319741461, + "grad_norm": 0.11683285981416702, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 349880 + }, + { + "epoch": 1.3525768891775294, + "grad_norm": 0.6189273595809937, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 349890 + }, + { + "epoch": 1.3526155463809126, + "grad_norm": 0.12348686903715134, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 349900 + }, + { + "epoch": 1.3526542035842959, + "grad_norm": 0.11133924126625061, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 349910 + }, + { + "epoch": 1.3526928607876791, + "grad_norm": 0.13373364508152008, + "learning_rate": 0.002, + "loss": 2.34, + "step": 349920 + }, + { + "epoch": 1.3527315179910624, + "grad_norm": 0.10956840217113495, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 349930 + }, + { + "epoch": 1.3527701751944456, + "grad_norm": 0.09929367899894714, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 349940 + }, + { + "epoch": 1.352808832397829, + "grad_norm": 0.12658318877220154, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 349950 + }, + { + "epoch": 1.3528474896012124, + "grad_norm": 0.09387831389904022, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 349960 + }, + { + "epoch": 1.3528861468045956, + "grad_norm": 0.09798414260149002, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 349970 + }, + { + "epoch": 1.3529248040079789, + "grad_norm": 0.11518450081348419, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 349980 + }, + { + "epoch": 1.3529634612113621, + "grad_norm": 0.10312693566083908, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 349990 + }, + { + "epoch": 1.3530021184147454, + "grad_norm": 0.10474686324596405, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 350000 + }, + { + "epoch": 1.3530407756181286, + "grad_norm": 0.11365187913179398, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 350010 + }, + { + "epoch": 1.3530794328215119, + "grad_norm": 0.12246767431497574, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 350020 + }, + { + "epoch": 1.3531180900248954, + "grad_norm": 0.09271487593650818, + "learning_rate": 0.002, + "loss": 2.347, + "step": 350030 + }, + { + "epoch": 1.3531567472282786, + "grad_norm": 0.11318147927522659, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 350040 + }, + { + "epoch": 1.3531954044316619, + "grad_norm": 0.11305706202983856, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 350050 + }, + { + "epoch": 1.3532340616350451, + "grad_norm": 0.08688071370124817, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 350060 + }, + { + "epoch": 1.3532727188384284, + "grad_norm": 0.09745533764362335, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 350070 + }, + { + "epoch": 1.3533113760418116, + "grad_norm": 0.09462425112724304, + "learning_rate": 0.002, + "loss": 2.336, + "step": 350080 + }, + { + "epoch": 1.3533500332451949, + "grad_norm": 0.09649679064750671, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 350090 + }, + { + "epoch": 1.3533886904485781, + "grad_norm": 0.09767809510231018, + "learning_rate": 0.002, + "loss": 2.344, + "step": 350100 + }, + { + "epoch": 1.3534273476519614, + "grad_norm": 0.09438911825418472, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 350110 + }, + { + "epoch": 1.3534660048553446, + "grad_norm": 0.10685234516859055, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 350120 + }, + { + "epoch": 1.353504662058728, + "grad_norm": 0.10139115899801254, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 350130 + }, + { + "epoch": 1.3535433192621114, + "grad_norm": 0.09849409013986588, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 350140 + }, + { + "epoch": 1.3535819764654946, + "grad_norm": 0.09482110291719437, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 350150 + }, + { + "epoch": 1.3536206336688779, + "grad_norm": 0.12369535118341446, + "learning_rate": 0.002, + "loss": 2.338, + "step": 350160 + }, + { + "epoch": 1.3536592908722611, + "grad_norm": 0.11014200001955032, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 350170 + }, + { + "epoch": 1.3536979480756444, + "grad_norm": 0.10463564097881317, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 350180 + }, + { + "epoch": 1.3537366052790276, + "grad_norm": 0.09717349708080292, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 350190 + }, + { + "epoch": 1.353775262482411, + "grad_norm": 0.09717091917991638, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 350200 + }, + { + "epoch": 1.3538139196857943, + "grad_norm": 0.09814690798521042, + "learning_rate": 0.002, + "loss": 2.335, + "step": 350210 + }, + { + "epoch": 1.3538525768891776, + "grad_norm": 0.11123622953891754, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 350220 + }, + { + "epoch": 1.3538912340925608, + "grad_norm": 0.11225827038288116, + "learning_rate": 0.002, + "loss": 2.34, + "step": 350230 + }, + { + "epoch": 1.353929891295944, + "grad_norm": 0.09374217689037323, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 350240 + }, + { + "epoch": 1.3539685484993274, + "grad_norm": 0.1104871854186058, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 350250 + }, + { + "epoch": 1.3540072057027106, + "grad_norm": 0.09894196689128876, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 350260 + }, + { + "epoch": 1.3540458629060939, + "grad_norm": 0.12607525289058685, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 350270 + }, + { + "epoch": 1.354084520109477, + "grad_norm": 0.11109902709722519, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 350280 + }, + { + "epoch": 1.3541231773128604, + "grad_norm": 0.10130560398101807, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 350290 + }, + { + "epoch": 1.3541618345162438, + "grad_norm": 0.09814395010471344, + "learning_rate": 0.002, + "loss": 2.332, + "step": 350300 + }, + { + "epoch": 1.354200491719627, + "grad_norm": 0.10805349797010422, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 350310 + }, + { + "epoch": 1.3542391489230103, + "grad_norm": 0.09742667526006699, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 350320 + }, + { + "epoch": 1.3542778061263936, + "grad_norm": 0.10429896414279938, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 350330 + }, + { + "epoch": 1.3543164633297768, + "grad_norm": 0.09611230343580246, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 350340 + }, + { + "epoch": 1.35435512053316, + "grad_norm": 0.23421809077262878, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 350350 + }, + { + "epoch": 1.3543937777365433, + "grad_norm": 0.10822147876024246, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 350360 + }, + { + "epoch": 1.3544324349399268, + "grad_norm": 0.10089278966188431, + "learning_rate": 0.002, + "loss": 2.335, + "step": 350370 + }, + { + "epoch": 1.35447109214331, + "grad_norm": 0.11248739063739777, + "learning_rate": 0.002, + "loss": 2.363, + "step": 350380 + }, + { + "epoch": 1.3545097493466933, + "grad_norm": 0.12073154747486115, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 350390 + }, + { + "epoch": 1.3545484065500766, + "grad_norm": 0.10004567354917526, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 350400 + }, + { + "epoch": 1.3545870637534598, + "grad_norm": 0.12040378898382187, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 350410 + }, + { + "epoch": 1.354625720956843, + "grad_norm": 0.09634535014629364, + "learning_rate": 0.002, + "loss": 2.338, + "step": 350420 + }, + { + "epoch": 1.3546643781602263, + "grad_norm": 0.10254807770252228, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 350430 + }, + { + "epoch": 1.3547030353636096, + "grad_norm": 0.11771095544099808, + "learning_rate": 0.002, + "loss": 2.324, + "step": 350440 + }, + { + "epoch": 1.3547416925669928, + "grad_norm": 0.10953339189291, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 350450 + }, + { + "epoch": 1.354780349770376, + "grad_norm": 0.12906228005886078, + "learning_rate": 0.002, + "loss": 2.332, + "step": 350460 + }, + { + "epoch": 1.3548190069737596, + "grad_norm": 0.10269488394260406, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 350470 + }, + { + "epoch": 1.3548576641771428, + "grad_norm": 0.10697735846042633, + "learning_rate": 0.002, + "loss": 2.346, + "step": 350480 + }, + { + "epoch": 1.354896321380526, + "grad_norm": 0.09774033725261688, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 350490 + }, + { + "epoch": 1.3549349785839093, + "grad_norm": 0.1603395640850067, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 350500 + }, + { + "epoch": 1.3549736357872926, + "grad_norm": 0.12372355908155441, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 350510 + }, + { + "epoch": 1.3550122929906758, + "grad_norm": 0.11275654286146164, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 350520 + }, + { + "epoch": 1.355050950194059, + "grad_norm": 0.09468483924865723, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 350530 + }, + { + "epoch": 1.3550896073974426, + "grad_norm": 0.10514874756336212, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 350540 + }, + { + "epoch": 1.3551282646008258, + "grad_norm": 0.10283022373914719, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 350550 + }, + { + "epoch": 1.355166921804209, + "grad_norm": 0.09229005128145218, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 350560 + }, + { + "epoch": 1.3552055790075923, + "grad_norm": 0.14236804842948914, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 350570 + }, + { + "epoch": 1.3552442362109756, + "grad_norm": 0.09927023947238922, + "learning_rate": 0.002, + "loss": 2.322, + "step": 350580 + }, + { + "epoch": 1.3552828934143588, + "grad_norm": 0.09392958134412766, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 350590 + }, + { + "epoch": 1.355321550617742, + "grad_norm": 0.11411365866661072, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 350600 + }, + { + "epoch": 1.3553602078211253, + "grad_norm": 0.09241783618927002, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 350610 + }, + { + "epoch": 1.3553988650245086, + "grad_norm": 0.08613024652004242, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 350620 + }, + { + "epoch": 1.3554375222278918, + "grad_norm": 0.10598557442426682, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 350630 + }, + { + "epoch": 1.3554761794312753, + "grad_norm": 0.09892724454402924, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 350640 + }, + { + "epoch": 1.3555148366346585, + "grad_norm": 0.09699424356222153, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 350650 + }, + { + "epoch": 1.3555534938380418, + "grad_norm": 0.11693083494901657, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 350660 + }, + { + "epoch": 1.355592151041425, + "grad_norm": 0.09478957206010818, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 350670 + }, + { + "epoch": 1.3556308082448083, + "grad_norm": 0.10289830714464188, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 350680 + }, + { + "epoch": 1.3556694654481916, + "grad_norm": 0.09411660581827164, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 350690 + }, + { + "epoch": 1.355708122651575, + "grad_norm": 0.1026243194937706, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 350700 + }, + { + "epoch": 1.3557467798549583, + "grad_norm": 0.11500430107116699, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 350710 + }, + { + "epoch": 1.3557854370583415, + "grad_norm": 0.105681873857975, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 350720 + }, + { + "epoch": 1.3558240942617248, + "grad_norm": 0.08912233263254166, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 350730 + }, + { + "epoch": 1.355862751465108, + "grad_norm": 0.0909724161028862, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 350740 + }, + { + "epoch": 1.3559014086684913, + "grad_norm": 0.10319610685110092, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 350750 + }, + { + "epoch": 1.3559400658718745, + "grad_norm": 0.10236437618732452, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 350760 + }, + { + "epoch": 1.3559787230752578, + "grad_norm": 0.09389732778072357, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 350770 + }, + { + "epoch": 1.356017380278641, + "grad_norm": 0.1131930947303772, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 350780 + }, + { + "epoch": 1.3560560374820243, + "grad_norm": 0.10111220926046371, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 350790 + }, + { + "epoch": 1.3560946946854076, + "grad_norm": 0.09880499541759491, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 350800 + }, + { + "epoch": 1.356133351888791, + "grad_norm": 0.09543951600790024, + "learning_rate": 0.002, + "loss": 2.3104, + "step": 350810 + }, + { + "epoch": 1.3561720090921743, + "grad_norm": 0.10903646051883698, + "learning_rate": 0.002, + "loss": 2.326, + "step": 350820 + }, + { + "epoch": 1.3562106662955575, + "grad_norm": 0.09331028163433075, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 350830 + }, + { + "epoch": 1.3562493234989408, + "grad_norm": 0.13966050744056702, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 350840 + }, + { + "epoch": 1.356287980702324, + "grad_norm": 0.09700989723205566, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 350850 + }, + { + "epoch": 1.3563266379057073, + "grad_norm": 0.0926334485411644, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 350860 + }, + { + "epoch": 1.3563652951090908, + "grad_norm": 0.09399153292179108, + "learning_rate": 0.002, + "loss": 2.338, + "step": 350870 + }, + { + "epoch": 1.356403952312474, + "grad_norm": 0.1077730804681778, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 350880 + }, + { + "epoch": 1.3564426095158573, + "grad_norm": 0.1043611541390419, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 350890 + }, + { + "epoch": 1.3564812667192405, + "grad_norm": 0.10705311596393585, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 350900 + }, + { + "epoch": 1.3565199239226238, + "grad_norm": 0.11739810556173325, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 350910 + }, + { + "epoch": 1.356558581126007, + "grad_norm": 0.10188135504722595, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 350920 + }, + { + "epoch": 1.3565972383293903, + "grad_norm": 0.11405863612890244, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 350930 + }, + { + "epoch": 1.3566358955327735, + "grad_norm": 0.10311403125524521, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 350940 + }, + { + "epoch": 1.3566745527361568, + "grad_norm": 0.11608370393514633, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 350950 + }, + { + "epoch": 1.35671320993954, + "grad_norm": 0.10479841381311417, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 350960 + }, + { + "epoch": 1.3567518671429233, + "grad_norm": 0.1082431823015213, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 350970 + }, + { + "epoch": 1.3567905243463068, + "grad_norm": 0.12166009098291397, + "learning_rate": 0.002, + "loss": 2.3499, + "step": 350980 + }, + { + "epoch": 1.35682918154969, + "grad_norm": 0.10090559720993042, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 350990 + }, + { + "epoch": 1.3568678387530733, + "grad_norm": 0.11029312014579773, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 351000 + }, + { + "epoch": 1.3569064959564565, + "grad_norm": 0.11490152031183243, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 351010 + }, + { + "epoch": 1.3569451531598398, + "grad_norm": 0.10307451337575912, + "learning_rate": 0.002, + "loss": 2.326, + "step": 351020 + }, + { + "epoch": 1.356983810363223, + "grad_norm": 0.14087946712970734, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 351030 + }, + { + "epoch": 1.3570224675666065, + "grad_norm": 0.09875258058309555, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 351040 + }, + { + "epoch": 1.3570611247699897, + "grad_norm": 0.10351992398500443, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 351050 + }, + { + "epoch": 1.357099781973373, + "grad_norm": 0.10319630056619644, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 351060 + }, + { + "epoch": 1.3571384391767563, + "grad_norm": 0.12239965796470642, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 351070 + }, + { + "epoch": 1.3571770963801395, + "grad_norm": 0.10036638379096985, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 351080 + }, + { + "epoch": 1.3572157535835228, + "grad_norm": 0.13001009821891785, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 351090 + }, + { + "epoch": 1.357254410786906, + "grad_norm": 0.12365800142288208, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 351100 + }, + { + "epoch": 1.3572930679902893, + "grad_norm": 0.10206878185272217, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 351110 + }, + { + "epoch": 1.3573317251936725, + "grad_norm": 0.11229734122753143, + "learning_rate": 0.002, + "loss": 2.325, + "step": 351120 + }, + { + "epoch": 1.3573703823970558, + "grad_norm": 0.1121644452214241, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 351130 + }, + { + "epoch": 1.357409039600439, + "grad_norm": 0.10506831109523773, + "learning_rate": 0.002, + "loss": 2.346, + "step": 351140 + }, + { + "epoch": 1.3574476968038225, + "grad_norm": 0.10747390240430832, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 351150 + }, + { + "epoch": 1.3574863540072057, + "grad_norm": 0.10540571063756943, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 351160 + }, + { + "epoch": 1.357525011210589, + "grad_norm": 0.1473851203918457, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 351170 + }, + { + "epoch": 1.3575636684139722, + "grad_norm": 0.101223424077034, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 351180 + }, + { + "epoch": 1.3576023256173555, + "grad_norm": 0.09359554946422577, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 351190 + }, + { + "epoch": 1.3576409828207388, + "grad_norm": 0.1014396995306015, + "learning_rate": 0.002, + "loss": 2.331, + "step": 351200 + }, + { + "epoch": 1.3576796400241222, + "grad_norm": 0.11498954892158508, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 351210 + }, + { + "epoch": 1.3577182972275055, + "grad_norm": 0.11524534970521927, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 351220 + }, + { + "epoch": 1.3577569544308887, + "grad_norm": 0.13116922974586487, + "learning_rate": 0.002, + "loss": 2.321, + "step": 351230 + }, + { + "epoch": 1.357795611634272, + "grad_norm": 0.09713736921548843, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 351240 + }, + { + "epoch": 1.3578342688376552, + "grad_norm": 0.09525876492261887, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 351250 + }, + { + "epoch": 1.3578729260410385, + "grad_norm": 0.12362363934516907, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 351260 + }, + { + "epoch": 1.3579115832444217, + "grad_norm": 0.1071556881070137, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 351270 + }, + { + "epoch": 1.357950240447805, + "grad_norm": 0.09683693200349808, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 351280 + }, + { + "epoch": 1.3579888976511882, + "grad_norm": 0.10740751773118973, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 351290 + }, + { + "epoch": 1.3580275548545715, + "grad_norm": 0.09097153693437576, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 351300 + }, + { + "epoch": 1.3580662120579547, + "grad_norm": 0.1114463284611702, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 351310 + }, + { + "epoch": 1.3581048692613382, + "grad_norm": 0.14950144290924072, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 351320 + }, + { + "epoch": 1.3581435264647215, + "grad_norm": 0.08972033858299255, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 351330 + }, + { + "epoch": 1.3581821836681047, + "grad_norm": 0.1413266509771347, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 351340 + }, + { + "epoch": 1.358220840871488, + "grad_norm": 0.11101408302783966, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 351350 + }, + { + "epoch": 1.3582594980748712, + "grad_norm": 0.09727130085229874, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 351360 + }, + { + "epoch": 1.3582981552782545, + "grad_norm": 0.11083702743053436, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 351370 + }, + { + "epoch": 1.358336812481638, + "grad_norm": 0.09823834896087646, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 351380 + }, + { + "epoch": 1.3583754696850212, + "grad_norm": 0.11632708460092545, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 351390 + }, + { + "epoch": 1.3584141268884045, + "grad_norm": 0.13511040806770325, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 351400 + }, + { + "epoch": 1.3584527840917877, + "grad_norm": 0.10550139844417572, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 351410 + }, + { + "epoch": 1.358491441295171, + "grad_norm": 0.0951557382941246, + "learning_rate": 0.002, + "loss": 2.33, + "step": 351420 + }, + { + "epoch": 1.3585300984985542, + "grad_norm": 0.10970769822597504, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 351430 + }, + { + "epoch": 1.3585687557019375, + "grad_norm": 0.11074388772249222, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 351440 + }, + { + "epoch": 1.3586074129053207, + "grad_norm": 0.08945836871862411, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 351450 + }, + { + "epoch": 1.358646070108704, + "grad_norm": 0.10373366624116898, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 351460 + }, + { + "epoch": 1.3586847273120872, + "grad_norm": 0.09923812747001648, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 351470 + }, + { + "epoch": 1.3587233845154705, + "grad_norm": 0.11788978427648544, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 351480 + }, + { + "epoch": 1.358762041718854, + "grad_norm": 0.11119283735752106, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 351490 + }, + { + "epoch": 1.3588006989222372, + "grad_norm": 0.09925679862499237, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 351500 + }, + { + "epoch": 1.3588393561256205, + "grad_norm": 0.10013158619403839, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 351510 + }, + { + "epoch": 1.3588780133290037, + "grad_norm": 0.11524486541748047, + "learning_rate": 0.002, + "loss": 2.34, + "step": 351520 + }, + { + "epoch": 1.358916670532387, + "grad_norm": 0.10622724145650864, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 351530 + }, + { + "epoch": 1.3589553277357702, + "grad_norm": 0.09704194217920303, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 351540 + }, + { + "epoch": 1.3589939849391537, + "grad_norm": 0.11838492751121521, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 351550 + }, + { + "epoch": 1.359032642142537, + "grad_norm": 0.0970839112997055, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 351560 + }, + { + "epoch": 1.3590712993459202, + "grad_norm": 0.10759337991476059, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 351570 + }, + { + "epoch": 1.3591099565493034, + "grad_norm": 0.0967593565583229, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 351580 + }, + { + "epoch": 1.3591486137526867, + "grad_norm": 0.10393026471138, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 351590 + }, + { + "epoch": 1.35918727095607, + "grad_norm": 0.10174359381198883, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 351600 + }, + { + "epoch": 1.3592259281594532, + "grad_norm": 0.09679939597845078, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 351610 + }, + { + "epoch": 1.3592645853628365, + "grad_norm": 0.10922277718782425, + "learning_rate": 0.002, + "loss": 2.329, + "step": 351620 + }, + { + "epoch": 1.3593032425662197, + "grad_norm": 0.14478425681591034, + "learning_rate": 0.002, + "loss": 2.332, + "step": 351630 + }, + { + "epoch": 1.359341899769603, + "grad_norm": 0.10015156120061874, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 351640 + }, + { + "epoch": 1.3593805569729862, + "grad_norm": 0.12160390615463257, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 351650 + }, + { + "epoch": 1.3594192141763697, + "grad_norm": 0.10180576890707016, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 351660 + }, + { + "epoch": 1.359457871379753, + "grad_norm": 0.10145292431116104, + "learning_rate": 0.002, + "loss": 2.331, + "step": 351670 + }, + { + "epoch": 1.3594965285831362, + "grad_norm": 0.14017429947853088, + "learning_rate": 0.002, + "loss": 2.338, + "step": 351680 + }, + { + "epoch": 1.3595351857865194, + "grad_norm": 0.10480217635631561, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 351690 + }, + { + "epoch": 1.3595738429899027, + "grad_norm": 0.0997604951262474, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 351700 + }, + { + "epoch": 1.359612500193286, + "grad_norm": 0.10736975818872452, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 351710 + }, + { + "epoch": 1.3596511573966694, + "grad_norm": 0.09866006672382355, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 351720 + }, + { + "epoch": 1.3596898146000527, + "grad_norm": 0.10628994554281235, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 351730 + }, + { + "epoch": 1.359728471803436, + "grad_norm": 0.11439824104309082, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 351740 + }, + { + "epoch": 1.3597671290068192, + "grad_norm": 0.10647710412740707, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 351750 + }, + { + "epoch": 1.3598057862102024, + "grad_norm": 0.09515728056430817, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 351760 + }, + { + "epoch": 1.3598444434135857, + "grad_norm": 0.1239086389541626, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 351770 + }, + { + "epoch": 1.359883100616969, + "grad_norm": 0.10595386475324631, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 351780 + }, + { + "epoch": 1.3599217578203522, + "grad_norm": 0.09424883872270584, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 351790 + }, + { + "epoch": 1.3599604150237354, + "grad_norm": 0.10692436248064041, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 351800 + }, + { + "epoch": 1.3599990722271187, + "grad_norm": 0.10175547748804092, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 351810 + }, + { + "epoch": 1.3600377294305022, + "grad_norm": 0.09597337990999222, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 351820 + }, + { + "epoch": 1.3600763866338854, + "grad_norm": 0.12366979569196701, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 351830 + }, + { + "epoch": 1.3601150438372687, + "grad_norm": 0.0925799235701561, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 351840 + }, + { + "epoch": 1.360153701040652, + "grad_norm": 0.10603862255811691, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 351850 + }, + { + "epoch": 1.3601923582440352, + "grad_norm": 0.12175677716732025, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 351860 + }, + { + "epoch": 1.3602310154474184, + "grad_norm": 0.10555287450551987, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 351870 + }, + { + "epoch": 1.3602696726508017, + "grad_norm": 0.10692015290260315, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 351880 + }, + { + "epoch": 1.3603083298541851, + "grad_norm": 0.10687336325645447, + "learning_rate": 0.002, + "loss": 2.332, + "step": 351890 + }, + { + "epoch": 1.3603469870575684, + "grad_norm": 0.10911957919597626, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 351900 + }, + { + "epoch": 1.3603856442609517, + "grad_norm": 0.13761015236377716, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 351910 + }, + { + "epoch": 1.360424301464335, + "grad_norm": 0.108570896089077, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 351920 + }, + { + "epoch": 1.3604629586677182, + "grad_norm": 0.10664588212966919, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 351930 + }, + { + "epoch": 1.3605016158711014, + "grad_norm": 0.11082687228918076, + "learning_rate": 0.002, + "loss": 2.347, + "step": 351940 + }, + { + "epoch": 1.3605402730744847, + "grad_norm": 0.11756327748298645, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 351950 + }, + { + "epoch": 1.360578930277868, + "grad_norm": 0.10725127905607224, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 351960 + }, + { + "epoch": 1.3606175874812512, + "grad_norm": 0.1116841658949852, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 351970 + }, + { + "epoch": 1.3606562446846344, + "grad_norm": 0.10605955868959427, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 351980 + }, + { + "epoch": 1.360694901888018, + "grad_norm": 0.10080024600028992, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 351990 + }, + { + "epoch": 1.3607335590914011, + "grad_norm": 0.12175345420837402, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 352000 + }, + { + "epoch": 1.3607722162947844, + "grad_norm": 0.10977376252412796, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 352010 + }, + { + "epoch": 1.3608108734981677, + "grad_norm": 0.17455539107322693, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 352020 + }, + { + "epoch": 1.360849530701551, + "grad_norm": 0.10920149087905884, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 352030 + }, + { + "epoch": 1.3608881879049342, + "grad_norm": 0.09943852573633194, + "learning_rate": 0.002, + "loss": 2.317, + "step": 352040 + }, + { + "epoch": 1.3609268451083174, + "grad_norm": 0.09689683467149734, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 352050 + }, + { + "epoch": 1.3609655023117009, + "grad_norm": 0.13160747289657593, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 352060 + }, + { + "epoch": 1.3610041595150841, + "grad_norm": 0.11040982604026794, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 352070 + }, + { + "epoch": 1.3610428167184674, + "grad_norm": 0.10374343395233154, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 352080 + }, + { + "epoch": 1.3610814739218506, + "grad_norm": 0.10358106344938278, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 352090 + }, + { + "epoch": 1.361120131125234, + "grad_norm": 0.1339714080095291, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 352100 + }, + { + "epoch": 1.3611587883286171, + "grad_norm": 0.12838685512542725, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 352110 + }, + { + "epoch": 1.3611974455320004, + "grad_norm": 0.11431363970041275, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 352120 + }, + { + "epoch": 1.3612361027353836, + "grad_norm": 0.10388395935297012, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 352130 + }, + { + "epoch": 1.361274759938767, + "grad_norm": 0.10466761142015457, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 352140 + }, + { + "epoch": 1.3613134171421502, + "grad_norm": 0.10688184946775436, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 352150 + }, + { + "epoch": 1.3613520743455336, + "grad_norm": 0.10324890166521072, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 352160 + }, + { + "epoch": 1.3613907315489169, + "grad_norm": 0.10610228776931763, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 352170 + }, + { + "epoch": 1.3614293887523001, + "grad_norm": 0.12272738665342331, + "learning_rate": 0.002, + "loss": 2.33, + "step": 352180 + }, + { + "epoch": 1.3614680459556834, + "grad_norm": 0.18759821355342865, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 352190 + }, + { + "epoch": 1.3615067031590666, + "grad_norm": 0.10147456079721451, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 352200 + }, + { + "epoch": 1.3615453603624499, + "grad_norm": 0.10508518666028976, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 352210 + }, + { + "epoch": 1.3615840175658331, + "grad_norm": 0.10307451337575912, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 352220 + }, + { + "epoch": 1.3616226747692166, + "grad_norm": 0.11710530519485474, + "learning_rate": 0.002, + "loss": 2.321, + "step": 352230 + }, + { + "epoch": 1.3616613319725999, + "grad_norm": 0.10675527155399323, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 352240 + }, + { + "epoch": 1.3616999891759831, + "grad_norm": 0.09699858725070953, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 352250 + }, + { + "epoch": 1.3617386463793664, + "grad_norm": 0.11721307784318924, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 352260 + }, + { + "epoch": 1.3617773035827496, + "grad_norm": 0.10319364070892334, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 352270 + }, + { + "epoch": 1.3618159607861329, + "grad_norm": 0.09261713922023773, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 352280 + }, + { + "epoch": 1.3618546179895161, + "grad_norm": 0.09714782238006592, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 352290 + }, + { + "epoch": 1.3618932751928994, + "grad_norm": 0.11221813410520554, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 352300 + }, + { + "epoch": 1.3619319323962826, + "grad_norm": 0.10760895907878876, + "learning_rate": 0.002, + "loss": 2.3123, + "step": 352310 + }, + { + "epoch": 1.3619705895996659, + "grad_norm": 0.10258577764034271, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 352320 + }, + { + "epoch": 1.3620092468030494, + "grad_norm": 0.11032426357269287, + "learning_rate": 0.002, + "loss": 2.3552, + "step": 352330 + }, + { + "epoch": 1.3620479040064326, + "grad_norm": 0.10629115253686905, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 352340 + }, + { + "epoch": 1.3620865612098159, + "grad_norm": 0.11356248706579208, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 352350 + }, + { + "epoch": 1.3621252184131991, + "grad_norm": 0.1022658571600914, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 352360 + }, + { + "epoch": 1.3621638756165824, + "grad_norm": 0.13350942730903625, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 352370 + }, + { + "epoch": 1.3622025328199656, + "grad_norm": 0.10261652618646622, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 352380 + }, + { + "epoch": 1.3622411900233489, + "grad_norm": 0.1132517158985138, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 352390 + }, + { + "epoch": 1.3622798472267323, + "grad_norm": 0.09861195087432861, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 352400 + }, + { + "epoch": 1.3623185044301156, + "grad_norm": 0.10845063626766205, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 352410 + }, + { + "epoch": 1.3623571616334988, + "grad_norm": 0.13102145493030548, + "learning_rate": 0.002, + "loss": 2.329, + "step": 352420 + }, + { + "epoch": 1.362395818836882, + "grad_norm": 0.1068115234375, + "learning_rate": 0.002, + "loss": 2.335, + "step": 352430 + }, + { + "epoch": 1.3624344760402654, + "grad_norm": 0.10941077768802643, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 352440 + }, + { + "epoch": 1.3624731332436486, + "grad_norm": 0.11071078479290009, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 352450 + }, + { + "epoch": 1.3625117904470319, + "grad_norm": 0.1088944599032402, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 352460 + }, + { + "epoch": 1.362550447650415, + "grad_norm": 0.09720829129219055, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 352470 + }, + { + "epoch": 1.3625891048537984, + "grad_norm": 0.18038299679756165, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 352480 + }, + { + "epoch": 1.3626277620571816, + "grad_norm": 0.1162460446357727, + "learning_rate": 0.002, + "loss": 2.332, + "step": 352490 + }, + { + "epoch": 1.362666419260565, + "grad_norm": 0.1099858358502388, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 352500 + }, + { + "epoch": 1.3627050764639483, + "grad_norm": 0.11105599254369736, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 352510 + }, + { + "epoch": 1.3627437336673316, + "grad_norm": 0.1036272943019867, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 352520 + }, + { + "epoch": 1.3627823908707148, + "grad_norm": 0.10909338295459747, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 352530 + }, + { + "epoch": 1.362821048074098, + "grad_norm": 0.12408467382192612, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 352540 + }, + { + "epoch": 1.3628597052774813, + "grad_norm": 0.11634545773267746, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 352550 + }, + { + "epoch": 1.3628983624808648, + "grad_norm": 0.10679116100072861, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 352560 + }, + { + "epoch": 1.362937019684248, + "grad_norm": 0.09850242733955383, + "learning_rate": 0.002, + "loss": 2.342, + "step": 352570 + }, + { + "epoch": 1.3629756768876313, + "grad_norm": 0.1210622563958168, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 352580 + }, + { + "epoch": 1.3630143340910146, + "grad_norm": 0.11602368950843811, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 352590 + }, + { + "epoch": 1.3630529912943978, + "grad_norm": 0.09763062745332718, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 352600 + }, + { + "epoch": 1.363091648497781, + "grad_norm": 0.09681380540132523, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 352610 + }, + { + "epoch": 1.3631303057011643, + "grad_norm": 0.10678580403327942, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 352620 + }, + { + "epoch": 1.3631689629045476, + "grad_norm": 0.10410351306200027, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 352630 + }, + { + "epoch": 1.3632076201079308, + "grad_norm": 0.10169476270675659, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 352640 + }, + { + "epoch": 1.363246277311314, + "grad_norm": 0.11437112092971802, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 352650 + }, + { + "epoch": 1.3632849345146973, + "grad_norm": 0.1199316531419754, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 352660 + }, + { + "epoch": 1.3633235917180808, + "grad_norm": 0.09013929218053818, + "learning_rate": 0.002, + "loss": 2.33, + "step": 352670 + }, + { + "epoch": 1.363362248921464, + "grad_norm": 0.09707517921924591, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 352680 + }, + { + "epoch": 1.3634009061248473, + "grad_norm": 0.11999980360269547, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 352690 + }, + { + "epoch": 1.3634395633282306, + "grad_norm": 0.1064855232834816, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 352700 + }, + { + "epoch": 1.3634782205316138, + "grad_norm": 0.11436594277620316, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 352710 + }, + { + "epoch": 1.363516877734997, + "grad_norm": 0.10475486516952515, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 352720 + }, + { + "epoch": 1.3635555349383806, + "grad_norm": 0.10481761395931244, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 352730 + }, + { + "epoch": 1.3635941921417638, + "grad_norm": 0.09346312284469604, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 352740 + }, + { + "epoch": 1.363632849345147, + "grad_norm": 0.10701143741607666, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 352750 + }, + { + "epoch": 1.3636715065485303, + "grad_norm": 0.10135741531848907, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 352760 + }, + { + "epoch": 1.3637101637519136, + "grad_norm": 0.13325202465057373, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 352770 + }, + { + "epoch": 1.3637488209552968, + "grad_norm": 0.09188321977853775, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 352780 + }, + { + "epoch": 1.36378747815868, + "grad_norm": 0.10058906674385071, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 352790 + }, + { + "epoch": 1.3638261353620633, + "grad_norm": 0.09714401513338089, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 352800 + }, + { + "epoch": 1.3638647925654466, + "grad_norm": 0.11655130237340927, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 352810 + }, + { + "epoch": 1.3639034497688298, + "grad_norm": 0.1096164807677269, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 352820 + }, + { + "epoch": 1.363942106972213, + "grad_norm": 0.10694403201341629, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 352830 + }, + { + "epoch": 1.3639807641755965, + "grad_norm": 0.1013718917965889, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 352840 + }, + { + "epoch": 1.3640194213789798, + "grad_norm": 0.10866200178861618, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 352850 + }, + { + "epoch": 1.364058078582363, + "grad_norm": 0.10821247845888138, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 352860 + }, + { + "epoch": 1.3640967357857463, + "grad_norm": 0.09890243411064148, + "learning_rate": 0.002, + "loss": 2.35, + "step": 352870 + }, + { + "epoch": 1.3641353929891296, + "grad_norm": 0.11495185643434525, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 352880 + }, + { + "epoch": 1.3641740501925128, + "grad_norm": 0.09019946306943893, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 352890 + }, + { + "epoch": 1.3642127073958963, + "grad_norm": 0.11374910175800323, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 352900 + }, + { + "epoch": 1.3642513645992795, + "grad_norm": 0.10232894122600555, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 352910 + }, + { + "epoch": 1.3642900218026628, + "grad_norm": 0.10844322293996811, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 352920 + }, + { + "epoch": 1.364328679006046, + "grad_norm": 0.10210633277893066, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 352930 + }, + { + "epoch": 1.3643673362094293, + "grad_norm": 0.09644097834825516, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 352940 + }, + { + "epoch": 1.3644059934128125, + "grad_norm": 0.10927636176347733, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 352950 + }, + { + "epoch": 1.3644446506161958, + "grad_norm": 0.10157135128974915, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 352960 + }, + { + "epoch": 1.364483307819579, + "grad_norm": 0.11467833817005157, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 352970 + }, + { + "epoch": 1.3645219650229623, + "grad_norm": 0.1262151449918747, + "learning_rate": 0.002, + "loss": 2.3549, + "step": 352980 + }, + { + "epoch": 1.3645606222263456, + "grad_norm": 0.10918152332305908, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 352990 + }, + { + "epoch": 1.3645992794297288, + "grad_norm": 0.09364178776741028, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 353000 + }, + { + "epoch": 1.3646379366331123, + "grad_norm": 0.11354406923055649, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 353010 + }, + { + "epoch": 1.3646765938364955, + "grad_norm": 0.10014007240533829, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 353020 + }, + { + "epoch": 1.3647152510398788, + "grad_norm": 0.11784714460372925, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 353030 + }, + { + "epoch": 1.364753908243262, + "grad_norm": 0.10475286841392517, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 353040 + }, + { + "epoch": 1.3647925654466453, + "grad_norm": 0.1111367717385292, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 353050 + }, + { + "epoch": 1.3648312226500285, + "grad_norm": 0.10144095867872238, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 353060 + }, + { + "epoch": 1.364869879853412, + "grad_norm": 0.10360066592693329, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 353070 + }, + { + "epoch": 1.3649085370567953, + "grad_norm": 0.11523858457803726, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 353080 + }, + { + "epoch": 1.3649471942601785, + "grad_norm": 0.09359196573495865, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 353090 + }, + { + "epoch": 1.3649858514635618, + "grad_norm": 0.11894936114549637, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 353100 + }, + { + "epoch": 1.365024508666945, + "grad_norm": 0.09457987546920776, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 353110 + }, + { + "epoch": 1.3650631658703283, + "grad_norm": 0.1078084260225296, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 353120 + }, + { + "epoch": 1.3651018230737115, + "grad_norm": 0.09385760128498077, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 353130 + }, + { + "epoch": 1.3651404802770948, + "grad_norm": 0.0927666574716568, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 353140 + }, + { + "epoch": 1.365179137480478, + "grad_norm": 0.1224551871418953, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 353150 + }, + { + "epoch": 1.3652177946838613, + "grad_norm": 0.0971512421965599, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 353160 + }, + { + "epoch": 1.3652564518872445, + "grad_norm": 0.1582522988319397, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 353170 + }, + { + "epoch": 1.365295109090628, + "grad_norm": 0.11045419424772263, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 353180 + }, + { + "epoch": 1.3653337662940113, + "grad_norm": 0.09811891615390778, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 353190 + }, + { + "epoch": 1.3653724234973945, + "grad_norm": 0.12702874839305878, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 353200 + }, + { + "epoch": 1.3654110807007778, + "grad_norm": 0.09151419252157211, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 353210 + }, + { + "epoch": 1.365449737904161, + "grad_norm": 0.157108873128891, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 353220 + }, + { + "epoch": 1.3654883951075443, + "grad_norm": 0.44005656242370605, + "learning_rate": 0.002, + "loss": 2.337, + "step": 353230 + }, + { + "epoch": 1.3655270523109277, + "grad_norm": 0.11164695769548416, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 353240 + }, + { + "epoch": 1.365565709514311, + "grad_norm": 0.12131067365407944, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 353250 + }, + { + "epoch": 1.3656043667176943, + "grad_norm": 0.09036579728126526, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 353260 + }, + { + "epoch": 1.3656430239210775, + "grad_norm": 0.11555355042219162, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 353270 + }, + { + "epoch": 1.3656816811244608, + "grad_norm": 0.09683941304683685, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 353280 + }, + { + "epoch": 1.365720338327844, + "grad_norm": 0.09810633212327957, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 353290 + }, + { + "epoch": 1.3657589955312273, + "grad_norm": 0.1062944084405899, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 353300 + }, + { + "epoch": 1.3657976527346105, + "grad_norm": 0.1608039289712906, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 353310 + }, + { + "epoch": 1.3658363099379938, + "grad_norm": 0.10838613659143448, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 353320 + }, + { + "epoch": 1.365874967141377, + "grad_norm": 0.12178441137075424, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 353330 + }, + { + "epoch": 1.3659136243447603, + "grad_norm": 0.11368858814239502, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 353340 + }, + { + "epoch": 1.3659522815481437, + "grad_norm": 0.09420602023601532, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 353350 + }, + { + "epoch": 1.365990938751527, + "grad_norm": 0.11776195466518402, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 353360 + }, + { + "epoch": 1.3660295959549102, + "grad_norm": 0.10489777475595474, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 353370 + }, + { + "epoch": 1.3660682531582935, + "grad_norm": 0.11508101224899292, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 353380 + }, + { + "epoch": 1.3661069103616768, + "grad_norm": 0.1090180054306984, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 353390 + }, + { + "epoch": 1.36614556756506, + "grad_norm": 0.10367222130298615, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 353400 + }, + { + "epoch": 1.3661842247684435, + "grad_norm": 0.10160892456769943, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 353410 + }, + { + "epoch": 1.3662228819718267, + "grad_norm": 0.1016288623213768, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 353420 + }, + { + "epoch": 1.36626153917521, + "grad_norm": 0.09615586698055267, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 353430 + }, + { + "epoch": 1.3663001963785932, + "grad_norm": 0.09929027408361435, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 353440 + }, + { + "epoch": 1.3663388535819765, + "grad_norm": 0.11279615014791489, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 353450 + }, + { + "epoch": 1.3663775107853597, + "grad_norm": 0.10219208896160126, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 353460 + }, + { + "epoch": 1.366416167988743, + "grad_norm": 0.10220801830291748, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 353470 + }, + { + "epoch": 1.3664548251921262, + "grad_norm": 0.10270463675260544, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 353480 + }, + { + "epoch": 1.3664934823955095, + "grad_norm": 0.10466257482767105, + "learning_rate": 0.002, + "loss": 2.347, + "step": 353490 + }, + { + "epoch": 1.3665321395988927, + "grad_norm": 0.09947629272937775, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 353500 + }, + { + "epoch": 1.366570796802276, + "grad_norm": 0.09577259421348572, + "learning_rate": 0.002, + "loss": 2.322, + "step": 353510 + }, + { + "epoch": 1.3666094540056595, + "grad_norm": 0.0886072888970375, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 353520 + }, + { + "epoch": 1.3666481112090427, + "grad_norm": 0.11046717315912247, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 353530 + }, + { + "epoch": 1.366686768412426, + "grad_norm": 0.10688751935958862, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 353540 + }, + { + "epoch": 1.3667254256158092, + "grad_norm": 0.0929543748497963, + "learning_rate": 0.002, + "loss": 2.335, + "step": 353550 + }, + { + "epoch": 1.3667640828191925, + "grad_norm": 0.09754712879657745, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 353560 + }, + { + "epoch": 1.3668027400225757, + "grad_norm": 0.11045172810554504, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 353570 + }, + { + "epoch": 1.3668413972259592, + "grad_norm": 0.10240747779607773, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 353580 + }, + { + "epoch": 1.3668800544293425, + "grad_norm": 0.11092318594455719, + "learning_rate": 0.002, + "loss": 2.349, + "step": 353590 + }, + { + "epoch": 1.3669187116327257, + "grad_norm": 0.10332795232534409, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 353600 + }, + { + "epoch": 1.366957368836109, + "grad_norm": 0.10368195176124573, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 353610 + }, + { + "epoch": 1.3669960260394922, + "grad_norm": 0.11492601782083511, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 353620 + }, + { + "epoch": 1.3670346832428755, + "grad_norm": 0.10416862368583679, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 353630 + }, + { + "epoch": 1.3670733404462587, + "grad_norm": 0.10875711590051651, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 353640 + }, + { + "epoch": 1.367111997649642, + "grad_norm": 0.09621744602918625, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 353650 + }, + { + "epoch": 1.3671506548530252, + "grad_norm": 0.10564836859703064, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 353660 + }, + { + "epoch": 1.3671893120564085, + "grad_norm": 0.12837018072605133, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 353670 + }, + { + "epoch": 1.3672279692597917, + "grad_norm": 0.10778024792671204, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 353680 + }, + { + "epoch": 1.3672666264631752, + "grad_norm": 0.11407023668289185, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 353690 + }, + { + "epoch": 1.3673052836665585, + "grad_norm": 0.12429900467395782, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 353700 + }, + { + "epoch": 1.3673439408699417, + "grad_norm": 0.09666662663221359, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 353710 + }, + { + "epoch": 1.367382598073325, + "grad_norm": 0.10815172642469406, + "learning_rate": 0.002, + "loss": 2.3107, + "step": 353720 + }, + { + "epoch": 1.3674212552767082, + "grad_norm": 0.10770297050476074, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 353730 + }, + { + "epoch": 1.3674599124800915, + "grad_norm": 0.10925965011119843, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 353740 + }, + { + "epoch": 1.367498569683475, + "grad_norm": 0.114189013838768, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 353750 + }, + { + "epoch": 1.3675372268868582, + "grad_norm": 0.11431930959224701, + "learning_rate": 0.002, + "loss": 2.328, + "step": 353760 + }, + { + "epoch": 1.3675758840902414, + "grad_norm": 0.1838698387145996, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 353770 + }, + { + "epoch": 1.3676145412936247, + "grad_norm": 0.10204703360795975, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 353780 + }, + { + "epoch": 1.367653198497008, + "grad_norm": 0.10525187104940414, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 353790 + }, + { + "epoch": 1.3676918557003912, + "grad_norm": 0.11447043716907501, + "learning_rate": 0.002, + "loss": 2.33, + "step": 353800 + }, + { + "epoch": 1.3677305129037745, + "grad_norm": 0.10773241519927979, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 353810 + }, + { + "epoch": 1.3677691701071577, + "grad_norm": 0.11622647196054459, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 353820 + }, + { + "epoch": 1.367807827310541, + "grad_norm": 0.13911838829517365, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 353830 + }, + { + "epoch": 1.3678464845139242, + "grad_norm": 0.11007165908813477, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 353840 + }, + { + "epoch": 1.3678851417173077, + "grad_norm": 0.09723573178052902, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 353850 + }, + { + "epoch": 1.367923798920691, + "grad_norm": 0.10949946939945221, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 353860 + }, + { + "epoch": 1.3679624561240742, + "grad_norm": 0.09649864584207535, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 353870 + }, + { + "epoch": 1.3680011133274574, + "grad_norm": 0.09300088882446289, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 353880 + }, + { + "epoch": 1.3680397705308407, + "grad_norm": 0.11034957319498062, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 353890 + }, + { + "epoch": 1.368078427734224, + "grad_norm": 0.10465884208679199, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 353900 + }, + { + "epoch": 1.3681170849376072, + "grad_norm": 0.12185883522033691, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 353910 + }, + { + "epoch": 1.3681557421409907, + "grad_norm": 0.08666864782571793, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 353920 + }, + { + "epoch": 1.368194399344374, + "grad_norm": 0.11534685641527176, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 353930 + }, + { + "epoch": 1.3682330565477572, + "grad_norm": 0.12451288104057312, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 353940 + }, + { + "epoch": 1.3682717137511404, + "grad_norm": 0.10793226212263107, + "learning_rate": 0.002, + "loss": 2.334, + "step": 353950 + }, + { + "epoch": 1.3683103709545237, + "grad_norm": 0.09760882705450058, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 353960 + }, + { + "epoch": 1.368349028157907, + "grad_norm": 0.09534558653831482, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 353970 + }, + { + "epoch": 1.3683876853612902, + "grad_norm": 0.10450071841478348, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 353980 + }, + { + "epoch": 1.3684263425646734, + "grad_norm": 0.09383483231067657, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 353990 + }, + { + "epoch": 1.3684649997680567, + "grad_norm": 0.11411093175411224, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 354000 + }, + { + "epoch": 1.36850365697144, + "grad_norm": 0.10227649658918381, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 354010 + }, + { + "epoch": 1.3685423141748234, + "grad_norm": 0.10927411168813705, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 354020 + }, + { + "epoch": 1.3685809713782067, + "grad_norm": 0.14527519047260284, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 354030 + }, + { + "epoch": 1.36861962858159, + "grad_norm": 0.10132671892642975, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 354040 + }, + { + "epoch": 1.3686582857849732, + "grad_norm": 0.10364679992198944, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 354050 + }, + { + "epoch": 1.3686969429883564, + "grad_norm": 0.09082487225532532, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 354060 + }, + { + "epoch": 1.3687356001917397, + "grad_norm": 0.10917646437883377, + "learning_rate": 0.002, + "loss": 2.341, + "step": 354070 + }, + { + "epoch": 1.368774257395123, + "grad_norm": 0.11109345406293869, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 354080 + }, + { + "epoch": 1.3688129145985064, + "grad_norm": 0.10947670042514801, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 354090 + }, + { + "epoch": 1.3688515718018897, + "grad_norm": 0.20453643798828125, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 354100 + }, + { + "epoch": 1.368890229005273, + "grad_norm": 0.260464608669281, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 354110 + }, + { + "epoch": 1.3689288862086562, + "grad_norm": 0.10048062354326248, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 354120 + }, + { + "epoch": 1.3689675434120394, + "grad_norm": 0.09973052889108658, + "learning_rate": 0.002, + "loss": 2.347, + "step": 354130 + }, + { + "epoch": 1.3690062006154227, + "grad_norm": 0.10623440146446228, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 354140 + }, + { + "epoch": 1.369044857818806, + "grad_norm": 0.10756231099367142, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 354150 + }, + { + "epoch": 1.3690835150221892, + "grad_norm": 0.09614838659763336, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 354160 + }, + { + "epoch": 1.3691221722255724, + "grad_norm": 0.099722720682621, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 354170 + }, + { + "epoch": 1.3691608294289557, + "grad_norm": 0.10293753445148468, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 354180 + }, + { + "epoch": 1.3691994866323391, + "grad_norm": 0.09563116729259491, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 354190 + }, + { + "epoch": 1.3692381438357224, + "grad_norm": 0.10133104026317596, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 354200 + }, + { + "epoch": 1.3692768010391057, + "grad_norm": 0.13343344628810883, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 354210 + }, + { + "epoch": 1.369315458242489, + "grad_norm": 0.09180284291505814, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 354220 + }, + { + "epoch": 1.3693541154458722, + "grad_norm": 0.11000876873731613, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 354230 + }, + { + "epoch": 1.3693927726492554, + "grad_norm": 0.1341230273246765, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 354240 + }, + { + "epoch": 1.3694314298526387, + "grad_norm": 0.09811849147081375, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 354250 + }, + { + "epoch": 1.3694700870560221, + "grad_norm": 0.11047971248626709, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 354260 + }, + { + "epoch": 1.3695087442594054, + "grad_norm": 0.09250407665967941, + "learning_rate": 0.002, + "loss": 2.332, + "step": 354270 + }, + { + "epoch": 1.3695474014627886, + "grad_norm": 0.11461371183395386, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 354280 + }, + { + "epoch": 1.369586058666172, + "grad_norm": 0.10704251378774643, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 354290 + }, + { + "epoch": 1.3696247158695551, + "grad_norm": 0.10264694690704346, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 354300 + }, + { + "epoch": 1.3696633730729384, + "grad_norm": 0.10288636386394501, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 354310 + }, + { + "epoch": 1.3697020302763216, + "grad_norm": 0.10215355455875397, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 354320 + }, + { + "epoch": 1.369740687479705, + "grad_norm": 0.1196942999958992, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 354330 + }, + { + "epoch": 1.3697793446830882, + "grad_norm": 0.11092463880777359, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 354340 + }, + { + "epoch": 1.3698180018864714, + "grad_norm": 0.11861385405063629, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 354350 + }, + { + "epoch": 1.3698566590898549, + "grad_norm": 0.1024026945233345, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 354360 + }, + { + "epoch": 1.3698953162932381, + "grad_norm": 0.11804378032684326, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 354370 + }, + { + "epoch": 1.3699339734966214, + "grad_norm": 0.11109177023172379, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 354380 + }, + { + "epoch": 1.3699726307000046, + "grad_norm": 0.1200912669301033, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 354390 + }, + { + "epoch": 1.3700112879033879, + "grad_norm": 0.09660663455724716, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 354400 + }, + { + "epoch": 1.3700499451067711, + "grad_norm": 0.11022187024354935, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 354410 + }, + { + "epoch": 1.3700886023101546, + "grad_norm": 0.10194243490695953, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 354420 + }, + { + "epoch": 1.3701272595135379, + "grad_norm": 0.10244203358888626, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 354430 + }, + { + "epoch": 1.3701659167169211, + "grad_norm": 0.10079459100961685, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 354440 + }, + { + "epoch": 1.3702045739203044, + "grad_norm": 0.09805863350629807, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 354450 + }, + { + "epoch": 1.3702432311236876, + "grad_norm": 0.0914018452167511, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 354460 + }, + { + "epoch": 1.3702818883270709, + "grad_norm": 0.09827131032943726, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 354470 + }, + { + "epoch": 1.3703205455304541, + "grad_norm": 0.10076434910297394, + "learning_rate": 0.002, + "loss": 2.333, + "step": 354480 + }, + { + "epoch": 1.3703592027338374, + "grad_norm": 0.10175915062427521, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 354490 + }, + { + "epoch": 1.3703978599372206, + "grad_norm": 0.1084328219294548, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 354500 + }, + { + "epoch": 1.3704365171406039, + "grad_norm": 0.10804764926433563, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 354510 + }, + { + "epoch": 1.3704751743439871, + "grad_norm": 0.10715720057487488, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 354520 + }, + { + "epoch": 1.3705138315473706, + "grad_norm": 0.10957840830087662, + "learning_rate": 0.002, + "loss": 2.328, + "step": 354530 + }, + { + "epoch": 1.3705524887507539, + "grad_norm": 0.09168438613414764, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 354540 + }, + { + "epoch": 1.3705911459541371, + "grad_norm": 0.10639364272356033, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 354550 + }, + { + "epoch": 1.3706298031575204, + "grad_norm": 0.09193290770053864, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 354560 + }, + { + "epoch": 1.3706684603609036, + "grad_norm": 0.11018170416355133, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 354570 + }, + { + "epoch": 1.3707071175642869, + "grad_norm": 0.15490789711475372, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 354580 + }, + { + "epoch": 1.3707457747676703, + "grad_norm": 0.09622285515069962, + "learning_rate": 0.002, + "loss": 2.323, + "step": 354590 + }, + { + "epoch": 1.3707844319710536, + "grad_norm": 0.0934017077088356, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 354600 + }, + { + "epoch": 1.3708230891744368, + "grad_norm": 0.09973679482936859, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 354610 + }, + { + "epoch": 1.37086174637782, + "grad_norm": 0.09377430379390717, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 354620 + }, + { + "epoch": 1.3709004035812034, + "grad_norm": 0.10160693526268005, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 354630 + }, + { + "epoch": 1.3709390607845866, + "grad_norm": 0.11587730795145035, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 354640 + }, + { + "epoch": 1.3709777179879699, + "grad_norm": 0.10981699824333191, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 354650 + }, + { + "epoch": 1.371016375191353, + "grad_norm": 0.11354506760835648, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 354660 + }, + { + "epoch": 1.3710550323947364, + "grad_norm": 0.09859279543161392, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 354670 + }, + { + "epoch": 1.3710936895981196, + "grad_norm": 0.10488973557949066, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 354680 + }, + { + "epoch": 1.3711323468015029, + "grad_norm": 0.09859029203653336, + "learning_rate": 0.002, + "loss": 2.326, + "step": 354690 + }, + { + "epoch": 1.3711710040048863, + "grad_norm": 0.11239251494407654, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 354700 + }, + { + "epoch": 1.3712096612082696, + "grad_norm": 0.11284992843866348, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 354710 + }, + { + "epoch": 1.3712483184116528, + "grad_norm": 0.12570716440677643, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 354720 + }, + { + "epoch": 1.371286975615036, + "grad_norm": 0.11355306953191757, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 354730 + }, + { + "epoch": 1.3713256328184193, + "grad_norm": 0.14632190763950348, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 354740 + }, + { + "epoch": 1.3713642900218026, + "grad_norm": 0.11878865957260132, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 354750 + }, + { + "epoch": 1.371402947225186, + "grad_norm": 0.11377181857824326, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 354760 + }, + { + "epoch": 1.3714416044285693, + "grad_norm": 0.11107166111469269, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 354770 + }, + { + "epoch": 1.3714802616319526, + "grad_norm": 0.11419668048620224, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 354780 + }, + { + "epoch": 1.3715189188353358, + "grad_norm": 0.09458251297473907, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 354790 + }, + { + "epoch": 1.371557576038719, + "grad_norm": 0.10594692826271057, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 354800 + }, + { + "epoch": 1.3715962332421023, + "grad_norm": 0.10339496284723282, + "learning_rate": 0.002, + "loss": 2.33, + "step": 354810 + }, + { + "epoch": 1.3716348904454856, + "grad_norm": 0.09302469342947006, + "learning_rate": 0.002, + "loss": 2.342, + "step": 354820 + }, + { + "epoch": 1.3716735476488688, + "grad_norm": 0.08864486217498779, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 354830 + }, + { + "epoch": 1.371712204852252, + "grad_norm": 0.11280500888824463, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 354840 + }, + { + "epoch": 1.3717508620556353, + "grad_norm": 0.09763745963573456, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 354850 + }, + { + "epoch": 1.3717895192590186, + "grad_norm": 0.11580676585435867, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 354860 + }, + { + "epoch": 1.371828176462402, + "grad_norm": 0.10815642029047012, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 354870 + }, + { + "epoch": 1.3718668336657853, + "grad_norm": 0.14305159449577332, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 354880 + }, + { + "epoch": 1.3719054908691686, + "grad_norm": 0.10197746008634567, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 354890 + }, + { + "epoch": 1.3719441480725518, + "grad_norm": 0.14328710734844208, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 354900 + }, + { + "epoch": 1.371982805275935, + "grad_norm": 0.11703839153051376, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 354910 + }, + { + "epoch": 1.3720214624793183, + "grad_norm": 0.11150151491165161, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 354920 + }, + { + "epoch": 1.3720601196827018, + "grad_norm": 0.11460588872432709, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 354930 + }, + { + "epoch": 1.372098776886085, + "grad_norm": 0.09653277695178986, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 354940 + }, + { + "epoch": 1.3721374340894683, + "grad_norm": 0.2815849781036377, + "learning_rate": 0.002, + "loss": 2.331, + "step": 354950 + }, + { + "epoch": 1.3721760912928516, + "grad_norm": 0.13114270567893982, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 354960 + }, + { + "epoch": 1.3722147484962348, + "grad_norm": 0.09608525037765503, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 354970 + }, + { + "epoch": 1.372253405699618, + "grad_norm": 0.11423958837985992, + "learning_rate": 0.002, + "loss": 2.3567, + "step": 354980 + }, + { + "epoch": 1.3722920629030013, + "grad_norm": 0.11959332227706909, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 354990 + }, + { + "epoch": 1.3723307201063846, + "grad_norm": 0.09793872386217117, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 355000 + }, + { + "epoch": 1.3723693773097678, + "grad_norm": 0.09575875848531723, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 355010 + }, + { + "epoch": 1.372408034513151, + "grad_norm": 0.1274711787700653, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 355020 + }, + { + "epoch": 1.3724466917165343, + "grad_norm": 0.10284113138914108, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 355030 + }, + { + "epoch": 1.3724853489199178, + "grad_norm": 0.12274546921253204, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 355040 + }, + { + "epoch": 1.372524006123301, + "grad_norm": 0.1329374462366104, + "learning_rate": 0.002, + "loss": 2.34, + "step": 355050 + }, + { + "epoch": 1.3725626633266843, + "grad_norm": 0.11847500503063202, + "learning_rate": 0.002, + "loss": 2.338, + "step": 355060 + }, + { + "epoch": 1.3726013205300676, + "grad_norm": 0.13184745609760284, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 355070 + }, + { + "epoch": 1.3726399777334508, + "grad_norm": 0.10280844569206238, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 355080 + }, + { + "epoch": 1.372678634936834, + "grad_norm": 0.10201731324195862, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 355090 + }, + { + "epoch": 1.3727172921402175, + "grad_norm": 0.09594844281673431, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 355100 + }, + { + "epoch": 1.3727559493436008, + "grad_norm": 0.1228349506855011, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 355110 + }, + { + "epoch": 1.372794606546984, + "grad_norm": 0.09336409717798233, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 355120 + }, + { + "epoch": 1.3728332637503673, + "grad_norm": 0.1344430148601532, + "learning_rate": 0.002, + "loss": 2.329, + "step": 355130 + }, + { + "epoch": 1.3728719209537505, + "grad_norm": 0.1103283166885376, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 355140 + }, + { + "epoch": 1.3729105781571338, + "grad_norm": 0.10305920988321304, + "learning_rate": 0.002, + "loss": 2.323, + "step": 355150 + }, + { + "epoch": 1.372949235360517, + "grad_norm": 0.12979035079479218, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 355160 + }, + { + "epoch": 1.3729878925639003, + "grad_norm": 0.10871124267578125, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 355170 + }, + { + "epoch": 1.3730265497672836, + "grad_norm": 0.09896957129240036, + "learning_rate": 0.002, + "loss": 2.33, + "step": 355180 + }, + { + "epoch": 1.3730652069706668, + "grad_norm": 0.09285487234592438, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 355190 + }, + { + "epoch": 1.37310386417405, + "grad_norm": 0.09617581963539124, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 355200 + }, + { + "epoch": 1.3731425213774335, + "grad_norm": 0.11290497332811356, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 355210 + }, + { + "epoch": 1.3731811785808168, + "grad_norm": 0.11208027601242065, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 355220 + }, + { + "epoch": 1.3732198357842, + "grad_norm": 0.10715988278388977, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 355230 + }, + { + "epoch": 1.3732584929875833, + "grad_norm": 0.11387352645397186, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 355240 + }, + { + "epoch": 1.3732971501909665, + "grad_norm": 0.09642688184976578, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 355250 + }, + { + "epoch": 1.3733358073943498, + "grad_norm": 0.09048770368099213, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 355260 + }, + { + "epoch": 1.3733744645977333, + "grad_norm": 0.0994555652141571, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 355270 + }, + { + "epoch": 1.3734131218011165, + "grad_norm": 0.10325551778078079, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 355280 + }, + { + "epoch": 1.3734517790044998, + "grad_norm": 0.1034950464963913, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 355290 + }, + { + "epoch": 1.373490436207883, + "grad_norm": 0.10025890916585922, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 355300 + }, + { + "epoch": 1.3735290934112663, + "grad_norm": 0.09707777202129364, + "learning_rate": 0.002, + "loss": 2.333, + "step": 355310 + }, + { + "epoch": 1.3735677506146495, + "grad_norm": 0.11595097929239273, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 355320 + }, + { + "epoch": 1.3736064078180328, + "grad_norm": 0.09489137679338455, + "learning_rate": 0.002, + "loss": 2.325, + "step": 355330 + }, + { + "epoch": 1.373645065021416, + "grad_norm": 0.09553955495357513, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 355340 + }, + { + "epoch": 1.3736837222247993, + "grad_norm": 0.1255011111497879, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 355350 + }, + { + "epoch": 1.3737223794281825, + "grad_norm": 0.11157847940921783, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 355360 + }, + { + "epoch": 1.3737610366315658, + "grad_norm": 0.19698411226272583, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 355370 + }, + { + "epoch": 1.3737996938349493, + "grad_norm": 0.10712388902902603, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 355380 + }, + { + "epoch": 1.3738383510383325, + "grad_norm": 0.11198710650205612, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 355390 + }, + { + "epoch": 1.3738770082417158, + "grad_norm": 0.0910465344786644, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 355400 + }, + { + "epoch": 1.373915665445099, + "grad_norm": 0.11695914715528488, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 355410 + }, + { + "epoch": 1.3739543226484823, + "grad_norm": 0.09684840589761734, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 355420 + }, + { + "epoch": 1.3739929798518655, + "grad_norm": 0.0928514152765274, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 355430 + }, + { + "epoch": 1.374031637055249, + "grad_norm": 0.11242695897817612, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 355440 + }, + { + "epoch": 1.3740702942586323, + "grad_norm": 0.11720636487007141, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 355450 + }, + { + "epoch": 1.3741089514620155, + "grad_norm": 0.11983829736709595, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 355460 + }, + { + "epoch": 1.3741476086653988, + "grad_norm": 0.09531054645776749, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 355470 + }, + { + "epoch": 1.374186265868782, + "grad_norm": 0.0979180783033371, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 355480 + }, + { + "epoch": 1.3742249230721653, + "grad_norm": 0.1277099996805191, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 355490 + }, + { + "epoch": 1.3742635802755485, + "grad_norm": 0.09821344912052155, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 355500 + }, + { + "epoch": 1.3743022374789318, + "grad_norm": 0.11369414627552032, + "learning_rate": 0.002, + "loss": 2.338, + "step": 355510 + }, + { + "epoch": 1.374340894682315, + "grad_norm": 0.10324325412511826, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 355520 + }, + { + "epoch": 1.3743795518856983, + "grad_norm": 0.11418458819389343, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 355530 + }, + { + "epoch": 1.3744182090890815, + "grad_norm": 0.09707921743392944, + "learning_rate": 0.002, + "loss": 2.333, + "step": 355540 + }, + { + "epoch": 1.374456866292465, + "grad_norm": 0.10141601413488388, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 355550 + }, + { + "epoch": 1.3744955234958482, + "grad_norm": 0.10970473289489746, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 355560 + }, + { + "epoch": 1.3745341806992315, + "grad_norm": 0.11045132577419281, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 355570 + }, + { + "epoch": 1.3745728379026148, + "grad_norm": 0.09881190955638885, + "learning_rate": 0.002, + "loss": 2.328, + "step": 355580 + }, + { + "epoch": 1.374611495105998, + "grad_norm": 0.09053805470466614, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 355590 + }, + { + "epoch": 1.3746501523093813, + "grad_norm": 0.10699243098497391, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 355600 + }, + { + "epoch": 1.3746888095127647, + "grad_norm": 0.11124745011329651, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 355610 + }, + { + "epoch": 1.374727466716148, + "grad_norm": 0.12792594730854034, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 355620 + }, + { + "epoch": 1.3747661239195312, + "grad_norm": 0.10298825055360794, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 355630 + }, + { + "epoch": 1.3748047811229145, + "grad_norm": 0.08798182010650635, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 355640 + }, + { + "epoch": 1.3748434383262977, + "grad_norm": 0.09169424325227737, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 355650 + }, + { + "epoch": 1.374882095529681, + "grad_norm": 0.11178061366081238, + "learning_rate": 0.002, + "loss": 2.325, + "step": 355660 + }, + { + "epoch": 1.3749207527330642, + "grad_norm": 0.17487318813800812, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 355670 + }, + { + "epoch": 1.3749594099364475, + "grad_norm": 0.09531193971633911, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 355680 + }, + { + "epoch": 1.3749980671398307, + "grad_norm": 0.10043787211179733, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 355690 + }, + { + "epoch": 1.375036724343214, + "grad_norm": 0.09354715794324875, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 355700 + }, + { + "epoch": 1.3750753815465975, + "grad_norm": 0.10695581138134003, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 355710 + }, + { + "epoch": 1.3751140387499807, + "grad_norm": 0.10047696530818939, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 355720 + }, + { + "epoch": 1.375152695953364, + "grad_norm": 0.10591744631528854, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 355730 + }, + { + "epoch": 1.3751913531567472, + "grad_norm": 0.098115935921669, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 355740 + }, + { + "epoch": 1.3752300103601305, + "grad_norm": 0.11012642085552216, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 355750 + }, + { + "epoch": 1.3752686675635137, + "grad_norm": 0.10438597947359085, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 355760 + }, + { + "epoch": 1.375307324766897, + "grad_norm": 0.09538904577493668, + "learning_rate": 0.002, + "loss": 2.32, + "step": 355770 + }, + { + "epoch": 1.3753459819702805, + "grad_norm": 0.09819392114877701, + "learning_rate": 0.002, + "loss": 2.346, + "step": 355780 + }, + { + "epoch": 1.3753846391736637, + "grad_norm": 0.12182353436946869, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 355790 + }, + { + "epoch": 1.375423296377047, + "grad_norm": 0.11237728595733643, + "learning_rate": 0.002, + "loss": 2.3097, + "step": 355800 + }, + { + "epoch": 1.3754619535804302, + "grad_norm": 0.09839800745248795, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 355810 + }, + { + "epoch": 1.3755006107838135, + "grad_norm": 0.0951833724975586, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 355820 + }, + { + "epoch": 1.3755392679871967, + "grad_norm": 0.0866539478302002, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 355830 + }, + { + "epoch": 1.37557792519058, + "grad_norm": 0.10665132105350494, + "learning_rate": 0.002, + "loss": 2.3616, + "step": 355840 + }, + { + "epoch": 1.3756165823939632, + "grad_norm": 0.10727176815271378, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 355850 + }, + { + "epoch": 1.3756552395973465, + "grad_norm": 0.09651970863342285, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 355860 + }, + { + "epoch": 1.3756938968007297, + "grad_norm": 0.10916649550199509, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 355870 + }, + { + "epoch": 1.3757325540041132, + "grad_norm": 0.1125960648059845, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 355880 + }, + { + "epoch": 1.3757712112074965, + "grad_norm": 0.09366412460803986, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 355890 + }, + { + "epoch": 1.3758098684108797, + "grad_norm": 0.09899480640888214, + "learning_rate": 0.002, + "loss": 2.3124, + "step": 355900 + }, + { + "epoch": 1.375848525614263, + "grad_norm": 0.09413845092058182, + "learning_rate": 0.002, + "loss": 2.328, + "step": 355910 + }, + { + "epoch": 1.3758871828176462, + "grad_norm": 0.10501842945814133, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 355920 + }, + { + "epoch": 1.3759258400210295, + "grad_norm": 0.10373161733150482, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 355930 + }, + { + "epoch": 1.3759644972244127, + "grad_norm": 0.0969945639371872, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 355940 + }, + { + "epoch": 1.3760031544277962, + "grad_norm": 0.11098074913024902, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 355950 + }, + { + "epoch": 1.3760418116311794, + "grad_norm": 0.1038602814078331, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 355960 + }, + { + "epoch": 1.3760804688345627, + "grad_norm": 0.10629703104496002, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 355970 + }, + { + "epoch": 1.376119126037946, + "grad_norm": 0.10306701809167862, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 355980 + }, + { + "epoch": 1.3761577832413292, + "grad_norm": 0.11155600100755692, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 355990 + }, + { + "epoch": 1.3761964404447125, + "grad_norm": 0.10721419006586075, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 356000 + }, + { + "epoch": 1.3762350976480957, + "grad_norm": 0.11784236878156662, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 356010 + }, + { + "epoch": 1.376273754851479, + "grad_norm": 0.10429581254720688, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 356020 + }, + { + "epoch": 1.3763124120548622, + "grad_norm": 0.10380349308252335, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 356030 + }, + { + "epoch": 1.3763510692582455, + "grad_norm": 0.09363957494497299, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 356040 + }, + { + "epoch": 1.376389726461629, + "grad_norm": 0.10458462685346603, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 356050 + }, + { + "epoch": 1.3764283836650122, + "grad_norm": 0.12209296226501465, + "learning_rate": 0.002, + "loss": 2.337, + "step": 356060 + }, + { + "epoch": 1.3764670408683954, + "grad_norm": 0.08871419727802277, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 356070 + }, + { + "epoch": 1.3765056980717787, + "grad_norm": 0.11016503721475601, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 356080 + }, + { + "epoch": 1.376544355275162, + "grad_norm": 0.10212699323892593, + "learning_rate": 0.002, + "loss": 2.345, + "step": 356090 + }, + { + "epoch": 1.3765830124785452, + "grad_norm": 0.13530485332012177, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 356100 + }, + { + "epoch": 1.3766216696819285, + "grad_norm": 0.09179184585809708, + "learning_rate": 0.002, + "loss": 2.302, + "step": 356110 + }, + { + "epoch": 1.376660326885312, + "grad_norm": 0.09778521209955215, + "learning_rate": 0.002, + "loss": 2.344, + "step": 356120 + }, + { + "epoch": 1.3766989840886952, + "grad_norm": 0.10612337291240692, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 356130 + }, + { + "epoch": 1.3767376412920784, + "grad_norm": 0.11075197905302048, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 356140 + }, + { + "epoch": 1.3767762984954617, + "grad_norm": 0.11034604161977768, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 356150 + }, + { + "epoch": 1.376814955698845, + "grad_norm": 0.11376018822193146, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 356160 + }, + { + "epoch": 1.3768536129022282, + "grad_norm": 0.0910055935382843, + "learning_rate": 0.002, + "loss": 2.342, + "step": 356170 + }, + { + "epoch": 1.3768922701056114, + "grad_norm": 0.09447870403528214, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 356180 + }, + { + "epoch": 1.3769309273089947, + "grad_norm": 0.10973922908306122, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 356190 + }, + { + "epoch": 1.376969584512378, + "grad_norm": 0.11534488201141357, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 356200 + }, + { + "epoch": 1.3770082417157612, + "grad_norm": 0.11993777006864548, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 356210 + }, + { + "epoch": 1.3770468989191447, + "grad_norm": 0.10797804594039917, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 356220 + }, + { + "epoch": 1.377085556122528, + "grad_norm": 0.09470128268003464, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 356230 + }, + { + "epoch": 1.3771242133259112, + "grad_norm": 0.107524573802948, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 356240 + }, + { + "epoch": 1.3771628705292944, + "grad_norm": 0.09606486558914185, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 356250 + }, + { + "epoch": 1.3772015277326777, + "grad_norm": 0.12199854850769043, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 356260 + }, + { + "epoch": 1.377240184936061, + "grad_norm": 0.0933234840631485, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 356270 + }, + { + "epoch": 1.3772788421394442, + "grad_norm": 0.09945408999919891, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 356280 + }, + { + "epoch": 1.3773174993428277, + "grad_norm": 0.1054026335477829, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 356290 + }, + { + "epoch": 1.377356156546211, + "grad_norm": 0.34597858786582947, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 356300 + }, + { + "epoch": 1.3773948137495942, + "grad_norm": 0.1390652060508728, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 356310 + }, + { + "epoch": 1.3774334709529774, + "grad_norm": 0.12162581086158752, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 356320 + }, + { + "epoch": 1.3774721281563607, + "grad_norm": 0.09069560468196869, + "learning_rate": 0.002, + "loss": 2.3152, + "step": 356330 + }, + { + "epoch": 1.377510785359744, + "grad_norm": 0.11227056384086609, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 356340 + }, + { + "epoch": 1.3775494425631272, + "grad_norm": 0.11093819886445999, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 356350 + }, + { + "epoch": 1.3775880997665104, + "grad_norm": 0.09435505419969559, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 356360 + }, + { + "epoch": 1.3776267569698937, + "grad_norm": 0.10290578752756119, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 356370 + }, + { + "epoch": 1.377665414173277, + "grad_norm": 0.12918859720230103, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 356380 + }, + { + "epoch": 1.3777040713766604, + "grad_norm": 0.09759011119604111, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 356390 + }, + { + "epoch": 1.3777427285800437, + "grad_norm": 0.10140181332826614, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 356400 + }, + { + "epoch": 1.377781385783427, + "grad_norm": 0.0912579819560051, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 356410 + }, + { + "epoch": 1.3778200429868102, + "grad_norm": 0.10252673923969269, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 356420 + }, + { + "epoch": 1.3778587001901934, + "grad_norm": 0.11278831213712692, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 356430 + }, + { + "epoch": 1.3778973573935767, + "grad_norm": 0.1057935580611229, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 356440 + }, + { + "epoch": 1.3779360145969601, + "grad_norm": 0.09558992087841034, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 356450 + }, + { + "epoch": 1.3779746718003434, + "grad_norm": 0.09526700526475906, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 356460 + }, + { + "epoch": 1.3780133290037266, + "grad_norm": 0.10227543860673904, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 356470 + }, + { + "epoch": 1.37805198620711, + "grad_norm": 0.09496919810771942, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 356480 + }, + { + "epoch": 1.3780906434104931, + "grad_norm": 0.09326247125864029, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 356490 + }, + { + "epoch": 1.3781293006138764, + "grad_norm": 0.11264954507350922, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 356500 + }, + { + "epoch": 1.3781679578172596, + "grad_norm": 0.10654882341623306, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 356510 + }, + { + "epoch": 1.378206615020643, + "grad_norm": 0.08847260475158691, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 356520 + }, + { + "epoch": 1.3782452722240262, + "grad_norm": 0.14755184948444366, + "learning_rate": 0.002, + "loss": 2.318, + "step": 356530 + }, + { + "epoch": 1.3782839294274094, + "grad_norm": 0.1040208637714386, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 356540 + }, + { + "epoch": 1.3783225866307927, + "grad_norm": 0.08862719684839249, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 356550 + }, + { + "epoch": 1.3783612438341761, + "grad_norm": 0.10133068263530731, + "learning_rate": 0.002, + "loss": 2.348, + "step": 356560 + }, + { + "epoch": 1.3783999010375594, + "grad_norm": 0.09677974134683609, + "learning_rate": 0.002, + "loss": 2.337, + "step": 356570 + }, + { + "epoch": 1.3784385582409426, + "grad_norm": 0.12741930782794952, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 356580 + }, + { + "epoch": 1.3784772154443259, + "grad_norm": 0.12969206273555756, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 356590 + }, + { + "epoch": 1.3785158726477091, + "grad_norm": 0.10940320789813995, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 356600 + }, + { + "epoch": 1.3785545298510924, + "grad_norm": 0.1125948429107666, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 356610 + }, + { + "epoch": 1.3785931870544759, + "grad_norm": 0.10383554548025131, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 356620 + }, + { + "epoch": 1.3786318442578591, + "grad_norm": 0.100620336830616, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 356630 + }, + { + "epoch": 1.3786705014612424, + "grad_norm": 0.1173454001545906, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 356640 + }, + { + "epoch": 1.3787091586646256, + "grad_norm": 0.11233677715063095, + "learning_rate": 0.002, + "loss": 2.341, + "step": 356650 + }, + { + "epoch": 1.3787478158680089, + "grad_norm": 0.09890108555555344, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 356660 + }, + { + "epoch": 1.3787864730713921, + "grad_norm": 0.10544534772634506, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 356670 + }, + { + "epoch": 1.3788251302747754, + "grad_norm": 0.10439340025186539, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 356680 + }, + { + "epoch": 1.3788637874781586, + "grad_norm": 0.11262357234954834, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 356690 + }, + { + "epoch": 1.3789024446815419, + "grad_norm": 0.09836552292108536, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 356700 + }, + { + "epoch": 1.3789411018849251, + "grad_norm": 0.11344776302576065, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 356710 + }, + { + "epoch": 1.3789797590883084, + "grad_norm": 0.09220697730779648, + "learning_rate": 0.002, + "loss": 2.347, + "step": 356720 + }, + { + "epoch": 1.3790184162916919, + "grad_norm": 0.09780165553092957, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 356730 + }, + { + "epoch": 1.3790570734950751, + "grad_norm": 0.09286107122898102, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 356740 + }, + { + "epoch": 1.3790957306984584, + "grad_norm": 0.1060047447681427, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 356750 + }, + { + "epoch": 1.3791343879018416, + "grad_norm": 0.11789479851722717, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 356760 + }, + { + "epoch": 1.3791730451052249, + "grad_norm": 0.09493661671876907, + "learning_rate": 0.002, + "loss": 2.3534, + "step": 356770 + }, + { + "epoch": 1.3792117023086081, + "grad_norm": 0.13432522118091583, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 356780 + }, + { + "epoch": 1.3792503595119916, + "grad_norm": 0.10457032173871994, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 356790 + }, + { + "epoch": 1.3792890167153748, + "grad_norm": 0.12527017295360565, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 356800 + }, + { + "epoch": 1.379327673918758, + "grad_norm": 0.09906395524740219, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 356810 + }, + { + "epoch": 1.3793663311221414, + "grad_norm": 0.10826694965362549, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 356820 + }, + { + "epoch": 1.3794049883255246, + "grad_norm": 0.10286567360162735, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 356830 + }, + { + "epoch": 1.3794436455289079, + "grad_norm": 0.11156069487333298, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 356840 + }, + { + "epoch": 1.379482302732291, + "grad_norm": 0.10198958963155746, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 356850 + }, + { + "epoch": 1.3795209599356744, + "grad_norm": 0.08753270655870438, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 356860 + }, + { + "epoch": 1.3795596171390576, + "grad_norm": 0.11631825566291809, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 356870 + }, + { + "epoch": 1.3795982743424409, + "grad_norm": 0.0976443886756897, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 356880 + }, + { + "epoch": 1.3796369315458241, + "grad_norm": 0.09814205765724182, + "learning_rate": 0.002, + "loss": 2.337, + "step": 356890 + }, + { + "epoch": 1.3796755887492076, + "grad_norm": 0.10800322145223618, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 356900 + }, + { + "epoch": 1.3797142459525908, + "grad_norm": 0.09989839792251587, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 356910 + }, + { + "epoch": 1.379752903155974, + "grad_norm": 0.10988453030586243, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 356920 + }, + { + "epoch": 1.3797915603593573, + "grad_norm": 0.14615663886070251, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 356930 + }, + { + "epoch": 1.3798302175627406, + "grad_norm": 0.09930089116096497, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 356940 + }, + { + "epoch": 1.3798688747661239, + "grad_norm": 0.10417861491441727, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 356950 + }, + { + "epoch": 1.3799075319695073, + "grad_norm": 0.1246800571680069, + "learning_rate": 0.002, + "loss": 2.3546, + "step": 356960 + }, + { + "epoch": 1.3799461891728906, + "grad_norm": 0.1527402251958847, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 356970 + }, + { + "epoch": 1.3799848463762738, + "grad_norm": 0.1109328642487526, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 356980 + }, + { + "epoch": 1.380023503579657, + "grad_norm": 0.09175754338502884, + "learning_rate": 0.002, + "loss": 2.334, + "step": 356990 + }, + { + "epoch": 1.3800621607830403, + "grad_norm": 0.10926374047994614, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 357000 + }, + { + "epoch": 1.3801008179864236, + "grad_norm": 0.10283466428518295, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 357010 + }, + { + "epoch": 1.3801394751898068, + "grad_norm": 0.10054881870746613, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 357020 + }, + { + "epoch": 1.38017813239319, + "grad_norm": 0.10364895313978195, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 357030 + }, + { + "epoch": 1.3802167895965733, + "grad_norm": 0.10115136951208115, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 357040 + }, + { + "epoch": 1.3802554467999566, + "grad_norm": 0.10140955448150635, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 357050 + }, + { + "epoch": 1.3802941040033399, + "grad_norm": 0.11943522095680237, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 357060 + }, + { + "epoch": 1.3803327612067233, + "grad_norm": 0.10424049943685532, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 357070 + }, + { + "epoch": 1.3803714184101066, + "grad_norm": 0.1278669834136963, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 357080 + }, + { + "epoch": 1.3804100756134898, + "grad_norm": 0.11559665203094482, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 357090 + }, + { + "epoch": 1.380448732816873, + "grad_norm": 0.10717646032571793, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 357100 + }, + { + "epoch": 1.3804873900202563, + "grad_norm": 0.0966046005487442, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 357110 + }, + { + "epoch": 1.3805260472236396, + "grad_norm": 0.09488651901483536, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 357120 + }, + { + "epoch": 1.380564704427023, + "grad_norm": 0.09918989986181259, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 357130 + }, + { + "epoch": 1.3806033616304063, + "grad_norm": 0.12172359973192215, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 357140 + }, + { + "epoch": 1.3806420188337896, + "grad_norm": 0.10451477020978928, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 357150 + }, + { + "epoch": 1.3806806760371728, + "grad_norm": 0.10900402814149857, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 357160 + }, + { + "epoch": 1.380719333240556, + "grad_norm": 0.10119115561246872, + "learning_rate": 0.002, + "loss": 2.349, + "step": 357170 + }, + { + "epoch": 1.3807579904439393, + "grad_norm": 0.10229839384555817, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 357180 + }, + { + "epoch": 1.3807966476473226, + "grad_norm": 0.11427944153547287, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 357190 + }, + { + "epoch": 1.3808353048507058, + "grad_norm": 0.11322027444839478, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 357200 + }, + { + "epoch": 1.380873962054089, + "grad_norm": 0.10636843740940094, + "learning_rate": 0.002, + "loss": 2.344, + "step": 357210 + }, + { + "epoch": 1.3809126192574723, + "grad_norm": 0.09347525238990784, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 357220 + }, + { + "epoch": 1.3809512764608556, + "grad_norm": 0.11774121224880219, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 357230 + }, + { + "epoch": 1.380989933664239, + "grad_norm": 0.0972491055727005, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 357240 + }, + { + "epoch": 1.3810285908676223, + "grad_norm": 0.10950019210577011, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 357250 + }, + { + "epoch": 1.3810672480710056, + "grad_norm": 0.09386036545038223, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 357260 + }, + { + "epoch": 1.3811059052743888, + "grad_norm": 0.10270935297012329, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 357270 + }, + { + "epoch": 1.381144562477772, + "grad_norm": 0.11999564617872238, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 357280 + }, + { + "epoch": 1.3811832196811553, + "grad_norm": 0.09688630700111389, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 357290 + }, + { + "epoch": 1.3812218768845388, + "grad_norm": 0.10284639149904251, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 357300 + }, + { + "epoch": 1.381260534087922, + "grad_norm": 0.09171400219202042, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 357310 + }, + { + "epoch": 1.3812991912913053, + "grad_norm": 0.09951532632112503, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 357320 + }, + { + "epoch": 1.3813378484946885, + "grad_norm": 0.08959772437810898, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 357330 + }, + { + "epoch": 1.3813765056980718, + "grad_norm": 0.12968987226486206, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 357340 + }, + { + "epoch": 1.381415162901455, + "grad_norm": 0.13226859271526337, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 357350 + }, + { + "epoch": 1.3814538201048383, + "grad_norm": 0.11036866158246994, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 357360 + }, + { + "epoch": 1.3814924773082216, + "grad_norm": 0.10080908238887787, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 357370 + }, + { + "epoch": 1.3815311345116048, + "grad_norm": 0.10770279169082642, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 357380 + }, + { + "epoch": 1.381569791714988, + "grad_norm": 0.12061317265033722, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 357390 + }, + { + "epoch": 1.3816084489183713, + "grad_norm": 0.09101912379264832, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 357400 + }, + { + "epoch": 1.3816471061217548, + "grad_norm": 0.11742638051509857, + "learning_rate": 0.002, + "loss": 2.348, + "step": 357410 + }, + { + "epoch": 1.381685763325138, + "grad_norm": 0.10257943719625473, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 357420 + }, + { + "epoch": 1.3817244205285213, + "grad_norm": 0.10319139808416367, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 357430 + }, + { + "epoch": 1.3817630777319045, + "grad_norm": 0.09902940690517426, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 357440 + }, + { + "epoch": 1.3818017349352878, + "grad_norm": 0.125919371843338, + "learning_rate": 0.002, + "loss": 2.327, + "step": 357450 + }, + { + "epoch": 1.381840392138671, + "grad_norm": 0.10221952944993973, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 357460 + }, + { + "epoch": 1.3818790493420545, + "grad_norm": 0.10029791295528412, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 357470 + }, + { + "epoch": 1.3819177065454378, + "grad_norm": 0.1684672236442566, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 357480 + }, + { + "epoch": 1.381956363748821, + "grad_norm": 0.10184746235609055, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 357490 + }, + { + "epoch": 1.3819950209522043, + "grad_norm": 0.10539903491735458, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 357500 + }, + { + "epoch": 1.3820336781555875, + "grad_norm": 0.10304230451583862, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 357510 + }, + { + "epoch": 1.3820723353589708, + "grad_norm": 0.11429215967655182, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 357520 + }, + { + "epoch": 1.382110992562354, + "grad_norm": 0.08892321586608887, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 357530 + }, + { + "epoch": 1.3821496497657373, + "grad_norm": 0.08578456193208694, + "learning_rate": 0.002, + "loss": 2.338, + "step": 357540 + }, + { + "epoch": 1.3821883069691205, + "grad_norm": 0.09519366919994354, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 357550 + }, + { + "epoch": 1.3822269641725038, + "grad_norm": 0.08658157289028168, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 357560 + }, + { + "epoch": 1.3822656213758873, + "grad_norm": 0.1024087518453598, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 357570 + }, + { + "epoch": 1.3823042785792705, + "grad_norm": 0.11436939984560013, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 357580 + }, + { + "epoch": 1.3823429357826538, + "grad_norm": 0.11564888060092926, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 357590 + }, + { + "epoch": 1.382381592986037, + "grad_norm": 0.10661925375461578, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 357600 + }, + { + "epoch": 1.3824202501894203, + "grad_norm": 0.11697540432214737, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 357610 + }, + { + "epoch": 1.3824589073928035, + "grad_norm": 0.10966644436120987, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 357620 + }, + { + "epoch": 1.3824975645961868, + "grad_norm": 0.11463762074708939, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 357630 + }, + { + "epoch": 1.3825362217995703, + "grad_norm": 0.09555447101593018, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 357640 + }, + { + "epoch": 1.3825748790029535, + "grad_norm": 0.13319911062717438, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 357650 + }, + { + "epoch": 1.3826135362063368, + "grad_norm": 0.1469419002532959, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 357660 + }, + { + "epoch": 1.38265219340972, + "grad_norm": 0.11750228703022003, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 357670 + }, + { + "epoch": 1.3826908506131033, + "grad_norm": 0.1029818207025528, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 357680 + }, + { + "epoch": 1.3827295078164865, + "grad_norm": 0.1653258204460144, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 357690 + }, + { + "epoch": 1.3827681650198698, + "grad_norm": 0.09607890993356705, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 357700 + }, + { + "epoch": 1.382806822223253, + "grad_norm": 0.09096881747245789, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 357710 + }, + { + "epoch": 1.3828454794266363, + "grad_norm": 0.10659847408533096, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 357720 + }, + { + "epoch": 1.3828841366300195, + "grad_norm": 0.0992022231221199, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 357730 + }, + { + "epoch": 1.382922793833403, + "grad_norm": 0.09695616364479065, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 357740 + }, + { + "epoch": 1.3829614510367862, + "grad_norm": 0.08763127774000168, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 357750 + }, + { + "epoch": 1.3830001082401695, + "grad_norm": 0.09782769531011581, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 357760 + }, + { + "epoch": 1.3830387654435528, + "grad_norm": 0.1507205367088318, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 357770 + }, + { + "epoch": 1.383077422646936, + "grad_norm": 0.10128536820411682, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 357780 + }, + { + "epoch": 1.3831160798503193, + "grad_norm": 0.09506537765264511, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 357790 + }, + { + "epoch": 1.3831547370537025, + "grad_norm": 0.09646282345056534, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 357800 + }, + { + "epoch": 1.383193394257086, + "grad_norm": 0.10690893232822418, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 357810 + }, + { + "epoch": 1.3832320514604692, + "grad_norm": 0.11854461580514908, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 357820 + }, + { + "epoch": 1.3832707086638525, + "grad_norm": 0.11158133298158646, + "learning_rate": 0.002, + "loss": 2.336, + "step": 357830 + }, + { + "epoch": 1.3833093658672357, + "grad_norm": 0.09612828493118286, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 357840 + }, + { + "epoch": 1.383348023070619, + "grad_norm": 0.10760582238435745, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 357850 + }, + { + "epoch": 1.3833866802740022, + "grad_norm": 0.12931786477565765, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 357860 + }, + { + "epoch": 1.3834253374773855, + "grad_norm": 0.10564881563186646, + "learning_rate": 0.002, + "loss": 2.341, + "step": 357870 + }, + { + "epoch": 1.3834639946807687, + "grad_norm": 0.10905144363641739, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 357880 + }, + { + "epoch": 1.383502651884152, + "grad_norm": 0.09472287446260452, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 357890 + }, + { + "epoch": 1.3835413090875353, + "grad_norm": 0.1405135542154312, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 357900 + }, + { + "epoch": 1.3835799662909187, + "grad_norm": 0.11076802015304565, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 357910 + }, + { + "epoch": 1.383618623494302, + "grad_norm": 0.10980372130870819, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 357920 + }, + { + "epoch": 1.3836572806976852, + "grad_norm": 0.1058148592710495, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 357930 + }, + { + "epoch": 1.3836959379010685, + "grad_norm": 0.09596095979213715, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 357940 + }, + { + "epoch": 1.3837345951044517, + "grad_norm": 0.11825399100780487, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 357950 + }, + { + "epoch": 1.383773252307835, + "grad_norm": 0.10999801754951477, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 357960 + }, + { + "epoch": 1.3838119095112182, + "grad_norm": 0.10320445150136948, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 357970 + }, + { + "epoch": 1.3838505667146017, + "grad_norm": 0.12310822308063507, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 357980 + }, + { + "epoch": 1.383889223917985, + "grad_norm": 0.104934923350811, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 357990 + }, + { + "epoch": 1.3839278811213682, + "grad_norm": 0.10744967311620712, + "learning_rate": 0.002, + "loss": 2.3574, + "step": 358000 + }, + { + "epoch": 1.3839665383247515, + "grad_norm": 0.11633177101612091, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 358010 + }, + { + "epoch": 1.3840051955281347, + "grad_norm": 0.12066573649644852, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 358020 + }, + { + "epoch": 1.384043852731518, + "grad_norm": 0.12387596070766449, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 358030 + }, + { + "epoch": 1.3840825099349012, + "grad_norm": 0.10241378098726273, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 358040 + }, + { + "epoch": 1.3841211671382845, + "grad_norm": 0.09486771374940872, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 358050 + }, + { + "epoch": 1.3841598243416677, + "grad_norm": 0.11558888107538223, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 358060 + }, + { + "epoch": 1.384198481545051, + "grad_norm": 0.10875887423753738, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 358070 + }, + { + "epoch": 1.3842371387484345, + "grad_norm": 0.11380153149366379, + "learning_rate": 0.002, + "loss": 2.343, + "step": 358080 + }, + { + "epoch": 1.3842757959518177, + "grad_norm": 0.11291273683309555, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 358090 + }, + { + "epoch": 1.384314453155201, + "grad_norm": 0.12594982981681824, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 358100 + }, + { + "epoch": 1.3843531103585842, + "grad_norm": 0.10149511694908142, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 358110 + }, + { + "epoch": 1.3843917675619675, + "grad_norm": 0.09986498206853867, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 358120 + }, + { + "epoch": 1.3844304247653507, + "grad_norm": 0.09570245444774628, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 358130 + }, + { + "epoch": 1.384469081968734, + "grad_norm": 0.11640851199626923, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 358140 + }, + { + "epoch": 1.3845077391721174, + "grad_norm": 0.11998428404331207, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 358150 + }, + { + "epoch": 1.3845463963755007, + "grad_norm": 0.10228558629751205, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 358160 + }, + { + "epoch": 1.384585053578884, + "grad_norm": 0.09900269657373428, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 358170 + }, + { + "epoch": 1.3846237107822672, + "grad_norm": 0.10694219172000885, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 358180 + }, + { + "epoch": 1.3846623679856505, + "grad_norm": 0.0999547615647316, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 358190 + }, + { + "epoch": 1.3847010251890337, + "grad_norm": 0.11348982900381088, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 358200 + }, + { + "epoch": 1.384739682392417, + "grad_norm": 0.09742119908332825, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 358210 + }, + { + "epoch": 1.3847783395958002, + "grad_norm": 0.11021881550550461, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 358220 + }, + { + "epoch": 1.3848169967991835, + "grad_norm": 0.09835400432348251, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 358230 + }, + { + "epoch": 1.3848556540025667, + "grad_norm": 0.10762475430965424, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 358240 + }, + { + "epoch": 1.3848943112059502, + "grad_norm": 0.0981389731168747, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 358250 + }, + { + "epoch": 1.3849329684093334, + "grad_norm": 0.12566789984703064, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 358260 + }, + { + "epoch": 1.3849716256127167, + "grad_norm": 0.10963409394025803, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 358270 + }, + { + "epoch": 1.3850102828161, + "grad_norm": 0.10804201662540436, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 358280 + }, + { + "epoch": 1.3850489400194832, + "grad_norm": 0.10372433066368103, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 358290 + }, + { + "epoch": 1.3850875972228665, + "grad_norm": 0.10548718273639679, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 358300 + }, + { + "epoch": 1.38512625442625, + "grad_norm": 0.09937731176614761, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 358310 + }, + { + "epoch": 1.3851649116296332, + "grad_norm": 0.10259462893009186, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 358320 + }, + { + "epoch": 1.3852035688330164, + "grad_norm": 0.10795610398054123, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 358330 + }, + { + "epoch": 1.3852422260363997, + "grad_norm": 0.0990600511431694, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 358340 + }, + { + "epoch": 1.385280883239783, + "grad_norm": 0.10372160375118256, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 358350 + }, + { + "epoch": 1.3853195404431662, + "grad_norm": 0.10864154994487762, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 358360 + }, + { + "epoch": 1.3853581976465494, + "grad_norm": 0.10648924112319946, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 358370 + }, + { + "epoch": 1.3853968548499327, + "grad_norm": 0.1119183599948883, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 358380 + }, + { + "epoch": 1.385435512053316, + "grad_norm": 0.525481104850769, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 358390 + }, + { + "epoch": 1.3854741692566992, + "grad_norm": 0.1384838968515396, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 358400 + }, + { + "epoch": 1.3855128264600824, + "grad_norm": 0.09210643172264099, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 358410 + }, + { + "epoch": 1.385551483663466, + "grad_norm": 0.08706945180892944, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 358420 + }, + { + "epoch": 1.3855901408668492, + "grad_norm": 0.10544677823781967, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 358430 + }, + { + "epoch": 1.3856287980702324, + "grad_norm": 0.12038781493902206, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 358440 + }, + { + "epoch": 1.3856674552736157, + "grad_norm": 0.10919291526079178, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 358450 + }, + { + "epoch": 1.385706112476999, + "grad_norm": 0.10312280058860779, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 358460 + }, + { + "epoch": 1.3857447696803822, + "grad_norm": 0.09364982694387436, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 358470 + }, + { + "epoch": 1.3857834268837657, + "grad_norm": 0.12228646129369736, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 358480 + }, + { + "epoch": 1.385822084087149, + "grad_norm": 0.09567874670028687, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 358490 + }, + { + "epoch": 1.3858607412905322, + "grad_norm": 0.1218157559633255, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 358500 + }, + { + "epoch": 1.3858993984939154, + "grad_norm": 0.12315016239881516, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 358510 + }, + { + "epoch": 1.3859380556972987, + "grad_norm": 0.11362304538488388, + "learning_rate": 0.002, + "loss": 2.341, + "step": 358520 + }, + { + "epoch": 1.385976712900682, + "grad_norm": 0.10522307455539703, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 358530 + }, + { + "epoch": 1.3860153701040652, + "grad_norm": 0.09985428303480148, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 358540 + }, + { + "epoch": 1.3860540273074484, + "grad_norm": 0.10328204929828644, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 358550 + }, + { + "epoch": 1.3860926845108317, + "grad_norm": 0.10569297522306442, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 358560 + }, + { + "epoch": 1.386131341714215, + "grad_norm": 0.10771090537309647, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 358570 + }, + { + "epoch": 1.3861699989175982, + "grad_norm": 0.11412060260772705, + "learning_rate": 0.002, + "loss": 2.3132, + "step": 358580 + }, + { + "epoch": 1.3862086561209817, + "grad_norm": 0.10503178834915161, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 358590 + }, + { + "epoch": 1.386247313324365, + "grad_norm": 0.10429660230875015, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 358600 + }, + { + "epoch": 1.3862859705277482, + "grad_norm": 0.09722950309515, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 358610 + }, + { + "epoch": 1.3863246277311314, + "grad_norm": 0.10863056033849716, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 358620 + }, + { + "epoch": 1.3863632849345147, + "grad_norm": 0.09604120999574661, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 358630 + }, + { + "epoch": 1.386401942137898, + "grad_norm": 0.12591049075126648, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 358640 + }, + { + "epoch": 1.3864405993412814, + "grad_norm": 0.12707343697547913, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 358650 + }, + { + "epoch": 1.3864792565446646, + "grad_norm": 0.09873824566602707, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 358660 + }, + { + "epoch": 1.386517913748048, + "grad_norm": 0.1484958827495575, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 358670 + }, + { + "epoch": 1.3865565709514311, + "grad_norm": 0.10666509717702866, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 358680 + }, + { + "epoch": 1.3865952281548144, + "grad_norm": 0.1105133444070816, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 358690 + }, + { + "epoch": 1.3866338853581976, + "grad_norm": 0.0936323031783104, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 358700 + }, + { + "epoch": 1.386672542561581, + "grad_norm": 0.09979099780321121, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 358710 + }, + { + "epoch": 1.3867111997649642, + "grad_norm": 0.0981900617480278, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 358720 + }, + { + "epoch": 1.3867498569683474, + "grad_norm": 0.11693184822797775, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 358730 + }, + { + "epoch": 1.3867885141717307, + "grad_norm": 0.10722755640745163, + "learning_rate": 0.002, + "loss": 2.328, + "step": 358740 + }, + { + "epoch": 1.386827171375114, + "grad_norm": 0.10744725167751312, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 358750 + }, + { + "epoch": 1.3868658285784974, + "grad_norm": 0.09629779309034348, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 358760 + }, + { + "epoch": 1.3869044857818806, + "grad_norm": 0.09576866030693054, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 358770 + }, + { + "epoch": 1.3869431429852639, + "grad_norm": 0.11830944567918777, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 358780 + }, + { + "epoch": 1.3869818001886471, + "grad_norm": 0.1023031547665596, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 358790 + }, + { + "epoch": 1.3870204573920304, + "grad_norm": 0.09811899065971375, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 358800 + }, + { + "epoch": 1.3870591145954136, + "grad_norm": 0.11076534539461136, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 358810 + }, + { + "epoch": 1.3870977717987971, + "grad_norm": 0.11109668761491776, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 358820 + }, + { + "epoch": 1.3871364290021804, + "grad_norm": 0.10413765162229538, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 358830 + }, + { + "epoch": 1.3871750862055636, + "grad_norm": 0.11741053313016891, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 358840 + }, + { + "epoch": 1.3872137434089469, + "grad_norm": 0.09895078837871552, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 358850 + }, + { + "epoch": 1.3872524006123301, + "grad_norm": 0.10803353041410446, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 358860 + }, + { + "epoch": 1.3872910578157134, + "grad_norm": 0.16418716311454773, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 358870 + }, + { + "epoch": 1.3873297150190966, + "grad_norm": 0.11566440016031265, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 358880 + }, + { + "epoch": 1.3873683722224799, + "grad_norm": 0.0987687036395073, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 358890 + }, + { + "epoch": 1.3874070294258631, + "grad_norm": 0.09825720638036728, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 358900 + }, + { + "epoch": 1.3874456866292464, + "grad_norm": 0.12824301421642303, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 358910 + }, + { + "epoch": 1.3874843438326296, + "grad_norm": 0.11537756025791168, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 358920 + }, + { + "epoch": 1.3875230010360131, + "grad_norm": 0.09872827678918839, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 358930 + }, + { + "epoch": 1.3875616582393964, + "grad_norm": 0.10614890605211258, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 358940 + }, + { + "epoch": 1.3876003154427796, + "grad_norm": 0.1058836579322815, + "learning_rate": 0.002, + "loss": 2.339, + "step": 358950 + }, + { + "epoch": 1.3876389726461629, + "grad_norm": 0.36678752303123474, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 358960 + }, + { + "epoch": 1.3876776298495461, + "grad_norm": 0.12122151255607605, + "learning_rate": 0.002, + "loss": 2.324, + "step": 358970 + }, + { + "epoch": 1.3877162870529294, + "grad_norm": 0.12098393589258194, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 358980 + }, + { + "epoch": 1.3877549442563129, + "grad_norm": 0.10260053724050522, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 358990 + }, + { + "epoch": 1.387793601459696, + "grad_norm": 0.1203232929110527, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 359000 + }, + { + "epoch": 1.3878322586630794, + "grad_norm": 0.10592683404684067, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 359010 + }, + { + "epoch": 1.3878709158664626, + "grad_norm": 0.09129156172275543, + "learning_rate": 0.002, + "loss": 2.338, + "step": 359020 + }, + { + "epoch": 1.3879095730698459, + "grad_norm": 0.11400771886110306, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 359030 + }, + { + "epoch": 1.387948230273229, + "grad_norm": 0.10435612499713898, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 359040 + }, + { + "epoch": 1.3879868874766124, + "grad_norm": 0.11092580854892731, + "learning_rate": 0.002, + "loss": 2.335, + "step": 359050 + }, + { + "epoch": 1.3880255446799956, + "grad_norm": 0.10389309376478195, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 359060 + }, + { + "epoch": 1.3880642018833789, + "grad_norm": 0.11955828219652176, + "learning_rate": 0.002, + "loss": 2.332, + "step": 359070 + }, + { + "epoch": 1.3881028590867621, + "grad_norm": 0.0991957038640976, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 359080 + }, + { + "epoch": 1.3881415162901454, + "grad_norm": 0.11172988265752792, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 359090 + }, + { + "epoch": 1.3881801734935288, + "grad_norm": 0.10796627402305603, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 359100 + }, + { + "epoch": 1.388218830696912, + "grad_norm": 0.0978311151266098, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 359110 + }, + { + "epoch": 1.3882574879002954, + "grad_norm": 0.0951458290219307, + "learning_rate": 0.002, + "loss": 2.3105, + "step": 359120 + }, + { + "epoch": 1.3882961451036786, + "grad_norm": 0.1115274578332901, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 359130 + }, + { + "epoch": 1.3883348023070619, + "grad_norm": 0.10567060858011246, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 359140 + }, + { + "epoch": 1.388373459510445, + "grad_norm": 0.09773063659667969, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 359150 + }, + { + "epoch": 1.3884121167138286, + "grad_norm": 0.10530774295330048, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 359160 + }, + { + "epoch": 1.3884507739172118, + "grad_norm": 0.11766993254423141, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 359170 + }, + { + "epoch": 1.388489431120595, + "grad_norm": 0.10456059873104095, + "learning_rate": 0.002, + "loss": 2.316, + "step": 359180 + }, + { + "epoch": 1.3885280883239783, + "grad_norm": 0.11992094665765762, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 359190 + }, + { + "epoch": 1.3885667455273616, + "grad_norm": 0.10693403333425522, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 359200 + }, + { + "epoch": 1.3886054027307448, + "grad_norm": 0.11005581170320511, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 359210 + }, + { + "epoch": 1.388644059934128, + "grad_norm": 0.08628341555595398, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 359220 + }, + { + "epoch": 1.3886827171375113, + "grad_norm": 0.09540165960788727, + "learning_rate": 0.002, + "loss": 2.342, + "step": 359230 + }, + { + "epoch": 1.3887213743408946, + "grad_norm": 0.10182127356529236, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 359240 + }, + { + "epoch": 1.3887600315442779, + "grad_norm": 0.08753172308206558, + "learning_rate": 0.002, + "loss": 2.326, + "step": 359250 + }, + { + "epoch": 1.388798688747661, + "grad_norm": 0.0912495106458664, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 359260 + }, + { + "epoch": 1.3888373459510446, + "grad_norm": 0.1422264277935028, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 359270 + }, + { + "epoch": 1.3888760031544278, + "grad_norm": 0.0967712327837944, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 359280 + }, + { + "epoch": 1.388914660357811, + "grad_norm": 0.11819314956665039, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 359290 + }, + { + "epoch": 1.3889533175611943, + "grad_norm": 0.1206083670258522, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 359300 + }, + { + "epoch": 1.3889919747645776, + "grad_norm": 0.11072078347206116, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 359310 + }, + { + "epoch": 1.3890306319679608, + "grad_norm": 0.10273478180170059, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 359320 + }, + { + "epoch": 1.3890692891713443, + "grad_norm": 0.09189434349536896, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 359330 + }, + { + "epoch": 1.3891079463747276, + "grad_norm": 0.09685275703668594, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 359340 + }, + { + "epoch": 1.3891466035781108, + "grad_norm": 0.0961252748966217, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 359350 + }, + { + "epoch": 1.389185260781494, + "grad_norm": 0.10798119753599167, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 359360 + }, + { + "epoch": 1.3892239179848773, + "grad_norm": 0.13294091820716858, + "learning_rate": 0.002, + "loss": 2.338, + "step": 359370 + }, + { + "epoch": 1.3892625751882606, + "grad_norm": 0.10411449521780014, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 359380 + }, + { + "epoch": 1.3893012323916438, + "grad_norm": 0.09397763758897781, + "learning_rate": 0.002, + "loss": 2.347, + "step": 359390 + }, + { + "epoch": 1.389339889595027, + "grad_norm": 0.11398205906152725, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 359400 + }, + { + "epoch": 1.3893785467984103, + "grad_norm": 0.11523891240358353, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 359410 + }, + { + "epoch": 1.3894172040017936, + "grad_norm": 0.10378719866275787, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 359420 + }, + { + "epoch": 1.3894558612051768, + "grad_norm": 0.10406482219696045, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 359430 + }, + { + "epoch": 1.3894945184085603, + "grad_norm": 0.0997418463230133, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 359440 + }, + { + "epoch": 1.3895331756119436, + "grad_norm": 0.10954898595809937, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 359450 + }, + { + "epoch": 1.3895718328153268, + "grad_norm": 0.1074637845158577, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 359460 + }, + { + "epoch": 1.38961049001871, + "grad_norm": 0.09442642331123352, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 359470 + }, + { + "epoch": 1.3896491472220933, + "grad_norm": 0.1044730693101883, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 359480 + }, + { + "epoch": 1.3896878044254766, + "grad_norm": 0.1017158254981041, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 359490 + }, + { + "epoch": 1.38972646162886, + "grad_norm": 0.09050361067056656, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 359500 + }, + { + "epoch": 1.3897651188322433, + "grad_norm": 0.11728296428918839, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 359510 + }, + { + "epoch": 1.3898037760356265, + "grad_norm": 0.13631664216518402, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 359520 + }, + { + "epoch": 1.3898424332390098, + "grad_norm": 0.11105327308177948, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 359530 + }, + { + "epoch": 1.389881090442393, + "grad_norm": 0.10104014724493027, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 359540 + }, + { + "epoch": 1.3899197476457763, + "grad_norm": 0.10071936994791031, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 359550 + }, + { + "epoch": 1.3899584048491596, + "grad_norm": 0.10410552471876144, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 359560 + }, + { + "epoch": 1.3899970620525428, + "grad_norm": 0.10601193457841873, + "learning_rate": 0.002, + "loss": 2.341, + "step": 359570 + }, + { + "epoch": 1.390035719255926, + "grad_norm": 0.09772384911775589, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 359580 + }, + { + "epoch": 1.3900743764593093, + "grad_norm": 0.095488540828228, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 359590 + }, + { + "epoch": 1.3901130336626928, + "grad_norm": 0.119688019156456, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 359600 + }, + { + "epoch": 1.390151690866076, + "grad_norm": 0.11805914342403412, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 359610 + }, + { + "epoch": 1.3901903480694593, + "grad_norm": 0.08788260817527771, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 359620 + }, + { + "epoch": 1.3902290052728425, + "grad_norm": 0.09670418500900269, + "learning_rate": 0.002, + "loss": 2.3595, + "step": 359630 + }, + { + "epoch": 1.3902676624762258, + "grad_norm": 0.09463132172822952, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 359640 + }, + { + "epoch": 1.390306319679609, + "grad_norm": 0.12268111854791641, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 359650 + }, + { + "epoch": 1.3903449768829923, + "grad_norm": 0.09824145585298538, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 359660 + }, + { + "epoch": 1.3903836340863758, + "grad_norm": 0.09898674488067627, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 359670 + }, + { + "epoch": 1.390422291289759, + "grad_norm": 0.1264234334230423, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 359680 + }, + { + "epoch": 1.3904609484931423, + "grad_norm": 0.10478637367486954, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 359690 + }, + { + "epoch": 1.3904996056965255, + "grad_norm": 0.10049188882112503, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 359700 + }, + { + "epoch": 1.3905382628999088, + "grad_norm": 0.11678900569677353, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 359710 + }, + { + "epoch": 1.390576920103292, + "grad_norm": 0.09387555718421936, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 359720 + }, + { + "epoch": 1.3906155773066753, + "grad_norm": 0.13321635127067566, + "learning_rate": 0.002, + "loss": 2.322, + "step": 359730 + }, + { + "epoch": 1.3906542345100585, + "grad_norm": 0.11627347767353058, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 359740 + }, + { + "epoch": 1.3906928917134418, + "grad_norm": 0.23211270570755005, + "learning_rate": 0.002, + "loss": 2.339, + "step": 359750 + }, + { + "epoch": 1.390731548916825, + "grad_norm": 0.10206664353609085, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 359760 + }, + { + "epoch": 1.3907702061202085, + "grad_norm": 0.09255781769752502, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 359770 + }, + { + "epoch": 1.3908088633235918, + "grad_norm": 0.10072731226682663, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 359780 + }, + { + "epoch": 1.390847520526975, + "grad_norm": 0.09593206644058228, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 359790 + }, + { + "epoch": 1.3908861777303583, + "grad_norm": 0.1118595078587532, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 359800 + }, + { + "epoch": 1.3909248349337415, + "grad_norm": 0.10689254850149155, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 359810 + }, + { + "epoch": 1.3909634921371248, + "grad_norm": 0.10757198929786682, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 359820 + }, + { + "epoch": 1.391002149340508, + "grad_norm": 0.09484315663576126, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 359830 + }, + { + "epoch": 1.3910408065438915, + "grad_norm": 0.10556916147470474, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 359840 + }, + { + "epoch": 1.3910794637472748, + "grad_norm": 0.09785880893468857, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 359850 + }, + { + "epoch": 1.391118120950658, + "grad_norm": 0.12440207600593567, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 359860 + }, + { + "epoch": 1.3911567781540413, + "grad_norm": 0.1069926843047142, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 359870 + }, + { + "epoch": 1.3911954353574245, + "grad_norm": 0.10705050081014633, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 359880 + }, + { + "epoch": 1.3912340925608078, + "grad_norm": 0.09222240746021271, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 359890 + }, + { + "epoch": 1.391272749764191, + "grad_norm": 0.10790204256772995, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 359900 + }, + { + "epoch": 1.3913114069675743, + "grad_norm": 0.12623713910579681, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 359910 + }, + { + "epoch": 1.3913500641709575, + "grad_norm": 0.10312226414680481, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 359920 + }, + { + "epoch": 1.3913887213743408, + "grad_norm": 0.0920531153678894, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 359930 + }, + { + "epoch": 1.3914273785777242, + "grad_norm": 0.10971745103597641, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 359940 + }, + { + "epoch": 1.3914660357811075, + "grad_norm": 0.1152036041021347, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 359950 + }, + { + "epoch": 1.3915046929844908, + "grad_norm": 0.09976490586996078, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 359960 + }, + { + "epoch": 1.391543350187874, + "grad_norm": 0.10785773396492004, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 359970 + }, + { + "epoch": 1.3915820073912573, + "grad_norm": 0.09680186957120895, + "learning_rate": 0.002, + "loss": 2.34, + "step": 359980 + }, + { + "epoch": 1.3916206645946405, + "grad_norm": 0.11459063738584518, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 359990 + }, + { + "epoch": 1.3916593217980238, + "grad_norm": 0.09679178148508072, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 360000 + }, + { + "epoch": 1.3916979790014072, + "grad_norm": 0.14078864455223083, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 360010 + }, + { + "epoch": 1.3917366362047905, + "grad_norm": 0.10744940489530563, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 360020 + }, + { + "epoch": 1.3917752934081737, + "grad_norm": 0.10858958959579468, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 360030 + }, + { + "epoch": 1.391813950611557, + "grad_norm": 0.11119197309017181, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 360040 + }, + { + "epoch": 1.3918526078149402, + "grad_norm": 0.09662743657827377, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 360050 + }, + { + "epoch": 1.3918912650183235, + "grad_norm": 0.10182948410511017, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 360060 + }, + { + "epoch": 1.3919299222217068, + "grad_norm": 0.10580704361200333, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 360070 + }, + { + "epoch": 1.39196857942509, + "grad_norm": 0.09505116194486618, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 360080 + }, + { + "epoch": 1.3920072366284733, + "grad_norm": 0.10502129048109055, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 360090 + }, + { + "epoch": 1.3920458938318565, + "grad_norm": 0.11738190054893494, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 360100 + }, + { + "epoch": 1.39208455103524, + "grad_norm": 0.09682303667068481, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 360110 + }, + { + "epoch": 1.3921232082386232, + "grad_norm": 0.10412992537021637, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 360120 + }, + { + "epoch": 1.3921618654420065, + "grad_norm": 0.08938931673765182, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 360130 + }, + { + "epoch": 1.3922005226453897, + "grad_norm": 0.10273377597332001, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 360140 + }, + { + "epoch": 1.392239179848773, + "grad_norm": 0.11520109325647354, + "learning_rate": 0.002, + "loss": 2.3136, + "step": 360150 + }, + { + "epoch": 1.3922778370521562, + "grad_norm": 0.10236816108226776, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 360160 + }, + { + "epoch": 1.3923164942555395, + "grad_norm": 0.10257626324892044, + "learning_rate": 0.002, + "loss": 2.357, + "step": 360170 + }, + { + "epoch": 1.392355151458923, + "grad_norm": 0.09983783960342407, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 360180 + }, + { + "epoch": 1.3923938086623062, + "grad_norm": 0.10540879517793655, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 360190 + }, + { + "epoch": 1.3924324658656895, + "grad_norm": 0.1019587367773056, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 360200 + }, + { + "epoch": 1.3924711230690727, + "grad_norm": 0.11383510380983353, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 360210 + }, + { + "epoch": 1.392509780272456, + "grad_norm": 0.10606590658426285, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 360220 + }, + { + "epoch": 1.3925484374758392, + "grad_norm": 0.1148306354880333, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 360230 + }, + { + "epoch": 1.3925870946792225, + "grad_norm": 0.10644423961639404, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 360240 + }, + { + "epoch": 1.3926257518826057, + "grad_norm": 0.117405965924263, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 360250 + }, + { + "epoch": 1.392664409085989, + "grad_norm": 0.12969310581684113, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 360260 + }, + { + "epoch": 1.3927030662893722, + "grad_norm": 0.10582044720649719, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 360270 + }, + { + "epoch": 1.3927417234927557, + "grad_norm": 0.1080719605088234, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 360280 + }, + { + "epoch": 1.392780380696139, + "grad_norm": 0.11880721896886826, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 360290 + }, + { + "epoch": 1.3928190378995222, + "grad_norm": 0.0958956629037857, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 360300 + }, + { + "epoch": 1.3928576951029055, + "grad_norm": 0.11437147110700607, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 360310 + }, + { + "epoch": 1.3928963523062887, + "grad_norm": 0.09938332438468933, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 360320 + }, + { + "epoch": 1.392935009509672, + "grad_norm": 0.10863950103521347, + "learning_rate": 0.002, + "loss": 2.334, + "step": 360330 + }, + { + "epoch": 1.3929736667130554, + "grad_norm": 0.1936277449131012, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 360340 + }, + { + "epoch": 1.3930123239164387, + "grad_norm": 0.09787117689847946, + "learning_rate": 0.002, + "loss": 2.334, + "step": 360350 + }, + { + "epoch": 1.393050981119822, + "grad_norm": 0.09849102050065994, + "learning_rate": 0.002, + "loss": 2.314, + "step": 360360 + }, + { + "epoch": 1.3930896383232052, + "grad_norm": 0.10864541679620743, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 360370 + }, + { + "epoch": 1.3931282955265885, + "grad_norm": 0.11145168542861938, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 360380 + }, + { + "epoch": 1.3931669527299717, + "grad_norm": 0.13048399984836578, + "learning_rate": 0.002, + "loss": 2.32, + "step": 360390 + }, + { + "epoch": 1.393205609933355, + "grad_norm": 0.0969235748052597, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 360400 + }, + { + "epoch": 1.3932442671367382, + "grad_norm": 0.10593166947364807, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 360410 + }, + { + "epoch": 1.3932829243401215, + "grad_norm": 0.11622758954763412, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 360420 + }, + { + "epoch": 1.3933215815435047, + "grad_norm": 0.11350737512111664, + "learning_rate": 0.002, + "loss": 2.325, + "step": 360430 + }, + { + "epoch": 1.393360238746888, + "grad_norm": 0.10533896088600159, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 360440 + }, + { + "epoch": 1.3933988959502714, + "grad_norm": 0.11156805604696274, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 360450 + }, + { + "epoch": 1.3934375531536547, + "grad_norm": 0.09442062675952911, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 360460 + }, + { + "epoch": 1.393476210357038, + "grad_norm": 0.13392575085163116, + "learning_rate": 0.002, + "loss": 2.338, + "step": 360470 + }, + { + "epoch": 1.3935148675604212, + "grad_norm": 0.10841104388237, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 360480 + }, + { + "epoch": 1.3935535247638045, + "grad_norm": 0.11690452694892883, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 360490 + }, + { + "epoch": 1.3935921819671877, + "grad_norm": 0.10026668012142181, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 360500 + }, + { + "epoch": 1.3936308391705712, + "grad_norm": 0.11332149803638458, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 360510 + }, + { + "epoch": 1.3936694963739544, + "grad_norm": 0.10418102890253067, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 360520 + }, + { + "epoch": 1.3937081535773377, + "grad_norm": 0.10783516615629196, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 360530 + }, + { + "epoch": 1.393746810780721, + "grad_norm": 0.11106953024864197, + "learning_rate": 0.002, + "loss": 2.3112, + "step": 360540 + }, + { + "epoch": 1.3937854679841042, + "grad_norm": 0.11125322431325912, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 360550 + }, + { + "epoch": 1.3938241251874874, + "grad_norm": 0.11771386861801147, + "learning_rate": 0.002, + "loss": 2.342, + "step": 360560 + }, + { + "epoch": 1.3938627823908707, + "grad_norm": 0.10649606585502625, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 360570 + }, + { + "epoch": 1.393901439594254, + "grad_norm": 0.10038311779499054, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 360580 + }, + { + "epoch": 1.3939400967976372, + "grad_norm": 0.1158284842967987, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 360590 + }, + { + "epoch": 1.3939787540010204, + "grad_norm": 0.10317362844944, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 360600 + }, + { + "epoch": 1.3940174112044037, + "grad_norm": 0.10186164081096649, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 360610 + }, + { + "epoch": 1.3940560684077872, + "grad_norm": 0.09978730976581573, + "learning_rate": 0.002, + "loss": 2.34, + "step": 360620 + }, + { + "epoch": 1.3940947256111704, + "grad_norm": 0.10873579978942871, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 360630 + }, + { + "epoch": 1.3941333828145537, + "grad_norm": 0.09863824397325516, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 360640 + }, + { + "epoch": 1.394172040017937, + "grad_norm": 0.12052670121192932, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 360650 + }, + { + "epoch": 1.3942106972213202, + "grad_norm": 0.12854629755020142, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 360660 + }, + { + "epoch": 1.3942493544247034, + "grad_norm": 0.09690230339765549, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 360670 + }, + { + "epoch": 1.394288011628087, + "grad_norm": 0.10591220110654831, + "learning_rate": 0.002, + "loss": 2.325, + "step": 360680 + }, + { + "epoch": 1.3943266688314702, + "grad_norm": 0.09987891465425491, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 360690 + }, + { + "epoch": 1.3943653260348534, + "grad_norm": 0.1403607428073883, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 360700 + }, + { + "epoch": 1.3944039832382367, + "grad_norm": 0.10875146090984344, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 360710 + }, + { + "epoch": 1.39444264044162, + "grad_norm": 0.10704370588064194, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 360720 + }, + { + "epoch": 1.3944812976450032, + "grad_norm": 0.09713513404130936, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 360730 + }, + { + "epoch": 1.3945199548483864, + "grad_norm": 0.12021619081497192, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 360740 + }, + { + "epoch": 1.3945586120517697, + "grad_norm": 0.10152607411146164, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 360750 + }, + { + "epoch": 1.394597269255153, + "grad_norm": 0.11064693331718445, + "learning_rate": 0.002, + "loss": 2.3507, + "step": 360760 + }, + { + "epoch": 1.3946359264585362, + "grad_norm": 0.11342901736497879, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 360770 + }, + { + "epoch": 1.3946745836619194, + "grad_norm": 0.10161430388689041, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 360780 + }, + { + "epoch": 1.394713240865303, + "grad_norm": 0.09243433177471161, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 360790 + }, + { + "epoch": 1.3947518980686862, + "grad_norm": 0.12722088396549225, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 360800 + }, + { + "epoch": 1.3947905552720694, + "grad_norm": 0.10662361234426498, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 360810 + }, + { + "epoch": 1.3948292124754527, + "grad_norm": 0.0985378846526146, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 360820 + }, + { + "epoch": 1.394867869678836, + "grad_norm": 0.10576797276735306, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 360830 + }, + { + "epoch": 1.3949065268822192, + "grad_norm": 0.10594084113836288, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 360840 + }, + { + "epoch": 1.3949451840856026, + "grad_norm": 0.09589492529630661, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 360850 + }, + { + "epoch": 1.394983841288986, + "grad_norm": 0.1067117303609848, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 360860 + }, + { + "epoch": 1.3950224984923691, + "grad_norm": 0.10901029407978058, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 360870 + }, + { + "epoch": 1.3950611556957524, + "grad_norm": 0.11935501545667648, + "learning_rate": 0.002, + "loss": 2.326, + "step": 360880 + }, + { + "epoch": 1.3950998128991356, + "grad_norm": 0.12973760068416595, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 360890 + }, + { + "epoch": 1.395138470102519, + "grad_norm": 0.10274084657430649, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 360900 + }, + { + "epoch": 1.3951771273059022, + "grad_norm": 0.11378609389066696, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 360910 + }, + { + "epoch": 1.3952157845092854, + "grad_norm": 0.12882862985134125, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 360920 + }, + { + "epoch": 1.3952544417126687, + "grad_norm": 0.10811634361743927, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 360930 + }, + { + "epoch": 1.395293098916052, + "grad_norm": 0.0853014886379242, + "learning_rate": 0.002, + "loss": 2.341, + "step": 360940 + }, + { + "epoch": 1.3953317561194352, + "grad_norm": 0.09347040951251984, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 360950 + }, + { + "epoch": 1.3953704133228186, + "grad_norm": 0.10422652214765549, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 360960 + }, + { + "epoch": 1.395409070526202, + "grad_norm": 0.09387336671352386, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 360970 + }, + { + "epoch": 1.3954477277295851, + "grad_norm": 0.09877435117959976, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 360980 + }, + { + "epoch": 1.3954863849329684, + "grad_norm": 0.10496794432401657, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 360990 + }, + { + "epoch": 1.3955250421363516, + "grad_norm": 0.1177574172616005, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 361000 + }, + { + "epoch": 1.395563699339735, + "grad_norm": 0.12681037187576294, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 361010 + }, + { + "epoch": 1.3956023565431184, + "grad_norm": 0.11275091767311096, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 361020 + }, + { + "epoch": 1.3956410137465016, + "grad_norm": 0.10215546935796738, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 361030 + }, + { + "epoch": 1.3956796709498849, + "grad_norm": 0.09558103233575821, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 361040 + }, + { + "epoch": 1.3957183281532681, + "grad_norm": 0.11078792810440063, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 361050 + }, + { + "epoch": 1.3957569853566514, + "grad_norm": 0.10702776163816452, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 361060 + }, + { + "epoch": 1.3957956425600346, + "grad_norm": 0.14717505872249603, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 361070 + }, + { + "epoch": 1.3958342997634179, + "grad_norm": 0.09539902955293655, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 361080 + }, + { + "epoch": 1.3958729569668011, + "grad_norm": 0.10105487704277039, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 361090 + }, + { + "epoch": 1.3959116141701844, + "grad_norm": 0.10328929871320724, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 361100 + }, + { + "epoch": 1.3959502713735676, + "grad_norm": 0.10702311247587204, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 361110 + }, + { + "epoch": 1.395988928576951, + "grad_norm": 0.10122741758823395, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 361120 + }, + { + "epoch": 1.3960275857803344, + "grad_norm": 0.10583388060331345, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 361130 + }, + { + "epoch": 1.3960662429837176, + "grad_norm": 0.10187561064958572, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 361140 + }, + { + "epoch": 1.3961049001871009, + "grad_norm": 0.096242755651474, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 361150 + }, + { + "epoch": 1.3961435573904841, + "grad_norm": 0.1273539960384369, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 361160 + }, + { + "epoch": 1.3961822145938674, + "grad_norm": 0.12333947420120239, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 361170 + }, + { + "epoch": 1.3962208717972506, + "grad_norm": 0.09781967103481293, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 361180 + }, + { + "epoch": 1.396259529000634, + "grad_norm": 0.11135595291852951, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 361190 + }, + { + "epoch": 1.3962981862040174, + "grad_norm": 0.1160992905497551, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 361200 + }, + { + "epoch": 1.3963368434074006, + "grad_norm": 0.10552875697612762, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 361210 + }, + { + "epoch": 1.3963755006107839, + "grad_norm": 0.11450695246458054, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 361220 + }, + { + "epoch": 1.3964141578141671, + "grad_norm": 0.10146085172891617, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 361230 + }, + { + "epoch": 1.3964528150175504, + "grad_norm": 0.09327413886785507, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 361240 + }, + { + "epoch": 1.3964914722209336, + "grad_norm": 0.13081274926662445, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 361250 + }, + { + "epoch": 1.3965301294243169, + "grad_norm": 0.10812999308109283, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 361260 + }, + { + "epoch": 1.3965687866277001, + "grad_norm": 0.1000206246972084, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 361270 + }, + { + "epoch": 1.3966074438310834, + "grad_norm": 0.09798294305801392, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 361280 + }, + { + "epoch": 1.3966461010344666, + "grad_norm": 0.09942685812711716, + "learning_rate": 0.002, + "loss": 2.336, + "step": 361290 + }, + { + "epoch": 1.39668475823785, + "grad_norm": 0.09794165939092636, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 361300 + }, + { + "epoch": 1.3967234154412334, + "grad_norm": 0.1085113063454628, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 361310 + }, + { + "epoch": 1.3967620726446166, + "grad_norm": 0.09627203643321991, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 361320 + }, + { + "epoch": 1.3968007298479999, + "grad_norm": 0.09349404275417328, + "learning_rate": 0.002, + "loss": 2.34, + "step": 361330 + }, + { + "epoch": 1.396839387051383, + "grad_norm": 0.12713338434696198, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 361340 + }, + { + "epoch": 1.3968780442547664, + "grad_norm": 0.10971616953611374, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 361350 + }, + { + "epoch": 1.3969167014581498, + "grad_norm": 0.1190105527639389, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 361360 + }, + { + "epoch": 1.396955358661533, + "grad_norm": 0.10451462119817734, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 361370 + }, + { + "epoch": 1.3969940158649163, + "grad_norm": 0.1128426343202591, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 361380 + }, + { + "epoch": 1.3970326730682996, + "grad_norm": 0.12514308094978333, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 361390 + }, + { + "epoch": 1.3970713302716828, + "grad_norm": 0.10585001111030579, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 361400 + }, + { + "epoch": 1.397109987475066, + "grad_norm": 0.12662208080291748, + "learning_rate": 0.002, + "loss": 2.323, + "step": 361410 + }, + { + "epoch": 1.3971486446784493, + "grad_norm": 0.08943864703178406, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 361420 + }, + { + "epoch": 1.3971873018818326, + "grad_norm": 0.09058155119419098, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 361430 + }, + { + "epoch": 1.3972259590852159, + "grad_norm": 0.1279940903186798, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 361440 + }, + { + "epoch": 1.397264616288599, + "grad_norm": 0.10139375925064087, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 361450 + }, + { + "epoch": 1.3973032734919826, + "grad_norm": 0.10489026457071304, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 361460 + }, + { + "epoch": 1.3973419306953658, + "grad_norm": 0.09946117550134659, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 361470 + }, + { + "epoch": 1.397380587898749, + "grad_norm": 0.10231003165245056, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 361480 + }, + { + "epoch": 1.3974192451021323, + "grad_norm": 0.10625860840082169, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 361490 + }, + { + "epoch": 1.3974579023055156, + "grad_norm": 0.09422457218170166, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 361500 + }, + { + "epoch": 1.3974965595088988, + "grad_norm": 0.12820343673229218, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 361510 + }, + { + "epoch": 1.397535216712282, + "grad_norm": 0.11814692616462708, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 361520 + }, + { + "epoch": 1.3975738739156656, + "grad_norm": 0.09457693994045258, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 361530 + }, + { + "epoch": 1.3976125311190488, + "grad_norm": 0.11984442174434662, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 361540 + }, + { + "epoch": 1.397651188322432, + "grad_norm": 0.11762730032205582, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 361550 + }, + { + "epoch": 1.3976898455258153, + "grad_norm": 0.10839294642210007, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 361560 + }, + { + "epoch": 1.3977285027291986, + "grad_norm": 0.09241145104169846, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 361570 + }, + { + "epoch": 1.3977671599325818, + "grad_norm": 0.10676850378513336, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 361580 + }, + { + "epoch": 1.397805817135965, + "grad_norm": 0.10594607889652252, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 361590 + }, + { + "epoch": 1.3978444743393483, + "grad_norm": 0.27866330742836, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 361600 + }, + { + "epoch": 1.3978831315427316, + "grad_norm": 0.09651079773902893, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 361610 + }, + { + "epoch": 1.3979217887461148, + "grad_norm": 0.13887152075767517, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 361620 + }, + { + "epoch": 1.3979604459494983, + "grad_norm": 0.11219456791877747, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 361630 + }, + { + "epoch": 1.3979991031528816, + "grad_norm": 0.12171217054128647, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 361640 + }, + { + "epoch": 1.3980377603562648, + "grad_norm": 0.09833841770887375, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 361650 + }, + { + "epoch": 1.398076417559648, + "grad_norm": 0.10252264142036438, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 361660 + }, + { + "epoch": 1.3981150747630313, + "grad_norm": 0.11962525546550751, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 361670 + }, + { + "epoch": 1.3981537319664146, + "grad_norm": 0.10162299871444702, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 361680 + }, + { + "epoch": 1.3981923891697978, + "grad_norm": 0.0998658686876297, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 361690 + }, + { + "epoch": 1.3982310463731813, + "grad_norm": 0.10175598412752151, + "learning_rate": 0.002, + "loss": 2.336, + "step": 361700 + }, + { + "epoch": 1.3982697035765645, + "grad_norm": 0.09900136291980743, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 361710 + }, + { + "epoch": 1.3983083607799478, + "grad_norm": 0.10916583985090256, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 361720 + }, + { + "epoch": 1.398347017983331, + "grad_norm": 0.12456495314836502, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 361730 + }, + { + "epoch": 1.3983856751867143, + "grad_norm": 0.100282222032547, + "learning_rate": 0.002, + "loss": 2.314, + "step": 361740 + }, + { + "epoch": 1.3984243323900976, + "grad_norm": 0.11090005934238434, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 361750 + }, + { + "epoch": 1.3984629895934808, + "grad_norm": 0.11740148812532425, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 361760 + }, + { + "epoch": 1.398501646796864, + "grad_norm": 0.12703080475330353, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 361770 + }, + { + "epoch": 1.3985403040002473, + "grad_norm": 0.11310010403394699, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 361780 + }, + { + "epoch": 1.3985789612036306, + "grad_norm": 0.11828218400478363, + "learning_rate": 0.002, + "loss": 2.3113, + "step": 361790 + }, + { + "epoch": 1.398617618407014, + "grad_norm": 0.115929014980793, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 361800 + }, + { + "epoch": 1.3986562756103973, + "grad_norm": 0.11614751070737839, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 361810 + }, + { + "epoch": 1.3986949328137805, + "grad_norm": 0.09835601598024368, + "learning_rate": 0.002, + "loss": 2.342, + "step": 361820 + }, + { + "epoch": 1.3987335900171638, + "grad_norm": 0.10991106182336807, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 361830 + }, + { + "epoch": 1.398772247220547, + "grad_norm": 0.10826396942138672, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 361840 + }, + { + "epoch": 1.3988109044239303, + "grad_norm": 0.11880354583263397, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 361850 + }, + { + "epoch": 1.3988495616273136, + "grad_norm": 0.09766586124897003, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 361860 + }, + { + "epoch": 1.398888218830697, + "grad_norm": 0.11801117658615112, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 361870 + }, + { + "epoch": 1.3989268760340803, + "grad_norm": 0.11416234076023102, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 361880 + }, + { + "epoch": 1.3989655332374635, + "grad_norm": 0.09224196523427963, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 361890 + }, + { + "epoch": 1.3990041904408468, + "grad_norm": 0.11558035016059875, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 361900 + }, + { + "epoch": 1.39904284764423, + "grad_norm": 0.10817205905914307, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 361910 + }, + { + "epoch": 1.3990815048476133, + "grad_norm": 0.10724747180938721, + "learning_rate": 0.002, + "loss": 2.333, + "step": 361920 + }, + { + "epoch": 1.3991201620509965, + "grad_norm": 0.6057680249214172, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 361930 + }, + { + "epoch": 1.3991588192543798, + "grad_norm": 0.09721146523952484, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 361940 + }, + { + "epoch": 1.399197476457763, + "grad_norm": 0.11369055509567261, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 361950 + }, + { + "epoch": 1.3992361336611463, + "grad_norm": 0.10831263661384583, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 361960 + }, + { + "epoch": 1.3992747908645298, + "grad_norm": 0.09343111515045166, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 361970 + }, + { + "epoch": 1.399313448067913, + "grad_norm": 0.09698399901390076, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 361980 + }, + { + "epoch": 1.3993521052712963, + "grad_norm": 0.1151181012392044, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 361990 + }, + { + "epoch": 1.3993907624746795, + "grad_norm": 0.10255439579486847, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 362000 + }, + { + "epoch": 1.3994294196780628, + "grad_norm": 0.10207153111696243, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 362010 + }, + { + "epoch": 1.399468076881446, + "grad_norm": 0.11824337393045425, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 362020 + }, + { + "epoch": 1.3995067340848293, + "grad_norm": 0.10297773778438568, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 362030 + }, + { + "epoch": 1.3995453912882128, + "grad_norm": 0.1008480042219162, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 362040 + }, + { + "epoch": 1.399584048491596, + "grad_norm": 0.11089113354682922, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 362050 + }, + { + "epoch": 1.3996227056949793, + "grad_norm": 0.11389949172735214, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 362060 + }, + { + "epoch": 1.3996613628983625, + "grad_norm": 0.10151943564414978, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 362070 + }, + { + "epoch": 1.3997000201017458, + "grad_norm": 0.10547202080488205, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 362080 + }, + { + "epoch": 1.399738677305129, + "grad_norm": 0.12970007956027985, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 362090 + }, + { + "epoch": 1.3997773345085123, + "grad_norm": 0.09537054598331451, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 362100 + }, + { + "epoch": 1.3998159917118955, + "grad_norm": 0.10798255354166031, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 362110 + }, + { + "epoch": 1.3998546489152788, + "grad_norm": 0.12179353088140488, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 362120 + }, + { + "epoch": 1.399893306118662, + "grad_norm": 0.1268961876630783, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 362130 + }, + { + "epoch": 1.3999319633220455, + "grad_norm": 0.09666255861520767, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 362140 + }, + { + "epoch": 1.3999706205254288, + "grad_norm": 0.10677926242351532, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 362150 + }, + { + "epoch": 1.400009277728812, + "grad_norm": 0.13625779747962952, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 362160 + }, + { + "epoch": 1.4000479349321953, + "grad_norm": 0.10409170389175415, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 362170 + }, + { + "epoch": 1.4000865921355785, + "grad_norm": 0.12508094310760498, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 362180 + }, + { + "epoch": 1.4001252493389618, + "grad_norm": 0.12312871962785721, + "learning_rate": 0.002, + "loss": 2.3046, + "step": 362190 + }, + { + "epoch": 1.4001639065423452, + "grad_norm": 0.1070362851023674, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 362200 + }, + { + "epoch": 1.4002025637457285, + "grad_norm": 0.10340160876512527, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 362210 + }, + { + "epoch": 1.4002412209491117, + "grad_norm": 0.11799286305904388, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 362220 + }, + { + "epoch": 1.400279878152495, + "grad_norm": 0.10353390127420425, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 362230 + }, + { + "epoch": 1.4003185353558782, + "grad_norm": 0.10391611605882645, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 362240 + }, + { + "epoch": 1.4003571925592615, + "grad_norm": 0.09352357685565948, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 362250 + }, + { + "epoch": 1.4003958497626448, + "grad_norm": 0.10108699649572372, + "learning_rate": 0.002, + "loss": 2.331, + "step": 362260 + }, + { + "epoch": 1.400434506966028, + "grad_norm": 0.1228623315691948, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 362270 + }, + { + "epoch": 1.4004731641694113, + "grad_norm": 0.09886107593774796, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 362280 + }, + { + "epoch": 1.4005118213727945, + "grad_norm": 0.1335236132144928, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 362290 + }, + { + "epoch": 1.4005504785761778, + "grad_norm": 0.10030423104763031, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 362300 + }, + { + "epoch": 1.4005891357795612, + "grad_norm": 0.1142667829990387, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 362310 + }, + { + "epoch": 1.4006277929829445, + "grad_norm": 0.10299000144004822, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 362320 + }, + { + "epoch": 1.4006664501863277, + "grad_norm": 0.11370020359754562, + "learning_rate": 0.002, + "loss": 2.335, + "step": 362330 + }, + { + "epoch": 1.400705107389711, + "grad_norm": 0.10238875448703766, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 362340 + }, + { + "epoch": 1.4007437645930942, + "grad_norm": 0.09951310604810715, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 362350 + }, + { + "epoch": 1.4007824217964775, + "grad_norm": 0.10246714949607849, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 362360 + }, + { + "epoch": 1.400821078999861, + "grad_norm": 0.11719667911529541, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 362370 + }, + { + "epoch": 1.4008597362032442, + "grad_norm": 0.10228227823972702, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 362380 + }, + { + "epoch": 1.4008983934066275, + "grad_norm": 0.10831063240766525, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 362390 + }, + { + "epoch": 1.4009370506100107, + "grad_norm": 0.09752275049686432, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 362400 + }, + { + "epoch": 1.400975707813394, + "grad_norm": 0.11090794950723648, + "learning_rate": 0.002, + "loss": 2.345, + "step": 362410 + }, + { + "epoch": 1.4010143650167772, + "grad_norm": 0.09280040860176086, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 362420 + }, + { + "epoch": 1.4010530222201605, + "grad_norm": 0.10434489697217941, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 362430 + }, + { + "epoch": 1.4010916794235437, + "grad_norm": 0.10599362105131149, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 362440 + }, + { + "epoch": 1.401130336626927, + "grad_norm": 0.12312032282352448, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 362450 + }, + { + "epoch": 1.4011689938303102, + "grad_norm": 0.12890659272670746, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 362460 + }, + { + "epoch": 1.4012076510336935, + "grad_norm": 0.09986308217048645, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 362470 + }, + { + "epoch": 1.401246308237077, + "grad_norm": 0.09984946250915527, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 362480 + }, + { + "epoch": 1.4012849654404602, + "grad_norm": 0.109229676425457, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 362490 + }, + { + "epoch": 1.4013236226438435, + "grad_norm": 0.12256049364805222, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 362500 + }, + { + "epoch": 1.4013622798472267, + "grad_norm": 0.10545402020215988, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 362510 + }, + { + "epoch": 1.40140093705061, + "grad_norm": 0.09840114414691925, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 362520 + }, + { + "epoch": 1.4014395942539932, + "grad_norm": 0.09764132648706436, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 362530 + }, + { + "epoch": 1.4014782514573767, + "grad_norm": 0.09643712639808655, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 362540 + }, + { + "epoch": 1.40151690866076, + "grad_norm": 0.10782516002655029, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 362550 + }, + { + "epoch": 1.4015555658641432, + "grad_norm": 0.09751346707344055, + "learning_rate": 0.002, + "loss": 2.315, + "step": 362560 + }, + { + "epoch": 1.4015942230675265, + "grad_norm": 0.13143301010131836, + "learning_rate": 0.002, + "loss": 2.3149, + "step": 362570 + }, + { + "epoch": 1.4016328802709097, + "grad_norm": 0.10341697186231613, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 362580 + }, + { + "epoch": 1.401671537474293, + "grad_norm": 0.09820285439491272, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 362590 + }, + { + "epoch": 1.4017101946776762, + "grad_norm": 0.0969579741358757, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 362600 + }, + { + "epoch": 1.4017488518810595, + "grad_norm": 0.11445106565952301, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 362610 + }, + { + "epoch": 1.4017875090844427, + "grad_norm": 0.1103585809469223, + "learning_rate": 0.002, + "loss": 2.3548, + "step": 362620 + }, + { + "epoch": 1.401826166287826, + "grad_norm": 0.09358467161655426, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 362630 + }, + { + "epoch": 1.4018648234912092, + "grad_norm": 0.09893258661031723, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 362640 + }, + { + "epoch": 1.4019034806945927, + "grad_norm": 0.09271512925624847, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 362650 + }, + { + "epoch": 1.401942137897976, + "grad_norm": 0.10333232581615448, + "learning_rate": 0.002, + "loss": 2.323, + "step": 362660 + }, + { + "epoch": 1.4019807951013592, + "grad_norm": 0.10539565235376358, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 362670 + }, + { + "epoch": 1.4020194523047425, + "grad_norm": 0.10242871195077896, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 362680 + }, + { + "epoch": 1.4020581095081257, + "grad_norm": 0.09406136721372604, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 362690 + }, + { + "epoch": 1.402096766711509, + "grad_norm": 0.10209919512271881, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 362700 + }, + { + "epoch": 1.4021354239148924, + "grad_norm": 0.11888878792524338, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 362710 + }, + { + "epoch": 1.4021740811182757, + "grad_norm": 0.10515199601650238, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 362720 + }, + { + "epoch": 1.402212738321659, + "grad_norm": 0.11428892612457275, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 362730 + }, + { + "epoch": 1.4022513955250422, + "grad_norm": 0.12660294771194458, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 362740 + }, + { + "epoch": 1.4022900527284254, + "grad_norm": 0.0967639684677124, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 362750 + }, + { + "epoch": 1.4023287099318087, + "grad_norm": 0.10571729391813278, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 362760 + }, + { + "epoch": 1.402367367135192, + "grad_norm": 0.12200155854225159, + "learning_rate": 0.002, + "loss": 2.342, + "step": 362770 + }, + { + "epoch": 1.4024060243385752, + "grad_norm": 0.10133996605873108, + "learning_rate": 0.002, + "loss": 2.333, + "step": 362780 + }, + { + "epoch": 1.4024446815419584, + "grad_norm": 0.10742209106683731, + "learning_rate": 0.002, + "loss": 2.344, + "step": 362790 + }, + { + "epoch": 1.4024833387453417, + "grad_norm": 0.10386163741350174, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 362800 + }, + { + "epoch": 1.402521995948725, + "grad_norm": 0.1222449317574501, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 362810 + }, + { + "epoch": 1.4025606531521084, + "grad_norm": 0.10506697744131088, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 362820 + }, + { + "epoch": 1.4025993103554917, + "grad_norm": 0.10943559557199478, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 362830 + }, + { + "epoch": 1.402637967558875, + "grad_norm": 0.1146540567278862, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 362840 + }, + { + "epoch": 1.4026766247622582, + "grad_norm": 0.10004156082868576, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 362850 + }, + { + "epoch": 1.4027152819656414, + "grad_norm": 0.11623933911323547, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 362860 + }, + { + "epoch": 1.4027539391690247, + "grad_norm": 0.12447261810302734, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 362870 + }, + { + "epoch": 1.4027925963724082, + "grad_norm": 0.11130033433437347, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 362880 + }, + { + "epoch": 1.4028312535757914, + "grad_norm": 0.10304639488458633, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 362890 + }, + { + "epoch": 1.4028699107791747, + "grad_norm": 0.1037617176771164, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 362900 + }, + { + "epoch": 1.402908567982558, + "grad_norm": 0.09812625497579575, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 362910 + }, + { + "epoch": 1.4029472251859412, + "grad_norm": 0.12238926440477371, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 362920 + }, + { + "epoch": 1.4029858823893244, + "grad_norm": 0.09530317038297653, + "learning_rate": 0.002, + "loss": 2.33, + "step": 362930 + }, + { + "epoch": 1.4030245395927077, + "grad_norm": 0.0930962935090065, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 362940 + }, + { + "epoch": 1.403063196796091, + "grad_norm": 0.1084480956196785, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 362950 + }, + { + "epoch": 1.4031018539994742, + "grad_norm": 0.10741405189037323, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 362960 + }, + { + "epoch": 1.4031405112028574, + "grad_norm": 0.1136438176035881, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 362970 + }, + { + "epoch": 1.4031791684062407, + "grad_norm": 0.09555595368146896, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 362980 + }, + { + "epoch": 1.4032178256096242, + "grad_norm": 0.10760748386383057, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 362990 + }, + { + "epoch": 1.4032564828130074, + "grad_norm": 0.09433700144290924, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 363000 + }, + { + "epoch": 1.4032951400163907, + "grad_norm": 0.1083793193101883, + "learning_rate": 0.002, + "loss": 2.334, + "step": 363010 + }, + { + "epoch": 1.403333797219774, + "grad_norm": 0.09169959276914597, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 363020 + }, + { + "epoch": 1.4033724544231572, + "grad_norm": 0.10351970046758652, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 363030 + }, + { + "epoch": 1.4034111116265404, + "grad_norm": 0.09821586310863495, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 363040 + }, + { + "epoch": 1.403449768829924, + "grad_norm": 0.12562578916549683, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 363050 + }, + { + "epoch": 1.4034884260333071, + "grad_norm": 0.10481049865484238, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 363060 + }, + { + "epoch": 1.4035270832366904, + "grad_norm": 0.4016679525375366, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 363070 + }, + { + "epoch": 1.4035657404400737, + "grad_norm": 0.1034378632903099, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 363080 + }, + { + "epoch": 1.403604397643457, + "grad_norm": 0.1017390713095665, + "learning_rate": 0.002, + "loss": 2.326, + "step": 363090 + }, + { + "epoch": 1.4036430548468402, + "grad_norm": 0.1351192742586136, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 363100 + }, + { + "epoch": 1.4036817120502234, + "grad_norm": 0.1087784543633461, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 363110 + }, + { + "epoch": 1.4037203692536067, + "grad_norm": 0.11712558567523956, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 363120 + }, + { + "epoch": 1.40375902645699, + "grad_norm": 0.1175539493560791, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 363130 + }, + { + "epoch": 1.4037976836603732, + "grad_norm": 0.10677841305732727, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 363140 + }, + { + "epoch": 1.4038363408637564, + "grad_norm": 0.1184573546051979, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 363150 + }, + { + "epoch": 1.40387499806714, + "grad_norm": 0.12204091250896454, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 363160 + }, + { + "epoch": 1.4039136552705231, + "grad_norm": 0.11024865508079529, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 363170 + }, + { + "epoch": 1.4039523124739064, + "grad_norm": 0.1106250137090683, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 363180 + }, + { + "epoch": 1.4039909696772896, + "grad_norm": 0.09033829718828201, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 363190 + }, + { + "epoch": 1.404029626880673, + "grad_norm": 0.12094565480947495, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 363200 + }, + { + "epoch": 1.4040682840840562, + "grad_norm": 0.09226465225219727, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 363210 + }, + { + "epoch": 1.4041069412874396, + "grad_norm": 0.14574606716632843, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 363220 + }, + { + "epoch": 1.4041455984908229, + "grad_norm": 0.11849888414144516, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 363230 + }, + { + "epoch": 1.4041842556942061, + "grad_norm": 0.12022794783115387, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 363240 + }, + { + "epoch": 1.4042229128975894, + "grad_norm": 0.10561049729585648, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 363250 + }, + { + "epoch": 1.4042615701009726, + "grad_norm": 0.10209091007709503, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 363260 + }, + { + "epoch": 1.4043002273043559, + "grad_norm": 0.1343676745891571, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 363270 + }, + { + "epoch": 1.4043388845077391, + "grad_norm": 0.11447706818580627, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 363280 + }, + { + "epoch": 1.4043775417111224, + "grad_norm": 0.09987994283437729, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 363290 + }, + { + "epoch": 1.4044161989145056, + "grad_norm": 0.11488515883684158, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 363300 + }, + { + "epoch": 1.404454856117889, + "grad_norm": 0.09707038849592209, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 363310 + }, + { + "epoch": 1.4044935133212724, + "grad_norm": 0.10826553404331207, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 363320 + }, + { + "epoch": 1.4045321705246556, + "grad_norm": 0.09534227848052979, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 363330 + }, + { + "epoch": 1.4045708277280389, + "grad_norm": 0.11304795742034912, + "learning_rate": 0.002, + "loss": 2.336, + "step": 363340 + }, + { + "epoch": 1.4046094849314221, + "grad_norm": 0.09416774660348892, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 363350 + }, + { + "epoch": 1.4046481421348054, + "grad_norm": 0.10979488492012024, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 363360 + }, + { + "epoch": 1.4046867993381886, + "grad_norm": 0.10304318368434906, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 363370 + }, + { + "epoch": 1.4047254565415719, + "grad_norm": 0.09270067512989044, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 363380 + }, + { + "epoch": 1.4047641137449554, + "grad_norm": 0.11495853215456009, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 363390 + }, + { + "epoch": 1.4048027709483386, + "grad_norm": 0.31204378604888916, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 363400 + }, + { + "epoch": 1.4048414281517219, + "grad_norm": 0.09133657068014145, + "learning_rate": 0.002, + "loss": 2.3533, + "step": 363410 + }, + { + "epoch": 1.4048800853551051, + "grad_norm": 0.09207748621702194, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 363420 + }, + { + "epoch": 1.4049187425584884, + "grad_norm": 0.10387468338012695, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 363430 + }, + { + "epoch": 1.4049573997618716, + "grad_norm": 0.4840529263019562, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 363440 + }, + { + "epoch": 1.4049960569652549, + "grad_norm": 0.10210797935724258, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 363450 + }, + { + "epoch": 1.4050347141686381, + "grad_norm": 0.10467180609703064, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 363460 + }, + { + "epoch": 1.4050733713720214, + "grad_norm": 0.08897180110216141, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 363470 + }, + { + "epoch": 1.4051120285754046, + "grad_norm": 0.0998954102396965, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 363480 + }, + { + "epoch": 1.405150685778788, + "grad_norm": 0.09620607644319534, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 363490 + }, + { + "epoch": 1.4051893429821714, + "grad_norm": 0.11393487453460693, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 363500 + }, + { + "epoch": 1.4052280001855546, + "grad_norm": 0.10956080257892609, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 363510 + }, + { + "epoch": 1.4052666573889379, + "grad_norm": 0.1093173399567604, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 363520 + }, + { + "epoch": 1.405305314592321, + "grad_norm": 0.12005335092544556, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 363530 + }, + { + "epoch": 1.4053439717957044, + "grad_norm": 0.11257109045982361, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 363540 + }, + { + "epoch": 1.4053826289990876, + "grad_norm": 0.11301475763320923, + "learning_rate": 0.002, + "loss": 2.3455, + "step": 363550 + }, + { + "epoch": 1.405421286202471, + "grad_norm": 0.11234022676944733, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 363560 + }, + { + "epoch": 1.4054599434058543, + "grad_norm": 0.10995227843523026, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 363570 + }, + { + "epoch": 1.4054986006092376, + "grad_norm": 0.10490180552005768, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 363580 + }, + { + "epoch": 1.4055372578126208, + "grad_norm": 0.10406752675771713, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 363590 + }, + { + "epoch": 1.405575915016004, + "grad_norm": 0.14566640555858612, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 363600 + }, + { + "epoch": 1.4056145722193873, + "grad_norm": 0.10529456287622452, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 363610 + }, + { + "epoch": 1.4056532294227706, + "grad_norm": 0.09102689474821091, + "learning_rate": 0.002, + "loss": 2.338, + "step": 363620 + }, + { + "epoch": 1.4056918866261539, + "grad_norm": 0.1229463443160057, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 363630 + }, + { + "epoch": 1.405730543829537, + "grad_norm": 0.12026394158601761, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 363640 + }, + { + "epoch": 1.4057692010329204, + "grad_norm": 0.10644323378801346, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 363650 + }, + { + "epoch": 1.4058078582363038, + "grad_norm": 0.16524867713451385, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 363660 + }, + { + "epoch": 1.405846515439687, + "grad_norm": 0.09728322923183441, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 363670 + }, + { + "epoch": 1.4058851726430703, + "grad_norm": 0.098272904753685, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 363680 + }, + { + "epoch": 1.4059238298464536, + "grad_norm": 0.11414486169815063, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 363690 + }, + { + "epoch": 1.4059624870498368, + "grad_norm": 0.09679889678955078, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 363700 + }, + { + "epoch": 1.40600114425322, + "grad_norm": 0.09830894321203232, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 363710 + }, + { + "epoch": 1.4060398014566033, + "grad_norm": 0.10768172889947891, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 363720 + }, + { + "epoch": 1.4060784586599868, + "grad_norm": 0.09654033929109573, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 363730 + }, + { + "epoch": 1.40611711586337, + "grad_norm": 0.11248601973056793, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 363740 + }, + { + "epoch": 1.4061557730667533, + "grad_norm": 0.10043682903051376, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 363750 + }, + { + "epoch": 1.4061944302701366, + "grad_norm": 0.1022840142250061, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 363760 + }, + { + "epoch": 1.4062330874735198, + "grad_norm": 0.09865662455558777, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 363770 + }, + { + "epoch": 1.406271744676903, + "grad_norm": 0.10069121420383453, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 363780 + }, + { + "epoch": 1.4063104018802863, + "grad_norm": 0.10596741735935211, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 363790 + }, + { + "epoch": 1.4063490590836696, + "grad_norm": 0.09834664314985275, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 363800 + }, + { + "epoch": 1.4063877162870528, + "grad_norm": 0.1031440794467926, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 363810 + }, + { + "epoch": 1.406426373490436, + "grad_norm": 0.10697752237319946, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 363820 + }, + { + "epoch": 1.4064650306938196, + "grad_norm": 0.10356352478265762, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 363830 + }, + { + "epoch": 1.4065036878972028, + "grad_norm": 0.09256229549646378, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 363840 + }, + { + "epoch": 1.406542345100586, + "grad_norm": 0.09180346131324768, + "learning_rate": 0.002, + "loss": 2.336, + "step": 363850 + }, + { + "epoch": 1.4065810023039693, + "grad_norm": 0.1671026051044464, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 363860 + }, + { + "epoch": 1.4066196595073526, + "grad_norm": 0.13014864921569824, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 363870 + }, + { + "epoch": 1.4066583167107358, + "grad_norm": 0.08843419700860977, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 363880 + }, + { + "epoch": 1.406696973914119, + "grad_norm": 0.097466379404068, + "learning_rate": 0.002, + "loss": 2.337, + "step": 363890 + }, + { + "epoch": 1.4067356311175025, + "grad_norm": 0.10403325408697128, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 363900 + }, + { + "epoch": 1.4067742883208858, + "grad_norm": 0.09658373147249222, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 363910 + }, + { + "epoch": 1.406812945524269, + "grad_norm": 0.09964865446090698, + "learning_rate": 0.002, + "loss": 2.318, + "step": 363920 + }, + { + "epoch": 1.4068516027276523, + "grad_norm": 0.11245810985565186, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 363930 + }, + { + "epoch": 1.4068902599310356, + "grad_norm": 0.09797230362892151, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 363940 + }, + { + "epoch": 1.4069289171344188, + "grad_norm": 0.10533495247364044, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 363950 + }, + { + "epoch": 1.406967574337802, + "grad_norm": 0.1286928802728653, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 363960 + }, + { + "epoch": 1.4070062315411853, + "grad_norm": 0.10884454101324081, + "learning_rate": 0.002, + "loss": 2.333, + "step": 363970 + }, + { + "epoch": 1.4070448887445686, + "grad_norm": 0.1721695214509964, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 363980 + }, + { + "epoch": 1.4070835459479518, + "grad_norm": 0.08932375907897949, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 363990 + }, + { + "epoch": 1.4071222031513353, + "grad_norm": 0.13959693908691406, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 364000 + }, + { + "epoch": 1.4071608603547185, + "grad_norm": 0.0952187329530716, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 364010 + }, + { + "epoch": 1.4071995175581018, + "grad_norm": 0.12597648799419403, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 364020 + }, + { + "epoch": 1.407238174761485, + "grad_norm": 0.10271839797496796, + "learning_rate": 0.002, + "loss": 2.313, + "step": 364030 + }, + { + "epoch": 1.4072768319648683, + "grad_norm": 0.10194234549999237, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 364040 + }, + { + "epoch": 1.4073154891682516, + "grad_norm": 0.11540882289409637, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 364050 + }, + { + "epoch": 1.407354146371635, + "grad_norm": 0.10244659334421158, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 364060 + }, + { + "epoch": 1.4073928035750183, + "grad_norm": 0.09166279435157776, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 364070 + }, + { + "epoch": 1.4074314607784015, + "grad_norm": 0.09806598722934723, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 364080 + }, + { + "epoch": 1.4074701179817848, + "grad_norm": 0.108644999563694, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 364090 + }, + { + "epoch": 1.407508775185168, + "grad_norm": 0.11653917282819748, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 364100 + }, + { + "epoch": 1.4075474323885513, + "grad_norm": 0.13030320405960083, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 364110 + }, + { + "epoch": 1.4075860895919345, + "grad_norm": 0.11179164052009583, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 364120 + }, + { + "epoch": 1.4076247467953178, + "grad_norm": 0.11357536166906357, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 364130 + }, + { + "epoch": 1.407663403998701, + "grad_norm": 0.11446142196655273, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 364140 + }, + { + "epoch": 1.4077020612020843, + "grad_norm": 0.11989977210760117, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 364150 + }, + { + "epoch": 1.4077407184054676, + "grad_norm": 0.11088665574789047, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 364160 + }, + { + "epoch": 1.407779375608851, + "grad_norm": 0.10767869651317596, + "learning_rate": 0.002, + "loss": 2.344, + "step": 364170 + }, + { + "epoch": 1.4078180328122343, + "grad_norm": 0.1473172903060913, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 364180 + }, + { + "epoch": 1.4078566900156175, + "grad_norm": 0.11636603623628616, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 364190 + }, + { + "epoch": 1.4078953472190008, + "grad_norm": 0.11040837317705154, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 364200 + }, + { + "epoch": 1.407934004422384, + "grad_norm": 0.1069677472114563, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 364210 + }, + { + "epoch": 1.4079726616257673, + "grad_norm": 0.11505875736474991, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 364220 + }, + { + "epoch": 1.4080113188291508, + "grad_norm": 0.10425962507724762, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 364230 + }, + { + "epoch": 1.408049976032534, + "grad_norm": 0.11092174798250198, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 364240 + }, + { + "epoch": 1.4080886332359173, + "grad_norm": 0.11644835770130157, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 364250 + }, + { + "epoch": 1.4081272904393005, + "grad_norm": 0.0934944674372673, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 364260 + }, + { + "epoch": 1.4081659476426838, + "grad_norm": 0.09497929364442825, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 364270 + }, + { + "epoch": 1.408204604846067, + "grad_norm": 0.11320208758115768, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 364280 + }, + { + "epoch": 1.4082432620494503, + "grad_norm": 0.09991753101348877, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 364290 + }, + { + "epoch": 1.4082819192528335, + "grad_norm": 0.0956939086318016, + "learning_rate": 0.002, + "loss": 2.331, + "step": 364300 + }, + { + "epoch": 1.4083205764562168, + "grad_norm": 0.11093162000179291, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 364310 + }, + { + "epoch": 1.4083592336596, + "grad_norm": 0.11404040455818176, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 364320 + }, + { + "epoch": 1.4083978908629833, + "grad_norm": 0.1058291420340538, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 364330 + }, + { + "epoch": 1.4084365480663668, + "grad_norm": 0.1125858873128891, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 364340 + }, + { + "epoch": 1.40847520526975, + "grad_norm": 0.1192534789443016, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 364350 + }, + { + "epoch": 1.4085138624731333, + "grad_norm": 0.09552609175443649, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 364360 + }, + { + "epoch": 1.4085525196765165, + "grad_norm": 0.11429965496063232, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 364370 + }, + { + "epoch": 1.4085911768798998, + "grad_norm": 0.14036065340042114, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 364380 + }, + { + "epoch": 1.408629834083283, + "grad_norm": 0.11110293120145798, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 364390 + }, + { + "epoch": 1.4086684912866665, + "grad_norm": 0.10643504559993744, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 364400 + }, + { + "epoch": 1.4087071484900497, + "grad_norm": 0.11138386279344559, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 364410 + }, + { + "epoch": 1.408745805693433, + "grad_norm": 0.09993134438991547, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 364420 + }, + { + "epoch": 1.4087844628968162, + "grad_norm": 0.09777616709470749, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 364430 + }, + { + "epoch": 1.4088231201001995, + "grad_norm": 0.12403468787670135, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 364440 + }, + { + "epoch": 1.4088617773035828, + "grad_norm": 0.09901606291532516, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 364450 + }, + { + "epoch": 1.408900434506966, + "grad_norm": 0.11089719086885452, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 364460 + }, + { + "epoch": 1.4089390917103493, + "grad_norm": 0.09899328649044037, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 364470 + }, + { + "epoch": 1.4089777489137325, + "grad_norm": 0.11358582228422165, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 364480 + }, + { + "epoch": 1.4090164061171158, + "grad_norm": 0.13312110304832458, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 364490 + }, + { + "epoch": 1.409055063320499, + "grad_norm": 0.11911635845899582, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 364500 + }, + { + "epoch": 1.4090937205238825, + "grad_norm": 0.12836365401744843, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 364510 + }, + { + "epoch": 1.4091323777272657, + "grad_norm": 0.0932023897767067, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 364520 + }, + { + "epoch": 1.409171034930649, + "grad_norm": 0.09835109859704971, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 364530 + }, + { + "epoch": 1.4092096921340322, + "grad_norm": 0.11520157009363174, + "learning_rate": 0.002, + "loss": 2.332, + "step": 364540 + }, + { + "epoch": 1.4092483493374155, + "grad_norm": 0.09594057500362396, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 364550 + }, + { + "epoch": 1.4092870065407987, + "grad_norm": 0.1021113321185112, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 364560 + }, + { + "epoch": 1.4093256637441822, + "grad_norm": 0.11449400335550308, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 364570 + }, + { + "epoch": 1.4093643209475655, + "grad_norm": 0.0967157632112503, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 364580 + }, + { + "epoch": 1.4094029781509487, + "grad_norm": 0.235229030251503, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 364590 + }, + { + "epoch": 1.409441635354332, + "grad_norm": 0.11157149076461792, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 364600 + }, + { + "epoch": 1.4094802925577152, + "grad_norm": 0.10770371556282043, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 364610 + }, + { + "epoch": 1.4095189497610985, + "grad_norm": 0.10761409252882004, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 364620 + }, + { + "epoch": 1.4095576069644817, + "grad_norm": 0.12079044431447983, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 364630 + }, + { + "epoch": 1.409596264167865, + "grad_norm": 0.1049729436635971, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 364640 + }, + { + "epoch": 1.4096349213712482, + "grad_norm": 0.1049463227391243, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 364650 + }, + { + "epoch": 1.4096735785746315, + "grad_norm": 0.10356732457876205, + "learning_rate": 0.002, + "loss": 2.317, + "step": 364660 + }, + { + "epoch": 1.4097122357780147, + "grad_norm": 0.11769746989011765, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 364670 + }, + { + "epoch": 1.4097508929813982, + "grad_norm": 0.34086525440216064, + "learning_rate": 0.002, + "loss": 2.327, + "step": 364680 + }, + { + "epoch": 1.4097895501847815, + "grad_norm": 0.11214648932218552, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 364690 + }, + { + "epoch": 1.4098282073881647, + "grad_norm": 0.10398946702480316, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 364700 + }, + { + "epoch": 1.409866864591548, + "grad_norm": 0.10326844453811646, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 364710 + }, + { + "epoch": 1.4099055217949312, + "grad_norm": 0.11874198168516159, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 364720 + }, + { + "epoch": 1.4099441789983145, + "grad_norm": 0.1078546792268753, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 364730 + }, + { + "epoch": 1.409982836201698, + "grad_norm": 0.11323689669370651, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 364740 + }, + { + "epoch": 1.4100214934050812, + "grad_norm": 0.10939283668994904, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 364750 + }, + { + "epoch": 1.4100601506084645, + "grad_norm": 0.11744766682386398, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 364760 + }, + { + "epoch": 1.4100988078118477, + "grad_norm": 0.11937874555587769, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 364770 + }, + { + "epoch": 1.410137465015231, + "grad_norm": 0.1032233014702797, + "learning_rate": 0.002, + "loss": 2.3582, + "step": 364780 + }, + { + "epoch": 1.4101761222186142, + "grad_norm": 0.10637333244085312, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 364790 + }, + { + "epoch": 1.4102147794219975, + "grad_norm": 0.13863548636436462, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 364800 + }, + { + "epoch": 1.4102534366253807, + "grad_norm": 0.1211773008108139, + "learning_rate": 0.002, + "loss": 2.338, + "step": 364810 + }, + { + "epoch": 1.410292093828764, + "grad_norm": 0.10527975112199783, + "learning_rate": 0.002, + "loss": 2.321, + "step": 364820 + }, + { + "epoch": 1.4103307510321472, + "grad_norm": 0.10959915816783905, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 364830 + }, + { + "epoch": 1.4103694082355305, + "grad_norm": 0.0984559953212738, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 364840 + }, + { + "epoch": 1.410408065438914, + "grad_norm": 0.13312171399593353, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 364850 + }, + { + "epoch": 1.4104467226422972, + "grad_norm": 0.11278527230024338, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 364860 + }, + { + "epoch": 1.4104853798456805, + "grad_norm": 0.10411836206912994, + "learning_rate": 0.002, + "loss": 2.322, + "step": 364870 + }, + { + "epoch": 1.4105240370490637, + "grad_norm": 0.10412270575761795, + "learning_rate": 0.002, + "loss": 2.324, + "step": 364880 + }, + { + "epoch": 1.410562694252447, + "grad_norm": 0.10608673840761185, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 364890 + }, + { + "epoch": 1.4106013514558302, + "grad_norm": 0.12006331980228424, + "learning_rate": 0.002, + "loss": 2.324, + "step": 364900 + }, + { + "epoch": 1.4106400086592137, + "grad_norm": 0.09856969118118286, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 364910 + }, + { + "epoch": 1.410678665862597, + "grad_norm": 0.1013568788766861, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 364920 + }, + { + "epoch": 1.4107173230659802, + "grad_norm": 0.14561229944229126, + "learning_rate": 0.002, + "loss": 2.333, + "step": 364930 + }, + { + "epoch": 1.4107559802693634, + "grad_norm": 0.12932388484477997, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 364940 + }, + { + "epoch": 1.4107946374727467, + "grad_norm": 0.10552596300840378, + "learning_rate": 0.002, + "loss": 2.331, + "step": 364950 + }, + { + "epoch": 1.41083329467613, + "grad_norm": 0.1043313518166542, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 364960 + }, + { + "epoch": 1.4108719518795132, + "grad_norm": 0.09932465851306915, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 364970 + }, + { + "epoch": 1.4109106090828964, + "grad_norm": 0.08606306463479996, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 364980 + }, + { + "epoch": 1.4109492662862797, + "grad_norm": 0.09718270599842072, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 364990 + }, + { + "epoch": 1.410987923489663, + "grad_norm": 0.10544393956661224, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 365000 + }, + { + "epoch": 1.4110265806930462, + "grad_norm": 0.10633678734302521, + "learning_rate": 0.002, + "loss": 2.321, + "step": 365010 + }, + { + "epoch": 1.4110652378964297, + "grad_norm": 0.09697526693344116, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 365020 + }, + { + "epoch": 1.411103895099813, + "grad_norm": 0.1041555181145668, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 365030 + }, + { + "epoch": 1.4111425523031962, + "grad_norm": 0.1068888008594513, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 365040 + }, + { + "epoch": 1.4111812095065794, + "grad_norm": 0.10143444687128067, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 365050 + }, + { + "epoch": 1.4112198667099627, + "grad_norm": 0.09093233197927475, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 365060 + }, + { + "epoch": 1.411258523913346, + "grad_norm": 0.1119607537984848, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 365070 + }, + { + "epoch": 1.4112971811167294, + "grad_norm": 0.09111730009317398, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 365080 + }, + { + "epoch": 1.4113358383201127, + "grad_norm": 0.10968484729528427, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 365090 + }, + { + "epoch": 1.411374495523496, + "grad_norm": 0.1032860204577446, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 365100 + }, + { + "epoch": 1.4114131527268792, + "grad_norm": 0.12258035689592361, + "learning_rate": 0.002, + "loss": 2.3098, + "step": 365110 + }, + { + "epoch": 1.4114518099302624, + "grad_norm": 0.09528249502182007, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 365120 + }, + { + "epoch": 1.4114904671336457, + "grad_norm": 0.10955754667520523, + "learning_rate": 0.002, + "loss": 2.3565, + "step": 365130 + }, + { + "epoch": 1.411529124337029, + "grad_norm": 0.10430938005447388, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 365140 + }, + { + "epoch": 1.4115677815404122, + "grad_norm": 0.1117718517780304, + "learning_rate": 0.002, + "loss": 2.325, + "step": 365150 + }, + { + "epoch": 1.4116064387437954, + "grad_norm": 0.08946974575519562, + "learning_rate": 0.002, + "loss": 2.34, + "step": 365160 + }, + { + "epoch": 1.4116450959471787, + "grad_norm": 0.13333655893802643, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 365170 + }, + { + "epoch": 1.411683753150562, + "grad_norm": 0.09432690590620041, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 365180 + }, + { + "epoch": 1.4117224103539454, + "grad_norm": 0.09220901131629944, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 365190 + }, + { + "epoch": 1.4117610675573287, + "grad_norm": 0.1282465159893036, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 365200 + }, + { + "epoch": 1.411799724760712, + "grad_norm": 0.0940774604678154, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 365210 + }, + { + "epoch": 1.4118383819640952, + "grad_norm": 0.09490504115819931, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 365220 + }, + { + "epoch": 1.4118770391674784, + "grad_norm": 0.107475645840168, + "learning_rate": 0.002, + "loss": 2.3064, + "step": 365230 + }, + { + "epoch": 1.4119156963708617, + "grad_norm": 0.11461813002824783, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 365240 + }, + { + "epoch": 1.4119543535742451, + "grad_norm": 0.10359986126422882, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 365250 + }, + { + "epoch": 1.4119930107776284, + "grad_norm": 0.10789109021425247, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 365260 + }, + { + "epoch": 1.4120316679810117, + "grad_norm": 0.10059541463851929, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 365270 + }, + { + "epoch": 1.412070325184395, + "grad_norm": 0.10454394668340683, + "learning_rate": 0.002, + "loss": 2.317, + "step": 365280 + }, + { + "epoch": 1.4121089823877782, + "grad_norm": 0.10084279626607895, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 365290 + }, + { + "epoch": 1.4121476395911614, + "grad_norm": 0.10830729454755783, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 365300 + }, + { + "epoch": 1.4121862967945447, + "grad_norm": 0.08894408494234085, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 365310 + }, + { + "epoch": 1.412224953997928, + "grad_norm": 0.10965152084827423, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 365320 + }, + { + "epoch": 1.4122636112013112, + "grad_norm": 0.09731363505125046, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 365330 + }, + { + "epoch": 1.4123022684046944, + "grad_norm": 0.10241921246051788, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 365340 + }, + { + "epoch": 1.412340925608078, + "grad_norm": 0.09518715739250183, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 365350 + }, + { + "epoch": 1.4123795828114611, + "grad_norm": 0.10591299831867218, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 365360 + }, + { + "epoch": 1.4124182400148444, + "grad_norm": 0.10860461741685867, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 365370 + }, + { + "epoch": 1.4124568972182276, + "grad_norm": 0.1118435487151146, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 365380 + }, + { + "epoch": 1.412495554421611, + "grad_norm": 0.10134666413068771, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 365390 + }, + { + "epoch": 1.4125342116249942, + "grad_norm": 0.12348821014165878, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 365400 + }, + { + "epoch": 1.4125728688283774, + "grad_norm": 0.099838025867939, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 365410 + }, + { + "epoch": 1.4126115260317609, + "grad_norm": 0.11787385493516922, + "learning_rate": 0.002, + "loss": 2.329, + "step": 365420 + }, + { + "epoch": 1.4126501832351441, + "grad_norm": 0.12467779964208603, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 365430 + }, + { + "epoch": 1.4126888404385274, + "grad_norm": 0.12273652106523514, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 365440 + }, + { + "epoch": 1.4127274976419106, + "grad_norm": 0.0944012925028801, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 365450 + }, + { + "epoch": 1.4127661548452939, + "grad_norm": 0.09575633704662323, + "learning_rate": 0.002, + "loss": 2.323, + "step": 365460 + }, + { + "epoch": 1.4128048120486771, + "grad_norm": 0.09414394199848175, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 365470 + }, + { + "epoch": 1.4128434692520604, + "grad_norm": 0.10007159411907196, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 365480 + }, + { + "epoch": 1.4128821264554436, + "grad_norm": 0.10673514753580093, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 365490 + }, + { + "epoch": 1.412920783658827, + "grad_norm": 0.09196875244379044, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 365500 + }, + { + "epoch": 1.4129594408622101, + "grad_norm": 0.09712628275156021, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 365510 + }, + { + "epoch": 1.4129980980655936, + "grad_norm": 0.09691095352172852, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 365520 + }, + { + "epoch": 1.4130367552689769, + "grad_norm": 0.1034843698143959, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 365530 + }, + { + "epoch": 1.4130754124723601, + "grad_norm": 0.12032202631235123, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 365540 + }, + { + "epoch": 1.4131140696757434, + "grad_norm": 0.11367922276258469, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 365550 + }, + { + "epoch": 1.4131527268791266, + "grad_norm": 0.10583038628101349, + "learning_rate": 0.002, + "loss": 2.333, + "step": 365560 + }, + { + "epoch": 1.4131913840825099, + "grad_norm": 0.11696499586105347, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 365570 + }, + { + "epoch": 1.4132300412858931, + "grad_norm": 0.1028081476688385, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 365580 + }, + { + "epoch": 1.4132686984892766, + "grad_norm": 0.09909705817699432, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 365590 + }, + { + "epoch": 1.4133073556926599, + "grad_norm": 0.09184190630912781, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 365600 + }, + { + "epoch": 1.4133460128960431, + "grad_norm": 0.10988624393939972, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 365610 + }, + { + "epoch": 1.4133846700994264, + "grad_norm": 0.10959646850824356, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 365620 + }, + { + "epoch": 1.4134233273028096, + "grad_norm": 0.11385467648506165, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 365630 + }, + { + "epoch": 1.4134619845061929, + "grad_norm": 0.11543765664100647, + "learning_rate": 0.002, + "loss": 2.327, + "step": 365640 + }, + { + "epoch": 1.4135006417095761, + "grad_norm": 0.0983387678861618, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 365650 + }, + { + "epoch": 1.4135392989129594, + "grad_norm": 0.10942315310239792, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 365660 + }, + { + "epoch": 1.4135779561163426, + "grad_norm": 0.09957767277956009, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 365670 + }, + { + "epoch": 1.4136166133197259, + "grad_norm": 0.11526834964752197, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 365680 + }, + { + "epoch": 1.4136552705231094, + "grad_norm": 0.10145354270935059, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 365690 + }, + { + "epoch": 1.4136939277264926, + "grad_norm": 0.09600527584552765, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 365700 + }, + { + "epoch": 1.4137325849298759, + "grad_norm": 0.0961616262793541, + "learning_rate": 0.002, + "loss": 2.3479, + "step": 365710 + }, + { + "epoch": 1.413771242133259, + "grad_norm": 0.12369763851165771, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 365720 + }, + { + "epoch": 1.4138098993366424, + "grad_norm": 0.10327655076980591, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 365730 + }, + { + "epoch": 1.4138485565400256, + "grad_norm": 0.10974455624818802, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 365740 + }, + { + "epoch": 1.4138872137434089, + "grad_norm": 0.09836733341217041, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 365750 + }, + { + "epoch": 1.4139258709467923, + "grad_norm": 0.09874507784843445, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 365760 + }, + { + "epoch": 1.4139645281501756, + "grad_norm": 0.11693933606147766, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 365770 + }, + { + "epoch": 1.4140031853535588, + "grad_norm": 0.08909876644611359, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 365780 + }, + { + "epoch": 1.414041842556942, + "grad_norm": 0.1115279570221901, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 365790 + }, + { + "epoch": 1.4140804997603253, + "grad_norm": 0.10595174878835678, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 365800 + }, + { + "epoch": 1.4141191569637086, + "grad_norm": 0.11007455736398697, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 365810 + }, + { + "epoch": 1.4141578141670919, + "grad_norm": 0.10507000237703323, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 365820 + }, + { + "epoch": 1.414196471370475, + "grad_norm": 0.09129108488559723, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 365830 + }, + { + "epoch": 1.4142351285738584, + "grad_norm": 0.10857604444026947, + "learning_rate": 0.002, + "loss": 2.329, + "step": 365840 + }, + { + "epoch": 1.4142737857772416, + "grad_norm": 0.13335253298282623, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 365850 + }, + { + "epoch": 1.414312442980625, + "grad_norm": 0.1100061908364296, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 365860 + }, + { + "epoch": 1.4143511001840083, + "grad_norm": 0.11533210426568985, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 365870 + }, + { + "epoch": 1.4143897573873916, + "grad_norm": 0.12122216075658798, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 365880 + }, + { + "epoch": 1.4144284145907748, + "grad_norm": 0.11135326325893402, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 365890 + }, + { + "epoch": 1.414467071794158, + "grad_norm": 0.09590534120798111, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 365900 + }, + { + "epoch": 1.4145057289975413, + "grad_norm": 0.11739147454500198, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 365910 + }, + { + "epoch": 1.4145443862009246, + "grad_norm": 0.09624744206666946, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 365920 + }, + { + "epoch": 1.414583043404308, + "grad_norm": 0.12246581166982651, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 365930 + }, + { + "epoch": 1.4146217006076913, + "grad_norm": 0.11366311460733414, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 365940 + }, + { + "epoch": 1.4146603578110746, + "grad_norm": 0.1130039170384407, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 365950 + }, + { + "epoch": 1.4146990150144578, + "grad_norm": 0.1092824786901474, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 365960 + }, + { + "epoch": 1.414737672217841, + "grad_norm": 0.10728099197149277, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 365970 + }, + { + "epoch": 1.4147763294212243, + "grad_norm": 0.12284889072179794, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 365980 + }, + { + "epoch": 1.4148149866246076, + "grad_norm": 0.11151456832885742, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 365990 + }, + { + "epoch": 1.4148536438279908, + "grad_norm": 0.10078704357147217, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 366000 + }, + { + "epoch": 1.414892301031374, + "grad_norm": 0.11053545773029327, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 366010 + }, + { + "epoch": 1.4149309582347573, + "grad_norm": 0.10661806166172028, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 366020 + }, + { + "epoch": 1.4149696154381408, + "grad_norm": 0.1044822558760643, + "learning_rate": 0.002, + "loss": 2.342, + "step": 366030 + }, + { + "epoch": 1.415008272641524, + "grad_norm": 0.09812510758638382, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 366040 + }, + { + "epoch": 1.4150469298449073, + "grad_norm": 0.10488618165254593, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 366050 + }, + { + "epoch": 1.4150855870482906, + "grad_norm": 0.10980476438999176, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 366060 + }, + { + "epoch": 1.4151242442516738, + "grad_norm": 0.10456784814596176, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 366070 + }, + { + "epoch": 1.415162901455057, + "grad_norm": 0.09372668713331223, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 366080 + }, + { + "epoch": 1.4152015586584406, + "grad_norm": 0.12072702497243881, + "learning_rate": 0.002, + "loss": 2.3133, + "step": 366090 + }, + { + "epoch": 1.4152402158618238, + "grad_norm": 0.10739337652921677, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 366100 + }, + { + "epoch": 1.415278873065207, + "grad_norm": 0.11193457990884781, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 366110 + }, + { + "epoch": 1.4153175302685903, + "grad_norm": 0.12541808187961578, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 366120 + }, + { + "epoch": 1.4153561874719736, + "grad_norm": 0.09714280813932419, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 366130 + }, + { + "epoch": 1.4153948446753568, + "grad_norm": 0.1137126013636589, + "learning_rate": 0.002, + "loss": 2.326, + "step": 366140 + }, + { + "epoch": 1.41543350187874, + "grad_norm": 0.1105848103761673, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 366150 + }, + { + "epoch": 1.4154721590821233, + "grad_norm": 0.09818024933338165, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 366160 + }, + { + "epoch": 1.4155108162855066, + "grad_norm": 0.12252239137887955, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 366170 + }, + { + "epoch": 1.4155494734888898, + "grad_norm": 0.10770494490861893, + "learning_rate": 0.002, + "loss": 2.342, + "step": 366180 + }, + { + "epoch": 1.415588130692273, + "grad_norm": 0.09204906225204468, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 366190 + }, + { + "epoch": 1.4156267878956565, + "grad_norm": 0.10548800230026245, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 366200 + }, + { + "epoch": 1.4156654450990398, + "grad_norm": 0.12471897155046463, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 366210 + }, + { + "epoch": 1.415704102302423, + "grad_norm": 0.11144419759511948, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 366220 + }, + { + "epoch": 1.4157427595058063, + "grad_norm": 0.09332024306058884, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 366230 + }, + { + "epoch": 1.4157814167091896, + "grad_norm": 0.11328937113285065, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 366240 + }, + { + "epoch": 1.4158200739125728, + "grad_norm": 0.10417918860912323, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 366250 + }, + { + "epoch": 1.4158587311159563, + "grad_norm": 0.0939640998840332, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 366260 + }, + { + "epoch": 1.4158973883193395, + "grad_norm": 0.10409481823444366, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 366270 + }, + { + "epoch": 1.4159360455227228, + "grad_norm": 0.1057303249835968, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 366280 + }, + { + "epoch": 1.415974702726106, + "grad_norm": 0.11792177706956863, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 366290 + }, + { + "epoch": 1.4160133599294893, + "grad_norm": 0.10138144344091415, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 366300 + }, + { + "epoch": 1.4160520171328725, + "grad_norm": 0.0995168387889862, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 366310 + }, + { + "epoch": 1.4160906743362558, + "grad_norm": 0.1047981008887291, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 366320 + }, + { + "epoch": 1.416129331539639, + "grad_norm": 0.2975539565086365, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 366330 + }, + { + "epoch": 1.4161679887430223, + "grad_norm": 0.09719040989875793, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 366340 + }, + { + "epoch": 1.4162066459464056, + "grad_norm": 0.10597871243953705, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 366350 + }, + { + "epoch": 1.4162453031497888, + "grad_norm": 0.132241353392601, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 366360 + }, + { + "epoch": 1.4162839603531723, + "grad_norm": 0.09745777398347855, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 366370 + }, + { + "epoch": 1.4163226175565555, + "grad_norm": 0.09836073219776154, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 366380 + }, + { + "epoch": 1.4163612747599388, + "grad_norm": 0.12061022222042084, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 366390 + }, + { + "epoch": 1.416399931963322, + "grad_norm": 0.11182274669408798, + "learning_rate": 0.002, + "loss": 2.332, + "step": 366400 + }, + { + "epoch": 1.4164385891667053, + "grad_norm": 0.10241756588220596, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 366410 + }, + { + "epoch": 1.4164772463700885, + "grad_norm": 0.10544509440660477, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 366420 + }, + { + "epoch": 1.416515903573472, + "grad_norm": 0.10424435138702393, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 366430 + }, + { + "epoch": 1.4165545607768553, + "grad_norm": 0.10981172323226929, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 366440 + }, + { + "epoch": 1.4165932179802385, + "grad_norm": 0.11870953440666199, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 366450 + }, + { + "epoch": 1.4166318751836218, + "grad_norm": 0.1339559555053711, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 366460 + }, + { + "epoch": 1.416670532387005, + "grad_norm": 0.09286986291408539, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 366470 + }, + { + "epoch": 1.4167091895903883, + "grad_norm": 0.11012223362922668, + "learning_rate": 0.002, + "loss": 2.338, + "step": 366480 + }, + { + "epoch": 1.4167478467937715, + "grad_norm": 0.21809683740139008, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 366490 + }, + { + "epoch": 1.4167865039971548, + "grad_norm": 0.09550125151872635, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 366500 + }, + { + "epoch": 1.416825161200538, + "grad_norm": 0.21449320018291473, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 366510 + }, + { + "epoch": 1.4168638184039213, + "grad_norm": 0.10646815598011017, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 366520 + }, + { + "epoch": 1.4169024756073045, + "grad_norm": 0.1135336235165596, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 366530 + }, + { + "epoch": 1.416941132810688, + "grad_norm": 0.09614226967096329, + "learning_rate": 0.002, + "loss": 2.337, + "step": 366540 + }, + { + "epoch": 1.4169797900140713, + "grad_norm": 0.11009273678064346, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 366550 + }, + { + "epoch": 1.4170184472174545, + "grad_norm": 0.10821156948804855, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 366560 + }, + { + "epoch": 1.4170571044208378, + "grad_norm": 0.09521789848804474, + "learning_rate": 0.002, + "loss": 2.3585, + "step": 366570 + }, + { + "epoch": 1.417095761624221, + "grad_norm": 0.10291599482297897, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 366580 + }, + { + "epoch": 1.4171344188276043, + "grad_norm": 0.10520762950181961, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 366590 + }, + { + "epoch": 1.4171730760309877, + "grad_norm": 0.09714417904615402, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 366600 + }, + { + "epoch": 1.417211733234371, + "grad_norm": 0.1105012446641922, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 366610 + }, + { + "epoch": 1.4172503904377542, + "grad_norm": 0.10851927101612091, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 366620 + }, + { + "epoch": 1.4172890476411375, + "grad_norm": 0.09911957383155823, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 366630 + }, + { + "epoch": 1.4173277048445208, + "grad_norm": 0.09575625509023666, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 366640 + }, + { + "epoch": 1.417366362047904, + "grad_norm": 0.10080332309007645, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 366650 + }, + { + "epoch": 1.4174050192512873, + "grad_norm": 0.10602813214063644, + "learning_rate": 0.002, + "loss": 2.348, + "step": 366660 + }, + { + "epoch": 1.4174436764546705, + "grad_norm": 0.11299441754817963, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 366670 + }, + { + "epoch": 1.4174823336580538, + "grad_norm": 0.13318519294261932, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 366680 + }, + { + "epoch": 1.417520990861437, + "grad_norm": 0.10320179164409637, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 366690 + }, + { + "epoch": 1.4175596480648203, + "grad_norm": 0.09384647756814957, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 366700 + }, + { + "epoch": 1.4175983052682037, + "grad_norm": 0.3169427812099457, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 366710 + }, + { + "epoch": 1.417636962471587, + "grad_norm": 0.11232533305883408, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 366720 + }, + { + "epoch": 1.4176756196749702, + "grad_norm": 0.10294479131698608, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 366730 + }, + { + "epoch": 1.4177142768783535, + "grad_norm": 0.12189421057701111, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 366740 + }, + { + "epoch": 1.4177529340817367, + "grad_norm": 0.10026071965694427, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 366750 + }, + { + "epoch": 1.41779159128512, + "grad_norm": 0.10128407925367355, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 366760 + }, + { + "epoch": 1.4178302484885035, + "grad_norm": 0.11087165772914886, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 366770 + }, + { + "epoch": 1.4178689056918867, + "grad_norm": 0.10710074752569199, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 366780 + }, + { + "epoch": 1.41790756289527, + "grad_norm": 0.1305713802576065, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 366790 + }, + { + "epoch": 1.4179462200986532, + "grad_norm": 0.10092976689338684, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 366800 + }, + { + "epoch": 1.4179848773020365, + "grad_norm": 0.10149747133255005, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 366810 + }, + { + "epoch": 1.4180235345054197, + "grad_norm": 0.09356102347373962, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 366820 + }, + { + "epoch": 1.418062191708803, + "grad_norm": 0.09790142625570297, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 366830 + }, + { + "epoch": 1.4181008489121862, + "grad_norm": 0.11745724827051163, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 366840 + }, + { + "epoch": 1.4181395061155695, + "grad_norm": 0.08513977378606796, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 366850 + }, + { + "epoch": 1.4181781633189527, + "grad_norm": 0.09998525679111481, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 366860 + }, + { + "epoch": 1.418216820522336, + "grad_norm": 0.10445107519626617, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 366870 + }, + { + "epoch": 1.4182554777257195, + "grad_norm": 0.09004156291484833, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 366880 + }, + { + "epoch": 1.4182941349291027, + "grad_norm": 0.1079174131155014, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 366890 + }, + { + "epoch": 1.418332792132486, + "grad_norm": 0.10222215950489044, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 366900 + }, + { + "epoch": 1.4183714493358692, + "grad_norm": 0.10205639153718948, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 366910 + }, + { + "epoch": 1.4184101065392525, + "grad_norm": 0.13565658032894135, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 366920 + }, + { + "epoch": 1.4184487637426357, + "grad_norm": 0.10180047899484634, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 366930 + }, + { + "epoch": 1.4184874209460192, + "grad_norm": 0.1170867383480072, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 366940 + }, + { + "epoch": 1.4185260781494025, + "grad_norm": 0.11132314056158066, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 366950 + }, + { + "epoch": 1.4185647353527857, + "grad_norm": 0.11016213893890381, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 366960 + }, + { + "epoch": 1.418603392556169, + "grad_norm": 0.10189254581928253, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 366970 + }, + { + "epoch": 1.4186420497595522, + "grad_norm": 0.10096590965986252, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 366980 + }, + { + "epoch": 1.4186807069629355, + "grad_norm": 0.10366704314947128, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 366990 + }, + { + "epoch": 1.4187193641663187, + "grad_norm": 0.10980086028575897, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 367000 + }, + { + "epoch": 1.418758021369702, + "grad_norm": 0.1058797687292099, + "learning_rate": 0.002, + "loss": 2.334, + "step": 367010 + }, + { + "epoch": 1.4187966785730852, + "grad_norm": 0.11351893842220306, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 367020 + }, + { + "epoch": 1.4188353357764685, + "grad_norm": 0.10458218306303024, + "learning_rate": 0.002, + "loss": 2.328, + "step": 367030 + }, + { + "epoch": 1.4188739929798517, + "grad_norm": 0.11217319220304489, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 367040 + }, + { + "epoch": 1.4189126501832352, + "grad_norm": 0.09893601387739182, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 367050 + }, + { + "epoch": 1.4189513073866185, + "grad_norm": 0.1001359298825264, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 367060 + }, + { + "epoch": 1.4189899645900017, + "grad_norm": 0.12899377942085266, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 367070 + }, + { + "epoch": 1.419028621793385, + "grad_norm": 0.1014057919383049, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 367080 + }, + { + "epoch": 1.4190672789967682, + "grad_norm": 0.10943603515625, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 367090 + }, + { + "epoch": 1.4191059362001515, + "grad_norm": 0.09798979014158249, + "learning_rate": 0.002, + "loss": 2.3554, + "step": 367100 + }, + { + "epoch": 1.419144593403535, + "grad_norm": 0.11311440914869308, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 367110 + }, + { + "epoch": 1.4191832506069182, + "grad_norm": 0.098808154463768, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 367120 + }, + { + "epoch": 1.4192219078103014, + "grad_norm": 0.10621762275695801, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 367130 + }, + { + "epoch": 1.4192605650136847, + "grad_norm": 0.11025074124336243, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 367140 + }, + { + "epoch": 1.419299222217068, + "grad_norm": 0.1080571860074997, + "learning_rate": 0.002, + "loss": 2.349, + "step": 367150 + }, + { + "epoch": 1.4193378794204512, + "grad_norm": 0.14338891208171844, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 367160 + }, + { + "epoch": 1.4193765366238345, + "grad_norm": 0.10458295047283173, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 367170 + }, + { + "epoch": 1.4194151938272177, + "grad_norm": 0.1376844346523285, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 367180 + }, + { + "epoch": 1.419453851030601, + "grad_norm": 0.11458317935466766, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 367190 + }, + { + "epoch": 1.4194925082339842, + "grad_norm": 0.13006827235221863, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 367200 + }, + { + "epoch": 1.4195311654373677, + "grad_norm": 0.10229130834341049, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 367210 + }, + { + "epoch": 1.419569822640751, + "grad_norm": 0.20018444955348969, + "learning_rate": 0.002, + "loss": 2.3135, + "step": 367220 + }, + { + "epoch": 1.4196084798441342, + "grad_norm": 0.09971259534358978, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 367230 + }, + { + "epoch": 1.4196471370475174, + "grad_norm": 0.08571271598339081, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 367240 + }, + { + "epoch": 1.4196857942509007, + "grad_norm": 0.1437324583530426, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 367250 + }, + { + "epoch": 1.419724451454284, + "grad_norm": 0.09790416061878204, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 367260 + }, + { + "epoch": 1.4197631086576672, + "grad_norm": 0.09812484681606293, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 367270 + }, + { + "epoch": 1.4198017658610507, + "grad_norm": 0.09241976588964462, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 367280 + }, + { + "epoch": 1.419840423064434, + "grad_norm": 0.12094639241695404, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 367290 + }, + { + "epoch": 1.4198790802678172, + "grad_norm": 0.13233411312103271, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 367300 + }, + { + "epoch": 1.4199177374712004, + "grad_norm": 0.09673107415437698, + "learning_rate": 0.002, + "loss": 2.344, + "step": 367310 + }, + { + "epoch": 1.4199563946745837, + "grad_norm": 0.11193081736564636, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 367320 + }, + { + "epoch": 1.419995051877967, + "grad_norm": 0.09532983601093292, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 367330 + }, + { + "epoch": 1.4200337090813502, + "grad_norm": 0.09918580204248428, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 367340 + }, + { + "epoch": 1.4200723662847334, + "grad_norm": 0.11038313806056976, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 367350 + }, + { + "epoch": 1.4201110234881167, + "grad_norm": 0.0948074534535408, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 367360 + }, + { + "epoch": 1.4201496806915, + "grad_norm": 0.09734196960926056, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 367370 + }, + { + "epoch": 1.4201883378948834, + "grad_norm": 0.10987944900989532, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 367380 + }, + { + "epoch": 1.4202269950982667, + "grad_norm": 0.11198877543210983, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 367390 + }, + { + "epoch": 1.42026565230165, + "grad_norm": 0.10422197729349136, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 367400 + }, + { + "epoch": 1.4203043095050332, + "grad_norm": 0.10399855673313141, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 367410 + }, + { + "epoch": 1.4203429667084164, + "grad_norm": 0.09684939682483673, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 367420 + }, + { + "epoch": 1.4203816239117997, + "grad_norm": 0.12432610988616943, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 367430 + }, + { + "epoch": 1.420420281115183, + "grad_norm": 0.10199954360723495, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 367440 + }, + { + "epoch": 1.4204589383185664, + "grad_norm": 0.09740272164344788, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 367450 + }, + { + "epoch": 1.4204975955219497, + "grad_norm": 0.10742583125829697, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 367460 + }, + { + "epoch": 1.420536252725333, + "grad_norm": 0.11151610314846039, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 367470 + }, + { + "epoch": 1.4205749099287162, + "grad_norm": 0.08460313826799393, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 367480 + }, + { + "epoch": 1.4206135671320994, + "grad_norm": 0.10881251096725464, + "learning_rate": 0.002, + "loss": 2.3129, + "step": 367490 + }, + { + "epoch": 1.4206522243354827, + "grad_norm": 0.1039658933877945, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 367500 + }, + { + "epoch": 1.420690881538866, + "grad_norm": 0.09391934424638748, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 367510 + }, + { + "epoch": 1.4207295387422492, + "grad_norm": 0.10123565047979355, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 367520 + }, + { + "epoch": 1.4207681959456324, + "grad_norm": 0.10684125870466232, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 367530 + }, + { + "epoch": 1.4208068531490157, + "grad_norm": 0.10670426487922668, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 367540 + }, + { + "epoch": 1.4208455103523991, + "grad_norm": 0.10557490587234497, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 367550 + }, + { + "epoch": 1.4208841675557824, + "grad_norm": 0.08854088187217712, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 367560 + }, + { + "epoch": 1.4209228247591656, + "grad_norm": 0.10692249983549118, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 367570 + }, + { + "epoch": 1.420961481962549, + "grad_norm": 0.10752949863672256, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 367580 + }, + { + "epoch": 1.4210001391659322, + "grad_norm": 0.12726660072803497, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 367590 + }, + { + "epoch": 1.4210387963693154, + "grad_norm": 0.10740122944116592, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 367600 + }, + { + "epoch": 1.4210774535726987, + "grad_norm": 0.129148468375206, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 367610 + }, + { + "epoch": 1.4211161107760821, + "grad_norm": 0.10967687517404556, + "learning_rate": 0.002, + "loss": 2.321, + "step": 367620 + }, + { + "epoch": 1.4211547679794654, + "grad_norm": 0.09522423148155212, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 367630 + }, + { + "epoch": 1.4211934251828486, + "grad_norm": 0.10406163334846497, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 367640 + }, + { + "epoch": 1.4212320823862319, + "grad_norm": 0.09662437438964844, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 367650 + }, + { + "epoch": 1.4212707395896151, + "grad_norm": 0.09962788969278336, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 367660 + }, + { + "epoch": 1.4213093967929984, + "grad_norm": 0.0924009308218956, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 367670 + }, + { + "epoch": 1.4213480539963816, + "grad_norm": 0.09442176669836044, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 367680 + }, + { + "epoch": 1.421386711199765, + "grad_norm": 0.11521480232477188, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 367690 + }, + { + "epoch": 1.4214253684031481, + "grad_norm": 0.10343587398529053, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 367700 + }, + { + "epoch": 1.4214640256065314, + "grad_norm": 0.10477244853973389, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 367710 + }, + { + "epoch": 1.4215026828099149, + "grad_norm": 0.1124812439084053, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 367720 + }, + { + "epoch": 1.4215413400132981, + "grad_norm": 0.11153688281774521, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 367730 + }, + { + "epoch": 1.4215799972166814, + "grad_norm": 0.09929315000772476, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 367740 + }, + { + "epoch": 1.4216186544200646, + "grad_norm": 0.10471302270889282, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 367750 + }, + { + "epoch": 1.4216573116234479, + "grad_norm": 0.10565510392189026, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 367760 + }, + { + "epoch": 1.4216959688268311, + "grad_norm": 0.12525208294391632, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 367770 + }, + { + "epoch": 1.4217346260302144, + "grad_norm": 0.10688397288322449, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 367780 + }, + { + "epoch": 1.4217732832335979, + "grad_norm": 0.0988677367568016, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 367790 + }, + { + "epoch": 1.4218119404369811, + "grad_norm": 0.10281901806592941, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 367800 + }, + { + "epoch": 1.4218505976403644, + "grad_norm": 0.10552622377872467, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 367810 + }, + { + "epoch": 1.4218892548437476, + "grad_norm": 0.12458661198616028, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 367820 + }, + { + "epoch": 1.4219279120471309, + "grad_norm": 0.11455637961626053, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 367830 + }, + { + "epoch": 1.4219665692505141, + "grad_norm": 0.08602002263069153, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 367840 + }, + { + "epoch": 1.4220052264538974, + "grad_norm": 0.10147720575332642, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 367850 + }, + { + "epoch": 1.4220438836572806, + "grad_norm": 0.10182800889015198, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 367860 + }, + { + "epoch": 1.4220825408606639, + "grad_norm": 0.10629668831825256, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 367870 + }, + { + "epoch": 1.4221211980640471, + "grad_norm": 0.09327349066734314, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 367880 + }, + { + "epoch": 1.4221598552674306, + "grad_norm": 0.11980682611465454, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 367890 + }, + { + "epoch": 1.4221985124708139, + "grad_norm": 0.09699162095785141, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 367900 + }, + { + "epoch": 1.422237169674197, + "grad_norm": 0.10681085288524628, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 367910 + }, + { + "epoch": 1.4222758268775804, + "grad_norm": 0.09521599113941193, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 367920 + }, + { + "epoch": 1.4223144840809636, + "grad_norm": 0.10524185746908188, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 367930 + }, + { + "epoch": 1.4223531412843469, + "grad_norm": 0.0984361320734024, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 367940 + }, + { + "epoch": 1.4223917984877303, + "grad_norm": 0.0989803597331047, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 367950 + }, + { + "epoch": 1.4224304556911136, + "grad_norm": 0.11651834100484848, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 367960 + }, + { + "epoch": 1.4224691128944968, + "grad_norm": 0.09913109242916107, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 367970 + }, + { + "epoch": 1.42250777009788, + "grad_norm": 0.12161785364151001, + "learning_rate": 0.002, + "loss": 2.328, + "step": 367980 + }, + { + "epoch": 1.4225464273012633, + "grad_norm": 0.1221100389957428, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 367990 + }, + { + "epoch": 1.4225850845046466, + "grad_norm": 0.10555516183376312, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 368000 + }, + { + "epoch": 1.4226237417080299, + "grad_norm": 0.21151964366436005, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 368010 + }, + { + "epoch": 1.422662398911413, + "grad_norm": 0.0941983163356781, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 368020 + }, + { + "epoch": 1.4227010561147964, + "grad_norm": 0.10658541321754456, + "learning_rate": 0.002, + "loss": 2.333, + "step": 368030 + }, + { + "epoch": 1.4227397133181796, + "grad_norm": 0.1082150936126709, + "learning_rate": 0.002, + "loss": 2.3111, + "step": 368040 + }, + { + "epoch": 1.4227783705215629, + "grad_norm": 0.10653243958950043, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 368050 + }, + { + "epoch": 1.4228170277249463, + "grad_norm": 0.08912888169288635, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 368060 + }, + { + "epoch": 1.4228556849283296, + "grad_norm": 0.09850385040044785, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 368070 + }, + { + "epoch": 1.4228943421317128, + "grad_norm": 0.09942353516817093, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 368080 + }, + { + "epoch": 1.422932999335096, + "grad_norm": 0.13291941583156586, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 368090 + }, + { + "epoch": 1.4229716565384793, + "grad_norm": 0.10999099165201187, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 368100 + }, + { + "epoch": 1.4230103137418626, + "grad_norm": 0.10195674747228622, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 368110 + }, + { + "epoch": 1.423048970945246, + "grad_norm": 0.12106695771217346, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 368120 + }, + { + "epoch": 1.4230876281486293, + "grad_norm": 0.09300248324871063, + "learning_rate": 0.002, + "loss": 2.329, + "step": 368130 + }, + { + "epoch": 1.4231262853520126, + "grad_norm": 0.11889013648033142, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 368140 + }, + { + "epoch": 1.4231649425553958, + "grad_norm": 0.10581057518720627, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 368150 + }, + { + "epoch": 1.423203599758779, + "grad_norm": 0.14475677907466888, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 368160 + }, + { + "epoch": 1.4232422569621623, + "grad_norm": 0.10251044481992722, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 368170 + }, + { + "epoch": 1.4232809141655456, + "grad_norm": 0.10213552415370941, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 368180 + }, + { + "epoch": 1.4233195713689288, + "grad_norm": 0.09757188707590103, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 368190 + }, + { + "epoch": 1.423358228572312, + "grad_norm": 0.11053182929754257, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 368200 + }, + { + "epoch": 1.4233968857756953, + "grad_norm": 0.08805803209543228, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 368210 + }, + { + "epoch": 1.4234355429790786, + "grad_norm": 0.11464353650808334, + "learning_rate": 0.002, + "loss": 2.309, + "step": 368220 + }, + { + "epoch": 1.423474200182462, + "grad_norm": 0.10245181620121002, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 368230 + }, + { + "epoch": 1.4235128573858453, + "grad_norm": 0.09253382682800293, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 368240 + }, + { + "epoch": 1.4235515145892286, + "grad_norm": 0.08895239233970642, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 368250 + }, + { + "epoch": 1.4235901717926118, + "grad_norm": 0.12682317197322845, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 368260 + }, + { + "epoch": 1.423628828995995, + "grad_norm": 0.10514622181653976, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 368270 + }, + { + "epoch": 1.4236674861993783, + "grad_norm": 0.10735808312892914, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 368280 + }, + { + "epoch": 1.4237061434027618, + "grad_norm": 0.09588049352169037, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 368290 + }, + { + "epoch": 1.423744800606145, + "grad_norm": 0.10734907537698746, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 368300 + }, + { + "epoch": 1.4237834578095283, + "grad_norm": 0.09214717894792557, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 368310 + }, + { + "epoch": 1.4238221150129116, + "grad_norm": 0.08925966918468475, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 368320 + }, + { + "epoch": 1.4238607722162948, + "grad_norm": 0.11620984971523285, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 368330 + }, + { + "epoch": 1.423899429419678, + "grad_norm": 0.13195545971393585, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 368340 + }, + { + "epoch": 1.4239380866230613, + "grad_norm": 0.10492658615112305, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 368350 + }, + { + "epoch": 1.4239767438264446, + "grad_norm": 0.1065715104341507, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 368360 + }, + { + "epoch": 1.4240154010298278, + "grad_norm": 0.09849032759666443, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 368370 + }, + { + "epoch": 1.424054058233211, + "grad_norm": 0.10577639192342758, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 368380 + }, + { + "epoch": 1.4240927154365943, + "grad_norm": 0.12574061751365662, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 368390 + }, + { + "epoch": 1.4241313726399778, + "grad_norm": 0.11132027953863144, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 368400 + }, + { + "epoch": 1.424170029843361, + "grad_norm": 0.10425229370594025, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 368410 + }, + { + "epoch": 1.4242086870467443, + "grad_norm": 0.09737608581781387, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 368420 + }, + { + "epoch": 1.4242473442501276, + "grad_norm": 0.11113197356462479, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 368430 + }, + { + "epoch": 1.4242860014535108, + "grad_norm": 0.10532024502754211, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 368440 + }, + { + "epoch": 1.424324658656894, + "grad_norm": 0.09628026187419891, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 368450 + }, + { + "epoch": 1.4243633158602775, + "grad_norm": 0.1088952049612999, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 368460 + }, + { + "epoch": 1.4244019730636608, + "grad_norm": 0.12860263884067535, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 368470 + }, + { + "epoch": 1.424440630267044, + "grad_norm": 0.10668892413377762, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 368480 + }, + { + "epoch": 1.4244792874704273, + "grad_norm": 0.1027596965432167, + "learning_rate": 0.002, + "loss": 2.323, + "step": 368490 + }, + { + "epoch": 1.4245179446738105, + "grad_norm": 0.12930242717266083, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 368500 + }, + { + "epoch": 1.4245566018771938, + "grad_norm": 0.10408979654312134, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 368510 + }, + { + "epoch": 1.424595259080577, + "grad_norm": 0.09692550450563431, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 368520 + }, + { + "epoch": 1.4246339162839603, + "grad_norm": 0.10290974378585815, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 368530 + }, + { + "epoch": 1.4246725734873436, + "grad_norm": 0.08995725214481354, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 368540 + }, + { + "epoch": 1.4247112306907268, + "grad_norm": 0.09780704975128174, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 368550 + }, + { + "epoch": 1.42474988789411, + "grad_norm": 0.10348577052354813, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 368560 + }, + { + "epoch": 1.4247885450974935, + "grad_norm": 0.10124948620796204, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 368570 + }, + { + "epoch": 1.4248272023008768, + "grad_norm": 0.11615990102291107, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 368580 + }, + { + "epoch": 1.42486585950426, + "grad_norm": 0.11272797733545303, + "learning_rate": 0.002, + "loss": 2.32, + "step": 368590 + }, + { + "epoch": 1.4249045167076433, + "grad_norm": 0.09842094779014587, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 368600 + }, + { + "epoch": 1.4249431739110265, + "grad_norm": 0.11287661641836166, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 368610 + }, + { + "epoch": 1.4249818311144098, + "grad_norm": 0.11041462421417236, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 368620 + }, + { + "epoch": 1.4250204883177933, + "grad_norm": 0.1251877099275589, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 368630 + }, + { + "epoch": 1.4250591455211765, + "grad_norm": 0.0997016504406929, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 368640 + }, + { + "epoch": 1.4250978027245598, + "grad_norm": 0.16072382032871246, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 368650 + }, + { + "epoch": 1.425136459927943, + "grad_norm": 0.09736277163028717, + "learning_rate": 0.002, + "loss": 2.3119, + "step": 368660 + }, + { + "epoch": 1.4251751171313263, + "grad_norm": 0.10369211435317993, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 368670 + }, + { + "epoch": 1.4252137743347095, + "grad_norm": 0.09280867129564285, + "learning_rate": 0.002, + "loss": 2.344, + "step": 368680 + }, + { + "epoch": 1.4252524315380928, + "grad_norm": 0.09761892259120941, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 368690 + }, + { + "epoch": 1.425291088741476, + "grad_norm": 0.13509036600589752, + "learning_rate": 0.002, + "loss": 2.347, + "step": 368700 + }, + { + "epoch": 1.4253297459448593, + "grad_norm": 0.09694381058216095, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 368710 + }, + { + "epoch": 1.4253684031482425, + "grad_norm": 0.11534082889556885, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 368720 + }, + { + "epoch": 1.4254070603516258, + "grad_norm": 0.09117257595062256, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 368730 + }, + { + "epoch": 1.4254457175550093, + "grad_norm": 0.10641550272703171, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 368740 + }, + { + "epoch": 1.4254843747583925, + "grad_norm": 0.13109582662582397, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 368750 + }, + { + "epoch": 1.4255230319617758, + "grad_norm": 0.09147099405527115, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 368760 + }, + { + "epoch": 1.425561689165159, + "grad_norm": 0.12726427614688873, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 368770 + }, + { + "epoch": 1.4256003463685423, + "grad_norm": 0.10484074056148529, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 368780 + }, + { + "epoch": 1.4256390035719255, + "grad_norm": 0.09867366403341293, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 368790 + }, + { + "epoch": 1.425677660775309, + "grad_norm": 0.10545188188552856, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 368800 + }, + { + "epoch": 1.4257163179786922, + "grad_norm": 0.11329960078001022, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 368810 + }, + { + "epoch": 1.4257549751820755, + "grad_norm": 0.1431681513786316, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 368820 + }, + { + "epoch": 1.4257936323854588, + "grad_norm": 0.4852242171764374, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 368830 + }, + { + "epoch": 1.425832289588842, + "grad_norm": 0.10883507877588272, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 368840 + }, + { + "epoch": 1.4258709467922253, + "grad_norm": 0.08958674222230911, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 368850 + }, + { + "epoch": 1.4259096039956085, + "grad_norm": 0.11914104223251343, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 368860 + }, + { + "epoch": 1.4259482611989918, + "grad_norm": 0.08615414053201675, + "learning_rate": 0.002, + "loss": 2.322, + "step": 368870 + }, + { + "epoch": 1.425986918402375, + "grad_norm": 0.09670282155275345, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 368880 + }, + { + "epoch": 1.4260255756057583, + "grad_norm": 0.0999981239438057, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 368890 + }, + { + "epoch": 1.4260642328091415, + "grad_norm": 0.09854158759117126, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 368900 + }, + { + "epoch": 1.426102890012525, + "grad_norm": 0.11261072754859924, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 368910 + }, + { + "epoch": 1.4261415472159082, + "grad_norm": 0.09439534693956375, + "learning_rate": 0.002, + "loss": 2.332, + "step": 368920 + }, + { + "epoch": 1.4261802044192915, + "grad_norm": 0.10984434932470322, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 368930 + }, + { + "epoch": 1.4262188616226747, + "grad_norm": 0.1016111746430397, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 368940 + }, + { + "epoch": 1.426257518826058, + "grad_norm": 0.0948866456747055, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 368950 + }, + { + "epoch": 1.4262961760294413, + "grad_norm": 0.08846337348222733, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 368960 + }, + { + "epoch": 1.4263348332328247, + "grad_norm": 0.11411886662244797, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 368970 + }, + { + "epoch": 1.426373490436208, + "grad_norm": 0.09665101021528244, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 368980 + }, + { + "epoch": 1.4264121476395912, + "grad_norm": 0.10688138008117676, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 368990 + }, + { + "epoch": 1.4264508048429745, + "grad_norm": 0.11031637340784073, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 369000 + }, + { + "epoch": 1.4264894620463577, + "grad_norm": 0.14109115302562714, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 369010 + }, + { + "epoch": 1.426528119249741, + "grad_norm": 0.11171339452266693, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 369020 + }, + { + "epoch": 1.4265667764531242, + "grad_norm": 0.12456446141004562, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 369030 + }, + { + "epoch": 1.4266054336565075, + "grad_norm": 0.10644245147705078, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 369040 + }, + { + "epoch": 1.4266440908598907, + "grad_norm": 0.11632377654314041, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 369050 + }, + { + "epoch": 1.426682748063274, + "grad_norm": 0.09376497566699982, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 369060 + }, + { + "epoch": 1.4267214052666575, + "grad_norm": 0.1096901223063469, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 369070 + }, + { + "epoch": 1.4267600624700407, + "grad_norm": 0.09440205991268158, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 369080 + }, + { + "epoch": 1.426798719673424, + "grad_norm": 0.10393696278333664, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 369090 + }, + { + "epoch": 1.4268373768768072, + "grad_norm": 0.0952950194478035, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 369100 + }, + { + "epoch": 1.4268760340801905, + "grad_norm": 0.09243932366371155, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 369110 + }, + { + "epoch": 1.4269146912835737, + "grad_norm": 0.10410625487565994, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 369120 + }, + { + "epoch": 1.426953348486957, + "grad_norm": 0.09936773777008057, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 369130 + }, + { + "epoch": 1.4269920056903405, + "grad_norm": 0.10040454566478729, + "learning_rate": 0.002, + "loss": 2.344, + "step": 369140 + }, + { + "epoch": 1.4270306628937237, + "grad_norm": 0.11334496736526489, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 369150 + }, + { + "epoch": 1.427069320097107, + "grad_norm": 0.09408103674650192, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 369160 + }, + { + "epoch": 1.4271079773004902, + "grad_norm": 0.09497067332267761, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 369170 + }, + { + "epoch": 1.4271466345038735, + "grad_norm": 0.10084699094295502, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 369180 + }, + { + "epoch": 1.4271852917072567, + "grad_norm": 0.1148439347743988, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 369190 + }, + { + "epoch": 1.42722394891064, + "grad_norm": 0.11306999623775482, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 369200 + }, + { + "epoch": 1.4272626061140232, + "grad_norm": 0.10049940645694733, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 369210 + }, + { + "epoch": 1.4273012633174065, + "grad_norm": 0.10067114233970642, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 369220 + }, + { + "epoch": 1.4273399205207897, + "grad_norm": 0.10959344357252121, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 369230 + }, + { + "epoch": 1.4273785777241732, + "grad_norm": 0.12859250605106354, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 369240 + }, + { + "epoch": 1.4274172349275565, + "grad_norm": 0.099395252764225, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 369250 + }, + { + "epoch": 1.4274558921309397, + "grad_norm": 0.09150033444166183, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 369260 + }, + { + "epoch": 1.427494549334323, + "grad_norm": 0.11286135017871857, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 369270 + }, + { + "epoch": 1.4275332065377062, + "grad_norm": 0.11032780259847641, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 369280 + }, + { + "epoch": 1.4275718637410895, + "grad_norm": 0.0984359011054039, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 369290 + }, + { + "epoch": 1.4276105209444727, + "grad_norm": 0.10086100548505783, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 369300 + }, + { + "epoch": 1.4276491781478562, + "grad_norm": 0.09905387461185455, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 369310 + }, + { + "epoch": 1.4276878353512394, + "grad_norm": 0.09202684462070465, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 369320 + }, + { + "epoch": 1.4277264925546227, + "grad_norm": 0.11295720189809799, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 369330 + }, + { + "epoch": 1.427765149758006, + "grad_norm": 0.13472798466682434, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 369340 + }, + { + "epoch": 1.4278038069613892, + "grad_norm": 0.1097302958369255, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 369350 + }, + { + "epoch": 1.4278424641647725, + "grad_norm": 0.1095571219921112, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 369360 + }, + { + "epoch": 1.4278811213681557, + "grad_norm": 0.09615956991910934, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 369370 + }, + { + "epoch": 1.427919778571539, + "grad_norm": 0.20836985111236572, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 369380 + }, + { + "epoch": 1.4279584357749222, + "grad_norm": 0.09848025441169739, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 369390 + }, + { + "epoch": 1.4279970929783055, + "grad_norm": 0.09631534665822983, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 369400 + }, + { + "epoch": 1.428035750181689, + "grad_norm": 0.12165029346942902, + "learning_rate": 0.002, + "loss": 2.324, + "step": 369410 + }, + { + "epoch": 1.4280744073850722, + "grad_norm": 0.11290930956602097, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 369420 + }, + { + "epoch": 1.4281130645884554, + "grad_norm": 0.10189209133386612, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 369430 + }, + { + "epoch": 1.4281517217918387, + "grad_norm": 0.1195901483297348, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 369440 + }, + { + "epoch": 1.428190378995222, + "grad_norm": 0.1144580990076065, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 369450 + }, + { + "epoch": 1.4282290361986052, + "grad_norm": 0.10400616377592087, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 369460 + }, + { + "epoch": 1.4282676934019884, + "grad_norm": 0.09329728782176971, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 369470 + }, + { + "epoch": 1.428306350605372, + "grad_norm": 0.09955257922410965, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 369480 + }, + { + "epoch": 1.4283450078087552, + "grad_norm": 0.11704539507627487, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 369490 + }, + { + "epoch": 1.4283836650121384, + "grad_norm": 0.1107201874256134, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 369500 + }, + { + "epoch": 1.4284223222155217, + "grad_norm": 0.09703484922647476, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 369510 + }, + { + "epoch": 1.428460979418905, + "grad_norm": 0.11836137622594833, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 369520 + }, + { + "epoch": 1.4284996366222882, + "grad_norm": 0.10974926501512527, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 369530 + }, + { + "epoch": 1.4285382938256714, + "grad_norm": 0.12903998792171478, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 369540 + }, + { + "epoch": 1.4285769510290547, + "grad_norm": 0.09003593027591705, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 369550 + }, + { + "epoch": 1.428615608232438, + "grad_norm": 0.11508678644895554, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 369560 + }, + { + "epoch": 1.4286542654358212, + "grad_norm": 0.11840968579053879, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 369570 + }, + { + "epoch": 1.4286929226392047, + "grad_norm": 0.10051307082176208, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 369580 + }, + { + "epoch": 1.428731579842588, + "grad_norm": 0.1256880760192871, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 369590 + }, + { + "epoch": 1.4287702370459712, + "grad_norm": 0.11354075372219086, + "learning_rate": 0.002, + "loss": 2.35, + "step": 369600 + }, + { + "epoch": 1.4288088942493544, + "grad_norm": 0.11274783313274384, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 369610 + }, + { + "epoch": 1.4288475514527377, + "grad_norm": 0.10234702378511429, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 369620 + }, + { + "epoch": 1.428886208656121, + "grad_norm": 0.12185943126678467, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 369630 + }, + { + "epoch": 1.4289248658595042, + "grad_norm": 0.12358567118644714, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 369640 + }, + { + "epoch": 1.4289635230628877, + "grad_norm": 0.10948533564805984, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 369650 + }, + { + "epoch": 1.429002180266271, + "grad_norm": 0.09104648977518082, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 369660 + }, + { + "epoch": 1.4290408374696542, + "grad_norm": 0.10487092286348343, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 369670 + }, + { + "epoch": 1.4290794946730374, + "grad_norm": 0.10738363116979599, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 369680 + }, + { + "epoch": 1.4291181518764207, + "grad_norm": 0.1259232610464096, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 369690 + }, + { + "epoch": 1.429156809079804, + "grad_norm": 0.09970113635063171, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 369700 + }, + { + "epoch": 1.4291954662831872, + "grad_norm": 0.10582879930734634, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 369710 + }, + { + "epoch": 1.4292341234865704, + "grad_norm": 0.10421253740787506, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 369720 + }, + { + "epoch": 1.4292727806899537, + "grad_norm": 0.10977336764335632, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 369730 + }, + { + "epoch": 1.429311437893337, + "grad_norm": 0.12770582735538483, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 369740 + }, + { + "epoch": 1.4293500950967204, + "grad_norm": 0.09583185613155365, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 369750 + }, + { + "epoch": 1.4293887523001036, + "grad_norm": 0.11037515103816986, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 369760 + }, + { + "epoch": 1.429427409503487, + "grad_norm": 0.1029893308877945, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 369770 + }, + { + "epoch": 1.4294660667068702, + "grad_norm": 0.10304558277130127, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 369780 + }, + { + "epoch": 1.4295047239102534, + "grad_norm": 0.12139219045639038, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 369790 + }, + { + "epoch": 1.4295433811136367, + "grad_norm": 0.10977718979120255, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 369800 + }, + { + "epoch": 1.4295820383170201, + "grad_norm": 0.10141117870807648, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 369810 + }, + { + "epoch": 1.4296206955204034, + "grad_norm": 0.10547979921102524, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 369820 + }, + { + "epoch": 1.4296593527237866, + "grad_norm": 0.0912381038069725, + "learning_rate": 0.002, + "loss": 2.32, + "step": 369830 + }, + { + "epoch": 1.4296980099271699, + "grad_norm": 0.09478603303432465, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 369840 + }, + { + "epoch": 1.4297366671305531, + "grad_norm": 0.10601737350225449, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 369850 + }, + { + "epoch": 1.4297753243339364, + "grad_norm": 0.11179604381322861, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 369860 + }, + { + "epoch": 1.4298139815373196, + "grad_norm": 0.10227113217115402, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 369870 + }, + { + "epoch": 1.429852638740703, + "grad_norm": 0.0985608771443367, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 369880 + }, + { + "epoch": 1.4298912959440861, + "grad_norm": 0.11969029903411865, + "learning_rate": 0.002, + "loss": 2.3113, + "step": 369890 + }, + { + "epoch": 1.4299299531474694, + "grad_norm": 0.10263413190841675, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 369900 + }, + { + "epoch": 1.4299686103508527, + "grad_norm": 0.12281263619661331, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 369910 + }, + { + "epoch": 1.4300072675542361, + "grad_norm": 0.10118019580841064, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 369920 + }, + { + "epoch": 1.4300459247576194, + "grad_norm": 0.0986642837524414, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 369930 + }, + { + "epoch": 1.4300845819610026, + "grad_norm": 0.10727502405643463, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 369940 + }, + { + "epoch": 1.4301232391643859, + "grad_norm": 0.09434787184000015, + "learning_rate": 0.002, + "loss": 2.334, + "step": 369950 + }, + { + "epoch": 1.4301618963677691, + "grad_norm": 0.103243887424469, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 369960 + }, + { + "epoch": 1.4302005535711524, + "grad_norm": 0.11206620931625366, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 369970 + }, + { + "epoch": 1.4302392107745359, + "grad_norm": 0.11155019700527191, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 369980 + }, + { + "epoch": 1.4302778679779191, + "grad_norm": 0.10238495469093323, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 369990 + }, + { + "epoch": 1.4303165251813024, + "grad_norm": 0.11440207809209824, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 370000 + }, + { + "epoch": 1.4303551823846856, + "grad_norm": 0.10964350402355194, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 370010 + }, + { + "epoch": 1.4303938395880689, + "grad_norm": 0.09666993468999863, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 370020 + }, + { + "epoch": 1.4304324967914521, + "grad_norm": 0.10145600140094757, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 370030 + }, + { + "epoch": 1.4304711539948354, + "grad_norm": 0.12468435615301132, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 370040 + }, + { + "epoch": 1.4305098111982186, + "grad_norm": 0.10010754317045212, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 370050 + }, + { + "epoch": 1.4305484684016019, + "grad_norm": 0.0995609387755394, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 370060 + }, + { + "epoch": 1.4305871256049851, + "grad_norm": 0.0948537290096283, + "learning_rate": 0.002, + "loss": 2.3093, + "step": 370070 + }, + { + "epoch": 1.4306257828083684, + "grad_norm": 0.1278780996799469, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 370080 + }, + { + "epoch": 1.4306644400117519, + "grad_norm": 0.09928832203149796, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 370090 + }, + { + "epoch": 1.4307030972151351, + "grad_norm": 0.11928533017635345, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 370100 + }, + { + "epoch": 1.4307417544185184, + "grad_norm": 0.13536189496517181, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 370110 + }, + { + "epoch": 1.4307804116219016, + "grad_norm": 0.10143552720546722, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 370120 + }, + { + "epoch": 1.4308190688252849, + "grad_norm": 0.10408347100019455, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 370130 + }, + { + "epoch": 1.4308577260286681, + "grad_norm": 0.09588273614645004, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 370140 + }, + { + "epoch": 1.4308963832320516, + "grad_norm": 0.09177899360656738, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 370150 + }, + { + "epoch": 1.4309350404354348, + "grad_norm": 0.11381170153617859, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 370160 + }, + { + "epoch": 1.430973697638818, + "grad_norm": 0.09797380119562149, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 370170 + }, + { + "epoch": 1.4310123548422014, + "grad_norm": 0.11212203651666641, + "learning_rate": 0.002, + "loss": 2.333, + "step": 370180 + }, + { + "epoch": 1.4310510120455846, + "grad_norm": 0.10767749696969986, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 370190 + }, + { + "epoch": 1.4310896692489679, + "grad_norm": 0.09107781946659088, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 370200 + }, + { + "epoch": 1.431128326452351, + "grad_norm": 0.10297054052352905, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 370210 + }, + { + "epoch": 1.4311669836557344, + "grad_norm": 0.09875304996967316, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 370220 + }, + { + "epoch": 1.4312056408591176, + "grad_norm": 0.0951622724533081, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 370230 + }, + { + "epoch": 1.4312442980625009, + "grad_norm": 0.10550680011510849, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 370240 + }, + { + "epoch": 1.4312829552658841, + "grad_norm": 0.10152022540569305, + "learning_rate": 0.002, + "loss": 2.328, + "step": 370250 + }, + { + "epoch": 1.4313216124692676, + "grad_norm": 0.09326834976673126, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 370260 + }, + { + "epoch": 1.4313602696726508, + "grad_norm": 0.10848992317914963, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 370270 + }, + { + "epoch": 1.431398926876034, + "grad_norm": 0.09691794961690903, + "learning_rate": 0.002, + "loss": 2.3606, + "step": 370280 + }, + { + "epoch": 1.4314375840794173, + "grad_norm": 0.12346416711807251, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 370290 + }, + { + "epoch": 1.4314762412828006, + "grad_norm": 0.10652803629636765, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 370300 + }, + { + "epoch": 1.4315148984861839, + "grad_norm": 0.13868898153305054, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 370310 + }, + { + "epoch": 1.4315535556895673, + "grad_norm": 0.10139944404363632, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 370320 + }, + { + "epoch": 1.4315922128929506, + "grad_norm": 0.10313576459884644, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 370330 + }, + { + "epoch": 1.4316308700963338, + "grad_norm": 0.09152504056692123, + "learning_rate": 0.002, + "loss": 2.317, + "step": 370340 + }, + { + "epoch": 1.431669527299717, + "grad_norm": 0.10575311630964279, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 370350 + }, + { + "epoch": 1.4317081845031003, + "grad_norm": 0.14318452775478363, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 370360 + }, + { + "epoch": 1.4317468417064836, + "grad_norm": 0.11087655276060104, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 370370 + }, + { + "epoch": 1.4317854989098668, + "grad_norm": 0.10831261426210403, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 370380 + }, + { + "epoch": 1.43182415611325, + "grad_norm": 0.1127522811293602, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 370390 + }, + { + "epoch": 1.4318628133166333, + "grad_norm": 0.09608136862516403, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 370400 + }, + { + "epoch": 1.4319014705200166, + "grad_norm": 0.11116312444210052, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 370410 + }, + { + "epoch": 1.4319401277233998, + "grad_norm": 0.10763781517744064, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 370420 + }, + { + "epoch": 1.4319787849267833, + "grad_norm": 0.1139378771185875, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 370430 + }, + { + "epoch": 1.4320174421301666, + "grad_norm": 0.11814486235380173, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 370440 + }, + { + "epoch": 1.4320560993335498, + "grad_norm": 0.11366909742355347, + "learning_rate": 0.002, + "loss": 2.328, + "step": 370450 + }, + { + "epoch": 1.432094756536933, + "grad_norm": 0.10304597020149231, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 370460 + }, + { + "epoch": 1.4321334137403163, + "grad_norm": 0.09563199430704117, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 370470 + }, + { + "epoch": 1.4321720709436996, + "grad_norm": 0.09992068260908127, + "learning_rate": 0.002, + "loss": 2.344, + "step": 370480 + }, + { + "epoch": 1.432210728147083, + "grad_norm": 0.11581120640039444, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 370490 + }, + { + "epoch": 1.4322493853504663, + "grad_norm": 0.09946183860301971, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 370500 + }, + { + "epoch": 1.4322880425538496, + "grad_norm": 0.10362624377012253, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 370510 + }, + { + "epoch": 1.4323266997572328, + "grad_norm": 0.0960688441991806, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 370520 + }, + { + "epoch": 1.432365356960616, + "grad_norm": 0.11272174119949341, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 370530 + }, + { + "epoch": 1.4324040141639993, + "grad_norm": 0.09724114090204239, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 370540 + }, + { + "epoch": 1.4324426713673826, + "grad_norm": 0.08760736137628555, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 370550 + }, + { + "epoch": 1.4324813285707658, + "grad_norm": 0.13241741061210632, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 370560 + }, + { + "epoch": 1.432519985774149, + "grad_norm": 0.11077427119016647, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 370570 + }, + { + "epoch": 1.4325586429775323, + "grad_norm": 0.09922739863395691, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 370580 + }, + { + "epoch": 1.4325973001809156, + "grad_norm": 0.11597199738025665, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 370590 + }, + { + "epoch": 1.432635957384299, + "grad_norm": 0.11011666059494019, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 370600 + }, + { + "epoch": 1.4326746145876823, + "grad_norm": 0.12466747313737869, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 370610 + }, + { + "epoch": 1.4327132717910656, + "grad_norm": 0.10244567692279816, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 370620 + }, + { + "epoch": 1.4327519289944488, + "grad_norm": 0.09919389337301254, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 370630 + }, + { + "epoch": 1.432790586197832, + "grad_norm": 0.11798831075429916, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 370640 + }, + { + "epoch": 1.4328292434012153, + "grad_norm": 0.12733817100524902, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 370650 + }, + { + "epoch": 1.4328679006045988, + "grad_norm": 0.09250468760728836, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 370660 + }, + { + "epoch": 1.432906557807982, + "grad_norm": 0.13374805450439453, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 370670 + }, + { + "epoch": 1.4329452150113653, + "grad_norm": 0.10336792469024658, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 370680 + }, + { + "epoch": 1.4329838722147485, + "grad_norm": 0.10279474407434464, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 370690 + }, + { + "epoch": 1.4330225294181318, + "grad_norm": 0.09461446106433868, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 370700 + }, + { + "epoch": 1.433061186621515, + "grad_norm": 0.14921467006206512, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 370710 + }, + { + "epoch": 1.4330998438248983, + "grad_norm": 0.16347302496433258, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 370720 + }, + { + "epoch": 1.4331385010282816, + "grad_norm": 0.11512413620948792, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 370730 + }, + { + "epoch": 1.4331771582316648, + "grad_norm": 0.08879565447568893, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 370740 + }, + { + "epoch": 1.433215815435048, + "grad_norm": 0.10276377946138382, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 370750 + }, + { + "epoch": 1.4332544726384313, + "grad_norm": 0.09875697642564774, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 370760 + }, + { + "epoch": 1.4332931298418148, + "grad_norm": 0.1068931594491005, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 370770 + }, + { + "epoch": 1.433331787045198, + "grad_norm": 0.11076817661523819, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 370780 + }, + { + "epoch": 1.4333704442485813, + "grad_norm": 0.10693301260471344, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 370790 + }, + { + "epoch": 1.4334091014519645, + "grad_norm": 0.12842826545238495, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 370800 + }, + { + "epoch": 1.4334477586553478, + "grad_norm": 0.10350693762302399, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 370810 + }, + { + "epoch": 1.433486415858731, + "grad_norm": 0.09163505584001541, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 370820 + }, + { + "epoch": 1.4335250730621145, + "grad_norm": 0.11667435616254807, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 370830 + }, + { + "epoch": 1.4335637302654978, + "grad_norm": 0.10743537545204163, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 370840 + }, + { + "epoch": 1.433602387468881, + "grad_norm": 0.1471959948539734, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 370850 + }, + { + "epoch": 1.4336410446722643, + "grad_norm": 0.10187181085348129, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 370860 + }, + { + "epoch": 1.4336797018756475, + "grad_norm": 0.09852719306945801, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 370870 + }, + { + "epoch": 1.4337183590790308, + "grad_norm": 0.10457748174667358, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 370880 + }, + { + "epoch": 1.433757016282414, + "grad_norm": 0.12135262787342072, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 370890 + }, + { + "epoch": 1.4337956734857973, + "grad_norm": 0.12262275815010071, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 370900 + }, + { + "epoch": 1.4338343306891805, + "grad_norm": 0.10123652964830399, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 370910 + }, + { + "epoch": 1.4338729878925638, + "grad_norm": 0.10346807539463043, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 370920 + }, + { + "epoch": 1.433911645095947, + "grad_norm": 0.1011311262845993, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 370930 + }, + { + "epoch": 1.4339503022993305, + "grad_norm": 0.10298784077167511, + "learning_rate": 0.002, + "loss": 2.324, + "step": 370940 + }, + { + "epoch": 1.4339889595027138, + "grad_norm": 0.10493668168783188, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 370950 + }, + { + "epoch": 1.434027616706097, + "grad_norm": 0.10941628366708755, + "learning_rate": 0.002, + "loss": 2.335, + "step": 370960 + }, + { + "epoch": 1.4340662739094803, + "grad_norm": 0.1208387240767479, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 370970 + }, + { + "epoch": 1.4341049311128635, + "grad_norm": 0.10983309894800186, + "learning_rate": 0.002, + "loss": 2.3085, + "step": 370980 + }, + { + "epoch": 1.4341435883162468, + "grad_norm": 0.10467513650655746, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 370990 + }, + { + "epoch": 1.4341822455196302, + "grad_norm": 0.10286015272140503, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 371000 + }, + { + "epoch": 1.4342209027230135, + "grad_norm": 0.09991083294153214, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 371010 + }, + { + "epoch": 1.4342595599263968, + "grad_norm": 0.09633886069059372, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 371020 + }, + { + "epoch": 1.43429821712978, + "grad_norm": 0.10873128473758698, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 371030 + }, + { + "epoch": 1.4343368743331633, + "grad_norm": 0.11154817789793015, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 371040 + }, + { + "epoch": 1.4343755315365465, + "grad_norm": 0.12193711847066879, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 371050 + }, + { + "epoch": 1.4344141887399298, + "grad_norm": 0.0954829752445221, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 371060 + }, + { + "epoch": 1.434452845943313, + "grad_norm": 0.1483481079339981, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 371070 + }, + { + "epoch": 1.4344915031466963, + "grad_norm": 0.1041557714343071, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 371080 + }, + { + "epoch": 1.4345301603500795, + "grad_norm": 0.10764243453741074, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 371090 + }, + { + "epoch": 1.434568817553463, + "grad_norm": 0.100117988884449, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 371100 + }, + { + "epoch": 1.4346074747568462, + "grad_norm": 0.09908339381217957, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 371110 + }, + { + "epoch": 1.4346461319602295, + "grad_norm": 0.11333918571472168, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 371120 + }, + { + "epoch": 1.4346847891636128, + "grad_norm": 0.10722298920154572, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 371130 + }, + { + "epoch": 1.434723446366996, + "grad_norm": 0.11224247515201569, + "learning_rate": 0.002, + "loss": 2.336, + "step": 371140 + }, + { + "epoch": 1.4347621035703793, + "grad_norm": 0.14375075697898865, + "learning_rate": 0.002, + "loss": 2.3538, + "step": 371150 + }, + { + "epoch": 1.4348007607737625, + "grad_norm": 0.11185460537672043, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 371160 + }, + { + "epoch": 1.434839417977146, + "grad_norm": 0.10739337652921677, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 371170 + }, + { + "epoch": 1.4348780751805292, + "grad_norm": 0.108052097260952, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 371180 + }, + { + "epoch": 1.4349167323839125, + "grad_norm": 0.11570432037115097, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 371190 + }, + { + "epoch": 1.4349553895872957, + "grad_norm": 0.0992949903011322, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 371200 + }, + { + "epoch": 1.434994046790679, + "grad_norm": 0.10696582496166229, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 371210 + }, + { + "epoch": 1.4350327039940622, + "grad_norm": 0.10340410470962524, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 371220 + }, + { + "epoch": 1.4350713611974455, + "grad_norm": 0.10558187961578369, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 371230 + }, + { + "epoch": 1.4351100184008287, + "grad_norm": 0.12378218024969101, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 371240 + }, + { + "epoch": 1.435148675604212, + "grad_norm": 0.09785618633031845, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 371250 + }, + { + "epoch": 1.4351873328075953, + "grad_norm": 0.0971897691488266, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 371260 + }, + { + "epoch": 1.4352259900109787, + "grad_norm": 0.12853021919727325, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 371270 + }, + { + "epoch": 1.435264647214362, + "grad_norm": 0.0908111110329628, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 371280 + }, + { + "epoch": 1.4353033044177452, + "grad_norm": 0.1069912314414978, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 371290 + }, + { + "epoch": 1.4353419616211285, + "grad_norm": 0.10096617788076401, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 371300 + }, + { + "epoch": 1.4353806188245117, + "grad_norm": 0.10879544913768768, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 371310 + }, + { + "epoch": 1.435419276027895, + "grad_norm": 0.1102469265460968, + "learning_rate": 0.002, + "loss": 2.333, + "step": 371320 + }, + { + "epoch": 1.4354579332312782, + "grad_norm": 0.09746189415454865, + "learning_rate": 0.002, + "loss": 2.3136, + "step": 371330 + }, + { + "epoch": 1.4354965904346617, + "grad_norm": 0.1017862856388092, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 371340 + }, + { + "epoch": 1.435535247638045, + "grad_norm": 0.09203585982322693, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 371350 + }, + { + "epoch": 1.4355739048414282, + "grad_norm": 0.09847685694694519, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 371360 + }, + { + "epoch": 1.4356125620448115, + "grad_norm": 0.11136972904205322, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 371370 + }, + { + "epoch": 1.4356512192481947, + "grad_norm": 0.10409975051879883, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 371380 + }, + { + "epoch": 1.435689876451578, + "grad_norm": 0.10757970064878464, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 371390 + }, + { + "epoch": 1.4357285336549612, + "grad_norm": 0.11430218070745468, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 371400 + }, + { + "epoch": 1.4357671908583445, + "grad_norm": 0.11079767346382141, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 371410 + }, + { + "epoch": 1.4358058480617277, + "grad_norm": 0.0886615738272667, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 371420 + }, + { + "epoch": 1.435844505265111, + "grad_norm": 0.09973558038473129, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 371430 + }, + { + "epoch": 1.4358831624684945, + "grad_norm": 0.0972694531083107, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 371440 + }, + { + "epoch": 1.4359218196718777, + "grad_norm": 0.11059543490409851, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 371450 + }, + { + "epoch": 1.435960476875261, + "grad_norm": 0.09584806114435196, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 371460 + }, + { + "epoch": 1.4359991340786442, + "grad_norm": 0.10012396425008774, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 371470 + }, + { + "epoch": 1.4360377912820275, + "grad_norm": 0.11090181022882462, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 371480 + }, + { + "epoch": 1.4360764484854107, + "grad_norm": 0.10269306600093842, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 371490 + }, + { + "epoch": 1.436115105688794, + "grad_norm": 0.11868039518594742, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 371500 + }, + { + "epoch": 1.4361537628921774, + "grad_norm": 0.12083914875984192, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 371510 + }, + { + "epoch": 1.4361924200955607, + "grad_norm": 0.10587406903505325, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 371520 + }, + { + "epoch": 1.436231077298944, + "grad_norm": 0.10817192494869232, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 371530 + }, + { + "epoch": 1.4362697345023272, + "grad_norm": 0.09226145595312119, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 371540 + }, + { + "epoch": 1.4363083917057105, + "grad_norm": 0.09368760883808136, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 371550 + }, + { + "epoch": 1.4363470489090937, + "grad_norm": 0.10719560086727142, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 371560 + }, + { + "epoch": 1.436385706112477, + "grad_norm": 0.09072700887918472, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 371570 + }, + { + "epoch": 1.4364243633158602, + "grad_norm": 0.0942125990986824, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 371580 + }, + { + "epoch": 1.4364630205192435, + "grad_norm": 0.13440567255020142, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 371590 + }, + { + "epoch": 1.4365016777226267, + "grad_norm": 0.26764971017837524, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 371600 + }, + { + "epoch": 1.4365403349260102, + "grad_norm": 0.14455781877040863, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 371610 + }, + { + "epoch": 1.4365789921293934, + "grad_norm": 0.10112746059894562, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 371620 + }, + { + "epoch": 1.4366176493327767, + "grad_norm": 0.09363212436437607, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 371630 + }, + { + "epoch": 1.43665630653616, + "grad_norm": 0.10647587478160858, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 371640 + }, + { + "epoch": 1.4366949637395432, + "grad_norm": 0.10569415986537933, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 371650 + }, + { + "epoch": 1.4367336209429264, + "grad_norm": 0.1061614602804184, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 371660 + }, + { + "epoch": 1.4367722781463097, + "grad_norm": 0.10273336619138718, + "learning_rate": 0.002, + "loss": 2.32, + "step": 371670 + }, + { + "epoch": 1.4368109353496932, + "grad_norm": 0.11125043779611588, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 371680 + }, + { + "epoch": 1.4368495925530764, + "grad_norm": 0.09265997260808945, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 371690 + }, + { + "epoch": 1.4368882497564597, + "grad_norm": 0.11448623239994049, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 371700 + }, + { + "epoch": 1.436926906959843, + "grad_norm": 0.12354601919651031, + "learning_rate": 0.002, + "loss": 2.3091, + "step": 371710 + }, + { + "epoch": 1.4369655641632262, + "grad_norm": 0.10530796647071838, + "learning_rate": 0.002, + "loss": 2.338, + "step": 371720 + }, + { + "epoch": 1.4370042213666094, + "grad_norm": 0.12017491459846497, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 371730 + }, + { + "epoch": 1.4370428785699927, + "grad_norm": 0.14511741697788239, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 371740 + }, + { + "epoch": 1.437081535773376, + "grad_norm": 0.09568100422620773, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 371750 + }, + { + "epoch": 1.4371201929767592, + "grad_norm": 0.09584043174982071, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 371760 + }, + { + "epoch": 1.4371588501801424, + "grad_norm": 0.10420922189950943, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 371770 + }, + { + "epoch": 1.437197507383526, + "grad_norm": 0.11375915259122849, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 371780 + }, + { + "epoch": 1.4372361645869092, + "grad_norm": 0.10612479597330093, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 371790 + }, + { + "epoch": 1.4372748217902924, + "grad_norm": 0.10375890135765076, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 371800 + }, + { + "epoch": 1.4373134789936757, + "grad_norm": 0.09391949325799942, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 371810 + }, + { + "epoch": 1.437352136197059, + "grad_norm": 0.10322212427854538, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 371820 + }, + { + "epoch": 1.4373907934004422, + "grad_norm": 0.10103631019592285, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 371830 + }, + { + "epoch": 1.4374294506038257, + "grad_norm": 0.11324197798967361, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 371840 + }, + { + "epoch": 1.437468107807209, + "grad_norm": 0.10173353552818298, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 371850 + }, + { + "epoch": 1.4375067650105922, + "grad_norm": 0.10180971771478653, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 371860 + }, + { + "epoch": 1.4375454222139754, + "grad_norm": 0.14309395849704742, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 371870 + }, + { + "epoch": 1.4375840794173587, + "grad_norm": 0.11053165793418884, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 371880 + }, + { + "epoch": 1.437622736620742, + "grad_norm": 0.10481183230876923, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 371890 + }, + { + "epoch": 1.4376613938241252, + "grad_norm": 0.13200727105140686, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 371900 + }, + { + "epoch": 1.4377000510275084, + "grad_norm": 0.0949479267001152, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 371910 + }, + { + "epoch": 1.4377387082308917, + "grad_norm": 0.11998516321182251, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 371920 + }, + { + "epoch": 1.437777365434275, + "grad_norm": 0.1394270956516266, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 371930 + }, + { + "epoch": 1.4378160226376582, + "grad_norm": 0.09979026019573212, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 371940 + }, + { + "epoch": 1.4378546798410416, + "grad_norm": 0.10514336824417114, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 371950 + }, + { + "epoch": 1.437893337044425, + "grad_norm": 0.09712082147598267, + "learning_rate": 0.002, + "loss": 2.3536, + "step": 371960 + }, + { + "epoch": 1.4379319942478082, + "grad_norm": 0.10522717982530594, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 371970 + }, + { + "epoch": 1.4379706514511914, + "grad_norm": 0.10137762129306793, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 371980 + }, + { + "epoch": 1.4380093086545747, + "grad_norm": 0.1008489802479744, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 371990 + }, + { + "epoch": 1.438047965857958, + "grad_norm": 0.12621283531188965, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 372000 + }, + { + "epoch": 1.4380866230613414, + "grad_norm": 0.12713102996349335, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 372010 + }, + { + "epoch": 1.4381252802647246, + "grad_norm": 0.13091817498207092, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 372020 + }, + { + "epoch": 1.438163937468108, + "grad_norm": 0.09806538373231888, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 372030 + }, + { + "epoch": 1.4382025946714911, + "grad_norm": 0.12110111862421036, + "learning_rate": 0.002, + "loss": 2.337, + "step": 372040 + }, + { + "epoch": 1.4382412518748744, + "grad_norm": 0.10088954120874405, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 372050 + }, + { + "epoch": 1.4382799090782576, + "grad_norm": 0.12509940564632416, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 372060 + }, + { + "epoch": 1.438318566281641, + "grad_norm": 0.11874911934137344, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 372070 + }, + { + "epoch": 1.4383572234850242, + "grad_norm": 0.12909932434558868, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 372080 + }, + { + "epoch": 1.4383958806884074, + "grad_norm": 0.09152228385210037, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 372090 + }, + { + "epoch": 1.4384345378917907, + "grad_norm": 0.1060309037566185, + "learning_rate": 0.002, + "loss": 2.342, + "step": 372100 + }, + { + "epoch": 1.438473195095174, + "grad_norm": 0.1071513444185257, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 372110 + }, + { + "epoch": 1.4385118522985574, + "grad_norm": 0.1377035528421402, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 372120 + }, + { + "epoch": 1.4385505095019406, + "grad_norm": 0.11575419455766678, + "learning_rate": 0.002, + "loss": 2.339, + "step": 372130 + }, + { + "epoch": 1.4385891667053239, + "grad_norm": 0.12348438054323196, + "learning_rate": 0.002, + "loss": 2.35, + "step": 372140 + }, + { + "epoch": 1.4386278239087071, + "grad_norm": 0.09979524463415146, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 372150 + }, + { + "epoch": 1.4386664811120904, + "grad_norm": 0.10092800855636597, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 372160 + }, + { + "epoch": 1.4387051383154736, + "grad_norm": 0.09440913051366806, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 372170 + }, + { + "epoch": 1.4387437955188571, + "grad_norm": 0.12174414098262787, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 372180 + }, + { + "epoch": 1.4387824527222404, + "grad_norm": 0.10224572569131851, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 372190 + }, + { + "epoch": 1.4388211099256236, + "grad_norm": 0.1092875599861145, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 372200 + }, + { + "epoch": 1.4388597671290069, + "grad_norm": 0.10130403935909271, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 372210 + }, + { + "epoch": 1.4388984243323901, + "grad_norm": 0.12170014530420303, + "learning_rate": 0.002, + "loss": 2.3579, + "step": 372220 + }, + { + "epoch": 1.4389370815357734, + "grad_norm": 0.1164335086941719, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 372230 + }, + { + "epoch": 1.4389757387391566, + "grad_norm": 0.10222030431032181, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 372240 + }, + { + "epoch": 1.4390143959425399, + "grad_norm": 0.11554889380931854, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 372250 + }, + { + "epoch": 1.4390530531459231, + "grad_norm": 0.11896473169326782, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 372260 + }, + { + "epoch": 1.4390917103493064, + "grad_norm": 0.09890048950910568, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 372270 + }, + { + "epoch": 1.4391303675526896, + "grad_norm": 0.10989270359277725, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 372280 + }, + { + "epoch": 1.4391690247560731, + "grad_norm": 0.10503815114498138, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 372290 + }, + { + "epoch": 1.4392076819594564, + "grad_norm": 0.10028046369552612, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 372300 + }, + { + "epoch": 1.4392463391628396, + "grad_norm": 0.10095027834177017, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 372310 + }, + { + "epoch": 1.4392849963662229, + "grad_norm": 0.11232791095972061, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 372320 + }, + { + "epoch": 1.4393236535696061, + "grad_norm": 0.11338062584400177, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 372330 + }, + { + "epoch": 1.4393623107729894, + "grad_norm": 0.12447841465473175, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 372340 + }, + { + "epoch": 1.4394009679763728, + "grad_norm": 0.10727968066930771, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 372350 + }, + { + "epoch": 1.439439625179756, + "grad_norm": 0.12589871883392334, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 372360 + }, + { + "epoch": 1.4394782823831394, + "grad_norm": 0.09049642831087112, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 372370 + }, + { + "epoch": 1.4395169395865226, + "grad_norm": 0.10203811526298523, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 372380 + }, + { + "epoch": 1.4395555967899059, + "grad_norm": 0.11543015390634537, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 372390 + }, + { + "epoch": 1.439594253993289, + "grad_norm": 0.10948812961578369, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 372400 + }, + { + "epoch": 1.4396329111966724, + "grad_norm": 0.11874116957187653, + "learning_rate": 0.002, + "loss": 2.335, + "step": 372410 + }, + { + "epoch": 1.4396715684000556, + "grad_norm": 0.09726309031248093, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 372420 + }, + { + "epoch": 1.4397102256034389, + "grad_norm": 0.10006396472454071, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 372430 + }, + { + "epoch": 1.4397488828068221, + "grad_norm": 0.10876845568418503, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 372440 + }, + { + "epoch": 1.4397875400102054, + "grad_norm": 0.10026299953460693, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 372450 + }, + { + "epoch": 1.4398261972135888, + "grad_norm": 0.0973251610994339, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 372460 + }, + { + "epoch": 1.439864854416972, + "grad_norm": 0.11038711667060852, + "learning_rate": 0.002, + "loss": 2.329, + "step": 372470 + }, + { + "epoch": 1.4399035116203553, + "grad_norm": 0.11058340966701508, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 372480 + }, + { + "epoch": 1.4399421688237386, + "grad_norm": 0.09188947081565857, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 372490 + }, + { + "epoch": 1.4399808260271219, + "grad_norm": 0.1101926639676094, + "learning_rate": 0.002, + "loss": 2.336, + "step": 372500 + }, + { + "epoch": 1.440019483230505, + "grad_norm": 0.2913362681865692, + "learning_rate": 0.002, + "loss": 2.3484, + "step": 372510 + }, + { + "epoch": 1.4400581404338886, + "grad_norm": 0.13520562648773193, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 372520 + }, + { + "epoch": 1.4400967976372718, + "grad_norm": 0.09579768031835556, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 372530 + }, + { + "epoch": 1.440135454840655, + "grad_norm": 0.1331591010093689, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 372540 + }, + { + "epoch": 1.4401741120440383, + "grad_norm": 0.10888656973838806, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 372550 + }, + { + "epoch": 1.4402127692474216, + "grad_norm": 0.09306596219539642, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 372560 + }, + { + "epoch": 1.4402514264508048, + "grad_norm": 0.104879230260849, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 372570 + }, + { + "epoch": 1.440290083654188, + "grad_norm": 0.09254723787307739, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 372580 + }, + { + "epoch": 1.4403287408575713, + "grad_norm": 0.09524793177843094, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 372590 + }, + { + "epoch": 1.4403673980609546, + "grad_norm": 0.09294036030769348, + "learning_rate": 0.002, + "loss": 2.342, + "step": 372600 + }, + { + "epoch": 1.4404060552643378, + "grad_norm": 0.11674389243125916, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 372610 + }, + { + "epoch": 1.440444712467721, + "grad_norm": 0.10271896421909332, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 372620 + }, + { + "epoch": 1.4404833696711046, + "grad_norm": 0.11283275485038757, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 372630 + }, + { + "epoch": 1.4405220268744878, + "grad_norm": 0.1119888424873352, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 372640 + }, + { + "epoch": 1.440560684077871, + "grad_norm": 0.09449056535959244, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 372650 + }, + { + "epoch": 1.4405993412812543, + "grad_norm": 0.09541455656290054, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 372660 + }, + { + "epoch": 1.4406379984846376, + "grad_norm": 0.1262357532978058, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 372670 + }, + { + "epoch": 1.4406766556880208, + "grad_norm": 0.10903370380401611, + "learning_rate": 0.002, + "loss": 2.331, + "step": 372680 + }, + { + "epoch": 1.4407153128914043, + "grad_norm": 0.11235390603542328, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 372690 + }, + { + "epoch": 1.4407539700947876, + "grad_norm": 0.1031249538064003, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 372700 + }, + { + "epoch": 1.4407926272981708, + "grad_norm": 0.10202572494745255, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 372710 + }, + { + "epoch": 1.440831284501554, + "grad_norm": 0.10625158995389938, + "learning_rate": 0.002, + "loss": 2.327, + "step": 372720 + }, + { + "epoch": 1.4408699417049373, + "grad_norm": 0.10402391105890274, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 372730 + }, + { + "epoch": 1.4409085989083206, + "grad_norm": 0.09886647760868073, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 372740 + }, + { + "epoch": 1.4409472561117038, + "grad_norm": 0.10791648924350739, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 372750 + }, + { + "epoch": 1.440985913315087, + "grad_norm": 0.10573247075080872, + "learning_rate": 0.002, + "loss": 2.3566, + "step": 372760 + }, + { + "epoch": 1.4410245705184703, + "grad_norm": 0.08799899369478226, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 372770 + }, + { + "epoch": 1.4410632277218536, + "grad_norm": 0.12291400879621506, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 372780 + }, + { + "epoch": 1.4411018849252368, + "grad_norm": 0.09974503517150879, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 372790 + }, + { + "epoch": 1.4411405421286203, + "grad_norm": 0.09110969305038452, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 372800 + }, + { + "epoch": 1.4411791993320036, + "grad_norm": 0.09680027514696121, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 372810 + }, + { + "epoch": 1.4412178565353868, + "grad_norm": 0.10247483104467392, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 372820 + }, + { + "epoch": 1.44125651373877, + "grad_norm": 0.13590359687805176, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 372830 + }, + { + "epoch": 1.4412951709421533, + "grad_norm": 0.0950893685221672, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 372840 + }, + { + "epoch": 1.4413338281455366, + "grad_norm": 0.10643938183784485, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 372850 + }, + { + "epoch": 1.44137248534892, + "grad_norm": 0.09532453864812851, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 372860 + }, + { + "epoch": 1.4414111425523033, + "grad_norm": 0.10018929094076157, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 372870 + }, + { + "epoch": 1.4414497997556865, + "grad_norm": 0.1031702384352684, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 372880 + }, + { + "epoch": 1.4414884569590698, + "grad_norm": 0.11543572694063187, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 372890 + }, + { + "epoch": 1.441527114162453, + "grad_norm": 0.08839228004217148, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 372900 + }, + { + "epoch": 1.4415657713658363, + "grad_norm": 0.11016961932182312, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 372910 + }, + { + "epoch": 1.4416044285692196, + "grad_norm": 0.10460019111633301, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 372920 + }, + { + "epoch": 1.4416430857726028, + "grad_norm": 0.14126625657081604, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 372930 + }, + { + "epoch": 1.441681742975986, + "grad_norm": 0.10394603759050369, + "learning_rate": 0.002, + "loss": 2.3515, + "step": 372940 + }, + { + "epoch": 1.4417204001793693, + "grad_norm": 0.10750212520360947, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 372950 + }, + { + "epoch": 1.4417590573827528, + "grad_norm": 0.1336800754070282, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 372960 + }, + { + "epoch": 1.441797714586136, + "grad_norm": 0.11134114116430283, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 372970 + }, + { + "epoch": 1.4418363717895193, + "grad_norm": 0.10422998666763306, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 372980 + }, + { + "epoch": 1.4418750289929025, + "grad_norm": 0.10585644096136093, + "learning_rate": 0.002, + "loss": 2.34, + "step": 372990 + }, + { + "epoch": 1.4419136861962858, + "grad_norm": 0.11790245026350021, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 373000 + }, + { + "epoch": 1.441952343399669, + "grad_norm": 0.10526230931282043, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 373010 + }, + { + "epoch": 1.4419910006030523, + "grad_norm": 0.0975332036614418, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 373020 + }, + { + "epoch": 1.4420296578064358, + "grad_norm": 0.10068307816982269, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 373030 + }, + { + "epoch": 1.442068315009819, + "grad_norm": 0.11506905406713486, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 373040 + }, + { + "epoch": 1.4421069722132023, + "grad_norm": 0.10308445990085602, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 373050 + }, + { + "epoch": 1.4421456294165855, + "grad_norm": 0.10953045636415482, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 373060 + }, + { + "epoch": 1.4421842866199688, + "grad_norm": 0.10586000978946686, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 373070 + }, + { + "epoch": 1.442222943823352, + "grad_norm": 0.11068696528673172, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 373080 + }, + { + "epoch": 1.4422616010267353, + "grad_norm": 0.10807348042726517, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 373090 + }, + { + "epoch": 1.4423002582301185, + "grad_norm": 0.10999464243650436, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 373100 + }, + { + "epoch": 1.4423389154335018, + "grad_norm": 0.0915793851017952, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 373110 + }, + { + "epoch": 1.442377572636885, + "grad_norm": 0.1043555811047554, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 373120 + }, + { + "epoch": 1.4424162298402685, + "grad_norm": 0.1145467758178711, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 373130 + }, + { + "epoch": 1.4424548870436518, + "grad_norm": 0.10055268555879593, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 373140 + }, + { + "epoch": 1.442493544247035, + "grad_norm": 0.11176237463951111, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 373150 + }, + { + "epoch": 1.4425322014504183, + "grad_norm": 0.10501953214406967, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 373160 + }, + { + "epoch": 1.4425708586538015, + "grad_norm": 0.09456057846546173, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 373170 + }, + { + "epoch": 1.4426095158571848, + "grad_norm": 0.09280961751937866, + "learning_rate": 0.002, + "loss": 2.344, + "step": 373180 + }, + { + "epoch": 1.442648173060568, + "grad_norm": 0.10198284685611725, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 373190 + }, + { + "epoch": 1.4426868302639515, + "grad_norm": 0.10452098399400711, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 373200 + }, + { + "epoch": 1.4427254874673348, + "grad_norm": 0.09441589564085007, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 373210 + }, + { + "epoch": 1.442764144670718, + "grad_norm": 0.11990165710449219, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 373220 + }, + { + "epoch": 1.4428028018741013, + "grad_norm": 0.10947558283805847, + "learning_rate": 0.002, + "loss": 2.331, + "step": 373230 + }, + { + "epoch": 1.4428414590774845, + "grad_norm": 0.1156168282032013, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 373240 + }, + { + "epoch": 1.4428801162808678, + "grad_norm": 0.09495019912719727, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 373250 + }, + { + "epoch": 1.442918773484251, + "grad_norm": 0.1100836843252182, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 373260 + }, + { + "epoch": 1.4429574306876343, + "grad_norm": 0.11163510382175446, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 373270 + }, + { + "epoch": 1.4429960878910175, + "grad_norm": 0.09736929833889008, + "learning_rate": 0.002, + "loss": 2.337, + "step": 373280 + }, + { + "epoch": 1.4430347450944008, + "grad_norm": 0.09500592201948166, + "learning_rate": 0.002, + "loss": 2.337, + "step": 373290 + }, + { + "epoch": 1.4430734022977842, + "grad_norm": 0.11622362583875656, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 373300 + }, + { + "epoch": 1.4431120595011675, + "grad_norm": 0.09297885000705719, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 373310 + }, + { + "epoch": 1.4431507167045508, + "grad_norm": 0.11191659420728683, + "learning_rate": 0.002, + "loss": 2.337, + "step": 373320 + }, + { + "epoch": 1.443189373907934, + "grad_norm": 0.11442656069993973, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 373330 + }, + { + "epoch": 1.4432280311113173, + "grad_norm": 0.1114255040884018, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 373340 + }, + { + "epoch": 1.4432666883147005, + "grad_norm": 0.09715000540018082, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 373350 + }, + { + "epoch": 1.4433053455180838, + "grad_norm": 0.10942159593105316, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 373360 + }, + { + "epoch": 1.4433440027214672, + "grad_norm": 0.10923704504966736, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 373370 + }, + { + "epoch": 1.4433826599248505, + "grad_norm": 0.10843712091445923, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 373380 + }, + { + "epoch": 1.4434213171282337, + "grad_norm": 0.1049380972981453, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 373390 + }, + { + "epoch": 1.443459974331617, + "grad_norm": 0.10164526849985123, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 373400 + }, + { + "epoch": 1.4434986315350002, + "grad_norm": 0.09803440421819687, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 373410 + }, + { + "epoch": 1.4435372887383835, + "grad_norm": 0.09612613916397095, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 373420 + }, + { + "epoch": 1.4435759459417667, + "grad_norm": 0.1065620705485344, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 373430 + }, + { + "epoch": 1.44361460314515, + "grad_norm": 0.10151802748441696, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 373440 + }, + { + "epoch": 1.4436532603485333, + "grad_norm": 0.09904639422893524, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 373450 + }, + { + "epoch": 1.4436919175519165, + "grad_norm": 0.09335681051015854, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 373460 + }, + { + "epoch": 1.4437305747553, + "grad_norm": 0.10069163888692856, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 373470 + }, + { + "epoch": 1.4437692319586832, + "grad_norm": 0.15649782121181488, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 373480 + }, + { + "epoch": 1.4438078891620665, + "grad_norm": 0.10765941441059113, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 373490 + }, + { + "epoch": 1.4438465463654497, + "grad_norm": 0.10586480796337128, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 373500 + }, + { + "epoch": 1.443885203568833, + "grad_norm": 0.09421324729919434, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 373510 + }, + { + "epoch": 1.4439238607722162, + "grad_norm": 0.08989991247653961, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 373520 + }, + { + "epoch": 1.4439625179755995, + "grad_norm": 0.10273010283708572, + "learning_rate": 0.002, + "loss": 2.34, + "step": 373530 + }, + { + "epoch": 1.444001175178983, + "grad_norm": 0.11481129378080368, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 373540 + }, + { + "epoch": 1.4440398323823662, + "grad_norm": 0.12491723895072937, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 373550 + }, + { + "epoch": 1.4440784895857495, + "grad_norm": 0.10945707559585571, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 373560 + }, + { + "epoch": 1.4441171467891327, + "grad_norm": 0.10013280808925629, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 373570 + }, + { + "epoch": 1.444155803992516, + "grad_norm": 0.10499537736177444, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 373580 + }, + { + "epoch": 1.4441944611958992, + "grad_norm": 0.08887854218482971, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 373590 + }, + { + "epoch": 1.4442331183992825, + "grad_norm": 0.10871432721614838, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 373600 + }, + { + "epoch": 1.4442717756026657, + "grad_norm": 0.1107456162571907, + "learning_rate": 0.002, + "loss": 2.345, + "step": 373610 + }, + { + "epoch": 1.444310432806049, + "grad_norm": 0.11917014420032501, + "learning_rate": 0.002, + "loss": 2.334, + "step": 373620 + }, + { + "epoch": 1.4443490900094322, + "grad_norm": 0.10580477118492126, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 373630 + }, + { + "epoch": 1.4443877472128157, + "grad_norm": 0.10364178568124771, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 373640 + }, + { + "epoch": 1.444426404416199, + "grad_norm": 0.11463356763124466, + "learning_rate": 0.002, + "loss": 2.336, + "step": 373650 + }, + { + "epoch": 1.4444650616195822, + "grad_norm": 0.09823574125766754, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 373660 + }, + { + "epoch": 1.4445037188229655, + "grad_norm": 0.13183921575546265, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 373670 + }, + { + "epoch": 1.4445423760263487, + "grad_norm": 0.09379124641418457, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 373680 + }, + { + "epoch": 1.444581033229732, + "grad_norm": 0.08960752934217453, + "learning_rate": 0.002, + "loss": 2.349, + "step": 373690 + }, + { + "epoch": 1.4446196904331154, + "grad_norm": 0.12396460771560669, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 373700 + }, + { + "epoch": 1.4446583476364987, + "grad_norm": 0.10179699957370758, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 373710 + }, + { + "epoch": 1.444697004839882, + "grad_norm": 0.10384551435709, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 373720 + }, + { + "epoch": 1.4447356620432652, + "grad_norm": 0.10939621925354004, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 373730 + }, + { + "epoch": 1.4447743192466485, + "grad_norm": 0.10100575536489487, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 373740 + }, + { + "epoch": 1.4448129764500317, + "grad_norm": 0.10216158628463745, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 373750 + }, + { + "epoch": 1.444851633653415, + "grad_norm": 0.1268494725227356, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 373760 + }, + { + "epoch": 1.4448902908567982, + "grad_norm": 0.10251862555742264, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 373770 + }, + { + "epoch": 1.4449289480601815, + "grad_norm": 0.1032252311706543, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 373780 + }, + { + "epoch": 1.4449676052635647, + "grad_norm": 0.09969396889209747, + "learning_rate": 0.002, + "loss": 2.334, + "step": 373790 + }, + { + "epoch": 1.445006262466948, + "grad_norm": 0.11264482885599136, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 373800 + }, + { + "epoch": 1.4450449196703314, + "grad_norm": 0.10568080842494965, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 373810 + }, + { + "epoch": 1.4450835768737147, + "grad_norm": 0.09127136319875717, + "learning_rate": 0.002, + "loss": 2.343, + "step": 373820 + }, + { + "epoch": 1.445122234077098, + "grad_norm": 0.10587592422962189, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 373830 + }, + { + "epoch": 1.4451608912804812, + "grad_norm": 0.11365770548582077, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 373840 + }, + { + "epoch": 1.4451995484838644, + "grad_norm": 0.11311656981706619, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 373850 + }, + { + "epoch": 1.4452382056872477, + "grad_norm": 0.10600202530622482, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 373860 + }, + { + "epoch": 1.4452768628906312, + "grad_norm": 0.12684810161590576, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 373870 + }, + { + "epoch": 1.4453155200940144, + "grad_norm": 0.11892486363649368, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 373880 + }, + { + "epoch": 1.4453541772973977, + "grad_norm": 0.09933153539896011, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 373890 + }, + { + "epoch": 1.445392834500781, + "grad_norm": 0.11293446272611618, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 373900 + }, + { + "epoch": 1.4454314917041642, + "grad_norm": 0.1038590744137764, + "learning_rate": 0.002, + "loss": 2.322, + "step": 373910 + }, + { + "epoch": 1.4454701489075474, + "grad_norm": 0.11509344726800919, + "learning_rate": 0.002, + "loss": 2.333, + "step": 373920 + }, + { + "epoch": 1.4455088061109307, + "grad_norm": 0.0943727046251297, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 373930 + }, + { + "epoch": 1.445547463314314, + "grad_norm": 0.10036980360746384, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 373940 + }, + { + "epoch": 1.4455861205176972, + "grad_norm": 0.11143596470355988, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 373950 + }, + { + "epoch": 1.4456247777210804, + "grad_norm": 0.10541340708732605, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 373960 + }, + { + "epoch": 1.4456634349244637, + "grad_norm": 0.10758457332849503, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 373970 + }, + { + "epoch": 1.4457020921278472, + "grad_norm": 0.12086843699216843, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 373980 + }, + { + "epoch": 1.4457407493312304, + "grad_norm": 0.09697914123535156, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 373990 + }, + { + "epoch": 1.4457794065346137, + "grad_norm": 0.09337199479341507, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 374000 + }, + { + "epoch": 1.445818063737997, + "grad_norm": 0.14395758509635925, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 374010 + }, + { + "epoch": 1.4458567209413802, + "grad_norm": 0.11079844832420349, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 374020 + }, + { + "epoch": 1.4458953781447634, + "grad_norm": 0.1177358403801918, + "learning_rate": 0.002, + "loss": 2.327, + "step": 374030 + }, + { + "epoch": 1.445934035348147, + "grad_norm": 0.10449257493019104, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 374040 + }, + { + "epoch": 1.4459726925515302, + "grad_norm": 0.10623761266469955, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 374050 + }, + { + "epoch": 1.4460113497549134, + "grad_norm": 0.11113753914833069, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 374060 + }, + { + "epoch": 1.4460500069582967, + "grad_norm": 0.10962066054344177, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 374070 + }, + { + "epoch": 1.44608866416168, + "grad_norm": 0.11375898867845535, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 374080 + }, + { + "epoch": 1.4461273213650632, + "grad_norm": 0.08839374035596848, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 374090 + }, + { + "epoch": 1.4461659785684464, + "grad_norm": 0.11471235752105713, + "learning_rate": 0.002, + "loss": 2.3525, + "step": 374100 + }, + { + "epoch": 1.4462046357718297, + "grad_norm": 0.10845217853784561, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 374110 + }, + { + "epoch": 1.446243292975213, + "grad_norm": 0.10325498133897781, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 374120 + }, + { + "epoch": 1.4462819501785962, + "grad_norm": 0.0919291228055954, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 374130 + }, + { + "epoch": 1.4463206073819794, + "grad_norm": 0.10365841537714005, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 374140 + }, + { + "epoch": 1.446359264585363, + "grad_norm": 0.0947713851928711, + "learning_rate": 0.002, + "loss": 2.327, + "step": 374150 + }, + { + "epoch": 1.4463979217887462, + "grad_norm": 0.11603802442550659, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 374160 + }, + { + "epoch": 1.4464365789921294, + "grad_norm": 0.10967438668012619, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 374170 + }, + { + "epoch": 1.4464752361955127, + "grad_norm": 0.11252371966838837, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 374180 + }, + { + "epoch": 1.446513893398896, + "grad_norm": 0.09533124417066574, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 374190 + }, + { + "epoch": 1.4465525506022792, + "grad_norm": 0.10764352977275848, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 374200 + }, + { + "epoch": 1.4465912078056626, + "grad_norm": 0.10111424326896667, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 374210 + }, + { + "epoch": 1.446629865009046, + "grad_norm": 0.11522792279720306, + "learning_rate": 0.002, + "loss": 2.351, + "step": 374220 + }, + { + "epoch": 1.4466685222124291, + "grad_norm": 0.09983590245246887, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 374230 + }, + { + "epoch": 1.4467071794158124, + "grad_norm": 0.10484399646520615, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 374240 + }, + { + "epoch": 1.4467458366191956, + "grad_norm": 0.12569187581539154, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 374250 + }, + { + "epoch": 1.446784493822579, + "grad_norm": 0.10629899054765701, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 374260 + }, + { + "epoch": 1.4468231510259622, + "grad_norm": 0.10669715702533722, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 374270 + }, + { + "epoch": 1.4468618082293454, + "grad_norm": 0.1442335695028305, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 374280 + }, + { + "epoch": 1.4469004654327287, + "grad_norm": 0.1721041202545166, + "learning_rate": 0.002, + "loss": 2.3526, + "step": 374290 + }, + { + "epoch": 1.446939122636112, + "grad_norm": 0.1116456538438797, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 374300 + }, + { + "epoch": 1.4469777798394952, + "grad_norm": 0.11294256150722504, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 374310 + }, + { + "epoch": 1.4470164370428786, + "grad_norm": 0.08833135664463043, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 374320 + }, + { + "epoch": 1.4470550942462619, + "grad_norm": 0.09844326227903366, + "learning_rate": 0.002, + "loss": 2.329, + "step": 374330 + }, + { + "epoch": 1.4470937514496451, + "grad_norm": 0.1368217021226883, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 374340 + }, + { + "epoch": 1.4471324086530284, + "grad_norm": 0.1035735234618187, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 374350 + }, + { + "epoch": 1.4471710658564116, + "grad_norm": 0.09134948998689651, + "learning_rate": 0.002, + "loss": 2.334, + "step": 374360 + }, + { + "epoch": 1.447209723059795, + "grad_norm": 0.11816652864217758, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 374370 + }, + { + "epoch": 1.4472483802631784, + "grad_norm": 0.1168443039059639, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 374380 + }, + { + "epoch": 1.4472870374665616, + "grad_norm": 0.0934310182929039, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 374390 + }, + { + "epoch": 1.4473256946699449, + "grad_norm": 0.10161089152097702, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 374400 + }, + { + "epoch": 1.4473643518733281, + "grad_norm": 0.10577352344989777, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 374410 + }, + { + "epoch": 1.4474030090767114, + "grad_norm": 0.10527346283197403, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 374420 + }, + { + "epoch": 1.4474416662800946, + "grad_norm": 0.09274306893348694, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 374430 + }, + { + "epoch": 1.4474803234834779, + "grad_norm": 0.10285604000091553, + "learning_rate": 0.002, + "loss": 2.344, + "step": 374440 + }, + { + "epoch": 1.4475189806868611, + "grad_norm": 0.11490141600370407, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 374450 + }, + { + "epoch": 1.4475576378902444, + "grad_norm": 0.1002807691693306, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 374460 + }, + { + "epoch": 1.4475962950936276, + "grad_norm": 0.10294970870018005, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 374470 + }, + { + "epoch": 1.447634952297011, + "grad_norm": 0.10106101632118225, + "learning_rate": 0.002, + "loss": 2.34, + "step": 374480 + }, + { + "epoch": 1.4476736095003944, + "grad_norm": 0.12514422833919525, + "learning_rate": 0.002, + "loss": 2.337, + "step": 374490 + }, + { + "epoch": 1.4477122667037776, + "grad_norm": 0.10771064460277557, + "learning_rate": 0.002, + "loss": 2.332, + "step": 374500 + }, + { + "epoch": 1.4477509239071609, + "grad_norm": 0.11269853264093399, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 374510 + }, + { + "epoch": 1.4477895811105441, + "grad_norm": 0.10505794733762741, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 374520 + }, + { + "epoch": 1.4478282383139274, + "grad_norm": 0.08940539509057999, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 374530 + }, + { + "epoch": 1.4478668955173106, + "grad_norm": 0.10429862141609192, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 374540 + }, + { + "epoch": 1.447905552720694, + "grad_norm": 0.09255179017782211, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 374550 + }, + { + "epoch": 1.4479442099240774, + "grad_norm": 0.13329355418682098, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 374560 + }, + { + "epoch": 1.4479828671274606, + "grad_norm": 0.10370327532291412, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 374570 + }, + { + "epoch": 1.4480215243308439, + "grad_norm": 0.10171709209680557, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 374580 + }, + { + "epoch": 1.448060181534227, + "grad_norm": 0.1019992083311081, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 374590 + }, + { + "epoch": 1.4480988387376104, + "grad_norm": 0.10621504485607147, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 374600 + }, + { + "epoch": 1.4481374959409936, + "grad_norm": 0.0987897738814354, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 374610 + }, + { + "epoch": 1.4481761531443769, + "grad_norm": 0.11098815500736237, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 374620 + }, + { + "epoch": 1.4482148103477601, + "grad_norm": 0.09835632145404816, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 374630 + }, + { + "epoch": 1.4482534675511434, + "grad_norm": 0.11043904721736908, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 374640 + }, + { + "epoch": 1.4482921247545266, + "grad_norm": 0.1077338308095932, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 374650 + }, + { + "epoch": 1.44833078195791, + "grad_norm": 0.10682597011327744, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 374660 + }, + { + "epoch": 1.4483694391612933, + "grad_norm": 0.10108894109725952, + "learning_rate": 0.002, + "loss": 2.326, + "step": 374670 + }, + { + "epoch": 1.4484080963646766, + "grad_norm": 0.10345392674207687, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 374680 + }, + { + "epoch": 1.4484467535680599, + "grad_norm": 0.09838715195655823, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 374690 + }, + { + "epoch": 1.448485410771443, + "grad_norm": 0.08886837959289551, + "learning_rate": 0.002, + "loss": 2.328, + "step": 374700 + }, + { + "epoch": 1.4485240679748264, + "grad_norm": 0.09721381217241287, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 374710 + }, + { + "epoch": 1.4485627251782098, + "grad_norm": 0.09373707324266434, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 374720 + }, + { + "epoch": 1.448601382381593, + "grad_norm": 0.10279977321624756, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 374730 + }, + { + "epoch": 1.4486400395849763, + "grad_norm": 0.11030231416225433, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 374740 + }, + { + "epoch": 1.4486786967883596, + "grad_norm": 0.18042241036891937, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 374750 + }, + { + "epoch": 1.4487173539917428, + "grad_norm": 0.11614906787872314, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 374760 + }, + { + "epoch": 1.448756011195126, + "grad_norm": 0.1080334410071373, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 374770 + }, + { + "epoch": 1.4487946683985093, + "grad_norm": 0.11193375289440155, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 374780 + }, + { + "epoch": 1.4488333256018926, + "grad_norm": 0.10424870997667313, + "learning_rate": 0.002, + "loss": 2.338, + "step": 374790 + }, + { + "epoch": 1.4488719828052758, + "grad_norm": 0.12174300104379654, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 374800 + }, + { + "epoch": 1.448910640008659, + "grad_norm": 0.09835692495107651, + "learning_rate": 0.002, + "loss": 2.3521, + "step": 374810 + }, + { + "epoch": 1.4489492972120426, + "grad_norm": 0.09823215752840042, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 374820 + }, + { + "epoch": 1.4489879544154258, + "grad_norm": 0.10451362282037735, + "learning_rate": 0.002, + "loss": 2.328, + "step": 374830 + }, + { + "epoch": 1.449026611618809, + "grad_norm": 0.0901821181178093, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 374840 + }, + { + "epoch": 1.4490652688221923, + "grad_norm": 0.09381404519081116, + "learning_rate": 0.002, + "loss": 2.3551, + "step": 374850 + }, + { + "epoch": 1.4491039260255756, + "grad_norm": 0.0980507880449295, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 374860 + }, + { + "epoch": 1.4491425832289588, + "grad_norm": 0.11305723339319229, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 374870 + }, + { + "epoch": 1.449181240432342, + "grad_norm": 0.11033497005701065, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 374880 + }, + { + "epoch": 1.4492198976357256, + "grad_norm": 0.1097717210650444, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 374890 + }, + { + "epoch": 1.4492585548391088, + "grad_norm": 0.11004069447517395, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 374900 + }, + { + "epoch": 1.449297212042492, + "grad_norm": 0.10695581883192062, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 374910 + }, + { + "epoch": 1.4493358692458753, + "grad_norm": 0.1037338525056839, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 374920 + }, + { + "epoch": 1.4493745264492586, + "grad_norm": 0.1020689606666565, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 374930 + }, + { + "epoch": 1.4494131836526418, + "grad_norm": 0.11201310902833939, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 374940 + }, + { + "epoch": 1.449451840856025, + "grad_norm": 0.09727621078491211, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 374950 + }, + { + "epoch": 1.4494904980594083, + "grad_norm": 0.10292645543813705, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 374960 + }, + { + "epoch": 1.4495291552627916, + "grad_norm": 0.09569789469242096, + "learning_rate": 0.002, + "loss": 2.341, + "step": 374970 + }, + { + "epoch": 1.4495678124661748, + "grad_norm": 0.09703312069177628, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 374980 + }, + { + "epoch": 1.4496064696695583, + "grad_norm": 0.10222285240888596, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 374990 + }, + { + "epoch": 1.4496451268729416, + "grad_norm": 0.11462683975696564, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 375000 + }, + { + "epoch": 1.4496837840763248, + "grad_norm": 0.10431713610887527, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 375010 + }, + { + "epoch": 1.449722441279708, + "grad_norm": 0.09858272969722748, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 375020 + }, + { + "epoch": 1.4497610984830913, + "grad_norm": 0.12057144939899445, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 375030 + }, + { + "epoch": 1.4497997556864746, + "grad_norm": 0.10652951151132584, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 375040 + }, + { + "epoch": 1.4498384128898578, + "grad_norm": 0.1226535513997078, + "learning_rate": 0.002, + "loss": 2.333, + "step": 375050 + }, + { + "epoch": 1.4498770700932413, + "grad_norm": 0.09500709921121597, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 375060 + }, + { + "epoch": 1.4499157272966245, + "grad_norm": 0.10096275806427002, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 375070 + }, + { + "epoch": 1.4499543845000078, + "grad_norm": 0.09361151605844498, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 375080 + }, + { + "epoch": 1.449993041703391, + "grad_norm": 0.09544364362955093, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 375090 + }, + { + "epoch": 1.4500316989067743, + "grad_norm": 0.1043727919459343, + "learning_rate": 0.002, + "loss": 2.341, + "step": 375100 + }, + { + "epoch": 1.4500703561101576, + "grad_norm": 0.17423953115940094, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 375110 + }, + { + "epoch": 1.4501090133135408, + "grad_norm": 0.10786591470241547, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 375120 + }, + { + "epoch": 1.450147670516924, + "grad_norm": 0.10140660405158997, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 375130 + }, + { + "epoch": 1.4501863277203073, + "grad_norm": 0.0983930379152298, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 375140 + }, + { + "epoch": 1.4502249849236906, + "grad_norm": 0.10323572158813477, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 375150 + }, + { + "epoch": 1.450263642127074, + "grad_norm": 0.10888148099184036, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 375160 + }, + { + "epoch": 1.4503022993304573, + "grad_norm": 0.12469719350337982, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 375170 + }, + { + "epoch": 1.4503409565338405, + "grad_norm": 0.08669634163379669, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 375180 + }, + { + "epoch": 1.4503796137372238, + "grad_norm": 0.09478214383125305, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 375190 + }, + { + "epoch": 1.450418270940607, + "grad_norm": 0.1561504751443863, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 375200 + }, + { + "epoch": 1.4504569281439903, + "grad_norm": 0.09788401424884796, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 375210 + }, + { + "epoch": 1.4504955853473736, + "grad_norm": 0.10349985957145691, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 375220 + }, + { + "epoch": 1.450534242550757, + "grad_norm": 0.8829423189163208, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 375230 + }, + { + "epoch": 1.4505728997541403, + "grad_norm": 0.12734265625476837, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 375240 + }, + { + "epoch": 1.4506115569575235, + "grad_norm": 0.1019926369190216, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 375250 + }, + { + "epoch": 1.4506502141609068, + "grad_norm": 0.10075262188911438, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 375260 + }, + { + "epoch": 1.45068887136429, + "grad_norm": 0.09703978151082993, + "learning_rate": 0.002, + "loss": 2.349, + "step": 375270 + }, + { + "epoch": 1.4507275285676733, + "grad_norm": 0.10557262599468231, + "learning_rate": 0.002, + "loss": 2.334, + "step": 375280 + }, + { + "epoch": 1.4507661857710565, + "grad_norm": 0.10136254876852036, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 375290 + }, + { + "epoch": 1.4508048429744398, + "grad_norm": 0.09263923764228821, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 375300 + }, + { + "epoch": 1.450843500177823, + "grad_norm": 0.12233812361955643, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 375310 + }, + { + "epoch": 1.4508821573812063, + "grad_norm": 0.10406205803155899, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 375320 + }, + { + "epoch": 1.4509208145845898, + "grad_norm": 0.11512085795402527, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 375330 + }, + { + "epoch": 1.450959471787973, + "grad_norm": 0.10936763137578964, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 375340 + }, + { + "epoch": 1.4509981289913563, + "grad_norm": 0.12254436314105988, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 375350 + }, + { + "epoch": 1.4510367861947395, + "grad_norm": 0.10936383157968521, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 375360 + }, + { + "epoch": 1.4510754433981228, + "grad_norm": 0.1023615226149559, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 375370 + }, + { + "epoch": 1.451114100601506, + "grad_norm": 0.0934467613697052, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 375380 + }, + { + "epoch": 1.4511527578048893, + "grad_norm": 0.09618617594242096, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 375390 + }, + { + "epoch": 1.4511914150082728, + "grad_norm": 0.12151792645454407, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 375400 + }, + { + "epoch": 1.451230072211656, + "grad_norm": 0.09098203480243683, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 375410 + }, + { + "epoch": 1.4512687294150393, + "grad_norm": 0.14736823737621307, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 375420 + }, + { + "epoch": 1.4513073866184225, + "grad_norm": 0.10443034023046494, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 375430 + }, + { + "epoch": 1.4513460438218058, + "grad_norm": 0.103264719247818, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 375440 + }, + { + "epoch": 1.451384701025189, + "grad_norm": 0.12326298654079437, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 375450 + }, + { + "epoch": 1.4514233582285723, + "grad_norm": 0.10376644879579544, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 375460 + }, + { + "epoch": 1.4514620154319555, + "grad_norm": 0.12066710740327835, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 375470 + }, + { + "epoch": 1.4515006726353388, + "grad_norm": 0.09960687160491943, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 375480 + }, + { + "epoch": 1.451539329838722, + "grad_norm": 0.10828446596860886, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 375490 + }, + { + "epoch": 1.4515779870421055, + "grad_norm": 0.09412367641925812, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 375500 + }, + { + "epoch": 1.4516166442454888, + "grad_norm": 0.11398924887180328, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 375510 + }, + { + "epoch": 1.451655301448872, + "grad_norm": 0.10298270732164383, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 375520 + }, + { + "epoch": 1.4516939586522553, + "grad_norm": 0.16039882600307465, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 375530 + }, + { + "epoch": 1.4517326158556385, + "grad_norm": 0.08908867835998535, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 375540 + }, + { + "epoch": 1.4517712730590218, + "grad_norm": 0.11916134506464005, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 375550 + }, + { + "epoch": 1.4518099302624052, + "grad_norm": 0.11261797696352005, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 375560 + }, + { + "epoch": 1.4518485874657885, + "grad_norm": 0.10552900284528732, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 375570 + }, + { + "epoch": 1.4518872446691717, + "grad_norm": 0.10971780866384506, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 375580 + }, + { + "epoch": 1.451925901872555, + "grad_norm": 0.10335514694452286, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 375590 + }, + { + "epoch": 1.4519645590759382, + "grad_norm": 0.11158467084169388, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 375600 + }, + { + "epoch": 1.4520032162793215, + "grad_norm": 0.10805515944957733, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 375610 + }, + { + "epoch": 1.4520418734827047, + "grad_norm": 0.09409262239933014, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 375620 + }, + { + "epoch": 1.452080530686088, + "grad_norm": 0.08583605289459229, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 375630 + }, + { + "epoch": 1.4521191878894713, + "grad_norm": 0.0960557758808136, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 375640 + }, + { + "epoch": 1.4521578450928545, + "grad_norm": 0.10765881836414337, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 375650 + }, + { + "epoch": 1.4521965022962378, + "grad_norm": 0.09392059594392776, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 375660 + }, + { + "epoch": 1.4522351594996212, + "grad_norm": 0.10155150294303894, + "learning_rate": 0.002, + "loss": 2.346, + "step": 375670 + }, + { + "epoch": 1.4522738167030045, + "grad_norm": 0.09984404593706131, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 375680 + }, + { + "epoch": 1.4523124739063877, + "grad_norm": 0.10842925310134888, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 375690 + }, + { + "epoch": 1.452351131109771, + "grad_norm": 0.0975777730345726, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 375700 + }, + { + "epoch": 1.4523897883131542, + "grad_norm": 0.1043943464756012, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 375710 + }, + { + "epoch": 1.4524284455165375, + "grad_norm": 0.09681592136621475, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 375720 + }, + { + "epoch": 1.452467102719921, + "grad_norm": 0.09512980282306671, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 375730 + }, + { + "epoch": 1.4525057599233042, + "grad_norm": 0.09035705775022507, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 375740 + }, + { + "epoch": 1.4525444171266875, + "grad_norm": 0.10851699858903885, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 375750 + }, + { + "epoch": 1.4525830743300707, + "grad_norm": 0.10928475111722946, + "learning_rate": 0.002, + "loss": 2.337, + "step": 375760 + }, + { + "epoch": 1.452621731533454, + "grad_norm": 0.08919432759284973, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 375770 + }, + { + "epoch": 1.4526603887368372, + "grad_norm": 0.10121583938598633, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 375780 + }, + { + "epoch": 1.4526990459402205, + "grad_norm": 0.10289706289768219, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 375790 + }, + { + "epoch": 1.4527377031436037, + "grad_norm": 0.1273885816335678, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 375800 + }, + { + "epoch": 1.452776360346987, + "grad_norm": 0.10660862177610397, + "learning_rate": 0.002, + "loss": 2.335, + "step": 375810 + }, + { + "epoch": 1.4528150175503702, + "grad_norm": 0.09819815307855606, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 375820 + }, + { + "epoch": 1.4528536747537535, + "grad_norm": 0.0908627137541771, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 375830 + }, + { + "epoch": 1.452892331957137, + "grad_norm": 0.11086965352296829, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 375840 + }, + { + "epoch": 1.4529309891605202, + "grad_norm": 0.11052382737398148, + "learning_rate": 0.002, + "loss": 2.3502, + "step": 375850 + }, + { + "epoch": 1.4529696463639035, + "grad_norm": 0.09581246227025986, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 375860 + }, + { + "epoch": 1.4530083035672867, + "grad_norm": 0.13408619165420532, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 375870 + }, + { + "epoch": 1.45304696077067, + "grad_norm": 0.10207131505012512, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 375880 + }, + { + "epoch": 1.4530856179740532, + "grad_norm": 0.10608415305614471, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 375890 + }, + { + "epoch": 1.4531242751774367, + "grad_norm": 0.10444062203168869, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 375900 + }, + { + "epoch": 1.45316293238082, + "grad_norm": 0.10438374429941177, + "learning_rate": 0.002, + "loss": 2.321, + "step": 375910 + }, + { + "epoch": 1.4532015895842032, + "grad_norm": 0.13011299073696136, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 375920 + }, + { + "epoch": 1.4532402467875865, + "grad_norm": 0.11127454042434692, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 375930 + }, + { + "epoch": 1.4532789039909697, + "grad_norm": 0.10193561762571335, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 375940 + }, + { + "epoch": 1.453317561194353, + "grad_norm": 0.14563407003879547, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 375950 + }, + { + "epoch": 1.4533562183977362, + "grad_norm": 0.10720785707235336, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 375960 + }, + { + "epoch": 1.4533948756011195, + "grad_norm": 0.13315080106258392, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 375970 + }, + { + "epoch": 1.4534335328045027, + "grad_norm": 0.09596723318099976, + "learning_rate": 0.002, + "loss": 2.326, + "step": 375980 + }, + { + "epoch": 1.453472190007886, + "grad_norm": 0.09336968511343002, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 375990 + }, + { + "epoch": 1.4535108472112692, + "grad_norm": 0.10079991817474365, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 376000 + }, + { + "epoch": 1.4535495044146527, + "grad_norm": 0.1005874052643776, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 376010 + }, + { + "epoch": 1.453588161618036, + "grad_norm": 0.1043470948934555, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 376020 + }, + { + "epoch": 1.4536268188214192, + "grad_norm": 0.1530982404947281, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 376030 + }, + { + "epoch": 1.4536654760248024, + "grad_norm": 0.12257708609104156, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 376040 + }, + { + "epoch": 1.4537041332281857, + "grad_norm": 0.0920775905251503, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 376050 + }, + { + "epoch": 1.453742790431569, + "grad_norm": 0.09294003248214722, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 376060 + }, + { + "epoch": 1.4537814476349524, + "grad_norm": 0.09124291688203812, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 376070 + }, + { + "epoch": 1.4538201048383357, + "grad_norm": 0.11236456781625748, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 376080 + }, + { + "epoch": 1.453858762041719, + "grad_norm": 0.0982632264494896, + "learning_rate": 0.002, + "loss": 2.332, + "step": 376090 + }, + { + "epoch": 1.4538974192451022, + "grad_norm": 0.09487993270158768, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 376100 + }, + { + "epoch": 1.4539360764484854, + "grad_norm": 0.10584530234336853, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 376110 + }, + { + "epoch": 1.4539747336518687, + "grad_norm": 0.11138825863599777, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 376120 + }, + { + "epoch": 1.454013390855252, + "grad_norm": 0.11696814000606537, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 376130 + }, + { + "epoch": 1.4540520480586352, + "grad_norm": 0.10326026380062103, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 376140 + }, + { + "epoch": 1.4540907052620184, + "grad_norm": 0.10197250545024872, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 376150 + }, + { + "epoch": 1.4541293624654017, + "grad_norm": 0.09561163932085037, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 376160 + }, + { + "epoch": 1.454168019668785, + "grad_norm": 0.1013605147600174, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 376170 + }, + { + "epoch": 1.4542066768721684, + "grad_norm": 0.1090017557144165, + "learning_rate": 0.002, + "loss": 2.333, + "step": 376180 + }, + { + "epoch": 1.4542453340755517, + "grad_norm": 0.09583274275064468, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 376190 + }, + { + "epoch": 1.454283991278935, + "grad_norm": 0.11330226063728333, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 376200 + }, + { + "epoch": 1.4543226484823182, + "grad_norm": 0.19393405318260193, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 376210 + }, + { + "epoch": 1.4543613056857014, + "grad_norm": 0.08783680945634842, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 376220 + }, + { + "epoch": 1.4543999628890847, + "grad_norm": 0.09217392653226852, + "learning_rate": 0.002, + "loss": 2.307, + "step": 376230 + }, + { + "epoch": 1.4544386200924682, + "grad_norm": 0.12929874658584595, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 376240 + }, + { + "epoch": 1.4544772772958514, + "grad_norm": 0.09602414816617966, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 376250 + }, + { + "epoch": 1.4545159344992347, + "grad_norm": 0.09557639807462692, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 376260 + }, + { + "epoch": 1.454554591702618, + "grad_norm": 0.10556343197822571, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 376270 + }, + { + "epoch": 1.4545932489060012, + "grad_norm": 0.12296749651432037, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 376280 + }, + { + "epoch": 1.4546319061093844, + "grad_norm": 0.10371430218219757, + "learning_rate": 0.002, + "loss": 2.321, + "step": 376290 + }, + { + "epoch": 1.4546705633127677, + "grad_norm": 0.09034077078104019, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 376300 + }, + { + "epoch": 1.454709220516151, + "grad_norm": 0.10609731823205948, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 376310 + }, + { + "epoch": 1.4547478777195342, + "grad_norm": 0.11761943250894547, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 376320 + }, + { + "epoch": 1.4547865349229174, + "grad_norm": 0.10954983532428741, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 376330 + }, + { + "epoch": 1.4548251921263007, + "grad_norm": 0.09743141382932663, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 376340 + }, + { + "epoch": 1.4548638493296842, + "grad_norm": 0.10274147242307663, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 376350 + }, + { + "epoch": 1.4549025065330674, + "grad_norm": 0.10671969503164291, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 376360 + }, + { + "epoch": 1.4549411637364507, + "grad_norm": 0.09649311006069183, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 376370 + }, + { + "epoch": 1.454979820939834, + "grad_norm": 0.10887061059474945, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 376380 + }, + { + "epoch": 1.4550184781432172, + "grad_norm": 0.09647853672504425, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 376390 + }, + { + "epoch": 1.4550571353466004, + "grad_norm": 0.09214019030332565, + "learning_rate": 0.002, + "loss": 2.3219, + "step": 376400 + }, + { + "epoch": 1.455095792549984, + "grad_norm": 0.13373766839504242, + "learning_rate": 0.002, + "loss": 2.337, + "step": 376410 + }, + { + "epoch": 1.4551344497533671, + "grad_norm": 0.09883366525173187, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 376420 + }, + { + "epoch": 1.4551731069567504, + "grad_norm": 0.08454938232898712, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 376430 + }, + { + "epoch": 1.4552117641601336, + "grad_norm": 0.09244359284639359, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 376440 + }, + { + "epoch": 1.455250421363517, + "grad_norm": 0.11290130019187927, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 376450 + }, + { + "epoch": 1.4552890785669002, + "grad_norm": 0.12210394442081451, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 376460 + }, + { + "epoch": 1.4553277357702834, + "grad_norm": 0.10246371477842331, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 376470 + }, + { + "epoch": 1.4553663929736667, + "grad_norm": 0.09917712956666946, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 376480 + }, + { + "epoch": 1.45540505017705, + "grad_norm": 0.11077070236206055, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 376490 + }, + { + "epoch": 1.4554437073804332, + "grad_norm": 0.10274385660886765, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 376500 + }, + { + "epoch": 1.4554823645838164, + "grad_norm": 0.09519847482442856, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 376510 + }, + { + "epoch": 1.4555210217871999, + "grad_norm": 0.10795559734106064, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 376520 + }, + { + "epoch": 1.4555596789905831, + "grad_norm": 0.09167300164699554, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 376530 + }, + { + "epoch": 1.4555983361939664, + "grad_norm": 0.09464503079652786, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 376540 + }, + { + "epoch": 1.4556369933973496, + "grad_norm": 0.11465884000062943, + "learning_rate": 0.002, + "loss": 2.335, + "step": 376550 + }, + { + "epoch": 1.455675650600733, + "grad_norm": 0.08918526023626328, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 376560 + }, + { + "epoch": 1.4557143078041161, + "grad_norm": 0.11020664125680923, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 376570 + }, + { + "epoch": 1.4557529650074996, + "grad_norm": 0.09470126032829285, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 376580 + }, + { + "epoch": 1.4557916222108829, + "grad_norm": 0.09877173602581024, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 376590 + }, + { + "epoch": 1.4558302794142661, + "grad_norm": 0.08799102902412415, + "learning_rate": 0.002, + "loss": 2.33, + "step": 376600 + }, + { + "epoch": 1.4558689366176494, + "grad_norm": 0.09482432901859283, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 376610 + }, + { + "epoch": 1.4559075938210326, + "grad_norm": 0.12014751881361008, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 376620 + }, + { + "epoch": 1.4559462510244159, + "grad_norm": 0.09875979274511337, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 376630 + }, + { + "epoch": 1.4559849082277991, + "grad_norm": 0.09889055788516998, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 376640 + }, + { + "epoch": 1.4560235654311824, + "grad_norm": 0.10025811195373535, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 376650 + }, + { + "epoch": 1.4560622226345656, + "grad_norm": 0.1114993616938591, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 376660 + }, + { + "epoch": 1.456100879837949, + "grad_norm": 0.10707735270261765, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 376670 + }, + { + "epoch": 1.4561395370413321, + "grad_norm": 0.1128287985920906, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 376680 + }, + { + "epoch": 1.4561781942447156, + "grad_norm": 0.09880905598402023, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 376690 + }, + { + "epoch": 1.4562168514480989, + "grad_norm": 0.120734304189682, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 376700 + }, + { + "epoch": 1.4562555086514821, + "grad_norm": 0.0999571681022644, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 376710 + }, + { + "epoch": 1.4562941658548654, + "grad_norm": 0.1232922375202179, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 376720 + }, + { + "epoch": 1.4563328230582486, + "grad_norm": 0.1032818928360939, + "learning_rate": 0.002, + "loss": 2.344, + "step": 376730 + }, + { + "epoch": 1.4563714802616319, + "grad_norm": 0.0872046947479248, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 376740 + }, + { + "epoch": 1.4564101374650154, + "grad_norm": 0.09458109736442566, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 376750 + }, + { + "epoch": 1.4564487946683986, + "grad_norm": 0.11395496875047684, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 376760 + }, + { + "epoch": 1.4564874518717819, + "grad_norm": 0.09434529393911362, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 376770 + }, + { + "epoch": 1.456526109075165, + "grad_norm": 0.1036214679479599, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 376780 + }, + { + "epoch": 1.4565647662785484, + "grad_norm": 0.12814341485500336, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 376790 + }, + { + "epoch": 1.4566034234819316, + "grad_norm": 0.0936191976070404, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 376800 + }, + { + "epoch": 1.4566420806853149, + "grad_norm": 0.10926380008459091, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 376810 + }, + { + "epoch": 1.4566807378886981, + "grad_norm": 0.0996718481183052, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 376820 + }, + { + "epoch": 1.4567193950920814, + "grad_norm": 0.11856534332036972, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 376830 + }, + { + "epoch": 1.4567580522954646, + "grad_norm": 0.08964511752128601, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 376840 + }, + { + "epoch": 1.456796709498848, + "grad_norm": 0.10973100364208221, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 376850 + }, + { + "epoch": 1.4568353667022313, + "grad_norm": 0.09312520176172256, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 376860 + }, + { + "epoch": 1.4568740239056146, + "grad_norm": 0.09315123409032822, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 376870 + }, + { + "epoch": 1.4569126811089979, + "grad_norm": 0.10019497573375702, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 376880 + }, + { + "epoch": 1.456951338312381, + "grad_norm": 0.10009029507637024, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 376890 + }, + { + "epoch": 1.4569899955157644, + "grad_norm": 0.10956374555826187, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 376900 + }, + { + "epoch": 1.4570286527191476, + "grad_norm": 0.08636657893657684, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 376910 + }, + { + "epoch": 1.457067309922531, + "grad_norm": 0.10253308713436127, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 376920 + }, + { + "epoch": 1.4571059671259143, + "grad_norm": 0.12445124238729477, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 376930 + }, + { + "epoch": 1.4571446243292976, + "grad_norm": 0.10057014971971512, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 376940 + }, + { + "epoch": 1.4571832815326808, + "grad_norm": 0.09831283986568451, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 376950 + }, + { + "epoch": 1.457221938736064, + "grad_norm": 0.11100064963102341, + "learning_rate": 0.002, + "loss": 2.3125, + "step": 376960 + }, + { + "epoch": 1.4572605959394473, + "grad_norm": 0.10230191051959991, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 376970 + }, + { + "epoch": 1.4572992531428306, + "grad_norm": 0.1209181398153305, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 376980 + }, + { + "epoch": 1.4573379103462138, + "grad_norm": 0.12005867809057236, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 376990 + }, + { + "epoch": 1.457376567549597, + "grad_norm": 0.09351613372564316, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 377000 + }, + { + "epoch": 1.4574152247529804, + "grad_norm": 0.117181695997715, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 377010 + }, + { + "epoch": 1.4574538819563638, + "grad_norm": 0.09451831877231598, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 377020 + }, + { + "epoch": 1.457492539159747, + "grad_norm": 0.10694488883018494, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 377030 + }, + { + "epoch": 1.4575311963631303, + "grad_norm": 0.10419536381959915, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 377040 + }, + { + "epoch": 1.4575698535665136, + "grad_norm": 0.09017110615968704, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 377050 + }, + { + "epoch": 1.4576085107698968, + "grad_norm": 0.09499950706958771, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 377060 + }, + { + "epoch": 1.45764716797328, + "grad_norm": 0.1026366576552391, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 377070 + }, + { + "epoch": 1.4576858251766633, + "grad_norm": 0.11473041772842407, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 377080 + }, + { + "epoch": 1.4577244823800468, + "grad_norm": 0.11454009264707565, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 377090 + }, + { + "epoch": 1.45776313958343, + "grad_norm": 0.10464174300432205, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 377100 + }, + { + "epoch": 1.4578017967868133, + "grad_norm": 0.10501186549663544, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 377110 + }, + { + "epoch": 1.4578404539901966, + "grad_norm": 0.11394108086824417, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 377120 + }, + { + "epoch": 1.4578791111935798, + "grad_norm": 0.10638362914323807, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 377130 + }, + { + "epoch": 1.457917768396963, + "grad_norm": 0.11045945435762405, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 377140 + }, + { + "epoch": 1.4579564256003463, + "grad_norm": 0.09159348905086517, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 377150 + }, + { + "epoch": 1.4579950828037296, + "grad_norm": 0.10121490061283112, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 377160 + }, + { + "epoch": 1.4580337400071128, + "grad_norm": 0.09991408884525299, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 377170 + }, + { + "epoch": 1.458072397210496, + "grad_norm": 0.11415287107229233, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 377180 + }, + { + "epoch": 1.4581110544138796, + "grad_norm": 0.10838111490011215, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 377190 + }, + { + "epoch": 1.4581497116172628, + "grad_norm": 0.3309662938117981, + "learning_rate": 0.002, + "loss": 2.34, + "step": 377200 + }, + { + "epoch": 1.458188368820646, + "grad_norm": 0.09142623096704483, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 377210 + }, + { + "epoch": 1.4582270260240293, + "grad_norm": 0.09920763224363327, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 377220 + }, + { + "epoch": 1.4582656832274126, + "grad_norm": 0.10411565005779266, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 377230 + }, + { + "epoch": 1.4583043404307958, + "grad_norm": 0.11159609258174896, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 377240 + }, + { + "epoch": 1.458342997634179, + "grad_norm": 0.14390446245670319, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 377250 + }, + { + "epoch": 1.4583816548375625, + "grad_norm": 0.11994519084692001, + "learning_rate": 0.002, + "loss": 2.337, + "step": 377260 + }, + { + "epoch": 1.4584203120409458, + "grad_norm": 0.09185947477817535, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 377270 + }, + { + "epoch": 1.458458969244329, + "grad_norm": 0.0885382816195488, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 377280 + }, + { + "epoch": 1.4584976264477123, + "grad_norm": 0.10349847376346588, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 377290 + }, + { + "epoch": 1.4585362836510956, + "grad_norm": 0.10722818970680237, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 377300 + }, + { + "epoch": 1.4585749408544788, + "grad_norm": 0.10624206811189651, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 377310 + }, + { + "epoch": 1.458613598057862, + "grad_norm": 0.09444423019886017, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 377320 + }, + { + "epoch": 1.4586522552612453, + "grad_norm": 0.1137901172041893, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 377330 + }, + { + "epoch": 1.4586909124646286, + "grad_norm": 0.10508893430233002, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 377340 + }, + { + "epoch": 1.4587295696680118, + "grad_norm": 0.1037166640162468, + "learning_rate": 0.002, + "loss": 2.351, + "step": 377350 + }, + { + "epoch": 1.4587682268713953, + "grad_norm": 0.11206698417663574, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 377360 + }, + { + "epoch": 1.4588068840747785, + "grad_norm": 0.11649308353662491, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 377370 + }, + { + "epoch": 1.4588455412781618, + "grad_norm": 0.08731440454721451, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 377380 + }, + { + "epoch": 1.458884198481545, + "grad_norm": 0.09099102020263672, + "learning_rate": 0.002, + "loss": 2.334, + "step": 377390 + }, + { + "epoch": 1.4589228556849283, + "grad_norm": 0.0877019613981247, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 377400 + }, + { + "epoch": 1.4589615128883116, + "grad_norm": 0.1126909926533699, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 377410 + }, + { + "epoch": 1.4590001700916948, + "grad_norm": 0.10737772285938263, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 377420 + }, + { + "epoch": 1.4590388272950783, + "grad_norm": 0.1171116977930069, + "learning_rate": 0.002, + "loss": 2.3528, + "step": 377430 + }, + { + "epoch": 1.4590774844984615, + "grad_norm": 0.10207915306091309, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 377440 + }, + { + "epoch": 1.4591161417018448, + "grad_norm": 0.10968824476003647, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 377450 + }, + { + "epoch": 1.459154798905228, + "grad_norm": 0.12337537109851837, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 377460 + }, + { + "epoch": 1.4591934561086113, + "grad_norm": 0.0989091545343399, + "learning_rate": 0.002, + "loss": 2.337, + "step": 377470 + }, + { + "epoch": 1.4592321133119945, + "grad_norm": 0.10799914598464966, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 377480 + }, + { + "epoch": 1.4592707705153778, + "grad_norm": 0.10297781229019165, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 377490 + }, + { + "epoch": 1.459309427718761, + "grad_norm": 0.09718317538499832, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 377500 + }, + { + "epoch": 1.4593480849221443, + "grad_norm": 0.1368032544851303, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 377510 + }, + { + "epoch": 1.4593867421255275, + "grad_norm": 0.10432197153568268, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 377520 + }, + { + "epoch": 1.459425399328911, + "grad_norm": 0.10635431855916977, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 377530 + }, + { + "epoch": 1.4594640565322943, + "grad_norm": 0.10219579190015793, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 377540 + }, + { + "epoch": 1.4595027137356775, + "grad_norm": 0.09423661231994629, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 377550 + }, + { + "epoch": 1.4595413709390608, + "grad_norm": 0.09498999267816544, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 377560 + }, + { + "epoch": 1.459580028142444, + "grad_norm": 0.10256657749414444, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 377570 + }, + { + "epoch": 1.4596186853458273, + "grad_norm": 0.10926370322704315, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 377580 + }, + { + "epoch": 1.4596573425492108, + "grad_norm": 0.10336575657129288, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 377590 + }, + { + "epoch": 1.459695999752594, + "grad_norm": 0.11170030385255814, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 377600 + }, + { + "epoch": 1.4597346569559773, + "grad_norm": 0.11618107557296753, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 377610 + }, + { + "epoch": 1.4597733141593605, + "grad_norm": 0.10613735765218735, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 377620 + }, + { + "epoch": 1.4598119713627438, + "grad_norm": 0.1042288988828659, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 377630 + }, + { + "epoch": 1.459850628566127, + "grad_norm": 0.09028930217027664, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 377640 + }, + { + "epoch": 1.4598892857695103, + "grad_norm": 0.11619842052459717, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 377650 + }, + { + "epoch": 1.4599279429728935, + "grad_norm": 0.24519114196300507, + "learning_rate": 0.002, + "loss": 2.337, + "step": 377660 + }, + { + "epoch": 1.4599666001762768, + "grad_norm": 0.09756876528263092, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 377670 + }, + { + "epoch": 1.46000525737966, + "grad_norm": 0.1016656905412674, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 377680 + }, + { + "epoch": 1.4600439145830433, + "grad_norm": 0.09221766144037247, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 377690 + }, + { + "epoch": 1.4600825717864268, + "grad_norm": 0.10488689690828323, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 377700 + }, + { + "epoch": 1.46012122898981, + "grad_norm": 0.10696990042924881, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 377710 + }, + { + "epoch": 1.4601598861931933, + "grad_norm": 0.10517062246799469, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 377720 + }, + { + "epoch": 1.4601985433965765, + "grad_norm": 0.10095066577196121, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 377730 + }, + { + "epoch": 1.4602372005999598, + "grad_norm": 0.09365548193454742, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 377740 + }, + { + "epoch": 1.460275857803343, + "grad_norm": 0.11912385374307632, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 377750 + }, + { + "epoch": 1.4603145150067265, + "grad_norm": 0.1095656305551529, + "learning_rate": 0.002, + "loss": 2.3078, + "step": 377760 + }, + { + "epoch": 1.4603531722101097, + "grad_norm": 0.0963665023446083, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 377770 + }, + { + "epoch": 1.460391829413493, + "grad_norm": 0.10575723648071289, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 377780 + }, + { + "epoch": 1.4604304866168762, + "grad_norm": 0.10483500361442566, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 377790 + }, + { + "epoch": 1.4604691438202595, + "grad_norm": 0.10449696332216263, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 377800 + }, + { + "epoch": 1.4605078010236427, + "grad_norm": 0.12368028610944748, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 377810 + }, + { + "epoch": 1.460546458227026, + "grad_norm": 0.09814267605543137, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 377820 + }, + { + "epoch": 1.4605851154304093, + "grad_norm": 0.09859205782413483, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 377830 + }, + { + "epoch": 1.4606237726337925, + "grad_norm": 0.08651961386203766, + "learning_rate": 0.002, + "loss": 2.333, + "step": 377840 + }, + { + "epoch": 1.4606624298371758, + "grad_norm": 0.11479639261960983, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 377850 + }, + { + "epoch": 1.460701087040559, + "grad_norm": 0.10261834412813187, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 377860 + }, + { + "epoch": 1.4607397442439425, + "grad_norm": 0.09389077872037888, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 377870 + }, + { + "epoch": 1.4607784014473257, + "grad_norm": 0.10234487801790237, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 377880 + }, + { + "epoch": 1.460817058650709, + "grad_norm": 0.1089874878525734, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 377890 + }, + { + "epoch": 1.4608557158540922, + "grad_norm": 0.12259659171104431, + "learning_rate": 0.002, + "loss": 2.341, + "step": 377900 + }, + { + "epoch": 1.4608943730574755, + "grad_norm": 0.10895906388759613, + "learning_rate": 0.002, + "loss": 2.3476, + "step": 377910 + }, + { + "epoch": 1.4609330302608587, + "grad_norm": 0.21146343648433685, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 377920 + }, + { + "epoch": 1.4609716874642422, + "grad_norm": 0.0988226905465126, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 377930 + }, + { + "epoch": 1.4610103446676255, + "grad_norm": 0.10519076883792877, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 377940 + }, + { + "epoch": 1.4610490018710087, + "grad_norm": 0.09804467856884003, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 377950 + }, + { + "epoch": 1.461087659074392, + "grad_norm": 0.08854498714208603, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 377960 + }, + { + "epoch": 1.4611263162777752, + "grad_norm": 0.11327465623617172, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 377970 + }, + { + "epoch": 1.4611649734811585, + "grad_norm": 0.09992703795433044, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 377980 + }, + { + "epoch": 1.4612036306845417, + "grad_norm": 0.10690799355506897, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 377990 + }, + { + "epoch": 1.461242287887925, + "grad_norm": 0.11535067856311798, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 378000 + }, + { + "epoch": 1.4612809450913082, + "grad_norm": 0.11228501796722412, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 378010 + }, + { + "epoch": 1.4613196022946915, + "grad_norm": 0.1051812395453453, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 378020 + }, + { + "epoch": 1.4613582594980747, + "grad_norm": 0.09751439839601517, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 378030 + }, + { + "epoch": 1.4613969167014582, + "grad_norm": 0.12290656566619873, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 378040 + }, + { + "epoch": 1.4614355739048415, + "grad_norm": 0.11187057942152023, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 378050 + }, + { + "epoch": 1.4614742311082247, + "grad_norm": 0.11574528366327286, + "learning_rate": 0.002, + "loss": 2.351, + "step": 378060 + }, + { + "epoch": 1.461512888311608, + "grad_norm": 0.2262689471244812, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 378070 + }, + { + "epoch": 1.4615515455149912, + "grad_norm": 0.10998236387968063, + "learning_rate": 0.002, + "loss": 2.354, + "step": 378080 + }, + { + "epoch": 1.4615902027183745, + "grad_norm": 0.12994958460330963, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 378090 + }, + { + "epoch": 1.461628859921758, + "grad_norm": 1.093963384628296, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 378100 + }, + { + "epoch": 1.4616675171251412, + "grad_norm": 0.12165454775094986, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 378110 + }, + { + "epoch": 1.4617061743285245, + "grad_norm": 0.1254856437444687, + "learning_rate": 0.002, + "loss": 2.343, + "step": 378120 + }, + { + "epoch": 1.4617448315319077, + "grad_norm": 0.09542275965213776, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 378130 + }, + { + "epoch": 1.461783488735291, + "grad_norm": 0.11822837591171265, + "learning_rate": 0.002, + "loss": 2.344, + "step": 378140 + }, + { + "epoch": 1.4618221459386742, + "grad_norm": 0.10291677713394165, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 378150 + }, + { + "epoch": 1.4618608031420575, + "grad_norm": 0.09813756495714188, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 378160 + }, + { + "epoch": 1.4618994603454407, + "grad_norm": 0.15027743577957153, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 378170 + }, + { + "epoch": 1.461938117548824, + "grad_norm": 0.10020308941602707, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 378180 + }, + { + "epoch": 1.4619767747522072, + "grad_norm": 0.21416075527668, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 378190 + }, + { + "epoch": 1.4620154319555905, + "grad_norm": 0.10870862007141113, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 378200 + }, + { + "epoch": 1.462054089158974, + "grad_norm": 0.10715006291866302, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 378210 + }, + { + "epoch": 1.4620927463623572, + "grad_norm": 0.10555832833051682, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 378220 + }, + { + "epoch": 1.4621314035657405, + "grad_norm": 0.1517215073108673, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 378230 + }, + { + "epoch": 1.4621700607691237, + "grad_norm": 0.1058911606669426, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 378240 + }, + { + "epoch": 1.462208717972507, + "grad_norm": 0.09682868421077728, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 378250 + }, + { + "epoch": 1.4622473751758902, + "grad_norm": 0.09074389934539795, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 378260 + }, + { + "epoch": 1.4622860323792737, + "grad_norm": 0.11195127665996552, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 378270 + }, + { + "epoch": 1.462324689582657, + "grad_norm": 0.1032756045460701, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 378280 + }, + { + "epoch": 1.4623633467860402, + "grad_norm": 0.10251615196466446, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 378290 + }, + { + "epoch": 1.4624020039894234, + "grad_norm": 0.11106961965560913, + "learning_rate": 0.002, + "loss": 2.316, + "step": 378300 + }, + { + "epoch": 1.4624406611928067, + "grad_norm": 0.10614031553268433, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 378310 + }, + { + "epoch": 1.46247931839619, + "grad_norm": 0.1287071406841278, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 378320 + }, + { + "epoch": 1.4625179755995732, + "grad_norm": 0.11179511249065399, + "learning_rate": 0.002, + "loss": 2.3564, + "step": 378330 + }, + { + "epoch": 1.4625566328029564, + "grad_norm": 0.11233927309513092, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 378340 + }, + { + "epoch": 1.4625952900063397, + "grad_norm": 0.13310958445072174, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 378350 + }, + { + "epoch": 1.462633947209723, + "grad_norm": 0.11643460392951965, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 378360 + }, + { + "epoch": 1.4626726044131062, + "grad_norm": 0.115506611764431, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 378370 + }, + { + "epoch": 1.4627112616164897, + "grad_norm": 0.1059376448392868, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 378380 + }, + { + "epoch": 1.462749918819873, + "grad_norm": 0.12319108843803406, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 378390 + }, + { + "epoch": 1.4627885760232562, + "grad_norm": 0.09284790605306625, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 378400 + }, + { + "epoch": 1.4628272332266394, + "grad_norm": 0.1385800689458847, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 378410 + }, + { + "epoch": 1.4628658904300227, + "grad_norm": 0.11187001317739487, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 378420 + }, + { + "epoch": 1.462904547633406, + "grad_norm": 0.1008906215429306, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 378430 + }, + { + "epoch": 1.4629432048367894, + "grad_norm": 0.11877201497554779, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 378440 + }, + { + "epoch": 1.4629818620401727, + "grad_norm": 0.1024615615606308, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 378450 + }, + { + "epoch": 1.463020519243556, + "grad_norm": 0.10329898446798325, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 378460 + }, + { + "epoch": 1.4630591764469392, + "grad_norm": 0.1304638385772705, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 378470 + }, + { + "epoch": 1.4630978336503224, + "grad_norm": 0.09835382550954819, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 378480 + }, + { + "epoch": 1.4631364908537057, + "grad_norm": 0.11532345414161682, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 378490 + }, + { + "epoch": 1.463175148057089, + "grad_norm": 0.09121698886156082, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 378500 + }, + { + "epoch": 1.4632138052604722, + "grad_norm": 0.10565804690122604, + "learning_rate": 0.002, + "loss": 2.345, + "step": 378510 + }, + { + "epoch": 1.4632524624638554, + "grad_norm": 0.10543417930603027, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 378520 + }, + { + "epoch": 1.4632911196672387, + "grad_norm": 0.0975838452577591, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 378530 + }, + { + "epoch": 1.463329776870622, + "grad_norm": 0.09796416759490967, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 378540 + }, + { + "epoch": 1.4633684340740054, + "grad_norm": 0.1139095202088356, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 378550 + }, + { + "epoch": 1.4634070912773887, + "grad_norm": 0.1038951724767685, + "learning_rate": 0.002, + "loss": 2.322, + "step": 378560 + }, + { + "epoch": 1.463445748480772, + "grad_norm": 0.11512892693281174, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 378570 + }, + { + "epoch": 1.4634844056841552, + "grad_norm": 0.10317057371139526, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 378580 + }, + { + "epoch": 1.4635230628875384, + "grad_norm": 0.09820453077554703, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 378590 + }, + { + "epoch": 1.4635617200909217, + "grad_norm": 0.09886424243450165, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 378600 + }, + { + "epoch": 1.4636003772943051, + "grad_norm": 0.09394435584545135, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 378610 + }, + { + "epoch": 1.4636390344976884, + "grad_norm": 0.09522797167301178, + "learning_rate": 0.002, + "loss": 2.3092, + "step": 378620 + }, + { + "epoch": 1.4636776917010716, + "grad_norm": 0.13810744881629944, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 378630 + }, + { + "epoch": 1.463716348904455, + "grad_norm": 0.10484451800584793, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 378640 + }, + { + "epoch": 1.4637550061078382, + "grad_norm": 0.10630256682634354, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 378650 + }, + { + "epoch": 1.4637936633112214, + "grad_norm": 0.10911909490823746, + "learning_rate": 0.002, + "loss": 2.348, + "step": 378660 + }, + { + "epoch": 1.4638323205146047, + "grad_norm": 0.10513444989919662, + "learning_rate": 0.002, + "loss": 2.3133, + "step": 378670 + }, + { + "epoch": 1.463870977717988, + "grad_norm": 0.12285730987787247, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 378680 + }, + { + "epoch": 1.4639096349213712, + "grad_norm": 0.10793665051460266, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 378690 + }, + { + "epoch": 1.4639482921247544, + "grad_norm": 0.12629415094852448, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 378700 + }, + { + "epoch": 1.4639869493281379, + "grad_norm": 0.10023588687181473, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 378710 + }, + { + "epoch": 1.4640256065315211, + "grad_norm": 0.13200624287128448, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 378720 + }, + { + "epoch": 1.4640642637349044, + "grad_norm": 0.1099928468465805, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 378730 + }, + { + "epoch": 1.4641029209382876, + "grad_norm": 0.11238476634025574, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 378740 + }, + { + "epoch": 1.464141578141671, + "grad_norm": 0.08446121960878372, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 378750 + }, + { + "epoch": 1.4641802353450541, + "grad_norm": 0.12352073937654495, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 378760 + }, + { + "epoch": 1.4642188925484374, + "grad_norm": 0.09753142297267914, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 378770 + }, + { + "epoch": 1.4642575497518209, + "grad_norm": 0.1746557056903839, + "learning_rate": 0.002, + "loss": 2.336, + "step": 378780 + }, + { + "epoch": 1.4642962069552041, + "grad_norm": 0.11375793814659119, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 378790 + }, + { + "epoch": 1.4643348641585874, + "grad_norm": 0.11853129416704178, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 378800 + }, + { + "epoch": 1.4643735213619706, + "grad_norm": 0.10554022341966629, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 378810 + }, + { + "epoch": 1.4644121785653539, + "grad_norm": 0.10978643596172333, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 378820 + }, + { + "epoch": 1.4644508357687371, + "grad_norm": 0.10757063329219818, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 378830 + }, + { + "epoch": 1.4644894929721204, + "grad_norm": 0.09296415001153946, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 378840 + }, + { + "epoch": 1.4645281501755036, + "grad_norm": 0.11280360817909241, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 378850 + }, + { + "epoch": 1.464566807378887, + "grad_norm": 0.11102467775344849, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 378860 + }, + { + "epoch": 1.4646054645822701, + "grad_norm": 0.10691647231578827, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 378870 + }, + { + "epoch": 1.4646441217856536, + "grad_norm": 0.11569046974182129, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 378880 + }, + { + "epoch": 1.4646827789890369, + "grad_norm": 0.10578905045986176, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 378890 + }, + { + "epoch": 1.4647214361924201, + "grad_norm": 0.11721711605787277, + "learning_rate": 0.002, + "loss": 2.342, + "step": 378900 + }, + { + "epoch": 1.4647600933958034, + "grad_norm": 0.10432325303554535, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 378910 + }, + { + "epoch": 1.4647987505991866, + "grad_norm": 0.09794896841049194, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 378920 + }, + { + "epoch": 1.4648374078025699, + "grad_norm": 0.10016597807407379, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 378930 + }, + { + "epoch": 1.4648760650059531, + "grad_norm": 0.09569735080003738, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 378940 + }, + { + "epoch": 1.4649147222093366, + "grad_norm": 0.11742109805345535, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 378950 + }, + { + "epoch": 1.4649533794127199, + "grad_norm": 0.13616660237312317, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 378960 + }, + { + "epoch": 1.464992036616103, + "grad_norm": 0.09571486711502075, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 378970 + }, + { + "epoch": 1.4650306938194864, + "grad_norm": 0.09083391726016998, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 378980 + }, + { + "epoch": 1.4650693510228696, + "grad_norm": 0.10643785446882248, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 378990 + }, + { + "epoch": 1.4651080082262529, + "grad_norm": 0.11423055082559586, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 379000 + }, + { + "epoch": 1.4651466654296361, + "grad_norm": 0.10717245191335678, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 379010 + }, + { + "epoch": 1.4651853226330194, + "grad_norm": 0.11718068271875381, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 379020 + }, + { + "epoch": 1.4652239798364026, + "grad_norm": 0.09448101371526718, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 379030 + }, + { + "epoch": 1.4652626370397859, + "grad_norm": 0.10101408511400223, + "learning_rate": 0.002, + "loss": 2.337, + "step": 379040 + }, + { + "epoch": 1.4653012942431693, + "grad_norm": 0.09884758293628693, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 379050 + }, + { + "epoch": 1.4653399514465526, + "grad_norm": 0.10572511702775955, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 379060 + }, + { + "epoch": 1.4653786086499359, + "grad_norm": 0.12739481031894684, + "learning_rate": 0.002, + "loss": 2.336, + "step": 379070 + }, + { + "epoch": 1.465417265853319, + "grad_norm": 0.09359881281852722, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 379080 + }, + { + "epoch": 1.4654559230567024, + "grad_norm": 0.17386668920516968, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 379090 + }, + { + "epoch": 1.4654945802600856, + "grad_norm": 0.11996670067310333, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 379100 + }, + { + "epoch": 1.4655332374634689, + "grad_norm": 0.1109367087483406, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 379110 + }, + { + "epoch": 1.4655718946668523, + "grad_norm": 0.10811998695135117, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 379120 + }, + { + "epoch": 1.4656105518702356, + "grad_norm": 0.10904763638973236, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 379130 + }, + { + "epoch": 1.4656492090736188, + "grad_norm": 0.10743703693151474, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 379140 + }, + { + "epoch": 1.465687866277002, + "grad_norm": 0.09433609992265701, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 379150 + }, + { + "epoch": 1.4657265234803853, + "grad_norm": 0.09775488823652267, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 379160 + }, + { + "epoch": 1.4657651806837686, + "grad_norm": 0.11015506088733673, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 379170 + }, + { + "epoch": 1.4658038378871519, + "grad_norm": 0.1063729077577591, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 379180 + }, + { + "epoch": 1.465842495090535, + "grad_norm": 0.11362296342849731, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 379190 + }, + { + "epoch": 1.4658811522939184, + "grad_norm": 0.11729138344526291, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 379200 + }, + { + "epoch": 1.4659198094973016, + "grad_norm": 0.09225037693977356, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 379210 + }, + { + "epoch": 1.465958466700685, + "grad_norm": 0.0946132019162178, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 379220 + }, + { + "epoch": 1.4659971239040683, + "grad_norm": 0.1005450114607811, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 379230 + }, + { + "epoch": 1.4660357811074516, + "grad_norm": 0.11385586857795715, + "learning_rate": 0.002, + "loss": 2.327, + "step": 379240 + }, + { + "epoch": 1.4660744383108348, + "grad_norm": 0.09891478717327118, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 379250 + }, + { + "epoch": 1.466113095514218, + "grad_norm": 0.09788984805345535, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 379260 + }, + { + "epoch": 1.4661517527176013, + "grad_norm": 0.09577134996652603, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 379270 + }, + { + "epoch": 1.4661904099209846, + "grad_norm": 0.11400555819272995, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 379280 + }, + { + "epoch": 1.466229067124368, + "grad_norm": 0.0947289690375328, + "learning_rate": 0.002, + "loss": 2.335, + "step": 379290 + }, + { + "epoch": 1.4662677243277513, + "grad_norm": 0.10459941625595093, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 379300 + }, + { + "epoch": 1.4663063815311346, + "grad_norm": 0.13126231729984283, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 379310 + }, + { + "epoch": 1.4663450387345178, + "grad_norm": 0.0940525233745575, + "learning_rate": 0.002, + "loss": 2.3126, + "step": 379320 + }, + { + "epoch": 1.466383695937901, + "grad_norm": 0.09635006636381149, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 379330 + }, + { + "epoch": 1.4664223531412843, + "grad_norm": 0.11484242975711823, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 379340 + }, + { + "epoch": 1.4664610103446676, + "grad_norm": 0.10275579243898392, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 379350 + }, + { + "epoch": 1.4664996675480508, + "grad_norm": 0.1047661229968071, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 379360 + }, + { + "epoch": 1.466538324751434, + "grad_norm": 0.08567144721746445, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 379370 + }, + { + "epoch": 1.4665769819548173, + "grad_norm": 0.4120446741580963, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 379380 + }, + { + "epoch": 1.4666156391582008, + "grad_norm": 0.11681369692087173, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 379390 + }, + { + "epoch": 1.466654296361584, + "grad_norm": 0.12154824286699295, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 379400 + }, + { + "epoch": 1.4666929535649673, + "grad_norm": 0.09770403057336807, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 379410 + }, + { + "epoch": 1.4667316107683506, + "grad_norm": 0.09427760541439056, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 379420 + }, + { + "epoch": 1.4667702679717338, + "grad_norm": 0.10652682930231094, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 379430 + }, + { + "epoch": 1.466808925175117, + "grad_norm": 0.1023586243391037, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 379440 + }, + { + "epoch": 1.4668475823785005, + "grad_norm": 0.10801868885755539, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 379450 + }, + { + "epoch": 1.4668862395818838, + "grad_norm": 0.10652651637792587, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 379460 + }, + { + "epoch": 1.466924896785267, + "grad_norm": 0.1005072072148323, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 379470 + }, + { + "epoch": 1.4669635539886503, + "grad_norm": 0.1006920337677002, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 379480 + }, + { + "epoch": 1.4670022111920336, + "grad_norm": 0.12021995335817337, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 379490 + }, + { + "epoch": 1.4670408683954168, + "grad_norm": 0.11323700100183487, + "learning_rate": 0.002, + "loss": 2.331, + "step": 379500 + }, + { + "epoch": 1.4670795255988, + "grad_norm": 0.10849696397781372, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 379510 + }, + { + "epoch": 1.4671181828021833, + "grad_norm": 0.09491071850061417, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 379520 + }, + { + "epoch": 1.4671568400055666, + "grad_norm": 0.1152421236038208, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 379530 + }, + { + "epoch": 1.4671954972089498, + "grad_norm": 0.0976879745721817, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 379540 + }, + { + "epoch": 1.467234154412333, + "grad_norm": 0.13243097066879272, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 379550 + }, + { + "epoch": 1.4672728116157165, + "grad_norm": 0.09705587476491928, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 379560 + }, + { + "epoch": 1.4673114688190998, + "grad_norm": 0.09247449040412903, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 379570 + }, + { + "epoch": 1.467350126022483, + "grad_norm": 0.14580531418323517, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 379580 + }, + { + "epoch": 1.4673887832258663, + "grad_norm": 0.11156206578016281, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 379590 + }, + { + "epoch": 1.4674274404292496, + "grad_norm": 0.11129026859998703, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 379600 + }, + { + "epoch": 1.4674660976326328, + "grad_norm": 0.1293782740831375, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 379610 + }, + { + "epoch": 1.4675047548360163, + "grad_norm": 0.10700634866952896, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 379620 + }, + { + "epoch": 1.4675434120393995, + "grad_norm": 0.09107174724340439, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 379630 + }, + { + "epoch": 1.4675820692427828, + "grad_norm": 0.0936293751001358, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 379640 + }, + { + "epoch": 1.467620726446166, + "grad_norm": 0.09288164973258972, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 379650 + }, + { + "epoch": 1.4676593836495493, + "grad_norm": 0.11261691898107529, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 379660 + }, + { + "epoch": 1.4676980408529325, + "grad_norm": 0.10153496265411377, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 379670 + }, + { + "epoch": 1.4677366980563158, + "grad_norm": 0.104087233543396, + "learning_rate": 0.002, + "loss": 2.3124, + "step": 379680 + }, + { + "epoch": 1.467775355259699, + "grad_norm": 0.10559988766908646, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 379690 + }, + { + "epoch": 1.4678140124630823, + "grad_norm": 0.09458968043327332, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 379700 + }, + { + "epoch": 1.4678526696664655, + "grad_norm": 0.09753791242837906, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 379710 + }, + { + "epoch": 1.4678913268698488, + "grad_norm": 0.10440224409103394, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 379720 + }, + { + "epoch": 1.4679299840732323, + "grad_norm": 0.11204987019300461, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 379730 + }, + { + "epoch": 1.4679686412766155, + "grad_norm": 0.09139106422662735, + "learning_rate": 0.002, + "loss": 2.331, + "step": 379740 + }, + { + "epoch": 1.4680072984799988, + "grad_norm": 0.09165173023939133, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 379750 + }, + { + "epoch": 1.468045955683382, + "grad_norm": 0.11202359199523926, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 379760 + }, + { + "epoch": 1.4680846128867653, + "grad_norm": 0.11179552227258682, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 379770 + }, + { + "epoch": 1.4681232700901485, + "grad_norm": 0.1006980687379837, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 379780 + }, + { + "epoch": 1.468161927293532, + "grad_norm": 0.09841970354318619, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 379790 + }, + { + "epoch": 1.4682005844969153, + "grad_norm": 0.10231813788414001, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 379800 + }, + { + "epoch": 1.4682392417002985, + "grad_norm": 0.11085063964128494, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 379810 + }, + { + "epoch": 1.4682778989036818, + "grad_norm": 0.09536705911159515, + "learning_rate": 0.002, + "loss": 2.334, + "step": 379820 + }, + { + "epoch": 1.468316556107065, + "grad_norm": 0.10429663956165314, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 379830 + }, + { + "epoch": 1.4683552133104483, + "grad_norm": 0.10179862380027771, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 379840 + }, + { + "epoch": 1.4683938705138315, + "grad_norm": 0.10341744124889374, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 379850 + }, + { + "epoch": 1.4684325277172148, + "grad_norm": 0.10987520217895508, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 379860 + }, + { + "epoch": 1.468471184920598, + "grad_norm": 0.1209501177072525, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 379870 + }, + { + "epoch": 1.4685098421239813, + "grad_norm": 0.10270653665065765, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 379880 + }, + { + "epoch": 1.4685484993273645, + "grad_norm": 0.10785645246505737, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 379890 + }, + { + "epoch": 1.468587156530748, + "grad_norm": 0.10725623369216919, + "learning_rate": 0.002, + "loss": 2.3176, + "step": 379900 + }, + { + "epoch": 1.4686258137341313, + "grad_norm": 0.09680724889039993, + "learning_rate": 0.002, + "loss": 2.3145, + "step": 379910 + }, + { + "epoch": 1.4686644709375145, + "grad_norm": 0.11619780212640762, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 379920 + }, + { + "epoch": 1.4687031281408978, + "grad_norm": 0.10926903039216995, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 379930 + }, + { + "epoch": 1.468741785344281, + "grad_norm": 0.09844309836626053, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 379940 + }, + { + "epoch": 1.4687804425476643, + "grad_norm": 0.11584539711475372, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 379950 + }, + { + "epoch": 1.4688190997510477, + "grad_norm": 0.09956274181604385, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 379960 + }, + { + "epoch": 1.468857756954431, + "grad_norm": 0.10570421069860458, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 379970 + }, + { + "epoch": 1.4688964141578142, + "grad_norm": 0.09156245738267899, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 379980 + }, + { + "epoch": 1.4689350713611975, + "grad_norm": 0.11181099712848663, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 379990 + }, + { + "epoch": 1.4689737285645807, + "grad_norm": 0.09745943546295166, + "learning_rate": 0.002, + "loss": 2.3108, + "step": 380000 + }, + { + "epoch": 1.469012385767964, + "grad_norm": 0.09387210011482239, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 380010 + }, + { + "epoch": 1.4690510429713473, + "grad_norm": 0.11295516043901443, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 380020 + }, + { + "epoch": 1.4690897001747305, + "grad_norm": 0.10584930330514908, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 380030 + }, + { + "epoch": 1.4691283573781138, + "grad_norm": 0.11459202319383621, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 380040 + }, + { + "epoch": 1.469167014581497, + "grad_norm": 0.11405713111162186, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 380050 + }, + { + "epoch": 1.4692056717848803, + "grad_norm": 0.09181715548038483, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 380060 + }, + { + "epoch": 1.4692443289882637, + "grad_norm": 0.09268059581518173, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 380070 + }, + { + "epoch": 1.469282986191647, + "grad_norm": 0.11425948888063431, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 380080 + }, + { + "epoch": 1.4693216433950302, + "grad_norm": 0.1236632838845253, + "learning_rate": 0.002, + "loss": 2.336, + "step": 380090 + }, + { + "epoch": 1.4693603005984135, + "grad_norm": 0.0985027402639389, + "learning_rate": 0.002, + "loss": 2.3446, + "step": 380100 + }, + { + "epoch": 1.4693989578017967, + "grad_norm": 0.10117731988430023, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 380110 + }, + { + "epoch": 1.46943761500518, + "grad_norm": 0.12691277265548706, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 380120 + }, + { + "epoch": 1.4694762722085635, + "grad_norm": 0.09519045799970627, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 380130 + }, + { + "epoch": 1.4695149294119467, + "grad_norm": 0.09453404694795609, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 380140 + }, + { + "epoch": 1.46955358661533, + "grad_norm": 0.09491042047739029, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 380150 + }, + { + "epoch": 1.4695922438187132, + "grad_norm": 0.10169180482625961, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 380160 + }, + { + "epoch": 1.4696309010220965, + "grad_norm": 0.10007622092962265, + "learning_rate": 0.002, + "loss": 2.3556, + "step": 380170 + }, + { + "epoch": 1.4696695582254797, + "grad_norm": 0.1122666671872139, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 380180 + }, + { + "epoch": 1.469708215428863, + "grad_norm": 0.09976609796285629, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 380190 + }, + { + "epoch": 1.4697468726322462, + "grad_norm": 0.12167870253324509, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 380200 + }, + { + "epoch": 1.4697855298356295, + "grad_norm": 0.09556052088737488, + "learning_rate": 0.002, + "loss": 2.339, + "step": 380210 + }, + { + "epoch": 1.4698241870390127, + "grad_norm": 0.11083319783210754, + "learning_rate": 0.002, + "loss": 2.33, + "step": 380220 + }, + { + "epoch": 1.469862844242396, + "grad_norm": 0.11835497617721558, + "learning_rate": 0.002, + "loss": 2.331, + "step": 380230 + }, + { + "epoch": 1.4699015014457795, + "grad_norm": 0.11649499833583832, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 380240 + }, + { + "epoch": 1.4699401586491627, + "grad_norm": 0.10010214895009995, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 380250 + }, + { + "epoch": 1.469978815852546, + "grad_norm": 0.10207920521497726, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 380260 + }, + { + "epoch": 1.4700174730559292, + "grad_norm": 0.14429835975170135, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 380270 + }, + { + "epoch": 1.4700561302593125, + "grad_norm": 0.09240922331809998, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 380280 + }, + { + "epoch": 1.4700947874626957, + "grad_norm": 0.0997718796133995, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 380290 + }, + { + "epoch": 1.4701334446660792, + "grad_norm": 0.11467599868774414, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 380300 + }, + { + "epoch": 1.4701721018694625, + "grad_norm": 0.09451211243867874, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 380310 + }, + { + "epoch": 1.4702107590728457, + "grad_norm": 0.1025175079703331, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 380320 + }, + { + "epoch": 1.470249416276229, + "grad_norm": 0.1079663336277008, + "learning_rate": 0.002, + "loss": 2.343, + "step": 380330 + }, + { + "epoch": 1.4702880734796122, + "grad_norm": 0.10581180453300476, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 380340 + }, + { + "epoch": 1.4703267306829955, + "grad_norm": 0.09240443259477615, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 380350 + }, + { + "epoch": 1.4703653878863787, + "grad_norm": 0.10867279767990112, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 380360 + }, + { + "epoch": 1.470404045089762, + "grad_norm": 0.11264824122190475, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 380370 + }, + { + "epoch": 1.4704427022931452, + "grad_norm": 0.11031465977430344, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 380380 + }, + { + "epoch": 1.4704813594965285, + "grad_norm": 0.10774829238653183, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 380390 + }, + { + "epoch": 1.4705200166999117, + "grad_norm": 0.11938739567995071, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 380400 + }, + { + "epoch": 1.4705586739032952, + "grad_norm": 0.10929541289806366, + "learning_rate": 0.002, + "loss": 2.335, + "step": 380410 + }, + { + "epoch": 1.4705973311066785, + "grad_norm": 0.10994645208120346, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 380420 + }, + { + "epoch": 1.4706359883100617, + "grad_norm": 0.11571916937828064, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 380430 + }, + { + "epoch": 1.470674645513445, + "grad_norm": 0.10880973190069199, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 380440 + }, + { + "epoch": 1.4707133027168282, + "grad_norm": 0.10868261009454727, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 380450 + }, + { + "epoch": 1.4707519599202115, + "grad_norm": 0.1179015189409256, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 380460 + }, + { + "epoch": 1.470790617123595, + "grad_norm": 0.11172597110271454, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 380470 + }, + { + "epoch": 1.4708292743269782, + "grad_norm": 0.08885904401540756, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 380480 + }, + { + "epoch": 1.4708679315303614, + "grad_norm": 0.1096772775053978, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 380490 + }, + { + "epoch": 1.4709065887337447, + "grad_norm": 0.1212785616517067, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 380500 + }, + { + "epoch": 1.470945245937128, + "grad_norm": 0.10107403248548508, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 380510 + }, + { + "epoch": 1.4709839031405112, + "grad_norm": 0.1215515285730362, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 380520 + }, + { + "epoch": 1.4710225603438944, + "grad_norm": 0.11970778554677963, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 380530 + }, + { + "epoch": 1.4710612175472777, + "grad_norm": 0.09164808690547943, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 380540 + }, + { + "epoch": 1.471099874750661, + "grad_norm": 0.09449364989995956, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 380550 + }, + { + "epoch": 1.4711385319540442, + "grad_norm": 0.0985458716750145, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 380560 + }, + { + "epoch": 1.4711771891574277, + "grad_norm": 0.10552497208118439, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 380570 + }, + { + "epoch": 1.471215846360811, + "grad_norm": 0.13049247860908508, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 380580 + }, + { + "epoch": 1.4712545035641942, + "grad_norm": 0.11239859461784363, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 380590 + }, + { + "epoch": 1.4712931607675774, + "grad_norm": 0.10022827237844467, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 380600 + }, + { + "epoch": 1.4713318179709607, + "grad_norm": 0.10231848806142807, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 380610 + }, + { + "epoch": 1.471370475174344, + "grad_norm": 0.10004410892724991, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 380620 + }, + { + "epoch": 1.4714091323777272, + "grad_norm": 0.10348884761333466, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 380630 + }, + { + "epoch": 1.4714477895811107, + "grad_norm": 0.11574351042509079, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 380640 + }, + { + "epoch": 1.471486446784494, + "grad_norm": 0.09569251537322998, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 380650 + }, + { + "epoch": 1.4715251039878772, + "grad_norm": 0.09906301647424698, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 380660 + }, + { + "epoch": 1.4715637611912604, + "grad_norm": 0.11972828209400177, + "learning_rate": 0.002, + "loss": 2.333, + "step": 380670 + }, + { + "epoch": 1.4716024183946437, + "grad_norm": 0.11014852672815323, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 380680 + }, + { + "epoch": 1.471641075598027, + "grad_norm": 0.09081583470106125, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 380690 + }, + { + "epoch": 1.4716797328014102, + "grad_norm": 0.11451420187950134, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 380700 + }, + { + "epoch": 1.4717183900047934, + "grad_norm": 0.11507417261600494, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 380710 + }, + { + "epoch": 1.4717570472081767, + "grad_norm": 0.10528484731912613, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 380720 + }, + { + "epoch": 1.47179570441156, + "grad_norm": 0.1073676198720932, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 380730 + }, + { + "epoch": 1.4718343616149434, + "grad_norm": 0.09003012627363205, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 380740 + }, + { + "epoch": 1.4718730188183267, + "grad_norm": 0.11079955101013184, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 380750 + }, + { + "epoch": 1.47191167602171, + "grad_norm": 0.11685178428888321, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 380760 + }, + { + "epoch": 1.4719503332250932, + "grad_norm": 0.0931723341345787, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 380770 + }, + { + "epoch": 1.4719889904284764, + "grad_norm": 0.13615788519382477, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 380780 + }, + { + "epoch": 1.4720276476318597, + "grad_norm": 0.0943218395113945, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 380790 + }, + { + "epoch": 1.472066304835243, + "grad_norm": 0.09162439405918121, + "learning_rate": 0.002, + "loss": 2.3433, + "step": 380800 + }, + { + "epoch": 1.4721049620386264, + "grad_norm": 0.12335995584726334, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 380810 + }, + { + "epoch": 1.4721436192420096, + "grad_norm": 0.09435350447893143, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 380820 + }, + { + "epoch": 1.472182276445393, + "grad_norm": 0.10461000353097916, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 380830 + }, + { + "epoch": 1.4722209336487762, + "grad_norm": 0.09914273768663406, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 380840 + }, + { + "epoch": 1.4722595908521594, + "grad_norm": 0.102699413895607, + "learning_rate": 0.002, + "loss": 2.321, + "step": 380850 + }, + { + "epoch": 1.4722982480555427, + "grad_norm": 0.10240603238344193, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 380860 + }, + { + "epoch": 1.472336905258926, + "grad_norm": 0.11398480832576752, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 380870 + }, + { + "epoch": 1.4723755624623092, + "grad_norm": 0.10690321773290634, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 380880 + }, + { + "epoch": 1.4724142196656924, + "grad_norm": 0.10043326020240784, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 380890 + }, + { + "epoch": 1.4724528768690757, + "grad_norm": 0.1416574865579605, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 380900 + }, + { + "epoch": 1.4724915340724591, + "grad_norm": 0.09897167980670929, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 380910 + }, + { + "epoch": 1.4725301912758424, + "grad_norm": 0.10017742216587067, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 380920 + }, + { + "epoch": 1.4725688484792256, + "grad_norm": 0.09924539178609848, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 380930 + }, + { + "epoch": 1.472607505682609, + "grad_norm": 0.10560602694749832, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 380940 + }, + { + "epoch": 1.4726461628859921, + "grad_norm": 0.11045221984386444, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 380950 + }, + { + "epoch": 1.4726848200893754, + "grad_norm": 0.10794904828071594, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 380960 + }, + { + "epoch": 1.4727234772927587, + "grad_norm": 0.12349242717027664, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 380970 + }, + { + "epoch": 1.4727621344961421, + "grad_norm": 0.11813584715127945, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 380980 + }, + { + "epoch": 1.4728007916995254, + "grad_norm": 0.09766297787427902, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 380990 + }, + { + "epoch": 1.4728394489029086, + "grad_norm": 0.16462789475917816, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 381000 + }, + { + "epoch": 1.4728781061062919, + "grad_norm": 0.10614291578531265, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 381010 + }, + { + "epoch": 1.4729167633096751, + "grad_norm": 0.09676255285739899, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 381020 + }, + { + "epoch": 1.4729554205130584, + "grad_norm": 0.09638655185699463, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 381030 + }, + { + "epoch": 1.4729940777164416, + "grad_norm": 0.10229352861642838, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 381040 + }, + { + "epoch": 1.473032734919825, + "grad_norm": 0.11363990604877472, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 381050 + }, + { + "epoch": 1.4730713921232081, + "grad_norm": 0.26950469613075256, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 381060 + }, + { + "epoch": 1.4731100493265914, + "grad_norm": 0.11464790999889374, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 381070 + }, + { + "epoch": 1.4731487065299749, + "grad_norm": 0.10803456604480743, + "learning_rate": 0.002, + "loss": 2.323, + "step": 381080 + }, + { + "epoch": 1.4731873637333581, + "grad_norm": 0.11394017934799194, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 381090 + }, + { + "epoch": 1.4732260209367414, + "grad_norm": 0.10622947663068771, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 381100 + }, + { + "epoch": 1.4732646781401246, + "grad_norm": 0.09255822747945786, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 381110 + }, + { + "epoch": 1.4733033353435079, + "grad_norm": 0.1168975681066513, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 381120 + }, + { + "epoch": 1.4733419925468911, + "grad_norm": 0.09841727465391159, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 381130 + }, + { + "epoch": 1.4733806497502744, + "grad_norm": 0.09333381801843643, + "learning_rate": 0.002, + "loss": 2.329, + "step": 381140 + }, + { + "epoch": 1.4734193069536579, + "grad_norm": 0.09599963575601578, + "learning_rate": 0.002, + "loss": 2.336, + "step": 381150 + }, + { + "epoch": 1.4734579641570411, + "grad_norm": 0.11897846311330795, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 381160 + }, + { + "epoch": 1.4734966213604244, + "grad_norm": 0.11470766365528107, + "learning_rate": 0.002, + "loss": 2.3195, + "step": 381170 + }, + { + "epoch": 1.4735352785638076, + "grad_norm": 0.10256505012512207, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 381180 + }, + { + "epoch": 1.4735739357671909, + "grad_norm": 0.10822631418704987, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 381190 + }, + { + "epoch": 1.4736125929705741, + "grad_norm": 0.09520353376865387, + "learning_rate": 0.002, + "loss": 2.3126, + "step": 381200 + }, + { + "epoch": 1.4736512501739574, + "grad_norm": 0.10728409886360168, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 381210 + }, + { + "epoch": 1.4736899073773406, + "grad_norm": 0.10907592624425888, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 381220 + }, + { + "epoch": 1.4737285645807239, + "grad_norm": 0.11383340507745743, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 381230 + }, + { + "epoch": 1.4737672217841071, + "grad_norm": 0.0988120511174202, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 381240 + }, + { + "epoch": 1.4738058789874906, + "grad_norm": 0.09999316930770874, + "learning_rate": 0.002, + "loss": 2.317, + "step": 381250 + }, + { + "epoch": 1.4738445361908739, + "grad_norm": 0.10439565777778625, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 381260 + }, + { + "epoch": 1.473883193394257, + "grad_norm": 0.1033417209982872, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 381270 + }, + { + "epoch": 1.4739218505976404, + "grad_norm": 0.11376326531171799, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 381280 + }, + { + "epoch": 1.4739605078010236, + "grad_norm": 0.09683159738779068, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 381290 + }, + { + "epoch": 1.4739991650044069, + "grad_norm": 0.1309593915939331, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 381300 + }, + { + "epoch": 1.4740378222077903, + "grad_norm": 0.09711278229951859, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 381310 + }, + { + "epoch": 1.4740764794111736, + "grad_norm": 0.09251412004232407, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 381320 + }, + { + "epoch": 1.4741151366145568, + "grad_norm": 0.10818198323249817, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 381330 + }, + { + "epoch": 1.47415379381794, + "grad_norm": 0.17244568467140198, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 381340 + }, + { + "epoch": 1.4741924510213233, + "grad_norm": 0.10578557848930359, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 381350 + }, + { + "epoch": 1.4742311082247066, + "grad_norm": 0.10104954242706299, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 381360 + }, + { + "epoch": 1.4742697654280899, + "grad_norm": 0.10733964294195175, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 381370 + }, + { + "epoch": 1.474308422631473, + "grad_norm": 0.10672634840011597, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 381380 + }, + { + "epoch": 1.4743470798348564, + "grad_norm": 0.09510699659585953, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 381390 + }, + { + "epoch": 1.4743857370382396, + "grad_norm": 0.10608115047216415, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 381400 + }, + { + "epoch": 1.4744243942416229, + "grad_norm": 0.0924900472164154, + "learning_rate": 0.002, + "loss": 2.3529, + "step": 381410 + }, + { + "epoch": 1.4744630514450063, + "grad_norm": 0.09588416665792465, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 381420 + }, + { + "epoch": 1.4745017086483896, + "grad_norm": 0.10708600282669067, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 381430 + }, + { + "epoch": 1.4745403658517728, + "grad_norm": 0.12301962822675705, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 381440 + }, + { + "epoch": 1.474579023055156, + "grad_norm": 0.10980133712291718, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 381450 + }, + { + "epoch": 1.4746176802585393, + "grad_norm": 0.0985681489109993, + "learning_rate": 0.002, + "loss": 2.335, + "step": 381460 + }, + { + "epoch": 1.4746563374619226, + "grad_norm": 0.10325771570205688, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 381470 + }, + { + "epoch": 1.474694994665306, + "grad_norm": 0.11751792579889297, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 381480 + }, + { + "epoch": 1.4747336518686893, + "grad_norm": 0.11009322106838226, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 381490 + }, + { + "epoch": 1.4747723090720726, + "grad_norm": 0.11105030030012131, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 381500 + }, + { + "epoch": 1.4748109662754558, + "grad_norm": 0.11428146064281464, + "learning_rate": 0.002, + "loss": 2.321, + "step": 381510 + }, + { + "epoch": 1.474849623478839, + "grad_norm": 0.09404455125331879, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 381520 + }, + { + "epoch": 1.4748882806822223, + "grad_norm": 0.10543165355920792, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 381530 + }, + { + "epoch": 1.4749269378856056, + "grad_norm": 0.10001705586910248, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 381540 + }, + { + "epoch": 1.4749655950889888, + "grad_norm": 0.08789706975221634, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 381550 + }, + { + "epoch": 1.475004252292372, + "grad_norm": 0.1343705952167511, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 381560 + }, + { + "epoch": 1.4750429094957553, + "grad_norm": 0.10130036622285843, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 381570 + }, + { + "epoch": 1.4750815666991386, + "grad_norm": 0.09015454351902008, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 381580 + }, + { + "epoch": 1.475120223902522, + "grad_norm": 0.12282449007034302, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 381590 + }, + { + "epoch": 1.4751588811059053, + "grad_norm": 0.09663018584251404, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 381600 + }, + { + "epoch": 1.4751975383092886, + "grad_norm": 0.11174379289150238, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 381610 + }, + { + "epoch": 1.4752361955126718, + "grad_norm": 0.11045493185520172, + "learning_rate": 0.002, + "loss": 2.324, + "step": 381620 + }, + { + "epoch": 1.475274852716055, + "grad_norm": 0.10591579973697662, + "learning_rate": 0.002, + "loss": 2.324, + "step": 381630 + }, + { + "epoch": 1.4753135099194383, + "grad_norm": 0.12079647928476334, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 381640 + }, + { + "epoch": 1.4753521671228218, + "grad_norm": 0.10993454605340958, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 381650 + }, + { + "epoch": 1.475390824326205, + "grad_norm": 0.10719642043113708, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 381660 + }, + { + "epoch": 1.4754294815295883, + "grad_norm": 0.10297977179288864, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 381670 + }, + { + "epoch": 1.4754681387329716, + "grad_norm": 0.10150489956140518, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 381680 + }, + { + "epoch": 1.4755067959363548, + "grad_norm": 0.10953573882579803, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 381690 + }, + { + "epoch": 1.475545453139738, + "grad_norm": 0.15350480377674103, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 381700 + }, + { + "epoch": 1.4755841103431213, + "grad_norm": 0.10827025026082993, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 381710 + }, + { + "epoch": 1.4756227675465046, + "grad_norm": 0.10280109941959381, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 381720 + }, + { + "epoch": 1.4756614247498878, + "grad_norm": 0.09801103174686432, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 381730 + }, + { + "epoch": 1.475700081953271, + "grad_norm": 0.11004986613988876, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 381740 + }, + { + "epoch": 1.4757387391566543, + "grad_norm": 0.12494536489248276, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 381750 + }, + { + "epoch": 1.4757773963600378, + "grad_norm": 0.1095493733882904, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 381760 + }, + { + "epoch": 1.475816053563421, + "grad_norm": 0.13262827694416046, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 381770 + }, + { + "epoch": 1.4758547107668043, + "grad_norm": 0.1285833716392517, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 381780 + }, + { + "epoch": 1.4758933679701876, + "grad_norm": 0.11003441363573074, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 381790 + }, + { + "epoch": 1.4759320251735708, + "grad_norm": 0.10204324126243591, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 381800 + }, + { + "epoch": 1.475970682376954, + "grad_norm": 0.10408364236354828, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 381810 + }, + { + "epoch": 1.4760093395803375, + "grad_norm": 0.12155904620885849, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 381820 + }, + { + "epoch": 1.4760479967837208, + "grad_norm": 0.09382772445678711, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 381830 + }, + { + "epoch": 1.476086653987104, + "grad_norm": 0.10493974387645721, + "learning_rate": 0.002, + "loss": 2.338, + "step": 381840 + }, + { + "epoch": 1.4761253111904873, + "grad_norm": 0.09143518656492233, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 381850 + }, + { + "epoch": 1.4761639683938705, + "grad_norm": 0.09787338227033615, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 381860 + }, + { + "epoch": 1.4762026255972538, + "grad_norm": 0.0987553671002388, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 381870 + }, + { + "epoch": 1.476241282800637, + "grad_norm": 0.10685688257217407, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 381880 + }, + { + "epoch": 1.4762799400040203, + "grad_norm": 0.10927334427833557, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 381890 + }, + { + "epoch": 1.4763185972074035, + "grad_norm": 0.1169334352016449, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 381900 + }, + { + "epoch": 1.4763572544107868, + "grad_norm": 0.09676641970872879, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 381910 + }, + { + "epoch": 1.47639591161417, + "grad_norm": 0.10892260074615479, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 381920 + }, + { + "epoch": 1.4764345688175535, + "grad_norm": 0.12671910226345062, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 381930 + }, + { + "epoch": 1.4764732260209368, + "grad_norm": 0.12871995568275452, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 381940 + }, + { + "epoch": 1.47651188322432, + "grad_norm": 0.10366962105035782, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 381950 + }, + { + "epoch": 1.4765505404277033, + "grad_norm": 0.10168401896953583, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 381960 + }, + { + "epoch": 1.4765891976310865, + "grad_norm": 0.11718802154064178, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 381970 + }, + { + "epoch": 1.4766278548344698, + "grad_norm": 0.11141911149024963, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 381980 + }, + { + "epoch": 1.4766665120378533, + "grad_norm": 0.10923470556735992, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 381990 + }, + { + "epoch": 1.4767051692412365, + "grad_norm": 0.09513156116008759, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 382000 + }, + { + "epoch": 1.4767438264446198, + "grad_norm": 0.12428732961416245, + "learning_rate": 0.002, + "loss": 2.337, + "step": 382010 + }, + { + "epoch": 1.476782483648003, + "grad_norm": 0.10048487782478333, + "learning_rate": 0.002, + "loss": 2.3183, + "step": 382020 + }, + { + "epoch": 1.4768211408513863, + "grad_norm": 0.10726416856050491, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 382030 + }, + { + "epoch": 1.4768597980547695, + "grad_norm": 0.140085369348526, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 382040 + }, + { + "epoch": 1.4768984552581528, + "grad_norm": 0.10316112637519836, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 382050 + }, + { + "epoch": 1.476937112461536, + "grad_norm": 0.10740186274051666, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 382060 + }, + { + "epoch": 1.4769757696649193, + "grad_norm": 0.102906733751297, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 382070 + }, + { + "epoch": 1.4770144268683025, + "grad_norm": 0.10092508047819138, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 382080 + }, + { + "epoch": 1.4770530840716858, + "grad_norm": 0.09575673937797546, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 382090 + }, + { + "epoch": 1.4770917412750693, + "grad_norm": 0.11741238832473755, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 382100 + }, + { + "epoch": 1.4771303984784525, + "grad_norm": 0.12052875757217407, + "learning_rate": 0.002, + "loss": 2.3229, + "step": 382110 + }, + { + "epoch": 1.4771690556818358, + "grad_norm": 0.106513611972332, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 382120 + }, + { + "epoch": 1.477207712885219, + "grad_norm": 0.1042778491973877, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 382130 + }, + { + "epoch": 1.4772463700886023, + "grad_norm": 0.10151872038841248, + "learning_rate": 0.002, + "loss": 2.3174, + "step": 382140 + }, + { + "epoch": 1.4772850272919855, + "grad_norm": 0.12854936718940735, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 382150 + }, + { + "epoch": 1.477323684495369, + "grad_norm": 0.11349263787269592, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 382160 + }, + { + "epoch": 1.4773623416987522, + "grad_norm": 0.09675941616296768, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 382170 + }, + { + "epoch": 1.4774009989021355, + "grad_norm": 0.10710638016462326, + "learning_rate": 0.002, + "loss": 2.341, + "step": 382180 + }, + { + "epoch": 1.4774396561055188, + "grad_norm": 0.10345502942800522, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 382190 + }, + { + "epoch": 1.477478313308902, + "grad_norm": 0.09632916003465652, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 382200 + }, + { + "epoch": 1.4775169705122853, + "grad_norm": 0.09333551675081253, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 382210 + }, + { + "epoch": 1.4775556277156685, + "grad_norm": 0.09522828459739685, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 382220 + }, + { + "epoch": 1.4775942849190518, + "grad_norm": 0.11494068056344986, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 382230 + }, + { + "epoch": 1.477632942122435, + "grad_norm": 0.1599724292755127, + "learning_rate": 0.002, + "loss": 2.325, + "step": 382240 + }, + { + "epoch": 1.4776715993258183, + "grad_norm": 0.10708048194646835, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 382250 + }, + { + "epoch": 1.4777102565292015, + "grad_norm": 0.18249209225177765, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 382260 + }, + { + "epoch": 1.477748913732585, + "grad_norm": 0.10732556134462357, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 382270 + }, + { + "epoch": 1.4777875709359682, + "grad_norm": 0.11067913472652435, + "learning_rate": 0.002, + "loss": 2.3395, + "step": 382280 + }, + { + "epoch": 1.4778262281393515, + "grad_norm": 0.09666985273361206, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 382290 + }, + { + "epoch": 1.4778648853427347, + "grad_norm": 0.10674726217985153, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 382300 + }, + { + "epoch": 1.477903542546118, + "grad_norm": 0.1110454648733139, + "learning_rate": 0.002, + "loss": 2.33, + "step": 382310 + }, + { + "epoch": 1.4779421997495013, + "grad_norm": 0.10222259163856506, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 382320 + }, + { + "epoch": 1.4779808569528847, + "grad_norm": 0.11711250245571136, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 382330 + }, + { + "epoch": 1.478019514156268, + "grad_norm": 0.0934610366821289, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 382340 + }, + { + "epoch": 1.4780581713596512, + "grad_norm": 0.10337325185537338, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 382350 + }, + { + "epoch": 1.4780968285630345, + "grad_norm": 0.11120796948671341, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 382360 + }, + { + "epoch": 1.4781354857664177, + "grad_norm": 0.10233265906572342, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 382370 + }, + { + "epoch": 1.478174142969801, + "grad_norm": 0.11238359659910202, + "learning_rate": 0.002, + "loss": 2.348, + "step": 382380 + }, + { + "epoch": 1.4782128001731842, + "grad_norm": 0.11099530011415482, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 382390 + }, + { + "epoch": 1.4782514573765675, + "grad_norm": 0.13393400609493256, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 382400 + }, + { + "epoch": 1.4782901145799507, + "grad_norm": 0.10745197534561157, + "learning_rate": 0.002, + "loss": 2.3103, + "step": 382410 + }, + { + "epoch": 1.478328771783334, + "grad_norm": 0.09651700407266617, + "learning_rate": 0.002, + "loss": 2.3201, + "step": 382420 + }, + { + "epoch": 1.4783674289867172, + "grad_norm": 0.13040949404239655, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 382430 + }, + { + "epoch": 1.4784060861901007, + "grad_norm": 0.1285749077796936, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 382440 + }, + { + "epoch": 1.478444743393484, + "grad_norm": 0.11416109651327133, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 382450 + }, + { + "epoch": 1.4784834005968672, + "grad_norm": 0.09473222494125366, + "learning_rate": 0.002, + "loss": 2.33, + "step": 382460 + }, + { + "epoch": 1.4785220578002505, + "grad_norm": 0.09636333584785461, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 382470 + }, + { + "epoch": 1.4785607150036337, + "grad_norm": 0.1212683841586113, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 382480 + }, + { + "epoch": 1.478599372207017, + "grad_norm": 0.10573016852140427, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 382490 + }, + { + "epoch": 1.4786380294104005, + "grad_norm": 0.0973353236913681, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 382500 + }, + { + "epoch": 1.4786766866137837, + "grad_norm": 0.10334083437919617, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 382510 + }, + { + "epoch": 1.478715343817167, + "grad_norm": 0.11552633345127106, + "learning_rate": 0.002, + "loss": 2.335, + "step": 382520 + }, + { + "epoch": 1.4787540010205502, + "grad_norm": 0.10685543715953827, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 382530 + }, + { + "epoch": 1.4787926582239335, + "grad_norm": 0.09678447246551514, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 382540 + }, + { + "epoch": 1.4788313154273167, + "grad_norm": 0.10663742572069168, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 382550 + }, + { + "epoch": 1.4788699726307, + "grad_norm": 0.1363559365272522, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 382560 + }, + { + "epoch": 1.4789086298340832, + "grad_norm": 0.12492944300174713, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 382570 + }, + { + "epoch": 1.4789472870374665, + "grad_norm": 0.10026931017637253, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 382580 + }, + { + "epoch": 1.4789859442408497, + "grad_norm": 0.1090857982635498, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 382590 + }, + { + "epoch": 1.4790246014442332, + "grad_norm": 0.09713274985551834, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 382600 + }, + { + "epoch": 1.4790632586476165, + "grad_norm": 0.0868186354637146, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 382610 + }, + { + "epoch": 1.4791019158509997, + "grad_norm": 0.11653818935155869, + "learning_rate": 0.002, + "loss": 2.337, + "step": 382620 + }, + { + "epoch": 1.479140573054383, + "grad_norm": 0.10660745203495026, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 382630 + }, + { + "epoch": 1.4791792302577662, + "grad_norm": 0.10817324370145798, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 382640 + }, + { + "epoch": 1.4792178874611495, + "grad_norm": 0.11057060956954956, + "learning_rate": 0.002, + "loss": 2.3143, + "step": 382650 + }, + { + "epoch": 1.4792565446645327, + "grad_norm": 0.10341325402259827, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 382660 + }, + { + "epoch": 1.4792952018679162, + "grad_norm": 0.11677426844835281, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 382670 + }, + { + "epoch": 1.4793338590712994, + "grad_norm": 0.15986883640289307, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 382680 + }, + { + "epoch": 1.4793725162746827, + "grad_norm": 0.10989416390657425, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 382690 + }, + { + "epoch": 1.479411173478066, + "grad_norm": 0.09436041861772537, + "learning_rate": 0.002, + "loss": 2.332, + "step": 382700 + }, + { + "epoch": 1.4794498306814492, + "grad_norm": 0.11045808345079422, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 382710 + }, + { + "epoch": 1.4794884878848324, + "grad_norm": 0.10220617055892944, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 382720 + }, + { + "epoch": 1.4795271450882157, + "grad_norm": 0.0920969769358635, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 382730 + }, + { + "epoch": 1.479565802291599, + "grad_norm": 0.2078670710325241, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 382740 + }, + { + "epoch": 1.4796044594949822, + "grad_norm": 0.1230267584323883, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 382750 + }, + { + "epoch": 1.4796431166983655, + "grad_norm": 0.10537160933017731, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 382760 + }, + { + "epoch": 1.479681773901749, + "grad_norm": 0.33487197756767273, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 382770 + }, + { + "epoch": 1.4797204311051322, + "grad_norm": 0.10333807021379471, + "learning_rate": 0.002, + "loss": 2.3577, + "step": 382780 + }, + { + "epoch": 1.4797590883085154, + "grad_norm": 0.09185725450515747, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 382790 + }, + { + "epoch": 1.4797977455118987, + "grad_norm": 0.11369094252586365, + "learning_rate": 0.002, + "loss": 2.3423, + "step": 382800 + }, + { + "epoch": 1.479836402715282, + "grad_norm": 0.6589314341545105, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 382810 + }, + { + "epoch": 1.4798750599186652, + "grad_norm": 0.12008057534694672, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 382820 + }, + { + "epoch": 1.4799137171220484, + "grad_norm": 0.10496290773153305, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 382830 + }, + { + "epoch": 1.479952374325432, + "grad_norm": 0.1102655827999115, + "learning_rate": 0.002, + "loss": 2.354, + "step": 382840 + }, + { + "epoch": 1.4799910315288152, + "grad_norm": 0.09562227129936218, + "learning_rate": 0.002, + "loss": 2.3055, + "step": 382850 + }, + { + "epoch": 1.4800296887321984, + "grad_norm": 0.12711955606937408, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 382860 + }, + { + "epoch": 1.4800683459355817, + "grad_norm": 0.11347995698451996, + "learning_rate": 0.002, + "loss": 2.326, + "step": 382870 + }, + { + "epoch": 1.480107003138965, + "grad_norm": 0.11589667201042175, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 382880 + }, + { + "epoch": 1.4801456603423482, + "grad_norm": 0.1088893711566925, + "learning_rate": 0.002, + "loss": 2.3435, + "step": 382890 + }, + { + "epoch": 1.4801843175457314, + "grad_norm": 0.10397517681121826, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 382900 + }, + { + "epoch": 1.4802229747491147, + "grad_norm": 0.10150329023599625, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 382910 + }, + { + "epoch": 1.480261631952498, + "grad_norm": 0.1259625405073166, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 382920 + }, + { + "epoch": 1.4803002891558812, + "grad_norm": 0.10090825706720352, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 382930 + }, + { + "epoch": 1.4803389463592647, + "grad_norm": 0.11497049778699875, + "learning_rate": 0.002, + "loss": 2.3509, + "step": 382940 + }, + { + "epoch": 1.480377603562648, + "grad_norm": 0.12436963617801666, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 382950 + }, + { + "epoch": 1.4804162607660312, + "grad_norm": 0.09711844474077225, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 382960 + }, + { + "epoch": 1.4804549179694144, + "grad_norm": 0.11406796425580978, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 382970 + }, + { + "epoch": 1.4804935751727977, + "grad_norm": 0.09396304190158844, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 382980 + }, + { + "epoch": 1.480532232376181, + "grad_norm": 0.10211638361215591, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 382990 + }, + { + "epoch": 1.4805708895795642, + "grad_norm": 0.08367765694856644, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 383000 + }, + { + "epoch": 1.4806095467829476, + "grad_norm": 0.13419294357299805, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 383010 + }, + { + "epoch": 1.480648203986331, + "grad_norm": 0.10420286655426025, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 383020 + }, + { + "epoch": 1.4806868611897142, + "grad_norm": 0.09555057436227798, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 383030 + }, + { + "epoch": 1.4807255183930974, + "grad_norm": 0.11132855713367462, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 383040 + }, + { + "epoch": 1.4807641755964807, + "grad_norm": 0.11654718965291977, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 383050 + }, + { + "epoch": 1.480802832799864, + "grad_norm": 0.10852136462926865, + "learning_rate": 0.002, + "loss": 2.339, + "step": 383060 + }, + { + "epoch": 1.4808414900032472, + "grad_norm": 0.10440509021282196, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 383070 + }, + { + "epoch": 1.4808801472066304, + "grad_norm": 0.10147271305322647, + "learning_rate": 0.002, + "loss": 2.333, + "step": 383080 + }, + { + "epoch": 1.4809188044100137, + "grad_norm": 0.11680194735527039, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 383090 + }, + { + "epoch": 1.480957461613397, + "grad_norm": 0.10291650146245956, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 383100 + }, + { + "epoch": 1.4809961188167804, + "grad_norm": 0.0920812115073204, + "learning_rate": 0.002, + "loss": 2.3527, + "step": 383110 + }, + { + "epoch": 1.4810347760201636, + "grad_norm": 0.12201771140098572, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 383120 + }, + { + "epoch": 1.481073433223547, + "grad_norm": 0.09390729665756226, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 383130 + }, + { + "epoch": 1.4811120904269302, + "grad_norm": 0.13557425141334534, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 383140 + }, + { + "epoch": 1.4811507476303134, + "grad_norm": 0.1080607920885086, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 383150 + }, + { + "epoch": 1.4811894048336967, + "grad_norm": 0.09251314401626587, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 383160 + }, + { + "epoch": 1.48122806203708, + "grad_norm": 0.09981363266706467, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 383170 + }, + { + "epoch": 1.4812667192404634, + "grad_norm": 0.12779657542705536, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 383180 + }, + { + "epoch": 1.4813053764438466, + "grad_norm": 0.10266245901584625, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 383190 + }, + { + "epoch": 1.4813440336472299, + "grad_norm": 0.09837732464075089, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 383200 + }, + { + "epoch": 1.4813826908506131, + "grad_norm": 0.11256790161132812, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 383210 + }, + { + "epoch": 1.4814213480539964, + "grad_norm": 0.11383913457393646, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 383220 + }, + { + "epoch": 1.4814600052573796, + "grad_norm": 0.10398650914430618, + "learning_rate": 0.002, + "loss": 2.334, + "step": 383230 + }, + { + "epoch": 1.481498662460763, + "grad_norm": 0.08707722276449203, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 383240 + }, + { + "epoch": 1.4815373196641461, + "grad_norm": 0.1149417906999588, + "learning_rate": 0.002, + "loss": 2.337, + "step": 383250 + }, + { + "epoch": 1.4815759768675294, + "grad_norm": 0.10394676774740219, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 383260 + }, + { + "epoch": 1.4816146340709127, + "grad_norm": 0.115664541721344, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 383270 + }, + { + "epoch": 1.4816532912742961, + "grad_norm": 0.12399958074092865, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 383280 + }, + { + "epoch": 1.4816919484776794, + "grad_norm": 0.09220393002033234, + "learning_rate": 0.002, + "loss": 2.344, + "step": 383290 + }, + { + "epoch": 1.4817306056810626, + "grad_norm": 0.10916592925786972, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 383300 + }, + { + "epoch": 1.4817692628844459, + "grad_norm": 0.09219096601009369, + "learning_rate": 0.002, + "loss": 2.33, + "step": 383310 + }, + { + "epoch": 1.4818079200878291, + "grad_norm": 0.10437247902154922, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 383320 + }, + { + "epoch": 1.4818465772912124, + "grad_norm": 0.10224609076976776, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 383330 + }, + { + "epoch": 1.4818852344945959, + "grad_norm": 0.12552587687969208, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 383340 + }, + { + "epoch": 1.4819238916979791, + "grad_norm": 0.11228864639997482, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 383350 + }, + { + "epoch": 1.4819625489013624, + "grad_norm": 0.11492089182138443, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 383360 + }, + { + "epoch": 1.4820012061047456, + "grad_norm": 0.10515844821929932, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 383370 + }, + { + "epoch": 1.4820398633081289, + "grad_norm": 0.09570827335119247, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 383380 + }, + { + "epoch": 1.4820785205115121, + "grad_norm": 0.13947443664073944, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 383390 + }, + { + "epoch": 1.4821171777148954, + "grad_norm": 0.10615614056587219, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 383400 + }, + { + "epoch": 1.4821558349182786, + "grad_norm": 0.10525351017713547, + "learning_rate": 0.002, + "loss": 2.335, + "step": 383410 + }, + { + "epoch": 1.4821944921216619, + "grad_norm": 0.09245917946100235, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 383420 + }, + { + "epoch": 1.4822331493250451, + "grad_norm": 0.10440874099731445, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 383430 + }, + { + "epoch": 1.4822718065284284, + "grad_norm": 0.11015748977661133, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 383440 + }, + { + "epoch": 1.4823104637318119, + "grad_norm": 0.14614816009998322, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 383450 + }, + { + "epoch": 1.482349120935195, + "grad_norm": 0.09687640517950058, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 383460 + }, + { + "epoch": 1.4823877781385784, + "grad_norm": 0.09319257736206055, + "learning_rate": 0.002, + "loss": 2.331, + "step": 383470 + }, + { + "epoch": 1.4824264353419616, + "grad_norm": 0.10338862240314484, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 383480 + }, + { + "epoch": 1.4824650925453449, + "grad_norm": 0.10386480391025543, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 383490 + }, + { + "epoch": 1.4825037497487281, + "grad_norm": 0.10086138546466827, + "learning_rate": 0.002, + "loss": 2.339, + "step": 383500 + }, + { + "epoch": 1.4825424069521116, + "grad_norm": 0.10217492282390594, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 383510 + }, + { + "epoch": 1.4825810641554948, + "grad_norm": 0.09466515481472015, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 383520 + }, + { + "epoch": 1.482619721358878, + "grad_norm": 0.10521284490823746, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 383530 + }, + { + "epoch": 1.4826583785622613, + "grad_norm": 0.09716833382844925, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 383540 + }, + { + "epoch": 1.4826970357656446, + "grad_norm": 0.1123156026005745, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 383550 + }, + { + "epoch": 1.4827356929690279, + "grad_norm": 0.09585864841938019, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 383560 + }, + { + "epoch": 1.482774350172411, + "grad_norm": 0.09589291363954544, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 383570 + }, + { + "epoch": 1.4828130073757944, + "grad_norm": 0.1105579063296318, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 383580 + }, + { + "epoch": 1.4828516645791776, + "grad_norm": 0.10001932829618454, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 383590 + }, + { + "epoch": 1.4828903217825609, + "grad_norm": 0.09974181652069092, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 383600 + }, + { + "epoch": 1.4829289789859441, + "grad_norm": 0.10484086722135544, + "learning_rate": 0.002, + "loss": 2.3581, + "step": 383610 + }, + { + "epoch": 1.4829676361893276, + "grad_norm": 0.10299315303564072, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 383620 + }, + { + "epoch": 1.4830062933927108, + "grad_norm": 0.11832892894744873, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 383630 + }, + { + "epoch": 1.483044950596094, + "grad_norm": 0.12004309892654419, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 383640 + }, + { + "epoch": 1.4830836077994773, + "grad_norm": 0.14523443579673767, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 383650 + }, + { + "epoch": 1.4831222650028606, + "grad_norm": 0.11291990429162979, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 383660 + }, + { + "epoch": 1.4831609222062438, + "grad_norm": 0.10504403710365295, + "learning_rate": 0.002, + "loss": 2.34, + "step": 383670 + }, + { + "epoch": 1.4831995794096273, + "grad_norm": 0.09405023604631424, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 383680 + }, + { + "epoch": 1.4832382366130106, + "grad_norm": 0.10415950417518616, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 383690 + }, + { + "epoch": 1.4832768938163938, + "grad_norm": 0.09688252210617065, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 383700 + }, + { + "epoch": 1.483315551019777, + "grad_norm": 0.130942702293396, + "learning_rate": 0.002, + "loss": 2.3381, + "step": 383710 + }, + { + "epoch": 1.4833542082231603, + "grad_norm": 0.09914401918649673, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 383720 + }, + { + "epoch": 1.4833928654265436, + "grad_norm": 0.11842868477106094, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 383730 + }, + { + "epoch": 1.4834315226299268, + "grad_norm": 0.1023070365190506, + "learning_rate": 0.002, + "loss": 2.327, + "step": 383740 + }, + { + "epoch": 1.48347017983331, + "grad_norm": 0.1171945184469223, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 383750 + }, + { + "epoch": 1.4835088370366933, + "grad_norm": 0.10847219824790955, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 383760 + }, + { + "epoch": 1.4835474942400766, + "grad_norm": 0.10732583701610565, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 383770 + }, + { + "epoch": 1.4835861514434598, + "grad_norm": 0.09026862680912018, + "learning_rate": 0.002, + "loss": 2.32, + "step": 383780 + }, + { + "epoch": 1.4836248086468433, + "grad_norm": 0.09737653285264969, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 383790 + }, + { + "epoch": 1.4836634658502266, + "grad_norm": 0.1116415336728096, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 383800 + }, + { + "epoch": 1.4837021230536098, + "grad_norm": 0.10554317384958267, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 383810 + }, + { + "epoch": 1.483740780256993, + "grad_norm": 0.10899829119443893, + "learning_rate": 0.002, + "loss": 2.332, + "step": 383820 + }, + { + "epoch": 1.4837794374603763, + "grad_norm": 0.09907833486795425, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 383830 + }, + { + "epoch": 1.4838180946637596, + "grad_norm": 0.10291578620672226, + "learning_rate": 0.002, + "loss": 2.326, + "step": 383840 + }, + { + "epoch": 1.483856751867143, + "grad_norm": 0.10465496778488159, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 383850 + }, + { + "epoch": 1.4838954090705263, + "grad_norm": 0.12064478546380997, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 383860 + }, + { + "epoch": 1.4839340662739096, + "grad_norm": 0.12056712806224823, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 383870 + }, + { + "epoch": 1.4839727234772928, + "grad_norm": 0.11756856739521027, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 383880 + }, + { + "epoch": 1.484011380680676, + "grad_norm": 0.09254784137010574, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 383890 + }, + { + "epoch": 1.4840500378840593, + "grad_norm": 0.10828401148319244, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 383900 + }, + { + "epoch": 1.4840886950874426, + "grad_norm": 0.09548208862543106, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 383910 + }, + { + "epoch": 1.4841273522908258, + "grad_norm": 0.09640761464834213, + "learning_rate": 0.002, + "loss": 2.339, + "step": 383920 + }, + { + "epoch": 1.484166009494209, + "grad_norm": 0.11068131029605865, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 383930 + }, + { + "epoch": 1.4842046666975923, + "grad_norm": 0.10378747433423996, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 383940 + }, + { + "epoch": 1.4842433239009756, + "grad_norm": 0.10128451883792877, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 383950 + }, + { + "epoch": 1.484281981104359, + "grad_norm": 0.10668580234050751, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 383960 + }, + { + "epoch": 1.4843206383077423, + "grad_norm": 0.10479380190372467, + "learning_rate": 0.002, + "loss": 2.3487, + "step": 383970 + }, + { + "epoch": 1.4843592955111256, + "grad_norm": 0.10641813278198242, + "learning_rate": 0.002, + "loss": 2.341, + "step": 383980 + }, + { + "epoch": 1.4843979527145088, + "grad_norm": 0.10178200155496597, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 383990 + }, + { + "epoch": 1.484436609917892, + "grad_norm": 0.10124867409467697, + "learning_rate": 0.002, + "loss": 2.326, + "step": 384000 + }, + { + "epoch": 1.4844752671212753, + "grad_norm": 0.10476204007863998, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 384010 + }, + { + "epoch": 1.4845139243246588, + "grad_norm": 0.10116813331842422, + "learning_rate": 0.002, + "loss": 2.346, + "step": 384020 + }, + { + "epoch": 1.484552581528042, + "grad_norm": 0.12537890672683716, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 384030 + }, + { + "epoch": 1.4845912387314253, + "grad_norm": 0.10372122377157211, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 384040 + }, + { + "epoch": 1.4846298959348085, + "grad_norm": 0.10350830852985382, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 384050 + }, + { + "epoch": 1.4846685531381918, + "grad_norm": 0.09183838963508606, + "learning_rate": 0.002, + "loss": 2.327, + "step": 384060 + }, + { + "epoch": 1.484707210341575, + "grad_norm": 0.12090179324150085, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 384070 + }, + { + "epoch": 1.4847458675449583, + "grad_norm": 0.1174481064081192, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 384080 + }, + { + "epoch": 1.4847845247483415, + "grad_norm": 0.08650671690702438, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 384090 + }, + { + "epoch": 1.4848231819517248, + "grad_norm": 0.08859454840421677, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 384100 + }, + { + "epoch": 1.484861839155108, + "grad_norm": 0.0890578106045723, + "learning_rate": 0.002, + "loss": 2.3051, + "step": 384110 + }, + { + "epoch": 1.4849004963584913, + "grad_norm": 0.12500403821468353, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 384120 + }, + { + "epoch": 1.4849391535618748, + "grad_norm": 0.11174352467060089, + "learning_rate": 0.002, + "loss": 2.329, + "step": 384130 + }, + { + "epoch": 1.484977810765258, + "grad_norm": 0.12891241908073425, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 384140 + }, + { + "epoch": 1.4850164679686413, + "grad_norm": 0.1084713265299797, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 384150 + }, + { + "epoch": 1.4850551251720245, + "grad_norm": 0.10541419684886932, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 384160 + }, + { + "epoch": 1.4850937823754078, + "grad_norm": 0.11375588923692703, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 384170 + }, + { + "epoch": 1.485132439578791, + "grad_norm": 0.09189879149198532, + "learning_rate": 0.002, + "loss": 2.339, + "step": 384180 + }, + { + "epoch": 1.4851710967821745, + "grad_norm": 0.09924867004156113, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 384190 + }, + { + "epoch": 1.4852097539855578, + "grad_norm": 0.10338280349969864, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 384200 + }, + { + "epoch": 1.485248411188941, + "grad_norm": 0.10292736440896988, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 384210 + }, + { + "epoch": 1.4852870683923243, + "grad_norm": 0.1111016720533371, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 384220 + }, + { + "epoch": 1.4853257255957075, + "grad_norm": 0.1034284457564354, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 384230 + }, + { + "epoch": 1.4853643827990908, + "grad_norm": 0.08710968494415283, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 384240 + }, + { + "epoch": 1.485403040002474, + "grad_norm": 0.10321284830570221, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 384250 + }, + { + "epoch": 1.4854416972058573, + "grad_norm": 0.10766103863716125, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 384260 + }, + { + "epoch": 1.4854803544092405, + "grad_norm": 0.10859043151140213, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 384270 + }, + { + "epoch": 1.4855190116126238, + "grad_norm": 0.09920382499694824, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 384280 + }, + { + "epoch": 1.485557668816007, + "grad_norm": 0.09533470869064331, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 384290 + }, + { + "epoch": 1.4855963260193905, + "grad_norm": 0.10594595968723297, + "learning_rate": 0.002, + "loss": 2.3148, + "step": 384300 + }, + { + "epoch": 1.4856349832227738, + "grad_norm": 0.12473361939191818, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 384310 + }, + { + "epoch": 1.485673640426157, + "grad_norm": 0.10555955022573471, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 384320 + }, + { + "epoch": 1.4857122976295403, + "grad_norm": 0.09441909939050674, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 384330 + }, + { + "epoch": 1.4857509548329235, + "grad_norm": 0.1008702889084816, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 384340 + }, + { + "epoch": 1.4857896120363068, + "grad_norm": 0.0997135266661644, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 384350 + }, + { + "epoch": 1.4858282692396902, + "grad_norm": 0.11072135716676712, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 384360 + }, + { + "epoch": 1.4858669264430735, + "grad_norm": 0.1106172502040863, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 384370 + }, + { + "epoch": 1.4859055836464568, + "grad_norm": 0.09268863499164581, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 384380 + }, + { + "epoch": 1.48594424084984, + "grad_norm": 0.09718073159456253, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 384390 + }, + { + "epoch": 1.4859828980532233, + "grad_norm": 0.10194788128137589, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 384400 + }, + { + "epoch": 1.4860215552566065, + "grad_norm": 0.13043349981307983, + "learning_rate": 0.002, + "loss": 2.3508, + "step": 384410 + }, + { + "epoch": 1.4860602124599898, + "grad_norm": 0.09998510032892227, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 384420 + }, + { + "epoch": 1.486098869663373, + "grad_norm": 0.1066591814160347, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 384430 + }, + { + "epoch": 1.4861375268667563, + "grad_norm": 0.11668353527784348, + "learning_rate": 0.002, + "loss": 2.3558, + "step": 384440 + }, + { + "epoch": 1.4861761840701395, + "grad_norm": 0.1100035160779953, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 384450 + }, + { + "epoch": 1.486214841273523, + "grad_norm": 0.11471410095691681, + "learning_rate": 0.002, + "loss": 2.308, + "step": 384460 + }, + { + "epoch": 1.4862534984769062, + "grad_norm": 0.09942575544118881, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 384470 + }, + { + "epoch": 1.4862921556802895, + "grad_norm": 0.11599720269441605, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 384480 + }, + { + "epoch": 1.4863308128836727, + "grad_norm": 0.10169614106416702, + "learning_rate": 0.002, + "loss": 2.318, + "step": 384490 + }, + { + "epoch": 1.486369470087056, + "grad_norm": 0.12186551839113235, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 384500 + }, + { + "epoch": 1.4864081272904393, + "grad_norm": 0.10848033428192139, + "learning_rate": 0.002, + "loss": 2.3498, + "step": 384510 + }, + { + "epoch": 1.4864467844938225, + "grad_norm": 0.1230197325348854, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 384520 + }, + { + "epoch": 1.486485441697206, + "grad_norm": 0.11099942028522491, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 384530 + }, + { + "epoch": 1.4865240989005892, + "grad_norm": 0.11096183955669403, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 384540 + }, + { + "epoch": 1.4865627561039725, + "grad_norm": 0.10845883190631866, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 384550 + }, + { + "epoch": 1.4866014133073557, + "grad_norm": 0.11498292535543442, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 384560 + }, + { + "epoch": 1.486640070510739, + "grad_norm": 0.10446729511022568, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 384570 + }, + { + "epoch": 1.4866787277141222, + "grad_norm": 0.1114344671368599, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 384580 + }, + { + "epoch": 1.4867173849175055, + "grad_norm": 0.10594511777162552, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 384590 + }, + { + "epoch": 1.4867560421208887, + "grad_norm": 0.09707600623369217, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 384600 + }, + { + "epoch": 1.486794699324272, + "grad_norm": 0.1228179857134819, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 384610 + }, + { + "epoch": 1.4868333565276552, + "grad_norm": 0.10634078830480576, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 384620 + }, + { + "epoch": 1.4868720137310387, + "grad_norm": 0.1053246408700943, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 384630 + }, + { + "epoch": 1.486910670934422, + "grad_norm": 0.10659126192331314, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 384640 + }, + { + "epoch": 1.4869493281378052, + "grad_norm": 0.10877703130245209, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 384650 + }, + { + "epoch": 1.4869879853411885, + "grad_norm": 0.11045970767736435, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 384660 + }, + { + "epoch": 1.4870266425445717, + "grad_norm": 0.09530500322580338, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 384670 + }, + { + "epoch": 1.487065299747955, + "grad_norm": 0.11263837665319443, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 384680 + }, + { + "epoch": 1.4871039569513382, + "grad_norm": 0.10159396380186081, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 384690 + }, + { + "epoch": 1.4871426141547217, + "grad_norm": 0.10858163982629776, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 384700 + }, + { + "epoch": 1.487181271358105, + "grad_norm": 0.11083827912807465, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 384710 + }, + { + "epoch": 1.4872199285614882, + "grad_norm": 0.1142810732126236, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 384720 + }, + { + "epoch": 1.4872585857648715, + "grad_norm": 0.10030544549226761, + "learning_rate": 0.002, + "loss": 2.3506, + "step": 384730 + }, + { + "epoch": 1.4872972429682547, + "grad_norm": 0.10645314306020737, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 384740 + }, + { + "epoch": 1.487335900171638, + "grad_norm": 0.0968933179974556, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 384750 + }, + { + "epoch": 1.4873745573750212, + "grad_norm": 0.11533058434724808, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 384760 + }, + { + "epoch": 1.4874132145784045, + "grad_norm": 0.10017845034599304, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 384770 + }, + { + "epoch": 1.4874518717817877, + "grad_norm": 0.11839465796947479, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 384780 + }, + { + "epoch": 1.487490528985171, + "grad_norm": 0.11330369114875793, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 384790 + }, + { + "epoch": 1.4875291861885545, + "grad_norm": 0.1114191859960556, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 384800 + }, + { + "epoch": 1.4875678433919377, + "grad_norm": 0.10950705409049988, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 384810 + }, + { + "epoch": 1.487606500595321, + "grad_norm": 0.11004837602376938, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 384820 + }, + { + "epoch": 1.4876451577987042, + "grad_norm": 0.09497445821762085, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 384830 + }, + { + "epoch": 1.4876838150020875, + "grad_norm": 0.09108157455921173, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 384840 + }, + { + "epoch": 1.4877224722054707, + "grad_norm": 0.10038923472166061, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 384850 + }, + { + "epoch": 1.487761129408854, + "grad_norm": 0.10554396361112595, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 384860 + }, + { + "epoch": 1.4877997866122374, + "grad_norm": 0.10369051992893219, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 384870 + }, + { + "epoch": 1.4878384438156207, + "grad_norm": 0.1258969008922577, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 384880 + }, + { + "epoch": 1.487877101019004, + "grad_norm": 0.08914817869663239, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 384890 + }, + { + "epoch": 1.4879157582223872, + "grad_norm": 0.09517431259155273, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 384900 + }, + { + "epoch": 1.4879544154257704, + "grad_norm": 0.09845437854528427, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 384910 + }, + { + "epoch": 1.4879930726291537, + "grad_norm": 0.12158235162496567, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 384920 + }, + { + "epoch": 1.488031729832537, + "grad_norm": 0.10567854344844818, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 384930 + }, + { + "epoch": 1.4880703870359202, + "grad_norm": 0.09979545325040817, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 384940 + }, + { + "epoch": 1.4881090442393035, + "grad_norm": 0.1211281418800354, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 384950 + }, + { + "epoch": 1.4881477014426867, + "grad_norm": 0.09864045679569244, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 384960 + }, + { + "epoch": 1.4881863586460702, + "grad_norm": 0.1125154048204422, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 384970 + }, + { + "epoch": 1.4882250158494534, + "grad_norm": 0.11239704489707947, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 384980 + }, + { + "epoch": 1.4882636730528367, + "grad_norm": 0.10237913578748703, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 384990 + }, + { + "epoch": 1.48830233025622, + "grad_norm": 0.10310379415750504, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 385000 + }, + { + "epoch": 1.4883409874596032, + "grad_norm": 0.10797710716724396, + "learning_rate": 0.002, + "loss": 2.335, + "step": 385010 + }, + { + "epoch": 1.4883796446629864, + "grad_norm": 0.10414762794971466, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 385020 + }, + { + "epoch": 1.4884183018663697, + "grad_norm": 0.12361656874418259, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 385030 + }, + { + "epoch": 1.4884569590697532, + "grad_norm": 0.10109461098909378, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 385040 + }, + { + "epoch": 1.4884956162731364, + "grad_norm": 0.1054452583193779, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 385050 + }, + { + "epoch": 1.4885342734765197, + "grad_norm": 0.09931855648756027, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 385060 + }, + { + "epoch": 1.488572930679903, + "grad_norm": 0.12945152819156647, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 385070 + }, + { + "epoch": 1.4886115878832862, + "grad_norm": 0.10160331428050995, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 385080 + }, + { + "epoch": 1.4886502450866694, + "grad_norm": 0.09898512065410614, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 385090 + }, + { + "epoch": 1.4886889022900527, + "grad_norm": 0.09146331250667572, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 385100 + }, + { + "epoch": 1.488727559493436, + "grad_norm": 0.10068368166685104, + "learning_rate": 0.002, + "loss": 2.308, + "step": 385110 + }, + { + "epoch": 1.4887662166968192, + "grad_norm": 0.10230806469917297, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 385120 + }, + { + "epoch": 1.4888048739002024, + "grad_norm": 0.10139543563127518, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 385130 + }, + { + "epoch": 1.488843531103586, + "grad_norm": 0.10228900611400604, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 385140 + }, + { + "epoch": 1.4888821883069692, + "grad_norm": 0.110586978495121, + "learning_rate": 0.002, + "loss": 2.32, + "step": 385150 + }, + { + "epoch": 1.4889208455103524, + "grad_norm": 0.10921701043844223, + "learning_rate": 0.002, + "loss": 2.332, + "step": 385160 + }, + { + "epoch": 1.4889595027137357, + "grad_norm": 0.10913387686014175, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 385170 + }, + { + "epoch": 1.488998159917119, + "grad_norm": 0.10443458706140518, + "learning_rate": 0.002, + "loss": 2.344, + "step": 385180 + }, + { + "epoch": 1.4890368171205022, + "grad_norm": 0.11597229540348053, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 385190 + }, + { + "epoch": 1.4890754743238857, + "grad_norm": 0.09303038567304611, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 385200 + }, + { + "epoch": 1.489114131527269, + "grad_norm": 0.0993490144610405, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 385210 + }, + { + "epoch": 1.4891527887306522, + "grad_norm": 0.10018926113843918, + "learning_rate": 0.002, + "loss": 2.324, + "step": 385220 + }, + { + "epoch": 1.4891914459340354, + "grad_norm": 0.1158030554652214, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 385230 + }, + { + "epoch": 1.4892301031374187, + "grad_norm": 0.10592015087604523, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 385240 + }, + { + "epoch": 1.489268760340802, + "grad_norm": 0.10533588379621506, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 385250 + }, + { + "epoch": 1.4893074175441852, + "grad_norm": 0.10306498408317566, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 385260 + }, + { + "epoch": 1.4893460747475684, + "grad_norm": 0.1061411127448082, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 385270 + }, + { + "epoch": 1.4893847319509517, + "grad_norm": 0.0947585478425026, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 385280 + }, + { + "epoch": 1.489423389154335, + "grad_norm": 0.13337740302085876, + "learning_rate": 0.002, + "loss": 2.3428, + "step": 385290 + }, + { + "epoch": 1.4894620463577182, + "grad_norm": 0.11710366606712341, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 385300 + }, + { + "epoch": 1.4895007035611016, + "grad_norm": 0.11103884130716324, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 385310 + }, + { + "epoch": 1.489539360764485, + "grad_norm": 0.1108136996626854, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 385320 + }, + { + "epoch": 1.4895780179678682, + "grad_norm": 0.11765240877866745, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 385330 + }, + { + "epoch": 1.4896166751712514, + "grad_norm": 0.09105757623910904, + "learning_rate": 0.002, + "loss": 2.314, + "step": 385340 + }, + { + "epoch": 1.4896553323746347, + "grad_norm": 0.09805571287870407, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 385350 + }, + { + "epoch": 1.489693989578018, + "grad_norm": 0.11820369213819504, + "learning_rate": 0.002, + "loss": 2.3193, + "step": 385360 + }, + { + "epoch": 1.4897326467814014, + "grad_norm": 0.09663347899913788, + "learning_rate": 0.002, + "loss": 2.324, + "step": 385370 + }, + { + "epoch": 1.4897713039847846, + "grad_norm": 0.10458124428987503, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 385380 + }, + { + "epoch": 1.4898099611881679, + "grad_norm": 0.10823596268892288, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 385390 + }, + { + "epoch": 1.4898486183915511, + "grad_norm": 0.09211437404155731, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 385400 + }, + { + "epoch": 1.4898872755949344, + "grad_norm": 0.1029345691204071, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 385410 + }, + { + "epoch": 1.4899259327983176, + "grad_norm": 0.09940619766712189, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 385420 + }, + { + "epoch": 1.489964590001701, + "grad_norm": 0.0912894532084465, + "learning_rate": 0.002, + "loss": 2.318, + "step": 385430 + }, + { + "epoch": 1.4900032472050841, + "grad_norm": 0.10252334922552109, + "learning_rate": 0.002, + "loss": 2.3122, + "step": 385440 + }, + { + "epoch": 1.4900419044084674, + "grad_norm": 0.11161807179450989, + "learning_rate": 0.002, + "loss": 2.344, + "step": 385450 + }, + { + "epoch": 1.4900805616118507, + "grad_norm": 0.11407460272312164, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 385460 + }, + { + "epoch": 1.490119218815234, + "grad_norm": 0.10580111294984818, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 385470 + }, + { + "epoch": 1.4901578760186174, + "grad_norm": 0.10977879911661148, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 385480 + }, + { + "epoch": 1.4901965332220006, + "grad_norm": 0.11768648028373718, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 385490 + }, + { + "epoch": 1.4902351904253839, + "grad_norm": 0.09398915618658066, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 385500 + }, + { + "epoch": 1.4902738476287671, + "grad_norm": 0.09989374130964279, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 385510 + }, + { + "epoch": 1.4903125048321504, + "grad_norm": 0.1387077122926712, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 385520 + }, + { + "epoch": 1.4903511620355336, + "grad_norm": 0.11738712340593338, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 385530 + }, + { + "epoch": 1.4903898192389171, + "grad_norm": 0.10570180416107178, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 385540 + }, + { + "epoch": 1.4904284764423004, + "grad_norm": 0.12425248324871063, + "learning_rate": 0.002, + "loss": 2.3622, + "step": 385550 + }, + { + "epoch": 1.4904671336456836, + "grad_norm": 0.11239184439182281, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 385560 + }, + { + "epoch": 1.4905057908490669, + "grad_norm": 0.11231973767280579, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 385570 + }, + { + "epoch": 1.4905444480524501, + "grad_norm": 0.10970614850521088, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 385580 + }, + { + "epoch": 1.4905831052558334, + "grad_norm": 0.09998834133148193, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 385590 + }, + { + "epoch": 1.4906217624592166, + "grad_norm": 0.10915309935808182, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 385600 + }, + { + "epoch": 1.4906604196625999, + "grad_norm": 0.1345834732055664, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 385610 + }, + { + "epoch": 1.4906990768659831, + "grad_norm": 0.09776283800601959, + "learning_rate": 0.002, + "loss": 2.343, + "step": 385620 + }, + { + "epoch": 1.4907377340693664, + "grad_norm": 0.10310234874486923, + "learning_rate": 0.002, + "loss": 2.342, + "step": 385630 + }, + { + "epoch": 1.4907763912727496, + "grad_norm": 0.11326014995574951, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 385640 + }, + { + "epoch": 1.490815048476133, + "grad_norm": 0.09007581323385239, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 385650 + }, + { + "epoch": 1.4908537056795164, + "grad_norm": 0.09305016696453094, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 385660 + }, + { + "epoch": 1.4908923628828996, + "grad_norm": 0.12081436812877655, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 385670 + }, + { + "epoch": 1.4909310200862829, + "grad_norm": 0.1021009087562561, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 385680 + }, + { + "epoch": 1.4909696772896661, + "grad_norm": 0.11606641858816147, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 385690 + }, + { + "epoch": 1.4910083344930494, + "grad_norm": 0.10912982374429703, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 385700 + }, + { + "epoch": 1.4910469916964328, + "grad_norm": 0.11273853480815887, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 385710 + }, + { + "epoch": 1.491085648899816, + "grad_norm": 0.10597539693117142, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 385720 + }, + { + "epoch": 1.4911243061031993, + "grad_norm": 0.10982988774776459, + "learning_rate": 0.002, + "loss": 2.331, + "step": 385730 + }, + { + "epoch": 1.4911629633065826, + "grad_norm": 0.10974101722240448, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 385740 + }, + { + "epoch": 1.4912016205099659, + "grad_norm": 0.09794044494628906, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 385750 + }, + { + "epoch": 1.491240277713349, + "grad_norm": 0.09805993735790253, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 385760 + }, + { + "epoch": 1.4912789349167324, + "grad_norm": 0.11157072335481644, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 385770 + }, + { + "epoch": 1.4913175921201156, + "grad_norm": 0.09369926154613495, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 385780 + }, + { + "epoch": 1.4913562493234989, + "grad_norm": 0.10773736238479614, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 385790 + }, + { + "epoch": 1.4913949065268821, + "grad_norm": 0.09465586394071579, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 385800 + }, + { + "epoch": 1.4914335637302654, + "grad_norm": 0.09090068191289902, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 385810 + }, + { + "epoch": 1.4914722209336488, + "grad_norm": 0.14364342391490936, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 385820 + }, + { + "epoch": 1.491510878137032, + "grad_norm": 0.09867400676012039, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 385830 + }, + { + "epoch": 1.4915495353404153, + "grad_norm": 0.1080484688282013, + "learning_rate": 0.002, + "loss": 2.3491, + "step": 385840 + }, + { + "epoch": 1.4915881925437986, + "grad_norm": 0.09689328819513321, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 385850 + }, + { + "epoch": 1.4916268497471818, + "grad_norm": 0.11398959159851074, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 385860 + }, + { + "epoch": 1.491665506950565, + "grad_norm": 0.09113884717226028, + "learning_rate": 0.002, + "loss": 2.3114, + "step": 385870 + }, + { + "epoch": 1.4917041641539486, + "grad_norm": 0.10891013592481613, + "learning_rate": 0.002, + "loss": 2.343, + "step": 385880 + }, + { + "epoch": 1.4917428213573318, + "grad_norm": 0.09847282618284225, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 385890 + }, + { + "epoch": 1.491781478560715, + "grad_norm": 0.13759636878967285, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 385900 + }, + { + "epoch": 1.4918201357640983, + "grad_norm": 0.10775196552276611, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 385910 + }, + { + "epoch": 1.4918587929674816, + "grad_norm": 0.10851259529590607, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 385920 + }, + { + "epoch": 1.4918974501708648, + "grad_norm": 0.10537663847208023, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 385930 + }, + { + "epoch": 1.491936107374248, + "grad_norm": 0.09828297793865204, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 385940 + }, + { + "epoch": 1.4919747645776313, + "grad_norm": 0.09515740722417831, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 385950 + }, + { + "epoch": 1.4920134217810146, + "grad_norm": 0.1303916722536087, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 385960 + }, + { + "epoch": 1.4920520789843978, + "grad_norm": 0.09538701176643372, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 385970 + }, + { + "epoch": 1.492090736187781, + "grad_norm": 0.10584598779678345, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 385980 + }, + { + "epoch": 1.4921293933911646, + "grad_norm": 0.1097467914223671, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 385990 + }, + { + "epoch": 1.4921680505945478, + "grad_norm": 0.10740872472524643, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 386000 + }, + { + "epoch": 1.492206707797931, + "grad_norm": 0.09600348770618439, + "learning_rate": 0.002, + "loss": 2.3159, + "step": 386010 + }, + { + "epoch": 1.4922453650013143, + "grad_norm": 0.1136220321059227, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 386020 + }, + { + "epoch": 1.4922840222046976, + "grad_norm": 0.10824783891439438, + "learning_rate": 0.002, + "loss": 2.345, + "step": 386030 + }, + { + "epoch": 1.4923226794080808, + "grad_norm": 0.09880796074867249, + "learning_rate": 0.002, + "loss": 2.351, + "step": 386040 + }, + { + "epoch": 1.4923613366114643, + "grad_norm": 0.10647264122962952, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 386050 + }, + { + "epoch": 1.4923999938148476, + "grad_norm": 0.08816510438919067, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 386060 + }, + { + "epoch": 1.4924386510182308, + "grad_norm": 0.10047812014818192, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 386070 + }, + { + "epoch": 1.492477308221614, + "grad_norm": 0.12441463768482208, + "learning_rate": 0.002, + "loss": 2.317, + "step": 386080 + }, + { + "epoch": 1.4925159654249973, + "grad_norm": 0.10287167876958847, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 386090 + }, + { + "epoch": 1.4925546226283806, + "grad_norm": 0.11641228199005127, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 386100 + }, + { + "epoch": 1.4925932798317638, + "grad_norm": 0.11843828856945038, + "learning_rate": 0.002, + "loss": 2.332, + "step": 386110 + }, + { + "epoch": 1.492631937035147, + "grad_norm": 0.10662206262350082, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 386120 + }, + { + "epoch": 1.4926705942385303, + "grad_norm": 0.10384664684534073, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 386130 + }, + { + "epoch": 1.4927092514419136, + "grad_norm": 0.10603508353233337, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 386140 + }, + { + "epoch": 1.4927479086452968, + "grad_norm": 0.11298702657222748, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 386150 + }, + { + "epoch": 1.4927865658486803, + "grad_norm": 0.08943584561347961, + "learning_rate": 0.002, + "loss": 2.345, + "step": 386160 + }, + { + "epoch": 1.4928252230520636, + "grad_norm": 0.11151974648237228, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 386170 + }, + { + "epoch": 1.4928638802554468, + "grad_norm": 0.10686797648668289, + "learning_rate": 0.002, + "loss": 2.3493, + "step": 386180 + }, + { + "epoch": 1.49290253745883, + "grad_norm": 0.1035001203417778, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 386190 + }, + { + "epoch": 1.4929411946622133, + "grad_norm": 0.10172740370035172, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 386200 + }, + { + "epoch": 1.4929798518655966, + "grad_norm": 0.11338132619857788, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 386210 + }, + { + "epoch": 1.49301850906898, + "grad_norm": 0.11095208674669266, + "learning_rate": 0.002, + "loss": 2.3464, + "step": 386220 + }, + { + "epoch": 1.4930571662723633, + "grad_norm": 0.111575648188591, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 386230 + }, + { + "epoch": 1.4930958234757465, + "grad_norm": 0.09547489881515503, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 386240 + }, + { + "epoch": 1.4931344806791298, + "grad_norm": 0.09410912543535233, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 386250 + }, + { + "epoch": 1.493173137882513, + "grad_norm": 0.1237073466181755, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 386260 + }, + { + "epoch": 1.4932117950858963, + "grad_norm": 0.11605266481637955, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 386270 + }, + { + "epoch": 1.4932504522892796, + "grad_norm": 0.10717470943927765, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 386280 + }, + { + "epoch": 1.4932891094926628, + "grad_norm": 0.10141540318727493, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 386290 + }, + { + "epoch": 1.493327766696046, + "grad_norm": 0.1000044122338295, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 386300 + }, + { + "epoch": 1.4933664238994293, + "grad_norm": 0.0962546244263649, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 386310 + }, + { + "epoch": 1.4934050811028126, + "grad_norm": 0.09642874449491501, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 386320 + }, + { + "epoch": 1.493443738306196, + "grad_norm": 0.08658089488744736, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 386330 + }, + { + "epoch": 1.4934823955095793, + "grad_norm": 0.14702236652374268, + "learning_rate": 0.002, + "loss": 2.338, + "step": 386340 + }, + { + "epoch": 1.4935210527129625, + "grad_norm": 0.10530516505241394, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 386350 + }, + { + "epoch": 1.4935597099163458, + "grad_norm": 0.0912630707025528, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 386360 + }, + { + "epoch": 1.493598367119729, + "grad_norm": 0.12375325709581375, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 386370 + }, + { + "epoch": 1.4936370243231123, + "grad_norm": 0.0921991690993309, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 386380 + }, + { + "epoch": 1.4936756815264958, + "grad_norm": 0.11260882019996643, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 386390 + }, + { + "epoch": 1.493714338729879, + "grad_norm": 0.11492011696100235, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 386400 + }, + { + "epoch": 1.4937529959332623, + "grad_norm": 0.11711040884256363, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 386410 + }, + { + "epoch": 1.4937916531366455, + "grad_norm": 0.1272195279598236, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 386420 + }, + { + "epoch": 1.4938303103400288, + "grad_norm": 0.09441424161195755, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 386430 + }, + { + "epoch": 1.493868967543412, + "grad_norm": 0.09789464622735977, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 386440 + }, + { + "epoch": 1.4939076247467953, + "grad_norm": 0.10984098166227341, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 386450 + }, + { + "epoch": 1.4939462819501785, + "grad_norm": 0.0962597206234932, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 386460 + }, + { + "epoch": 1.4939849391535618, + "grad_norm": 0.10608778893947601, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 386470 + }, + { + "epoch": 1.494023596356945, + "grad_norm": 0.10052426904439926, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 386480 + }, + { + "epoch": 1.4940622535603285, + "grad_norm": 0.10627039521932602, + "learning_rate": 0.002, + "loss": 2.3462, + "step": 386490 + }, + { + "epoch": 1.4941009107637118, + "grad_norm": 0.10974103212356567, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 386500 + }, + { + "epoch": 1.494139567967095, + "grad_norm": 0.10237738490104675, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 386510 + }, + { + "epoch": 1.4941782251704783, + "grad_norm": 0.11119193583726883, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 386520 + }, + { + "epoch": 1.4942168823738615, + "grad_norm": 0.12733414769172668, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 386530 + }, + { + "epoch": 1.4942555395772448, + "grad_norm": 0.11605283617973328, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 386540 + }, + { + "epoch": 1.494294196780628, + "grad_norm": 0.1292264312505722, + "learning_rate": 0.002, + "loss": 2.323, + "step": 386550 + }, + { + "epoch": 1.4943328539840115, + "grad_norm": 0.11857552826404572, + "learning_rate": 0.002, + "loss": 2.331, + "step": 386560 + }, + { + "epoch": 1.4943715111873948, + "grad_norm": 0.0968799740076065, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 386570 + }, + { + "epoch": 1.494410168390778, + "grad_norm": 0.2860582768917084, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 386580 + }, + { + "epoch": 1.4944488255941613, + "grad_norm": 0.11790402233600616, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 386590 + }, + { + "epoch": 1.4944874827975445, + "grad_norm": 0.1172618493437767, + "learning_rate": 0.002, + "loss": 2.3282, + "step": 386600 + }, + { + "epoch": 1.4945261400009278, + "grad_norm": 0.08819235861301422, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 386610 + }, + { + "epoch": 1.494564797204311, + "grad_norm": 0.11440522223711014, + "learning_rate": 0.002, + "loss": 2.332, + "step": 386620 + }, + { + "epoch": 1.4946034544076943, + "grad_norm": 0.11602221429347992, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 386630 + }, + { + "epoch": 1.4946421116110775, + "grad_norm": 0.08647876977920532, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 386640 + }, + { + "epoch": 1.4946807688144608, + "grad_norm": 0.12004893273115158, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 386650 + }, + { + "epoch": 1.4947194260178442, + "grad_norm": 0.1069132536649704, + "learning_rate": 0.002, + "loss": 2.3426, + "step": 386660 + }, + { + "epoch": 1.4947580832212275, + "grad_norm": 0.09967175871133804, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 386670 + }, + { + "epoch": 1.4947967404246107, + "grad_norm": 0.11542779952287674, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 386680 + }, + { + "epoch": 1.494835397627994, + "grad_norm": 0.10156113654375076, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 386690 + }, + { + "epoch": 1.4948740548313773, + "grad_norm": 0.10142820328474045, + "learning_rate": 0.002, + "loss": 2.3445, + "step": 386700 + }, + { + "epoch": 1.4949127120347605, + "grad_norm": 0.10076209157705307, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 386710 + }, + { + "epoch": 1.4949513692381438, + "grad_norm": 0.10062038898468018, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 386720 + }, + { + "epoch": 1.4949900264415272, + "grad_norm": 0.0967542752623558, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 386730 + }, + { + "epoch": 1.4950286836449105, + "grad_norm": 0.11722204834222794, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 386740 + }, + { + "epoch": 1.4950673408482937, + "grad_norm": 0.10882771760225296, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 386750 + }, + { + "epoch": 1.495105998051677, + "grad_norm": 0.109149269759655, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 386760 + }, + { + "epoch": 1.4951446552550602, + "grad_norm": 0.10933854430913925, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 386770 + }, + { + "epoch": 1.4951833124584435, + "grad_norm": 0.10297413170337677, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 386780 + }, + { + "epoch": 1.4952219696618267, + "grad_norm": 0.10061938315629959, + "learning_rate": 0.002, + "loss": 2.3117, + "step": 386790 + }, + { + "epoch": 1.49526062686521, + "grad_norm": 0.10365668684244156, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 386800 + }, + { + "epoch": 1.4952992840685932, + "grad_norm": 0.10860082507133484, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 386810 + }, + { + "epoch": 1.4953379412719765, + "grad_norm": 0.1317634880542755, + "learning_rate": 0.002, + "loss": 2.3093, + "step": 386820 + }, + { + "epoch": 1.49537659847536, + "grad_norm": 0.12806427478790283, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 386830 + }, + { + "epoch": 1.4954152556787432, + "grad_norm": 0.11585642397403717, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 386840 + }, + { + "epoch": 1.4954539128821265, + "grad_norm": 0.09713537245988846, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 386850 + }, + { + "epoch": 1.4954925700855097, + "grad_norm": 0.0927940234541893, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 386860 + }, + { + "epoch": 1.495531227288893, + "grad_norm": 0.09935614466667175, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 386870 + }, + { + "epoch": 1.4955698844922762, + "grad_norm": 0.10377293825149536, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 386880 + }, + { + "epoch": 1.4956085416956595, + "grad_norm": 0.11830423027276993, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 386890 + }, + { + "epoch": 1.495647198899043, + "grad_norm": 0.10357511788606644, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 386900 + }, + { + "epoch": 1.4956858561024262, + "grad_norm": 0.10574238002300262, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 386910 + }, + { + "epoch": 1.4957245133058095, + "grad_norm": 0.11580254137516022, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 386920 + }, + { + "epoch": 1.4957631705091927, + "grad_norm": 0.10394071042537689, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 386930 + }, + { + "epoch": 1.495801827712576, + "grad_norm": 0.14849720895290375, + "learning_rate": 0.002, + "loss": 2.341, + "step": 386940 + }, + { + "epoch": 1.4958404849159592, + "grad_norm": 0.0992945060133934, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 386950 + }, + { + "epoch": 1.4958791421193425, + "grad_norm": 0.09691601246595383, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 386960 + }, + { + "epoch": 1.4959177993227257, + "grad_norm": 0.09586106985807419, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 386970 + }, + { + "epoch": 1.495956456526109, + "grad_norm": 0.10614022612571716, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 386980 + }, + { + "epoch": 1.4959951137294922, + "grad_norm": 0.10315316915512085, + "learning_rate": 0.002, + "loss": 2.319, + "step": 386990 + }, + { + "epoch": 1.4960337709328757, + "grad_norm": 0.11343605816364288, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 387000 + }, + { + "epoch": 1.496072428136259, + "grad_norm": 0.11363035440444946, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 387010 + }, + { + "epoch": 1.4961110853396422, + "grad_norm": 0.09986026585102081, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 387020 + }, + { + "epoch": 1.4961497425430255, + "grad_norm": 0.11407621949911118, + "learning_rate": 0.002, + "loss": 2.318, + "step": 387030 + }, + { + "epoch": 1.4961883997464087, + "grad_norm": 0.11337780207395554, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 387040 + }, + { + "epoch": 1.496227056949792, + "grad_norm": 0.0949147418141365, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 387050 + }, + { + "epoch": 1.4962657141531754, + "grad_norm": 0.10442779213190079, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 387060 + }, + { + "epoch": 1.4963043713565587, + "grad_norm": 0.1179739385843277, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 387070 + }, + { + "epoch": 1.496343028559942, + "grad_norm": 0.09415707737207413, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 387080 + }, + { + "epoch": 1.4963816857633252, + "grad_norm": 0.10857003182172775, + "learning_rate": 0.002, + "loss": 2.314, + "step": 387090 + }, + { + "epoch": 1.4964203429667084, + "grad_norm": 0.11177841573953629, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 387100 + }, + { + "epoch": 1.4964590001700917, + "grad_norm": 0.1178489550948143, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 387110 + }, + { + "epoch": 1.496497657373475, + "grad_norm": 0.11042745411396027, + "learning_rate": 0.002, + "loss": 2.328, + "step": 387120 + }, + { + "epoch": 1.4965363145768582, + "grad_norm": 0.09763308614492416, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 387130 + }, + { + "epoch": 1.4965749717802415, + "grad_norm": 0.11204870790243149, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 387140 + }, + { + "epoch": 1.4966136289836247, + "grad_norm": 0.09637466073036194, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 387150 + }, + { + "epoch": 1.496652286187008, + "grad_norm": 0.11716560274362564, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 387160 + }, + { + "epoch": 1.4966909433903914, + "grad_norm": 0.10886909812688828, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 387170 + }, + { + "epoch": 1.4967296005937747, + "grad_norm": 0.11475536227226257, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 387180 + }, + { + "epoch": 1.496768257797158, + "grad_norm": 0.08800867199897766, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 387190 + }, + { + "epoch": 1.4968069150005412, + "grad_norm": 0.11351599544286728, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 387200 + }, + { + "epoch": 1.4968455722039244, + "grad_norm": 0.10470616072416306, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 387210 + }, + { + "epoch": 1.4968842294073077, + "grad_norm": 0.11616797000169754, + "learning_rate": 0.002, + "loss": 2.325, + "step": 387220 + }, + { + "epoch": 1.4969228866106912, + "grad_norm": 0.10372336953878403, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 387230 + }, + { + "epoch": 1.4969615438140744, + "grad_norm": 0.14390446245670319, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 387240 + }, + { + "epoch": 1.4970002010174577, + "grad_norm": 0.10057645291090012, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 387250 + }, + { + "epoch": 1.497038858220841, + "grad_norm": 0.09963632375001907, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 387260 + }, + { + "epoch": 1.4970775154242242, + "grad_norm": 0.1073041632771492, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 387270 + }, + { + "epoch": 1.4971161726276074, + "grad_norm": 0.10896219313144684, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 387280 + }, + { + "epoch": 1.4971548298309907, + "grad_norm": 0.10329505801200867, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 387290 + }, + { + "epoch": 1.497193487034374, + "grad_norm": 0.10008035600185394, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 387300 + }, + { + "epoch": 1.4972321442377572, + "grad_norm": 0.09460999816656113, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 387310 + }, + { + "epoch": 1.4972708014411404, + "grad_norm": 0.11439158022403717, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 387320 + }, + { + "epoch": 1.4973094586445237, + "grad_norm": 0.09630564600229263, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 387330 + }, + { + "epoch": 1.4973481158479072, + "grad_norm": 0.12669190764427185, + "learning_rate": 0.002, + "loss": 2.336, + "step": 387340 + }, + { + "epoch": 1.4973867730512904, + "grad_norm": 0.2987734377384186, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 387350 + }, + { + "epoch": 1.4974254302546737, + "grad_norm": 0.10269183665513992, + "learning_rate": 0.002, + "loss": 2.3467, + "step": 387360 + }, + { + "epoch": 1.497464087458057, + "grad_norm": 0.2718169093132019, + "learning_rate": 0.002, + "loss": 2.327, + "step": 387370 + }, + { + "epoch": 1.4975027446614402, + "grad_norm": 0.09955942630767822, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 387380 + }, + { + "epoch": 1.4975414018648234, + "grad_norm": 0.13938391208648682, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 387390 + }, + { + "epoch": 1.497580059068207, + "grad_norm": 0.10868766158819199, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 387400 + }, + { + "epoch": 1.4976187162715902, + "grad_norm": 0.10905612260103226, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 387410 + }, + { + "epoch": 1.4976573734749734, + "grad_norm": 0.10838504135608673, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 387420 + }, + { + "epoch": 1.4976960306783567, + "grad_norm": 0.11081354320049286, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 387430 + }, + { + "epoch": 1.49773468788174, + "grad_norm": 0.11287319660186768, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 387440 + }, + { + "epoch": 1.4977733450851232, + "grad_norm": 0.11388552933931351, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 387450 + }, + { + "epoch": 1.4978120022885064, + "grad_norm": 0.09228865057229996, + "learning_rate": 0.002, + "loss": 2.32, + "step": 387460 + }, + { + "epoch": 1.4978506594918897, + "grad_norm": 0.10086563974618912, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 387470 + }, + { + "epoch": 1.497889316695273, + "grad_norm": 0.1103566586971283, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 387480 + }, + { + "epoch": 1.4979279738986562, + "grad_norm": 0.10129038989543915, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 387490 + }, + { + "epoch": 1.4979666311020394, + "grad_norm": 0.10978537052869797, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 387500 + }, + { + "epoch": 1.498005288305423, + "grad_norm": 0.09721244871616364, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 387510 + }, + { + "epoch": 1.4980439455088062, + "grad_norm": 0.10568886995315552, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 387520 + }, + { + "epoch": 1.4980826027121894, + "grad_norm": 0.10114246606826782, + "learning_rate": 0.002, + "loss": 2.336, + "step": 387530 + }, + { + "epoch": 1.4981212599155727, + "grad_norm": 0.08565915375947952, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 387540 + }, + { + "epoch": 1.498159917118956, + "grad_norm": 0.08483897149562836, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 387550 + }, + { + "epoch": 1.4981985743223392, + "grad_norm": 0.08971007168292999, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 387560 + }, + { + "epoch": 1.4982372315257226, + "grad_norm": 0.13412101566791534, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 387570 + }, + { + "epoch": 1.4982758887291059, + "grad_norm": 0.10396832227706909, + "learning_rate": 0.002, + "loss": 2.331, + "step": 387580 + }, + { + "epoch": 1.4983145459324891, + "grad_norm": 0.1287054866552353, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 387590 + }, + { + "epoch": 1.4983532031358724, + "grad_norm": 0.09784513711929321, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 387600 + }, + { + "epoch": 1.4983918603392556, + "grad_norm": 0.10984116792678833, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 387610 + }, + { + "epoch": 1.498430517542639, + "grad_norm": 0.10213734209537506, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 387620 + }, + { + "epoch": 1.4984691747460221, + "grad_norm": 0.11160784214735031, + "learning_rate": 0.002, + "loss": 2.3447, + "step": 387630 + }, + { + "epoch": 1.4985078319494054, + "grad_norm": 0.10323362797498703, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 387640 + }, + { + "epoch": 1.4985464891527887, + "grad_norm": 0.11253419518470764, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 387650 + }, + { + "epoch": 1.498585146356172, + "grad_norm": 0.1090523898601532, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 387660 + }, + { + "epoch": 1.4986238035595552, + "grad_norm": 0.11632861196994781, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 387670 + }, + { + "epoch": 1.4986624607629386, + "grad_norm": 0.09930185228586197, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 387680 + }, + { + "epoch": 1.4987011179663219, + "grad_norm": 0.09560258686542511, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 387690 + }, + { + "epoch": 1.4987397751697051, + "grad_norm": 0.10566894710063934, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 387700 + }, + { + "epoch": 1.4987784323730884, + "grad_norm": 0.11293548345565796, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 387710 + }, + { + "epoch": 1.4988170895764716, + "grad_norm": 0.10041575133800507, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 387720 + }, + { + "epoch": 1.498855746779855, + "grad_norm": 0.10108038783073425, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 387730 + }, + { + "epoch": 1.4988944039832384, + "grad_norm": 0.1080123633146286, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 387740 + }, + { + "epoch": 1.4989330611866216, + "grad_norm": 0.0993170365691185, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 387750 + }, + { + "epoch": 1.4989717183900049, + "grad_norm": 0.11185912042856216, + "learning_rate": 0.002, + "loss": 2.3497, + "step": 387760 + }, + { + "epoch": 1.4990103755933881, + "grad_norm": 0.10889036953449249, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 387770 + }, + { + "epoch": 1.4990490327967714, + "grad_norm": 0.09817574918270111, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 387780 + }, + { + "epoch": 1.4990876900001546, + "grad_norm": 0.10236465930938721, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 387790 + }, + { + "epoch": 1.4991263472035379, + "grad_norm": 0.09964855760335922, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 387800 + }, + { + "epoch": 1.4991650044069211, + "grad_norm": 0.10301323235034943, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 387810 + }, + { + "epoch": 1.4992036616103044, + "grad_norm": 0.11032721400260925, + "learning_rate": 0.002, + "loss": 2.344, + "step": 387820 + }, + { + "epoch": 1.4992423188136876, + "grad_norm": 0.09334992617368698, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 387830 + }, + { + "epoch": 1.4992809760170709, + "grad_norm": 0.0951884463429451, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 387840 + }, + { + "epoch": 1.4993196332204544, + "grad_norm": 0.09448539465665817, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 387850 + }, + { + "epoch": 1.4993582904238376, + "grad_norm": 0.10757434368133545, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 387860 + }, + { + "epoch": 1.4993969476272209, + "grad_norm": 0.11576647311449051, + "learning_rate": 0.002, + "loss": 2.331, + "step": 387870 + }, + { + "epoch": 1.4994356048306041, + "grad_norm": 0.09661038964986801, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 387880 + }, + { + "epoch": 1.4994742620339874, + "grad_norm": 0.11678434163331985, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 387890 + }, + { + "epoch": 1.4995129192373706, + "grad_norm": 0.09969460219144821, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 387900 + }, + { + "epoch": 1.499551576440754, + "grad_norm": 0.11960793286561966, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 387910 + }, + { + "epoch": 1.4995902336441373, + "grad_norm": 0.11385287344455719, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 387920 + }, + { + "epoch": 1.4996288908475206, + "grad_norm": 0.09566864371299744, + "learning_rate": 0.002, + "loss": 2.3557, + "step": 387930 + }, + { + "epoch": 1.4996675480509039, + "grad_norm": 0.09156309813261032, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 387940 + }, + { + "epoch": 1.499706205254287, + "grad_norm": 0.11333674937486649, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 387950 + }, + { + "epoch": 1.4997448624576704, + "grad_norm": 0.10646463185548782, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 387960 + }, + { + "epoch": 1.4997835196610536, + "grad_norm": 0.09774358570575714, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 387970 + }, + { + "epoch": 1.4998221768644369, + "grad_norm": 0.10020807385444641, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 387980 + }, + { + "epoch": 1.4998608340678201, + "grad_norm": 0.1096162423491478, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 387990 + }, + { + "epoch": 1.4998994912712034, + "grad_norm": 0.12573811411857605, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 388000 + }, + { + "epoch": 1.4999381484745866, + "grad_norm": 0.08667264133691788, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 388010 + }, + { + "epoch": 1.49997680567797, + "grad_norm": 0.10994665324687958, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 388020 + }, + { + "epoch": 1.5000154628813533, + "grad_norm": 0.10760389268398285, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 388030 + }, + { + "epoch": 1.5000541200847366, + "grad_norm": 0.10332886129617691, + "learning_rate": 0.002, + "loss": 2.3099, + "step": 388040 + }, + { + "epoch": 1.5000927772881198, + "grad_norm": 0.10724914819002151, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 388050 + }, + { + "epoch": 1.500131434491503, + "grad_norm": 0.10977024585008621, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 388060 + }, + { + "epoch": 1.5001700916948866, + "grad_norm": 0.10817869752645493, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 388070 + }, + { + "epoch": 1.5002087488982698, + "grad_norm": 0.11673028767108917, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 388080 + }, + { + "epoch": 1.500247406101653, + "grad_norm": 0.10535792261362076, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 388090 + }, + { + "epoch": 1.5002860633050363, + "grad_norm": 0.10003501921892166, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 388100 + }, + { + "epoch": 1.5003247205084196, + "grad_norm": 0.1060367152094841, + "learning_rate": 0.002, + "loss": 2.3209, + "step": 388110 + }, + { + "epoch": 1.5003633777118028, + "grad_norm": 0.1156829223036766, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 388120 + }, + { + "epoch": 1.500402034915186, + "grad_norm": 0.11129516363143921, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 388130 + }, + { + "epoch": 1.5004406921185693, + "grad_norm": 0.44771406054496765, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 388140 + }, + { + "epoch": 1.5004793493219526, + "grad_norm": 0.10634361952543259, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 388150 + }, + { + "epoch": 1.5005180065253358, + "grad_norm": 0.0964827761054039, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 388160 + }, + { + "epoch": 1.500556663728719, + "grad_norm": 0.0940362885594368, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 388170 + }, + { + "epoch": 1.5005953209321024, + "grad_norm": 0.10004458576440811, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 388180 + }, + { + "epoch": 1.5006339781354856, + "grad_norm": 0.10141144692897797, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 388190 + }, + { + "epoch": 1.500672635338869, + "grad_norm": 0.10415209829807281, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 388200 + }, + { + "epoch": 1.5007112925422523, + "grad_norm": 0.12072297185659409, + "learning_rate": 0.002, + "loss": 2.3137, + "step": 388210 + }, + { + "epoch": 1.5007499497456356, + "grad_norm": 0.120872363448143, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 388220 + }, + { + "epoch": 1.5007886069490188, + "grad_norm": 0.11353632062673569, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 388230 + }, + { + "epoch": 1.5008272641524023, + "grad_norm": 0.10656017065048218, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 388240 + }, + { + "epoch": 1.5008659213557856, + "grad_norm": 0.10381469875574112, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 388250 + }, + { + "epoch": 1.5009045785591688, + "grad_norm": 0.11682920902967453, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 388260 + }, + { + "epoch": 1.500943235762552, + "grad_norm": 0.109528087079525, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 388270 + }, + { + "epoch": 1.5009818929659353, + "grad_norm": 0.08958955109119415, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 388280 + }, + { + "epoch": 1.5010205501693186, + "grad_norm": 0.1280771642923355, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 388290 + }, + { + "epoch": 1.5010592073727018, + "grad_norm": 0.10596900433301926, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 388300 + }, + { + "epoch": 1.501097864576085, + "grad_norm": 0.10658132284879684, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 388310 + }, + { + "epoch": 1.5011365217794683, + "grad_norm": 0.11552879959344864, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 388320 + }, + { + "epoch": 1.5011751789828516, + "grad_norm": 0.11456441134214401, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 388330 + }, + { + "epoch": 1.5012138361862348, + "grad_norm": 0.12030810117721558, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 388340 + }, + { + "epoch": 1.501252493389618, + "grad_norm": 0.0922018438577652, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 388350 + }, + { + "epoch": 1.5012911505930013, + "grad_norm": 0.106600321829319, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 388360 + }, + { + "epoch": 1.5013298077963848, + "grad_norm": 0.10056577622890472, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 388370 + }, + { + "epoch": 1.501368464999768, + "grad_norm": 0.09994447231292725, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 388380 + }, + { + "epoch": 1.5014071222031513, + "grad_norm": 0.09490270912647247, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 388390 + }, + { + "epoch": 1.5014457794065346, + "grad_norm": 0.10684575140476227, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 388400 + }, + { + "epoch": 1.501484436609918, + "grad_norm": 0.11132889240980148, + "learning_rate": 0.002, + "loss": 2.328, + "step": 388410 + }, + { + "epoch": 1.5015230938133013, + "grad_norm": 0.11740515381097794, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 388420 + }, + { + "epoch": 1.5015617510166845, + "grad_norm": 0.12547734379768372, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 388430 + }, + { + "epoch": 1.5016004082200678, + "grad_norm": 0.0951928198337555, + "learning_rate": 0.002, + "loss": 2.328, + "step": 388440 + }, + { + "epoch": 1.501639065423451, + "grad_norm": 0.09670039266347885, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 388450 + }, + { + "epoch": 1.5016777226268343, + "grad_norm": 0.08767662197351456, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 388460 + }, + { + "epoch": 1.5017163798302176, + "grad_norm": 0.08616555482149124, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 388470 + }, + { + "epoch": 1.5017550370336008, + "grad_norm": 0.10425017029047012, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 388480 + }, + { + "epoch": 1.501793694236984, + "grad_norm": 0.11210408806800842, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 388490 + }, + { + "epoch": 1.5018323514403673, + "grad_norm": 0.10728723555803299, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 388500 + }, + { + "epoch": 1.5018710086437506, + "grad_norm": 0.09564007073640823, + "learning_rate": 0.002, + "loss": 2.335, + "step": 388510 + }, + { + "epoch": 1.5019096658471338, + "grad_norm": 0.11068980395793915, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 388520 + }, + { + "epoch": 1.5019483230505173, + "grad_norm": 0.11430082470178604, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 388530 + }, + { + "epoch": 1.5019869802539005, + "grad_norm": 0.10830912739038467, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 388540 + }, + { + "epoch": 1.5020256374572838, + "grad_norm": 0.11302468180656433, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 388550 + }, + { + "epoch": 1.502064294660667, + "grad_norm": 0.09648089855909348, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 388560 + }, + { + "epoch": 1.5021029518640503, + "grad_norm": 0.09799264371395111, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 388570 + }, + { + "epoch": 1.5021416090674338, + "grad_norm": 0.10546073317527771, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 388580 + }, + { + "epoch": 1.502180266270817, + "grad_norm": 0.1253112256526947, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 388590 + }, + { + "epoch": 1.5022189234742003, + "grad_norm": 0.10033942759037018, + "learning_rate": 0.002, + "loss": 2.319, + "step": 388600 + }, + { + "epoch": 1.5022575806775835, + "grad_norm": 0.09231418371200562, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 388610 + }, + { + "epoch": 1.5022962378809668, + "grad_norm": 0.11819812655448914, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 388620 + }, + { + "epoch": 1.50233489508435, + "grad_norm": 0.10543312877416611, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 388630 + }, + { + "epoch": 1.5023735522877333, + "grad_norm": 0.10718866437673569, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 388640 + }, + { + "epoch": 1.5024122094911165, + "grad_norm": 0.11451803147792816, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 388650 + }, + { + "epoch": 1.5024508666944998, + "grad_norm": 0.09051160514354706, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 388660 + }, + { + "epoch": 1.502489523897883, + "grad_norm": 0.6592245697975159, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 388670 + }, + { + "epoch": 1.5025281811012663, + "grad_norm": 0.1896670162677765, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 388680 + }, + { + "epoch": 1.5025668383046495, + "grad_norm": 0.09931972622871399, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 388690 + }, + { + "epoch": 1.502605495508033, + "grad_norm": 0.10684159398078918, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 388700 + }, + { + "epoch": 1.5026441527114163, + "grad_norm": 0.09554118663072586, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 388710 + }, + { + "epoch": 1.5026828099147995, + "grad_norm": 0.10077395290136337, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 388720 + }, + { + "epoch": 1.5027214671181828, + "grad_norm": 0.09313789755105972, + "learning_rate": 0.002, + "loss": 2.315, + "step": 388730 + }, + { + "epoch": 1.502760124321566, + "grad_norm": 0.10598669201135635, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 388740 + }, + { + "epoch": 1.5027987815249495, + "grad_norm": 0.10307098925113678, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 388750 + }, + { + "epoch": 1.5028374387283328, + "grad_norm": 0.08475697040557861, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 388760 + }, + { + "epoch": 1.502876095931716, + "grad_norm": 0.15715770423412323, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 388770 + }, + { + "epoch": 1.5029147531350993, + "grad_norm": 0.0934685617685318, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 388780 + }, + { + "epoch": 1.5029534103384825, + "grad_norm": 0.1194806918501854, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 388790 + }, + { + "epoch": 1.5029920675418658, + "grad_norm": 0.09156420081853867, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 388800 + }, + { + "epoch": 1.503030724745249, + "grad_norm": 0.12355215847492218, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 388810 + }, + { + "epoch": 1.5030693819486323, + "grad_norm": 0.1338273584842682, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 388820 + }, + { + "epoch": 1.5031080391520155, + "grad_norm": 0.10630382597446442, + "learning_rate": 0.002, + "loss": 2.336, + "step": 388830 + }, + { + "epoch": 1.5031466963553988, + "grad_norm": 0.090221107006073, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 388840 + }, + { + "epoch": 1.503185353558782, + "grad_norm": 0.09536974132061005, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 388850 + }, + { + "epoch": 1.5032240107621653, + "grad_norm": 0.09492926299571991, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 388860 + }, + { + "epoch": 1.5032626679655487, + "grad_norm": 0.10294625908136368, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 388870 + }, + { + "epoch": 1.503301325168932, + "grad_norm": 0.12637640535831451, + "learning_rate": 0.002, + "loss": 2.33, + "step": 388880 + }, + { + "epoch": 1.5033399823723153, + "grad_norm": 0.10367640107870102, + "learning_rate": 0.002, + "loss": 2.32, + "step": 388890 + }, + { + "epoch": 1.5033786395756985, + "grad_norm": 0.10841293632984161, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 388900 + }, + { + "epoch": 1.503417296779082, + "grad_norm": 0.09370643645524979, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 388910 + }, + { + "epoch": 1.5034559539824652, + "grad_norm": 0.11507739871740341, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 388920 + }, + { + "epoch": 1.5034946111858485, + "grad_norm": 0.11928284913301468, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 388930 + }, + { + "epoch": 1.5035332683892317, + "grad_norm": 0.1128237247467041, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 388940 + }, + { + "epoch": 1.503571925592615, + "grad_norm": 0.09719345718622208, + "learning_rate": 0.002, + "loss": 2.3196, + "step": 388950 + }, + { + "epoch": 1.5036105827959982, + "grad_norm": 0.10155039280653, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 388960 + }, + { + "epoch": 1.5036492399993815, + "grad_norm": 0.1289808750152588, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 388970 + }, + { + "epoch": 1.5036878972027647, + "grad_norm": 0.10769186913967133, + "learning_rate": 0.002, + "loss": 2.3177, + "step": 388980 + }, + { + "epoch": 1.503726554406148, + "grad_norm": 0.09606771916151047, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 388990 + }, + { + "epoch": 1.5037652116095312, + "grad_norm": 0.0984034314751625, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 389000 + }, + { + "epoch": 1.5038038688129145, + "grad_norm": 0.10618776828050613, + "learning_rate": 0.002, + "loss": 2.32, + "step": 389010 + }, + { + "epoch": 1.5038425260162978, + "grad_norm": 0.10699150711297989, + "learning_rate": 0.002, + "loss": 2.339, + "step": 389020 + }, + { + "epoch": 1.503881183219681, + "grad_norm": 0.09430860728025436, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 389030 + }, + { + "epoch": 1.5039198404230645, + "grad_norm": 0.0932871401309967, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 389040 + }, + { + "epoch": 1.5039584976264477, + "grad_norm": 0.10846278071403503, + "learning_rate": 0.002, + "loss": 2.3451, + "step": 389050 + }, + { + "epoch": 1.503997154829831, + "grad_norm": 0.10263998061418533, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 389060 + }, + { + "epoch": 1.5040358120332142, + "grad_norm": 0.11393225938081741, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 389070 + }, + { + "epoch": 1.5040744692365977, + "grad_norm": 0.10867276042699814, + "learning_rate": 0.002, + "loss": 2.3116, + "step": 389080 + }, + { + "epoch": 1.504113126439981, + "grad_norm": 0.10557813942432404, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 389090 + }, + { + "epoch": 1.5041517836433642, + "grad_norm": 0.0988297313451767, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 389100 + }, + { + "epoch": 1.5041904408467475, + "grad_norm": 0.13613122701644897, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 389110 + }, + { + "epoch": 1.5042290980501307, + "grad_norm": 0.10214740037918091, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 389120 + }, + { + "epoch": 1.504267755253514, + "grad_norm": 0.10097360610961914, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 389130 + }, + { + "epoch": 1.5043064124568972, + "grad_norm": 0.10136395692825317, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 389140 + }, + { + "epoch": 1.5043450696602805, + "grad_norm": 0.10534156858921051, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 389150 + }, + { + "epoch": 1.5043837268636637, + "grad_norm": 0.10916571319103241, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 389160 + }, + { + "epoch": 1.504422384067047, + "grad_norm": 0.11262747645378113, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 389170 + }, + { + "epoch": 1.5044610412704302, + "grad_norm": 0.11168365180492401, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 389180 + }, + { + "epoch": 1.5044996984738135, + "grad_norm": 0.10077814012765884, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 389190 + }, + { + "epoch": 1.5045383556771967, + "grad_norm": 0.10950718075037003, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 389200 + }, + { + "epoch": 1.5045770128805802, + "grad_norm": 0.1116180568933487, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 389210 + }, + { + "epoch": 1.5046156700839635, + "grad_norm": 0.10967687517404556, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 389220 + }, + { + "epoch": 1.5046543272873467, + "grad_norm": 0.10847678780555725, + "learning_rate": 0.002, + "loss": 2.344, + "step": 389230 + }, + { + "epoch": 1.50469298449073, + "grad_norm": 0.11077504605054855, + "learning_rate": 0.002, + "loss": 2.3114, + "step": 389240 + }, + { + "epoch": 1.5047316416941134, + "grad_norm": 0.12560157477855682, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 389250 + }, + { + "epoch": 1.5047702988974967, + "grad_norm": 0.10352113842964172, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 389260 + }, + { + "epoch": 1.50480895610088, + "grad_norm": 0.09635389596223831, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 389270 + }, + { + "epoch": 1.5048476133042632, + "grad_norm": 0.10735803097486496, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 389280 + }, + { + "epoch": 1.5048862705076465, + "grad_norm": 0.12441831827163696, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 389290 + }, + { + "epoch": 1.5049249277110297, + "grad_norm": 0.10879199206829071, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 389300 + }, + { + "epoch": 1.504963584914413, + "grad_norm": 0.0954468846321106, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 389310 + }, + { + "epoch": 1.5050022421177962, + "grad_norm": 0.09573718160390854, + "learning_rate": 0.002, + "loss": 2.3161, + "step": 389320 + }, + { + "epoch": 1.5050408993211795, + "grad_norm": 0.10864290595054626, + "learning_rate": 0.002, + "loss": 2.3123, + "step": 389330 + }, + { + "epoch": 1.5050795565245627, + "grad_norm": 0.09222857654094696, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 389340 + }, + { + "epoch": 1.505118213727946, + "grad_norm": 0.09859822690486908, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 389350 + }, + { + "epoch": 1.5051568709313292, + "grad_norm": 0.12920109927654266, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 389360 + }, + { + "epoch": 1.5051955281347125, + "grad_norm": 0.09436677396297455, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 389370 + }, + { + "epoch": 1.505234185338096, + "grad_norm": 0.10126109421253204, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 389380 + }, + { + "epoch": 1.5052728425414792, + "grad_norm": 0.09820156544446945, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 389390 + }, + { + "epoch": 1.5053114997448624, + "grad_norm": 0.16399995982646942, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 389400 + }, + { + "epoch": 1.5053501569482457, + "grad_norm": 0.11499249935150146, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 389410 + }, + { + "epoch": 1.5053888141516292, + "grad_norm": 0.19883383810520172, + "learning_rate": 0.002, + "loss": 2.3144, + "step": 389420 + }, + { + "epoch": 1.5054274713550124, + "grad_norm": 0.09507088363170624, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 389430 + }, + { + "epoch": 1.5054661285583957, + "grad_norm": 0.12007997930049896, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 389440 + }, + { + "epoch": 1.505504785761779, + "grad_norm": 0.09999562054872513, + "learning_rate": 0.002, + "loss": 2.3235, + "step": 389450 + }, + { + "epoch": 1.5055434429651622, + "grad_norm": 0.10346845537424088, + "learning_rate": 0.002, + "loss": 2.334, + "step": 389460 + }, + { + "epoch": 1.5055821001685454, + "grad_norm": 0.09966317564249039, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 389470 + }, + { + "epoch": 1.5056207573719287, + "grad_norm": 0.3257620930671692, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 389480 + }, + { + "epoch": 1.505659414575312, + "grad_norm": 0.10240617394447327, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 389490 + }, + { + "epoch": 1.5056980717786952, + "grad_norm": 0.10182981193065643, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 389500 + }, + { + "epoch": 1.5057367289820784, + "grad_norm": 0.09937752783298492, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 389510 + }, + { + "epoch": 1.5057753861854617, + "grad_norm": 0.11027289927005768, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 389520 + }, + { + "epoch": 1.505814043388845, + "grad_norm": 0.10598906129598618, + "learning_rate": 0.002, + "loss": 2.337, + "step": 389530 + }, + { + "epoch": 1.5058527005922282, + "grad_norm": 0.09967856109142303, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 389540 + }, + { + "epoch": 1.5058913577956117, + "grad_norm": 0.12473419308662415, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 389550 + }, + { + "epoch": 1.505930014998995, + "grad_norm": 0.09534024447202682, + "learning_rate": 0.002, + "loss": 2.3202, + "step": 389560 + }, + { + "epoch": 1.5059686722023782, + "grad_norm": 0.1028229296207428, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 389570 + }, + { + "epoch": 1.5060073294057614, + "grad_norm": 0.09971043467521667, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 389580 + }, + { + "epoch": 1.506045986609145, + "grad_norm": 0.09162665158510208, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 389590 + }, + { + "epoch": 1.5060846438125282, + "grad_norm": 0.13003022968769073, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 389600 + }, + { + "epoch": 1.5061233010159114, + "grad_norm": 0.11770327389240265, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 389610 + }, + { + "epoch": 1.5061619582192947, + "grad_norm": 0.09348583221435547, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 389620 + }, + { + "epoch": 1.506200615422678, + "grad_norm": 0.11873598396778107, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 389630 + }, + { + "epoch": 1.5062392726260612, + "grad_norm": 0.11417189985513687, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 389640 + }, + { + "epoch": 1.5062779298294444, + "grad_norm": 0.12200142443180084, + "learning_rate": 0.002, + "loss": 2.3217, + "step": 389650 + }, + { + "epoch": 1.5063165870328277, + "grad_norm": 0.11383046954870224, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 389660 + }, + { + "epoch": 1.506355244236211, + "grad_norm": 0.1026448905467987, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 389670 + }, + { + "epoch": 1.5063939014395942, + "grad_norm": 0.09607816487550735, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 389680 + }, + { + "epoch": 1.5064325586429774, + "grad_norm": 0.10966033488512039, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 389690 + }, + { + "epoch": 1.5064712158463607, + "grad_norm": 0.0915357917547226, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 389700 + }, + { + "epoch": 1.506509873049744, + "grad_norm": 0.10979775339365005, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 389710 + }, + { + "epoch": 1.5065485302531274, + "grad_norm": 0.10046348720788956, + "learning_rate": 0.002, + "loss": 2.321, + "step": 389720 + }, + { + "epoch": 1.5065871874565107, + "grad_norm": 0.09790916740894318, + "learning_rate": 0.002, + "loss": 2.3146, + "step": 389730 + }, + { + "epoch": 1.506625844659894, + "grad_norm": 0.09642428904771805, + "learning_rate": 0.002, + "loss": 2.3158, + "step": 389740 + }, + { + "epoch": 1.5066645018632772, + "grad_norm": 0.09451036900281906, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 389750 + }, + { + "epoch": 1.5067031590666606, + "grad_norm": 0.10986850410699844, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 389760 + }, + { + "epoch": 1.5067418162700439, + "grad_norm": 0.08941185474395752, + "learning_rate": 0.002, + "loss": 2.341, + "step": 389770 + }, + { + "epoch": 1.5067804734734271, + "grad_norm": 0.09333249181509018, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 389780 + }, + { + "epoch": 1.5068191306768104, + "grad_norm": 0.10135620832443237, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 389790 + }, + { + "epoch": 1.5068577878801936, + "grad_norm": 0.09368425607681274, + "learning_rate": 0.002, + "loss": 2.3169, + "step": 389800 + }, + { + "epoch": 1.506896445083577, + "grad_norm": 0.1552366465330124, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 389810 + }, + { + "epoch": 1.5069351022869601, + "grad_norm": 0.09722664952278137, + "learning_rate": 0.002, + "loss": 2.3272, + "step": 389820 + }, + { + "epoch": 1.5069737594903434, + "grad_norm": 0.11551640927791595, + "learning_rate": 0.002, + "loss": 2.34, + "step": 389830 + }, + { + "epoch": 1.5070124166937267, + "grad_norm": 0.11581946909427643, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 389840 + }, + { + "epoch": 1.50705107389711, + "grad_norm": 0.09836060553789139, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 389850 + }, + { + "epoch": 1.5070897311004932, + "grad_norm": 0.12190762907266617, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 389860 + }, + { + "epoch": 1.5071283883038764, + "grad_norm": 0.11505207419395447, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 389870 + }, + { + "epoch": 1.5071670455072597, + "grad_norm": 0.0856177881360054, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 389880 + }, + { + "epoch": 1.5072057027106431, + "grad_norm": 0.09246792644262314, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 389890 + }, + { + "epoch": 1.5072443599140264, + "grad_norm": 0.10114166885614395, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 389900 + }, + { + "epoch": 1.5072830171174096, + "grad_norm": 0.09140849113464355, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 389910 + }, + { + "epoch": 1.507321674320793, + "grad_norm": 0.10203917324542999, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 389920 + }, + { + "epoch": 1.5073603315241764, + "grad_norm": 0.10773786902427673, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 389930 + }, + { + "epoch": 1.5073989887275596, + "grad_norm": 0.09965097904205322, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 389940 + }, + { + "epoch": 1.5074376459309429, + "grad_norm": 0.13344642519950867, + "learning_rate": 0.002, + "loss": 2.324, + "step": 389950 + }, + { + "epoch": 1.5074763031343261, + "grad_norm": 0.0945010781288147, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 389960 + }, + { + "epoch": 1.5075149603377094, + "grad_norm": 0.09982645511627197, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 389970 + }, + { + "epoch": 1.5075536175410926, + "grad_norm": 0.10593358427286148, + "learning_rate": 0.002, + "loss": 2.34, + "step": 389980 + }, + { + "epoch": 1.5075922747444759, + "grad_norm": 0.14716258645057678, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 389990 + }, + { + "epoch": 1.5076309319478591, + "grad_norm": 0.1291695535182953, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 390000 + }, + { + "epoch": 1.5076695891512424, + "grad_norm": 0.09642203152179718, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 390010 + }, + { + "epoch": 1.5077082463546256, + "grad_norm": 0.09814012795686722, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 390020 + }, + { + "epoch": 1.5077469035580089, + "grad_norm": 0.09167808294296265, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 390030 + }, + { + "epoch": 1.5077855607613921, + "grad_norm": 0.10567633807659149, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 390040 + }, + { + "epoch": 1.5078242179647754, + "grad_norm": 0.10716531425714493, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 390050 + }, + { + "epoch": 1.5078628751681589, + "grad_norm": 0.11084076017141342, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 390060 + }, + { + "epoch": 1.5079015323715421, + "grad_norm": 0.11794474720954895, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 390070 + }, + { + "epoch": 1.5079401895749254, + "grad_norm": 0.10359127074480057, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 390080 + }, + { + "epoch": 1.5079788467783086, + "grad_norm": 0.09916546940803528, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 390090 + }, + { + "epoch": 1.508017503981692, + "grad_norm": 0.0951470211148262, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 390100 + }, + { + "epoch": 1.5080561611850754, + "grad_norm": 0.10114279389381409, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 390110 + }, + { + "epoch": 1.5080948183884586, + "grad_norm": 0.1029725968837738, + "learning_rate": 0.002, + "loss": 2.3492, + "step": 390120 + }, + { + "epoch": 1.5081334755918419, + "grad_norm": 0.11488725990056992, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 390130 + }, + { + "epoch": 1.508172132795225, + "grad_norm": 0.11002848297357559, + "learning_rate": 0.002, + "loss": 2.3131, + "step": 390140 + }, + { + "epoch": 1.5082107899986084, + "grad_norm": 0.11786045879125595, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 390150 + }, + { + "epoch": 1.5082494472019916, + "grad_norm": 0.12056462466716766, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 390160 + }, + { + "epoch": 1.5082881044053749, + "grad_norm": 0.13139678537845612, + "learning_rate": 0.002, + "loss": 2.3138, + "step": 390170 + }, + { + "epoch": 1.5083267616087581, + "grad_norm": 0.1203528419137001, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 390180 + }, + { + "epoch": 1.5083654188121414, + "grad_norm": 0.10807789117097855, + "learning_rate": 0.002, + "loss": 2.333, + "step": 390190 + }, + { + "epoch": 1.5084040760155246, + "grad_norm": 0.08393408358097076, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 390200 + }, + { + "epoch": 1.5084427332189079, + "grad_norm": 0.11528384685516357, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 390210 + }, + { + "epoch": 1.5084813904222911, + "grad_norm": 0.10376711189746857, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 390220 + }, + { + "epoch": 1.5085200476256746, + "grad_norm": 0.0873163640499115, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 390230 + }, + { + "epoch": 1.5085587048290579, + "grad_norm": 0.11890695244073868, + "learning_rate": 0.002, + "loss": 2.3386, + "step": 390240 + }, + { + "epoch": 1.508597362032441, + "grad_norm": 0.11122336238622665, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 390250 + }, + { + "epoch": 1.5086360192358244, + "grad_norm": 0.12941014766693115, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 390260 + }, + { + "epoch": 1.5086746764392078, + "grad_norm": 0.10037616640329361, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 390270 + }, + { + "epoch": 1.508713333642591, + "grad_norm": 0.1099221259355545, + "learning_rate": 0.002, + "loss": 2.345, + "step": 390280 + }, + { + "epoch": 1.5087519908459743, + "grad_norm": 0.10333079099655151, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 390290 + }, + { + "epoch": 1.5087906480493576, + "grad_norm": 0.11339517682790756, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 390300 + }, + { + "epoch": 1.5088293052527408, + "grad_norm": 0.0993972197175026, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 390310 + }, + { + "epoch": 1.508867962456124, + "grad_norm": 0.14479485154151917, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 390320 + }, + { + "epoch": 1.5089066196595073, + "grad_norm": 0.10708092153072357, + "learning_rate": 0.002, + "loss": 2.3461, + "step": 390330 + }, + { + "epoch": 1.5089452768628906, + "grad_norm": 0.11167121678590775, + "learning_rate": 0.002, + "loss": 2.342, + "step": 390340 + }, + { + "epoch": 1.5089839340662738, + "grad_norm": 0.09349657595157623, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 390350 + }, + { + "epoch": 1.509022591269657, + "grad_norm": 0.0951918363571167, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 390360 + }, + { + "epoch": 1.5090612484730404, + "grad_norm": 0.10206221789121628, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 390370 + }, + { + "epoch": 1.5090999056764236, + "grad_norm": 0.13456571102142334, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 390380 + }, + { + "epoch": 1.509138562879807, + "grad_norm": 0.10076375305652618, + "learning_rate": 0.002, + "loss": 2.3187, + "step": 390390 + }, + { + "epoch": 1.5091772200831903, + "grad_norm": 0.11521826684474945, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 390400 + }, + { + "epoch": 1.5092158772865736, + "grad_norm": 0.10876718908548355, + "learning_rate": 0.002, + "loss": 2.333, + "step": 390410 + }, + { + "epoch": 1.5092545344899568, + "grad_norm": 0.09847059100866318, + "learning_rate": 0.002, + "loss": 2.332, + "step": 390420 + }, + { + "epoch": 1.50929319169334, + "grad_norm": 0.11620467156171799, + "learning_rate": 0.002, + "loss": 2.3425, + "step": 390430 + }, + { + "epoch": 1.5093318488967236, + "grad_norm": 0.10162986814975739, + "learning_rate": 0.002, + "loss": 2.3518, + "step": 390440 + }, + { + "epoch": 1.5093705061001068, + "grad_norm": 0.1363476663827896, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 390450 + }, + { + "epoch": 1.50940916330349, + "grad_norm": 0.10407718271017075, + "learning_rate": 0.002, + "loss": 2.329, + "step": 390460 + }, + { + "epoch": 1.5094478205068733, + "grad_norm": 0.10302364081144333, + "learning_rate": 0.002, + "loss": 2.324, + "step": 390470 + }, + { + "epoch": 1.5094864777102566, + "grad_norm": 0.10686130076646805, + "learning_rate": 0.002, + "loss": 2.356, + "step": 390480 + }, + { + "epoch": 1.5095251349136398, + "grad_norm": 0.10023646801710129, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 390490 + }, + { + "epoch": 1.509563792117023, + "grad_norm": 0.11389179527759552, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 390500 + }, + { + "epoch": 1.5096024493204063, + "grad_norm": 0.12031827121973038, + "learning_rate": 0.002, + "loss": 2.3444, + "step": 390510 + }, + { + "epoch": 1.5096411065237896, + "grad_norm": 0.09122280776500702, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 390520 + }, + { + "epoch": 1.5096797637271728, + "grad_norm": 0.11488986760377884, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 390530 + }, + { + "epoch": 1.509718420930556, + "grad_norm": 0.10780785232782364, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 390540 + }, + { + "epoch": 1.5097570781339393, + "grad_norm": 0.08984780311584473, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 390550 + }, + { + "epoch": 1.5097957353373228, + "grad_norm": 0.12073812633752823, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 390560 + }, + { + "epoch": 1.509834392540706, + "grad_norm": 0.11867832392454147, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 390570 + }, + { + "epoch": 1.5098730497440893, + "grad_norm": 0.11204609274864197, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 390580 + }, + { + "epoch": 1.5099117069474726, + "grad_norm": 0.1025635302066803, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 390590 + }, + { + "epoch": 1.5099503641508558, + "grad_norm": 0.09087958931922913, + "learning_rate": 0.002, + "loss": 2.3203, + "step": 390600 + }, + { + "epoch": 1.5099890213542393, + "grad_norm": 0.09487861394882202, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 390610 + }, + { + "epoch": 1.5100276785576225, + "grad_norm": 0.10458683222532272, + "learning_rate": 0.002, + "loss": 2.343, + "step": 390620 + }, + { + "epoch": 1.5100663357610058, + "grad_norm": 0.11646603047847748, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 390630 + }, + { + "epoch": 1.510104992964389, + "grad_norm": 0.106773741543293, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 390640 + }, + { + "epoch": 1.5101436501677723, + "grad_norm": 0.09964160621166229, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 390650 + }, + { + "epoch": 1.5101823073711556, + "grad_norm": 0.09841714799404144, + "learning_rate": 0.002, + "loss": 2.3049, + "step": 390660 + }, + { + "epoch": 1.5102209645745388, + "grad_norm": 0.0869053304195404, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 390670 + }, + { + "epoch": 1.510259621777922, + "grad_norm": 0.09558701515197754, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 390680 + }, + { + "epoch": 1.5102982789813053, + "grad_norm": 0.13511855900287628, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 390690 + }, + { + "epoch": 1.5103369361846886, + "grad_norm": 0.12006426602602005, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 390700 + }, + { + "epoch": 1.5103755933880718, + "grad_norm": 0.09411891549825668, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 390710 + }, + { + "epoch": 1.510414250591455, + "grad_norm": 0.1079292893409729, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 390720 + }, + { + "epoch": 1.5104529077948385, + "grad_norm": 0.09245080500841141, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 390730 + }, + { + "epoch": 1.5104915649982218, + "grad_norm": 0.10134102404117584, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 390740 + }, + { + "epoch": 1.510530222201605, + "grad_norm": 0.11004859209060669, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 390750 + }, + { + "epoch": 1.5105688794049883, + "grad_norm": 0.10117950290441513, + "learning_rate": 0.002, + "loss": 2.336, + "step": 390760 + }, + { + "epoch": 1.5106075366083718, + "grad_norm": 0.1033184751868248, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 390770 + }, + { + "epoch": 1.510646193811755, + "grad_norm": 0.11043199896812439, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 390780 + }, + { + "epoch": 1.5106848510151383, + "grad_norm": 0.21440497040748596, + "learning_rate": 0.002, + "loss": 2.319, + "step": 390790 + }, + { + "epoch": 1.5107235082185215, + "grad_norm": 0.10097640007734299, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 390800 + }, + { + "epoch": 1.5107621654219048, + "grad_norm": 0.11292038857936859, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 390810 + }, + { + "epoch": 1.510800822625288, + "grad_norm": 0.12639853358268738, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 390820 + }, + { + "epoch": 1.5108394798286713, + "grad_norm": 0.12269312143325806, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 390830 + }, + { + "epoch": 1.5108781370320545, + "grad_norm": 0.10088943690061569, + "learning_rate": 0.002, + "loss": 2.328, + "step": 390840 + }, + { + "epoch": 1.5109167942354378, + "grad_norm": 0.0912739560008049, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 390850 + }, + { + "epoch": 1.510955451438821, + "grad_norm": 0.1321217268705368, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 390860 + }, + { + "epoch": 1.5109941086422043, + "grad_norm": 0.3824848532676697, + "learning_rate": 0.002, + "loss": 2.3477, + "step": 390870 + }, + { + "epoch": 1.5110327658455875, + "grad_norm": 0.10984724014997482, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 390880 + }, + { + "epoch": 1.5110714230489708, + "grad_norm": 0.09105230122804642, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 390890 + }, + { + "epoch": 1.5111100802523543, + "grad_norm": 0.10240257531404495, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 390900 + }, + { + "epoch": 1.5111487374557375, + "grad_norm": 0.1014326810836792, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 390910 + }, + { + "epoch": 1.5111873946591208, + "grad_norm": 0.0920289158821106, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 390920 + }, + { + "epoch": 1.511226051862504, + "grad_norm": 0.10732376575469971, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 390930 + }, + { + "epoch": 1.5112647090658875, + "grad_norm": 0.1001841351389885, + "learning_rate": 0.002, + "loss": 2.318, + "step": 390940 + }, + { + "epoch": 1.5113033662692708, + "grad_norm": 0.10073614120483398, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 390950 + }, + { + "epoch": 1.511342023472654, + "grad_norm": 0.09710587561130524, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 390960 + }, + { + "epoch": 1.5113806806760373, + "grad_norm": 0.1239316314458847, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 390970 + }, + { + "epoch": 1.5114193378794205, + "grad_norm": 0.10505892336368561, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 390980 + }, + { + "epoch": 1.5114579950828038, + "grad_norm": 0.10494199395179749, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 390990 + }, + { + "epoch": 1.511496652286187, + "grad_norm": 0.10229931026697159, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 391000 + }, + { + "epoch": 1.5115353094895703, + "grad_norm": 0.09727927297353745, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 391010 + }, + { + "epoch": 1.5115739666929535, + "grad_norm": 0.09767213463783264, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 391020 + }, + { + "epoch": 1.5116126238963368, + "grad_norm": 0.10535608232021332, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 391030 + }, + { + "epoch": 1.51165128109972, + "grad_norm": 0.12566813826560974, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 391040 + }, + { + "epoch": 1.5116899383031033, + "grad_norm": 0.10017907619476318, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 391050 + }, + { + "epoch": 1.5117285955064865, + "grad_norm": 0.10999967157840729, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 391060 + }, + { + "epoch": 1.51176725270987, + "grad_norm": 0.11590699851512909, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 391070 + }, + { + "epoch": 1.5118059099132533, + "grad_norm": 0.09183014929294586, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 391080 + }, + { + "epoch": 1.5118445671166365, + "grad_norm": 0.09417787194252014, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 391090 + }, + { + "epoch": 1.5118832243200198, + "grad_norm": 0.12490889430046082, + "learning_rate": 0.002, + "loss": 2.3519, + "step": 391100 + }, + { + "epoch": 1.5119218815234032, + "grad_norm": 0.09948685765266418, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 391110 + }, + { + "epoch": 1.5119605387267865, + "grad_norm": 0.10057361423969269, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 391120 + }, + { + "epoch": 1.5119991959301697, + "grad_norm": 0.10864268243312836, + "learning_rate": 0.002, + "loss": 2.329, + "step": 391130 + }, + { + "epoch": 1.512037853133553, + "grad_norm": 0.11731230467557907, + "learning_rate": 0.002, + "loss": 2.3465, + "step": 391140 + }, + { + "epoch": 1.5120765103369362, + "grad_norm": 0.10575708001852036, + "learning_rate": 0.002, + "loss": 2.337, + "step": 391150 + }, + { + "epoch": 1.5121151675403195, + "grad_norm": 0.12365453690290451, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 391160 + }, + { + "epoch": 1.5121538247437027, + "grad_norm": 0.12001720815896988, + "learning_rate": 0.002, + "loss": 2.3524, + "step": 391170 + }, + { + "epoch": 1.512192481947086, + "grad_norm": 0.12133695185184479, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 391180 + }, + { + "epoch": 1.5122311391504693, + "grad_norm": 0.10062947124242783, + "learning_rate": 0.002, + "loss": 2.3605, + "step": 391190 + }, + { + "epoch": 1.5122697963538525, + "grad_norm": 0.10657189786434174, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 391200 + }, + { + "epoch": 1.5123084535572358, + "grad_norm": 0.10898979008197784, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 391210 + }, + { + "epoch": 1.512347110760619, + "grad_norm": 0.09677448123693466, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 391220 + }, + { + "epoch": 1.5123857679640023, + "grad_norm": 0.09534822404384613, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 391230 + }, + { + "epoch": 1.5124244251673857, + "grad_norm": 0.12047450989484787, + "learning_rate": 0.002, + "loss": 2.3181, + "step": 391240 + }, + { + "epoch": 1.512463082370769, + "grad_norm": 0.10149037837982178, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 391250 + }, + { + "epoch": 1.5125017395741522, + "grad_norm": 0.11668939143419266, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 391260 + }, + { + "epoch": 1.5125403967775355, + "grad_norm": 0.09661033749580383, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 391270 + }, + { + "epoch": 1.512579053980919, + "grad_norm": 0.10099072754383087, + "learning_rate": 0.002, + "loss": 2.3431, + "step": 391280 + }, + { + "epoch": 1.5126177111843022, + "grad_norm": 0.09791026264429092, + "learning_rate": 0.002, + "loss": 2.322, + "step": 391290 + }, + { + "epoch": 1.5126563683876855, + "grad_norm": 0.10209483653306961, + "learning_rate": 0.002, + "loss": 2.329, + "step": 391300 + }, + { + "epoch": 1.5126950255910687, + "grad_norm": 0.0877443328499794, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 391310 + }, + { + "epoch": 1.512733682794452, + "grad_norm": 0.11895333230495453, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 391320 + }, + { + "epoch": 1.5127723399978352, + "grad_norm": 0.11299550533294678, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 391330 + }, + { + "epoch": 1.5128109972012185, + "grad_norm": 0.09337560087442398, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 391340 + }, + { + "epoch": 1.5128496544046017, + "grad_norm": 0.11867666244506836, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 391350 + }, + { + "epoch": 1.512888311607985, + "grad_norm": 0.09304529428482056, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 391360 + }, + { + "epoch": 1.5129269688113682, + "grad_norm": 0.10020868480205536, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 391370 + }, + { + "epoch": 1.5129656260147515, + "grad_norm": 0.1085643619298935, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 391380 + }, + { + "epoch": 1.5130042832181347, + "grad_norm": 0.1056111752986908, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 391390 + }, + { + "epoch": 1.513042940421518, + "grad_norm": 0.10014656186103821, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 391400 + }, + { + "epoch": 1.5130815976249015, + "grad_norm": 0.09461404383182526, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 391410 + }, + { + "epoch": 1.5131202548282847, + "grad_norm": 0.12414918839931488, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 391420 + }, + { + "epoch": 1.513158912031668, + "grad_norm": 0.10402705520391464, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 391430 + }, + { + "epoch": 1.5131975692350512, + "grad_norm": 0.09142938256263733, + "learning_rate": 0.002, + "loss": 2.323, + "step": 391440 + }, + { + "epoch": 1.5132362264384347, + "grad_norm": 0.09494287520647049, + "learning_rate": 0.002, + "loss": 2.34, + "step": 391450 + }, + { + "epoch": 1.513274883641818, + "grad_norm": 0.11556043475866318, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 391460 + }, + { + "epoch": 1.5133135408452012, + "grad_norm": 0.08956260234117508, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 391470 + }, + { + "epoch": 1.5133521980485845, + "grad_norm": 0.11527398973703384, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 391480 + }, + { + "epoch": 1.5133908552519677, + "grad_norm": 0.1307714432477951, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 391490 + }, + { + "epoch": 1.513429512455351, + "grad_norm": 0.09584154933691025, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 391500 + }, + { + "epoch": 1.5134681696587342, + "grad_norm": 0.10020260512828827, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 391510 + }, + { + "epoch": 1.5135068268621175, + "grad_norm": 0.09970816969871521, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 391520 + }, + { + "epoch": 1.5135454840655007, + "grad_norm": 0.10133900493383408, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 391530 + }, + { + "epoch": 1.513584141268884, + "grad_norm": 0.09789116680622101, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 391540 + }, + { + "epoch": 1.5136227984722672, + "grad_norm": 0.09601163864135742, + "learning_rate": 0.002, + "loss": 2.3385, + "step": 391550 + }, + { + "epoch": 1.5136614556756505, + "grad_norm": 0.12760160863399506, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 391560 + }, + { + "epoch": 1.5137001128790337, + "grad_norm": 0.13389061391353607, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 391570 + }, + { + "epoch": 1.5137387700824172, + "grad_norm": 0.10897321254014969, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 391580 + }, + { + "epoch": 1.5137774272858004, + "grad_norm": 0.10551299899816513, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 391590 + }, + { + "epoch": 1.5138160844891837, + "grad_norm": 0.09069501608610153, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 391600 + }, + { + "epoch": 1.513854741692567, + "grad_norm": 0.09852441400289536, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 391610 + }, + { + "epoch": 1.5138933988959504, + "grad_norm": 0.10836633294820786, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 391620 + }, + { + "epoch": 1.5139320560993337, + "grad_norm": 0.09398753941059113, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 391630 + }, + { + "epoch": 1.513970713302717, + "grad_norm": 0.10845766961574554, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 391640 + }, + { + "epoch": 1.5140093705061002, + "grad_norm": 0.1267317682504654, + "learning_rate": 0.002, + "loss": 2.3543, + "step": 391650 + }, + { + "epoch": 1.5140480277094834, + "grad_norm": 0.096676304936409, + "learning_rate": 0.002, + "loss": 2.3222, + "step": 391660 + }, + { + "epoch": 1.5140866849128667, + "grad_norm": 0.10404936224222183, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 391670 + }, + { + "epoch": 1.51412534211625, + "grad_norm": 0.12528268992900848, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 391680 + }, + { + "epoch": 1.5141639993196332, + "grad_norm": 0.09098008275032043, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 391690 + }, + { + "epoch": 1.5142026565230164, + "grad_norm": 0.09539708495140076, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 391700 + }, + { + "epoch": 1.5142413137263997, + "grad_norm": 0.1156436949968338, + "learning_rate": 0.002, + "loss": 2.3379, + "step": 391710 + }, + { + "epoch": 1.514279970929783, + "grad_norm": 0.0968218669295311, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 391720 + }, + { + "epoch": 1.5143186281331662, + "grad_norm": 0.10383110493421555, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 391730 + }, + { + "epoch": 1.5143572853365495, + "grad_norm": 0.09785936027765274, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 391740 + }, + { + "epoch": 1.514395942539933, + "grad_norm": 0.12443191558122635, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 391750 + }, + { + "epoch": 1.5144345997433162, + "grad_norm": 0.09383685141801834, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 391760 + }, + { + "epoch": 1.5144732569466994, + "grad_norm": 0.09789196401834488, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 391770 + }, + { + "epoch": 1.5145119141500827, + "grad_norm": 0.09333156794309616, + "learning_rate": 0.002, + "loss": 2.343, + "step": 391780 + }, + { + "epoch": 1.5145505713534662, + "grad_norm": 0.4236663281917572, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 391790 + }, + { + "epoch": 1.5145892285568494, + "grad_norm": 0.10581768304109573, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 391800 + }, + { + "epoch": 1.5146278857602327, + "grad_norm": 0.10425762087106705, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 391810 + }, + { + "epoch": 1.514666542963616, + "grad_norm": 0.11275427788496017, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 391820 + }, + { + "epoch": 1.5147052001669992, + "grad_norm": 0.09525428712368011, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 391830 + }, + { + "epoch": 1.5147438573703824, + "grad_norm": 0.11205447465181351, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 391840 + }, + { + "epoch": 1.5147825145737657, + "grad_norm": 0.1030084639787674, + "learning_rate": 0.002, + "loss": 2.3458, + "step": 391850 + }, + { + "epoch": 1.514821171777149, + "grad_norm": 0.0894642099738121, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 391860 + }, + { + "epoch": 1.5148598289805322, + "grad_norm": 0.14282263815402985, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 391870 + }, + { + "epoch": 1.5148984861839154, + "grad_norm": 0.15263572335243225, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 391880 + }, + { + "epoch": 1.5149371433872987, + "grad_norm": 0.09568876773118973, + "learning_rate": 0.002, + "loss": 2.338, + "step": 391890 + }, + { + "epoch": 1.514975800590682, + "grad_norm": 0.11003223806619644, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 391900 + }, + { + "epoch": 1.5150144577940652, + "grad_norm": 0.09703919291496277, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 391910 + }, + { + "epoch": 1.5150531149974487, + "grad_norm": 0.10533502697944641, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 391920 + }, + { + "epoch": 1.515091772200832, + "grad_norm": 0.09649696201086044, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 391930 + }, + { + "epoch": 1.5151304294042152, + "grad_norm": 0.12361782789230347, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 391940 + }, + { + "epoch": 1.5151690866075984, + "grad_norm": 0.09518574178218842, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 391950 + }, + { + "epoch": 1.5152077438109819, + "grad_norm": 0.10137498378753662, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 391960 + }, + { + "epoch": 1.5152464010143651, + "grad_norm": 0.1020670160651207, + "learning_rate": 0.002, + "loss": 2.331, + "step": 391970 + }, + { + "epoch": 1.5152850582177484, + "grad_norm": 0.11717270314693451, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 391980 + }, + { + "epoch": 1.5153237154211316, + "grad_norm": 0.10038861632347107, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 391990 + }, + { + "epoch": 1.515362372624515, + "grad_norm": 0.0989314541220665, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 392000 + }, + { + "epoch": 1.5154010298278981, + "grad_norm": 0.09754546731710434, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 392010 + }, + { + "epoch": 1.5154396870312814, + "grad_norm": 0.10955166816711426, + "learning_rate": 0.002, + "loss": 2.3427, + "step": 392020 + }, + { + "epoch": 1.5154783442346647, + "grad_norm": 0.10340434312820435, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 392030 + }, + { + "epoch": 1.515517001438048, + "grad_norm": 0.11696350574493408, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 392040 + }, + { + "epoch": 1.5155556586414312, + "grad_norm": 0.11961662024259567, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 392050 + }, + { + "epoch": 1.5155943158448144, + "grad_norm": 0.09642060101032257, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 392060 + }, + { + "epoch": 1.5156329730481977, + "grad_norm": 0.09937610477209091, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 392070 + }, + { + "epoch": 1.515671630251581, + "grad_norm": 0.09241262823343277, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 392080 + }, + { + "epoch": 1.5157102874549644, + "grad_norm": 0.10047882050275803, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 392090 + }, + { + "epoch": 1.5157489446583476, + "grad_norm": 0.09795857965946198, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 392100 + }, + { + "epoch": 1.515787601861731, + "grad_norm": 0.10768328607082367, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 392110 + }, + { + "epoch": 1.5158262590651141, + "grad_norm": 0.10577775537967682, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 392120 + }, + { + "epoch": 1.5158649162684976, + "grad_norm": 0.09501665830612183, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 392130 + }, + { + "epoch": 1.5159035734718809, + "grad_norm": 0.09370652586221695, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 392140 + }, + { + "epoch": 1.5159422306752641, + "grad_norm": 0.11130591481924057, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 392150 + }, + { + "epoch": 1.5159808878786474, + "grad_norm": 0.09607101231813431, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 392160 + }, + { + "epoch": 1.5160195450820306, + "grad_norm": 0.09282945096492767, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 392170 + }, + { + "epoch": 1.5160582022854139, + "grad_norm": 0.09593275189399719, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 392180 + }, + { + "epoch": 1.5160968594887971, + "grad_norm": 0.1060456782579422, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 392190 + }, + { + "epoch": 1.5161355166921804, + "grad_norm": 0.08822032809257507, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 392200 + }, + { + "epoch": 1.5161741738955636, + "grad_norm": 0.1242724061012268, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 392210 + }, + { + "epoch": 1.516212831098947, + "grad_norm": 0.15245239436626434, + "learning_rate": 0.002, + "loss": 2.352, + "step": 392220 + }, + { + "epoch": 1.5162514883023301, + "grad_norm": 0.11181936413049698, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 392230 + }, + { + "epoch": 1.5162901455057134, + "grad_norm": 0.10632267594337463, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 392240 + }, + { + "epoch": 1.5163288027090969, + "grad_norm": 0.11932023614645004, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 392250 + }, + { + "epoch": 1.5163674599124801, + "grad_norm": 0.09645699709653854, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 392260 + }, + { + "epoch": 1.5164061171158634, + "grad_norm": 0.09505350142717361, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 392270 + }, + { + "epoch": 1.5164447743192466, + "grad_norm": 0.10155574977397919, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 392280 + }, + { + "epoch": 1.5164834315226299, + "grad_norm": 0.09071482717990875, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 392290 + }, + { + "epoch": 1.5165220887260134, + "grad_norm": 0.10598849505186081, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 392300 + }, + { + "epoch": 1.5165607459293966, + "grad_norm": 0.09445805847644806, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 392310 + }, + { + "epoch": 1.5165994031327799, + "grad_norm": 0.11329188942909241, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 392320 + }, + { + "epoch": 1.516638060336163, + "grad_norm": 0.12282945215702057, + "learning_rate": 0.002, + "loss": 2.3343, + "step": 392330 + }, + { + "epoch": 1.5166767175395464, + "grad_norm": 0.11929178982973099, + "learning_rate": 0.002, + "loss": 2.3172, + "step": 392340 + }, + { + "epoch": 1.5167153747429296, + "grad_norm": 0.0997723639011383, + "learning_rate": 0.002, + "loss": 2.341, + "step": 392350 + }, + { + "epoch": 1.5167540319463129, + "grad_norm": 0.10382483899593353, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 392360 + }, + { + "epoch": 1.5167926891496961, + "grad_norm": 0.09186827391386032, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 392370 + }, + { + "epoch": 1.5168313463530794, + "grad_norm": 0.11631743609905243, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 392380 + }, + { + "epoch": 1.5168700035564626, + "grad_norm": 0.11780108511447906, + "learning_rate": 0.002, + "loss": 2.332, + "step": 392390 + }, + { + "epoch": 1.5169086607598459, + "grad_norm": 0.11376313865184784, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 392400 + }, + { + "epoch": 1.5169473179632291, + "grad_norm": 0.11365222185850143, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 392410 + }, + { + "epoch": 1.5169859751666126, + "grad_norm": 0.09479023516178131, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 392420 + }, + { + "epoch": 1.5170246323699959, + "grad_norm": 0.09805808961391449, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 392430 + }, + { + "epoch": 1.517063289573379, + "grad_norm": 0.0988486185669899, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 392440 + }, + { + "epoch": 1.5171019467767624, + "grad_norm": 0.14714302122592926, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 392450 + }, + { + "epoch": 1.5171406039801456, + "grad_norm": 0.10289258509874344, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 392460 + }, + { + "epoch": 1.517179261183529, + "grad_norm": 0.11664602160453796, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 392470 + }, + { + "epoch": 1.5172179183869123, + "grad_norm": 0.10490322858095169, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 392480 + }, + { + "epoch": 1.5172565755902956, + "grad_norm": 0.1358417272567749, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 392490 + }, + { + "epoch": 1.5172952327936788, + "grad_norm": 0.11767023056745529, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 392500 + }, + { + "epoch": 1.517333889997062, + "grad_norm": 0.1056869849562645, + "learning_rate": 0.002, + "loss": 2.334, + "step": 392510 + }, + { + "epoch": 1.5173725472004453, + "grad_norm": 0.09077652543783188, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 392520 + }, + { + "epoch": 1.5174112044038286, + "grad_norm": 0.11978007107973099, + "learning_rate": 0.002, + "loss": 2.333, + "step": 392530 + }, + { + "epoch": 1.5174498616072118, + "grad_norm": 0.09781042486429214, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 392540 + }, + { + "epoch": 1.517488518810595, + "grad_norm": 0.1132543534040451, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 392550 + }, + { + "epoch": 1.5175271760139784, + "grad_norm": 0.10332779586315155, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 392560 + }, + { + "epoch": 1.5175658332173616, + "grad_norm": 0.0938902273774147, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 392570 + }, + { + "epoch": 1.5176044904207449, + "grad_norm": 0.10210373252630234, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 392580 + }, + { + "epoch": 1.5176431476241283, + "grad_norm": 0.10769516974687576, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 392590 + }, + { + "epoch": 1.5176818048275116, + "grad_norm": 0.11146010458469391, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 392600 + }, + { + "epoch": 1.5177204620308948, + "grad_norm": 0.0972304567694664, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 392610 + }, + { + "epoch": 1.517759119234278, + "grad_norm": 0.09830550849437714, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 392620 + }, + { + "epoch": 1.5177977764376613, + "grad_norm": 0.11365142464637756, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 392630 + }, + { + "epoch": 1.5178364336410448, + "grad_norm": 0.09081844240427017, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 392640 + }, + { + "epoch": 1.517875090844428, + "grad_norm": 0.09644568711519241, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 392650 + }, + { + "epoch": 1.5179137480478113, + "grad_norm": 0.11101473867893219, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 392660 + }, + { + "epoch": 1.5179524052511946, + "grad_norm": 0.10434384644031525, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 392670 + }, + { + "epoch": 1.5179910624545778, + "grad_norm": 0.095488540828228, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 392680 + }, + { + "epoch": 1.518029719657961, + "grad_norm": 0.11795365810394287, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 392690 + }, + { + "epoch": 1.5180683768613443, + "grad_norm": 0.11635580658912659, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 392700 + }, + { + "epoch": 1.5181070340647276, + "grad_norm": 0.10605958849191666, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 392710 + }, + { + "epoch": 1.5181456912681108, + "grad_norm": 0.11753251403570175, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 392720 + }, + { + "epoch": 1.518184348471494, + "grad_norm": 0.10806383937597275, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 392730 + }, + { + "epoch": 1.5182230056748773, + "grad_norm": 0.1053234115242958, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 392740 + }, + { + "epoch": 1.5182616628782606, + "grad_norm": 0.09364716708660126, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 392750 + }, + { + "epoch": 1.518300320081644, + "grad_norm": 0.09259460121393204, + "learning_rate": 0.002, + "loss": 2.3417, + "step": 392760 + }, + { + "epoch": 1.5183389772850273, + "grad_norm": 0.10869884490966797, + "learning_rate": 0.002, + "loss": 2.3142, + "step": 392770 + }, + { + "epoch": 1.5183776344884106, + "grad_norm": 0.10413431376218796, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 392780 + }, + { + "epoch": 1.5184162916917938, + "grad_norm": 0.09960522502660751, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 392790 + }, + { + "epoch": 1.5184549488951773, + "grad_norm": 0.10901309549808502, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 392800 + }, + { + "epoch": 1.5184936060985605, + "grad_norm": 0.09266559034585953, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 392810 + }, + { + "epoch": 1.5185322633019438, + "grad_norm": 0.12511612474918365, + "learning_rate": 0.002, + "loss": 2.343, + "step": 392820 + }, + { + "epoch": 1.518570920505327, + "grad_norm": 0.0981638953089714, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 392830 + }, + { + "epoch": 1.5186095777087103, + "grad_norm": 0.12114229053258896, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 392840 + }, + { + "epoch": 1.5186482349120936, + "grad_norm": 0.1143832877278328, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 392850 + }, + { + "epoch": 1.5186868921154768, + "grad_norm": 0.0972919762134552, + "learning_rate": 0.002, + "loss": 2.3485, + "step": 392860 + }, + { + "epoch": 1.51872554931886, + "grad_norm": 0.10748854279518127, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 392870 + }, + { + "epoch": 1.5187642065222433, + "grad_norm": 0.12689507007598877, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 392880 + }, + { + "epoch": 1.5188028637256266, + "grad_norm": 0.09913776814937592, + "learning_rate": 0.002, + "loss": 2.3178, + "step": 392890 + }, + { + "epoch": 1.5188415209290098, + "grad_norm": 0.10747122764587402, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 392900 + }, + { + "epoch": 1.518880178132393, + "grad_norm": 0.12248285859823227, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 392910 + }, + { + "epoch": 1.5189188353357763, + "grad_norm": 0.0993824154138565, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 392920 + }, + { + "epoch": 1.5189574925391598, + "grad_norm": 0.10456233471632004, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 392930 + }, + { + "epoch": 1.518996149742543, + "grad_norm": 0.10220813006162643, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 392940 + }, + { + "epoch": 1.5190348069459263, + "grad_norm": 0.12464220821857452, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 392950 + }, + { + "epoch": 1.5190734641493095, + "grad_norm": 0.12892554700374603, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 392960 + }, + { + "epoch": 1.519112121352693, + "grad_norm": 0.09762685000896454, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 392970 + }, + { + "epoch": 1.5191507785560763, + "grad_norm": 0.1277654469013214, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 392980 + }, + { + "epoch": 1.5191894357594595, + "grad_norm": 0.09789907187223434, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 392990 + }, + { + "epoch": 1.5192280929628428, + "grad_norm": 0.10237940400838852, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 393000 + }, + { + "epoch": 1.519266750166226, + "grad_norm": 0.11480651795864105, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 393010 + }, + { + "epoch": 1.5193054073696093, + "grad_norm": 0.09746021777391434, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 393020 + }, + { + "epoch": 1.5193440645729925, + "grad_norm": 0.10675664246082306, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 393030 + }, + { + "epoch": 1.5193827217763758, + "grad_norm": 0.09594815224409103, + "learning_rate": 0.002, + "loss": 2.3173, + "step": 393040 + }, + { + "epoch": 1.519421378979759, + "grad_norm": 0.11100509762763977, + "learning_rate": 0.002, + "loss": 2.3115, + "step": 393050 + }, + { + "epoch": 1.5194600361831423, + "grad_norm": 0.11012808233499527, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 393060 + }, + { + "epoch": 1.5194986933865255, + "grad_norm": 0.10745590925216675, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 393070 + }, + { + "epoch": 1.5195373505899088, + "grad_norm": 0.10260172188282013, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 393080 + }, + { + "epoch": 1.519576007793292, + "grad_norm": 0.10885608941316605, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 393090 + }, + { + "epoch": 1.5196146649966755, + "grad_norm": 0.11682126671075821, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 393100 + }, + { + "epoch": 1.5196533222000588, + "grad_norm": 0.10573320090770721, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 393110 + }, + { + "epoch": 1.519691979403442, + "grad_norm": 0.10078450292348862, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 393120 + }, + { + "epoch": 1.5197306366068253, + "grad_norm": 0.1034998968243599, + "learning_rate": 0.002, + "loss": 2.324, + "step": 393130 + }, + { + "epoch": 1.5197692938102088, + "grad_norm": 0.09276104718446732, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 393140 + }, + { + "epoch": 1.519807951013592, + "grad_norm": 0.10877040028572083, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 393150 + }, + { + "epoch": 1.5198466082169753, + "grad_norm": 0.10812745988368988, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 393160 + }, + { + "epoch": 1.5198852654203585, + "grad_norm": 0.12653662264347076, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 393170 + }, + { + "epoch": 1.5199239226237418, + "grad_norm": 0.10978109389543533, + "learning_rate": 0.002, + "loss": 2.3469, + "step": 393180 + }, + { + "epoch": 1.519962579827125, + "grad_norm": 0.10237731039524078, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 393190 + }, + { + "epoch": 1.5200012370305083, + "grad_norm": 0.10582784563302994, + "learning_rate": 0.002, + "loss": 2.3182, + "step": 393200 + }, + { + "epoch": 1.5200398942338915, + "grad_norm": 0.10746137797832489, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 393210 + }, + { + "epoch": 1.5200785514372748, + "grad_norm": 0.1339419037103653, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 393220 + }, + { + "epoch": 1.520117208640658, + "grad_norm": 0.1077442392706871, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 393230 + }, + { + "epoch": 1.5201558658440413, + "grad_norm": 0.11086498200893402, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 393240 + }, + { + "epoch": 1.5201945230474245, + "grad_norm": 0.11237195134162903, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 393250 + }, + { + "epoch": 1.5202331802508078, + "grad_norm": 0.12104542553424835, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 393260 + }, + { + "epoch": 1.5202718374541913, + "grad_norm": 0.10996460914611816, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 393270 + }, + { + "epoch": 1.5203104946575745, + "grad_norm": 0.09453155845403671, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 393280 + }, + { + "epoch": 1.5203491518609578, + "grad_norm": 0.10569145530462265, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 393290 + }, + { + "epoch": 1.520387809064341, + "grad_norm": 0.1049167811870575, + "learning_rate": 0.002, + "loss": 2.3166, + "step": 393300 + }, + { + "epoch": 1.5204264662677245, + "grad_norm": 0.10674849152565002, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 393310 + }, + { + "epoch": 1.5204651234711077, + "grad_norm": 0.09160462021827698, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 393320 + }, + { + "epoch": 1.520503780674491, + "grad_norm": 0.11467369645833969, + "learning_rate": 0.002, + "loss": 2.3119, + "step": 393330 + }, + { + "epoch": 1.5205424378778742, + "grad_norm": 0.11105100810527802, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 393340 + }, + { + "epoch": 1.5205810950812575, + "grad_norm": 0.09533059597015381, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 393350 + }, + { + "epoch": 1.5206197522846407, + "grad_norm": 0.10581649094820023, + "learning_rate": 0.002, + "loss": 2.325, + "step": 393360 + }, + { + "epoch": 1.520658409488024, + "grad_norm": 0.12146256864070892, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 393370 + }, + { + "epoch": 1.5206970666914073, + "grad_norm": 0.14656734466552734, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 393380 + }, + { + "epoch": 1.5207357238947905, + "grad_norm": 0.10592775046825409, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 393390 + }, + { + "epoch": 1.5207743810981738, + "grad_norm": 0.10509035736322403, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 393400 + }, + { + "epoch": 1.520813038301557, + "grad_norm": 0.1095312312245369, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 393410 + }, + { + "epoch": 1.5208516955049403, + "grad_norm": 0.0920720025897026, + "learning_rate": 0.002, + "loss": 2.326, + "step": 393420 + }, + { + "epoch": 1.5208903527083235, + "grad_norm": 0.10379816591739655, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 393430 + }, + { + "epoch": 1.520929009911707, + "grad_norm": 0.8264697790145874, + "learning_rate": 0.002, + "loss": 2.3277, + "step": 393440 + }, + { + "epoch": 1.5209676671150902, + "grad_norm": 0.10276079922914505, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 393450 + }, + { + "epoch": 1.5210063243184735, + "grad_norm": 0.13902220129966736, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 393460 + }, + { + "epoch": 1.5210449815218567, + "grad_norm": 0.12761332094669342, + "learning_rate": 0.002, + "loss": 2.3361, + "step": 393470 + }, + { + "epoch": 1.5210836387252402, + "grad_norm": 0.09847622364759445, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 393480 + }, + { + "epoch": 1.5211222959286235, + "grad_norm": 0.10445865988731384, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 393490 + }, + { + "epoch": 1.5211609531320067, + "grad_norm": 0.10974553227424622, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 393500 + }, + { + "epoch": 1.52119961033539, + "grad_norm": 0.09774612635374069, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 393510 + }, + { + "epoch": 1.5212382675387732, + "grad_norm": 0.10803424566984177, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 393520 + }, + { + "epoch": 1.5212769247421565, + "grad_norm": 0.10066314786672592, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 393530 + }, + { + "epoch": 1.5213155819455397, + "grad_norm": 0.10231117159128189, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 393540 + }, + { + "epoch": 1.521354239148923, + "grad_norm": 0.12019024789333344, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 393550 + }, + { + "epoch": 1.5213928963523062, + "grad_norm": 0.10460830479860306, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 393560 + }, + { + "epoch": 1.5214315535556895, + "grad_norm": 0.11702663451433182, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 393570 + }, + { + "epoch": 1.5214702107590727, + "grad_norm": 0.10864273458719254, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 393580 + }, + { + "epoch": 1.521508867962456, + "grad_norm": 0.09753713756799698, + "learning_rate": 0.002, + "loss": 2.323, + "step": 393590 + }, + { + "epoch": 1.5215475251658392, + "grad_norm": 0.11186327785253525, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 393600 + }, + { + "epoch": 1.5215861823692227, + "grad_norm": 0.10732456296682358, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 393610 + }, + { + "epoch": 1.521624839572606, + "grad_norm": 0.10576072335243225, + "learning_rate": 0.002, + "loss": 2.3367, + "step": 393620 + }, + { + "epoch": 1.5216634967759892, + "grad_norm": 0.10033950954675674, + "learning_rate": 0.002, + "loss": 2.3157, + "step": 393630 + }, + { + "epoch": 1.5217021539793725, + "grad_norm": 0.11310286819934845, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 393640 + }, + { + "epoch": 1.521740811182756, + "grad_norm": 0.09552452713251114, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 393650 + }, + { + "epoch": 1.5217794683861392, + "grad_norm": 0.11281601339578629, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 393660 + }, + { + "epoch": 1.5218181255895225, + "grad_norm": 0.10923857986927032, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 393670 + }, + { + "epoch": 1.5218567827929057, + "grad_norm": 0.10855615139007568, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 393680 + }, + { + "epoch": 1.521895439996289, + "grad_norm": 0.11590652912855148, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 393690 + }, + { + "epoch": 1.5219340971996722, + "grad_norm": 0.10057484358549118, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 393700 + }, + { + "epoch": 1.5219727544030555, + "grad_norm": 0.10506322979927063, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 393710 + }, + { + "epoch": 1.5220114116064387, + "grad_norm": 0.11757268756628036, + "learning_rate": 0.002, + "loss": 2.3421, + "step": 393720 + }, + { + "epoch": 1.522050068809822, + "grad_norm": 0.1272018998861313, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 393730 + }, + { + "epoch": 1.5220887260132052, + "grad_norm": 0.10142607241868973, + "learning_rate": 0.002, + "loss": 2.3563, + "step": 393740 + }, + { + "epoch": 1.5221273832165885, + "grad_norm": 0.12956005334854126, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 393750 + }, + { + "epoch": 1.5221660404199717, + "grad_norm": 0.0910429060459137, + "learning_rate": 0.002, + "loss": 2.3312, + "step": 393760 + }, + { + "epoch": 1.522204697623355, + "grad_norm": 0.1021728664636612, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 393770 + }, + { + "epoch": 1.5222433548267384, + "grad_norm": 0.1251039206981659, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 393780 + }, + { + "epoch": 1.5222820120301217, + "grad_norm": 0.10015079379081726, + "learning_rate": 0.002, + "loss": 2.3309, + "step": 393790 + }, + { + "epoch": 1.522320669233505, + "grad_norm": 0.09192286431789398, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 393800 + }, + { + "epoch": 1.5223593264368882, + "grad_norm": 0.13426172733306885, + "learning_rate": 0.002, + "loss": 2.3378, + "step": 393810 + }, + { + "epoch": 1.5223979836402717, + "grad_norm": 0.11347789317369461, + "learning_rate": 0.002, + "loss": 2.328, + "step": 393820 + }, + { + "epoch": 1.522436640843655, + "grad_norm": 0.13141945004463196, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 393830 + }, + { + "epoch": 1.5224752980470382, + "grad_norm": 0.09779248386621475, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 393840 + }, + { + "epoch": 1.5225139552504214, + "grad_norm": 0.1317087858915329, + "learning_rate": 0.002, + "loss": 2.3105, + "step": 393850 + }, + { + "epoch": 1.5225526124538047, + "grad_norm": 0.11914301663637161, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 393860 + }, + { + "epoch": 1.522591269657188, + "grad_norm": 0.10186275094747543, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 393870 + }, + { + "epoch": 1.5226299268605712, + "grad_norm": 0.09285727143287659, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 393880 + }, + { + "epoch": 1.5226685840639544, + "grad_norm": 0.11961885541677475, + "learning_rate": 0.002, + "loss": 2.335, + "step": 393890 + }, + { + "epoch": 1.5227072412673377, + "grad_norm": 0.09913287311792374, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 393900 + }, + { + "epoch": 1.522745898470721, + "grad_norm": 0.10889461636543274, + "learning_rate": 0.002, + "loss": 2.3531, + "step": 393910 + }, + { + "epoch": 1.5227845556741042, + "grad_norm": 0.10375667363405228, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 393920 + }, + { + "epoch": 1.5228232128774875, + "grad_norm": 0.11258301883935928, + "learning_rate": 0.002, + "loss": 2.3128, + "step": 393930 + }, + { + "epoch": 1.5228618700808707, + "grad_norm": 0.10796305537223816, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 393940 + }, + { + "epoch": 1.5229005272842542, + "grad_norm": 0.11623138934373856, + "learning_rate": 0.002, + "loss": 2.32, + "step": 393950 + }, + { + "epoch": 1.5229391844876374, + "grad_norm": 0.10926652699708939, + "learning_rate": 0.002, + "loss": 2.3238, + "step": 393960 + }, + { + "epoch": 1.5229778416910207, + "grad_norm": 0.1066916286945343, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 393970 + }, + { + "epoch": 1.523016498894404, + "grad_norm": 0.10033965855836868, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 393980 + }, + { + "epoch": 1.5230551560977874, + "grad_norm": 0.09625542908906937, + "learning_rate": 0.002, + "loss": 2.3429, + "step": 393990 + }, + { + "epoch": 1.5230938133011707, + "grad_norm": 0.10865096747875214, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 394000 + }, + { + "epoch": 1.523132470504554, + "grad_norm": 0.09583982825279236, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 394010 + }, + { + "epoch": 1.5231711277079372, + "grad_norm": 0.1044599786400795, + "learning_rate": 0.002, + "loss": 2.337, + "step": 394020 + }, + { + "epoch": 1.5232097849113204, + "grad_norm": 0.11785249412059784, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 394030 + }, + { + "epoch": 1.5232484421147037, + "grad_norm": 0.20341509580612183, + "learning_rate": 0.002, + "loss": 2.326, + "step": 394040 + }, + { + "epoch": 1.523287099318087, + "grad_norm": 0.11942635476589203, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 394050 + }, + { + "epoch": 1.5233257565214702, + "grad_norm": 0.10689658671617508, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 394060 + }, + { + "epoch": 1.5233644137248534, + "grad_norm": 0.1341332197189331, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 394070 + }, + { + "epoch": 1.5234030709282367, + "grad_norm": 0.10913897305727005, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 394080 + }, + { + "epoch": 1.52344172813162, + "grad_norm": 0.10185789316892624, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 394090 + }, + { + "epoch": 1.5234803853350032, + "grad_norm": 0.12783731520175934, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 394100 + }, + { + "epoch": 1.5235190425383864, + "grad_norm": 0.10477516800165176, + "learning_rate": 0.002, + "loss": 2.3481, + "step": 394110 + }, + { + "epoch": 1.52355769974177, + "grad_norm": 0.10059119760990143, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 394120 + }, + { + "epoch": 1.5235963569451532, + "grad_norm": 0.09582138806581497, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 394130 + }, + { + "epoch": 1.5236350141485364, + "grad_norm": 0.09843972325325012, + "learning_rate": 0.002, + "loss": 2.3495, + "step": 394140 + }, + { + "epoch": 1.5236736713519197, + "grad_norm": 0.10571901500225067, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 394150 + }, + { + "epoch": 1.5237123285553031, + "grad_norm": 0.12133859097957611, + "learning_rate": 0.002, + "loss": 2.3438, + "step": 394160 + }, + { + "epoch": 1.5237509857586864, + "grad_norm": 0.11906891316175461, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 394170 + }, + { + "epoch": 1.5237896429620696, + "grad_norm": 0.11050893366336823, + "learning_rate": 0.002, + "loss": 2.344, + "step": 394180 + }, + { + "epoch": 1.523828300165453, + "grad_norm": 0.0925726592540741, + "learning_rate": 0.002, + "loss": 2.309, + "step": 394190 + }, + { + "epoch": 1.5238669573688362, + "grad_norm": 0.09126723557710648, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 394200 + }, + { + "epoch": 1.5239056145722194, + "grad_norm": 0.12772385776042938, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 394210 + }, + { + "epoch": 1.5239442717756027, + "grad_norm": 0.10459309816360474, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 394220 + }, + { + "epoch": 1.523982928978986, + "grad_norm": 0.13920532166957855, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 394230 + }, + { + "epoch": 1.5240215861823692, + "grad_norm": 0.11099079251289368, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 394240 + }, + { + "epoch": 1.5240602433857524, + "grad_norm": 0.11004325747489929, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 394250 + }, + { + "epoch": 1.5240989005891357, + "grad_norm": 0.10719511657953262, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 394260 + }, + { + "epoch": 1.524137557792519, + "grad_norm": 0.092842236161232, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 394270 + }, + { + "epoch": 1.5241762149959024, + "grad_norm": 0.1256948709487915, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 394280 + }, + { + "epoch": 1.5242148721992856, + "grad_norm": 0.11076880246400833, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 394290 + }, + { + "epoch": 1.524253529402669, + "grad_norm": 0.09827834367752075, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 394300 + }, + { + "epoch": 1.5242921866060521, + "grad_norm": 0.10175015777349472, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 394310 + }, + { + "epoch": 1.5243308438094354, + "grad_norm": 0.09242283552885056, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 394320 + }, + { + "epoch": 1.5243695010128189, + "grad_norm": 0.12954500317573547, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 394330 + }, + { + "epoch": 1.5244081582162021, + "grad_norm": 0.10394249111413956, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 394340 + }, + { + "epoch": 1.5244468154195854, + "grad_norm": 0.09540977329015732, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 394350 + }, + { + "epoch": 1.5244854726229686, + "grad_norm": 0.1039431244134903, + "learning_rate": 0.002, + "loss": 2.3245, + "step": 394360 + }, + { + "epoch": 1.5245241298263519, + "grad_norm": 0.09400424361228943, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 394370 + }, + { + "epoch": 1.5245627870297351, + "grad_norm": 0.10155437886714935, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 394380 + }, + { + "epoch": 1.5246014442331184, + "grad_norm": 0.16589802503585815, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 394390 + }, + { + "epoch": 1.5246401014365016, + "grad_norm": 0.1328371912240982, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 394400 + }, + { + "epoch": 1.524678758639885, + "grad_norm": 0.12050023674964905, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 394410 + }, + { + "epoch": 1.5247174158432681, + "grad_norm": 0.1132076308131218, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 394420 + }, + { + "epoch": 1.5247560730466514, + "grad_norm": 0.09805141389369965, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 394430 + }, + { + "epoch": 1.5247947302500346, + "grad_norm": 0.25652357935905457, + "learning_rate": 0.002, + "loss": 2.3667, + "step": 394440 + }, + { + "epoch": 1.5248333874534181, + "grad_norm": 0.11934979259967804, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 394450 + }, + { + "epoch": 1.5248720446568014, + "grad_norm": 0.09216290712356567, + "learning_rate": 0.002, + "loss": 2.3308, + "step": 394460 + }, + { + "epoch": 1.5249107018601846, + "grad_norm": 0.09738843142986298, + "learning_rate": 0.002, + "loss": 2.324, + "step": 394470 + }, + { + "epoch": 1.5249493590635679, + "grad_norm": 0.10043909400701523, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 394480 + }, + { + "epoch": 1.5249880162669511, + "grad_norm": 0.09155519306659698, + "learning_rate": 0.002, + "loss": 2.3225, + "step": 394490 + }, + { + "epoch": 1.5250266734703346, + "grad_norm": 0.10652053356170654, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 394500 + }, + { + "epoch": 1.5250653306737179, + "grad_norm": 0.10862810164690018, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 394510 + }, + { + "epoch": 1.525103987877101, + "grad_norm": 0.11527835577726364, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 394520 + }, + { + "epoch": 1.5251426450804844, + "grad_norm": 0.09903129935264587, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 394530 + }, + { + "epoch": 1.5251813022838676, + "grad_norm": 0.10090334713459015, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 394540 + }, + { + "epoch": 1.5252199594872509, + "grad_norm": 0.11012939363718033, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 394550 + }, + { + "epoch": 1.5252586166906341, + "grad_norm": 0.13572582602500916, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 394560 + }, + { + "epoch": 1.5252972738940174, + "grad_norm": 0.0971146821975708, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 394570 + }, + { + "epoch": 1.5253359310974006, + "grad_norm": 0.10251978039741516, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 394580 + }, + { + "epoch": 1.5253745883007839, + "grad_norm": 0.1334100067615509, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 394590 + }, + { + "epoch": 1.5254132455041671, + "grad_norm": 0.10141900181770325, + "learning_rate": 0.002, + "loss": 2.3537, + "step": 394600 + }, + { + "epoch": 1.5254519027075504, + "grad_norm": 0.10914376378059387, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 394610 + }, + { + "epoch": 1.5254905599109339, + "grad_norm": 0.1215430423617363, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 394620 + }, + { + "epoch": 1.525529217114317, + "grad_norm": 0.0969533771276474, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 394630 + }, + { + "epoch": 1.5255678743177004, + "grad_norm": 0.12010926753282547, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 394640 + }, + { + "epoch": 1.5256065315210836, + "grad_norm": 0.10832379013299942, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 394650 + }, + { + "epoch": 1.525645188724467, + "grad_norm": 0.11077246814966202, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 394660 + }, + { + "epoch": 1.5256838459278503, + "grad_norm": 0.09438929706811905, + "learning_rate": 0.002, + "loss": 2.3407, + "step": 394670 + }, + { + "epoch": 1.5257225031312336, + "grad_norm": 0.09369917958974838, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 394680 + }, + { + "epoch": 1.5257611603346168, + "grad_norm": 0.09520909935235977, + "learning_rate": 0.002, + "loss": 2.3472, + "step": 394690 + }, + { + "epoch": 1.525799817538, + "grad_norm": 0.10132263600826263, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 394700 + }, + { + "epoch": 1.5258384747413833, + "grad_norm": 0.10649420320987701, + "learning_rate": 0.002, + "loss": 2.34, + "step": 394710 + }, + { + "epoch": 1.5258771319447666, + "grad_norm": 0.09204483777284622, + "learning_rate": 0.002, + "loss": 2.3321, + "step": 394720 + }, + { + "epoch": 1.5259157891481498, + "grad_norm": 0.0947391539812088, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 394730 + }, + { + "epoch": 1.525954446351533, + "grad_norm": 0.11095141619443893, + "learning_rate": 0.002, + "loss": 2.3486, + "step": 394740 + }, + { + "epoch": 1.5259931035549164, + "grad_norm": 0.11917775869369507, + "learning_rate": 0.002, + "loss": 2.3364, + "step": 394750 + }, + { + "epoch": 1.5260317607582996, + "grad_norm": 0.09193374961614609, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 394760 + }, + { + "epoch": 1.5260704179616829, + "grad_norm": 0.10913031548261642, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 394770 + }, + { + "epoch": 1.526109075165066, + "grad_norm": 0.08642298728227615, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 394780 + }, + { + "epoch": 1.5261477323684496, + "grad_norm": 0.10410486906766891, + "learning_rate": 0.002, + "loss": 2.3514, + "step": 394790 + }, + { + "epoch": 1.5261863895718328, + "grad_norm": 0.10142649710178375, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 394800 + }, + { + "epoch": 1.526225046775216, + "grad_norm": 0.1226276382803917, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 394810 + }, + { + "epoch": 1.5262637039785993, + "grad_norm": 0.09792998433113098, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 394820 + }, + { + "epoch": 1.5263023611819828, + "grad_norm": 0.1416461020708084, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 394830 + }, + { + "epoch": 1.526341018385366, + "grad_norm": 0.1003195121884346, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 394840 + }, + { + "epoch": 1.5263796755887493, + "grad_norm": 0.11260812729597092, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 394850 + }, + { + "epoch": 1.5264183327921326, + "grad_norm": 0.09097535163164139, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 394860 + }, + { + "epoch": 1.5264569899955158, + "grad_norm": 0.10058214515447617, + "learning_rate": 0.002, + "loss": 2.33, + "step": 394870 + }, + { + "epoch": 1.526495647198899, + "grad_norm": 0.11305031180381775, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 394880 + }, + { + "epoch": 1.5265343044022823, + "grad_norm": 0.11100471764802933, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 394890 + }, + { + "epoch": 1.5265729616056656, + "grad_norm": 0.09917481243610382, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 394900 + }, + { + "epoch": 1.5266116188090488, + "grad_norm": 0.16251282393932343, + "learning_rate": 0.002, + "loss": 2.3175, + "step": 394910 + }, + { + "epoch": 1.526650276012432, + "grad_norm": 0.09185020625591278, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 394920 + }, + { + "epoch": 1.5266889332158153, + "grad_norm": 0.09810732305049896, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 394930 + }, + { + "epoch": 1.5267275904191986, + "grad_norm": 0.09562589228153229, + "learning_rate": 0.002, + "loss": 2.3206, + "step": 394940 + }, + { + "epoch": 1.5267662476225818, + "grad_norm": 0.0937427207827568, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 394950 + }, + { + "epoch": 1.5268049048259653, + "grad_norm": 0.10749828815460205, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 394960 + }, + { + "epoch": 1.5268435620293486, + "grad_norm": 0.10349355638027191, + "learning_rate": 0.002, + "loss": 2.3345, + "step": 394970 + }, + { + "epoch": 1.5268822192327318, + "grad_norm": 0.08923173695802689, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 394980 + }, + { + "epoch": 1.526920876436115, + "grad_norm": 0.09006751328706741, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 394990 + }, + { + "epoch": 1.5269595336394985, + "grad_norm": 0.0997283011674881, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 395000 + }, + { + "epoch": 1.5269981908428818, + "grad_norm": 0.106237031519413, + "learning_rate": 0.002, + "loss": 2.336, + "step": 395010 + }, + { + "epoch": 1.527036848046265, + "grad_norm": 0.1262180507183075, + "learning_rate": 0.002, + "loss": 2.3576, + "step": 395020 + }, + { + "epoch": 1.5270755052496483, + "grad_norm": 0.09994278103113174, + "learning_rate": 0.002, + "loss": 2.3478, + "step": 395030 + }, + { + "epoch": 1.5271141624530316, + "grad_norm": 0.12491146475076675, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 395040 + }, + { + "epoch": 1.5271528196564148, + "grad_norm": 0.09989090263843536, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 395050 + }, + { + "epoch": 1.527191476859798, + "grad_norm": 0.09762978553771973, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 395060 + }, + { + "epoch": 1.5272301340631813, + "grad_norm": 0.10708542168140411, + "learning_rate": 0.002, + "loss": 2.336, + "step": 395070 + }, + { + "epoch": 1.5272687912665646, + "grad_norm": 0.10440186411142349, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 395080 + }, + { + "epoch": 1.5273074484699478, + "grad_norm": 0.10143192112445831, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 395090 + }, + { + "epoch": 1.527346105673331, + "grad_norm": 0.10683754086494446, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 395100 + }, + { + "epoch": 1.5273847628767143, + "grad_norm": 0.10886263102293015, + "learning_rate": 0.002, + "loss": 2.339, + "step": 395110 + }, + { + "epoch": 1.5274234200800976, + "grad_norm": 0.10051087290048599, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 395120 + }, + { + "epoch": 1.527462077283481, + "grad_norm": 0.10115080326795578, + "learning_rate": 0.002, + "loss": 2.346, + "step": 395130 + }, + { + "epoch": 1.5275007344868643, + "grad_norm": 0.09336403757333755, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 395140 + }, + { + "epoch": 1.5275393916902475, + "grad_norm": 0.12161771953105927, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 395150 + }, + { + "epoch": 1.5275780488936308, + "grad_norm": 0.10998454689979553, + "learning_rate": 0.002, + "loss": 2.3388, + "step": 395160 + }, + { + "epoch": 1.5276167060970143, + "grad_norm": 0.10109839588403702, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 395170 + }, + { + "epoch": 1.5276553633003975, + "grad_norm": 0.13047313690185547, + "learning_rate": 0.002, + "loss": 2.3192, + "step": 395180 + }, + { + "epoch": 1.5276940205037808, + "grad_norm": 0.09682628512382507, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 395190 + }, + { + "epoch": 1.527732677707164, + "grad_norm": 0.1216890811920166, + "learning_rate": 0.002, + "loss": 2.3199, + "step": 395200 + }, + { + "epoch": 1.5277713349105473, + "grad_norm": 0.10351161658763885, + "learning_rate": 0.002, + "loss": 2.3375, + "step": 395210 + }, + { + "epoch": 1.5278099921139305, + "grad_norm": 0.10402938723564148, + "learning_rate": 0.002, + "loss": 2.3179, + "step": 395220 + }, + { + "epoch": 1.5278486493173138, + "grad_norm": 0.1108585074543953, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 395230 + }, + { + "epoch": 1.527887306520697, + "grad_norm": 0.10756135731935501, + "learning_rate": 0.002, + "loss": 2.324, + "step": 395240 + }, + { + "epoch": 1.5279259637240803, + "grad_norm": 0.10325378179550171, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 395250 + }, + { + "epoch": 1.5279646209274635, + "grad_norm": 0.09820621460676193, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 395260 + }, + { + "epoch": 1.5280032781308468, + "grad_norm": 0.17582575976848602, + "learning_rate": 0.002, + "loss": 2.3366, + "step": 395270 + }, + { + "epoch": 1.52804193533423, + "grad_norm": 0.0954713225364685, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 395280 + }, + { + "epoch": 1.5280805925376133, + "grad_norm": 0.10267660766839981, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 395290 + }, + { + "epoch": 1.5281192497409968, + "grad_norm": 0.16547106206417084, + "learning_rate": 0.002, + "loss": 2.318, + "step": 395300 + }, + { + "epoch": 1.52815790694438, + "grad_norm": 0.11562098562717438, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 395310 + }, + { + "epoch": 1.5281965641477633, + "grad_norm": 0.09530088305473328, + "learning_rate": 0.002, + "loss": 2.335, + "step": 395320 + }, + { + "epoch": 1.5282352213511465, + "grad_norm": 0.10034076869487762, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 395330 + }, + { + "epoch": 1.52827387855453, + "grad_norm": 0.11918345093727112, + "learning_rate": 0.002, + "loss": 2.311, + "step": 395340 + }, + { + "epoch": 1.5283125357579133, + "grad_norm": 0.11170210689306259, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 395350 + }, + { + "epoch": 1.5283511929612965, + "grad_norm": 0.09193076193332672, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 395360 + }, + { + "epoch": 1.5283898501646798, + "grad_norm": 0.10835543274879456, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 395370 + }, + { + "epoch": 1.528428507368063, + "grad_norm": 0.1091921404004097, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 395380 + }, + { + "epoch": 1.5284671645714463, + "grad_norm": 0.11564579606056213, + "learning_rate": 0.002, + "loss": 2.3215, + "step": 395390 + }, + { + "epoch": 1.5285058217748295, + "grad_norm": 0.09592635929584503, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 395400 + }, + { + "epoch": 1.5285444789782128, + "grad_norm": 0.1146683543920517, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 395410 + }, + { + "epoch": 1.528583136181596, + "grad_norm": 0.10991352051496506, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 395420 + }, + { + "epoch": 1.5286217933849793, + "grad_norm": 0.08941232413053513, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 395430 + }, + { + "epoch": 1.5286604505883625, + "grad_norm": 0.10512373596429825, + "learning_rate": 0.002, + "loss": 2.3322, + "step": 395440 + }, + { + "epoch": 1.5286991077917458, + "grad_norm": 0.10848497599363327, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 395450 + }, + { + "epoch": 1.528737764995129, + "grad_norm": 0.1271876096725464, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 395460 + }, + { + "epoch": 1.5287764221985125, + "grad_norm": 0.11504293978214264, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 395470 + }, + { + "epoch": 1.5288150794018958, + "grad_norm": 0.11468256264925003, + "learning_rate": 0.002, + "loss": 2.3468, + "step": 395480 + }, + { + "epoch": 1.528853736605279, + "grad_norm": 0.1417941004037857, + "learning_rate": 0.002, + "loss": 2.3292, + "step": 395490 + }, + { + "epoch": 1.5288923938086623, + "grad_norm": 0.09736809879541397, + "learning_rate": 0.002, + "loss": 2.3311, + "step": 395500 + }, + { + "epoch": 1.5289310510120457, + "grad_norm": 0.09892872720956802, + "learning_rate": 0.002, + "loss": 2.3439, + "step": 395510 + }, + { + "epoch": 1.528969708215429, + "grad_norm": 0.09419476240873337, + "learning_rate": 0.002, + "loss": 2.3539, + "step": 395520 + }, + { + "epoch": 1.5290083654188122, + "grad_norm": 0.10254430770874023, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 395530 + }, + { + "epoch": 1.5290470226221955, + "grad_norm": 0.10115151852369308, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 395540 + }, + { + "epoch": 1.5290856798255787, + "grad_norm": 0.09870092570781708, + "learning_rate": 0.002, + "loss": 2.331, + "step": 395550 + }, + { + "epoch": 1.529124337028962, + "grad_norm": 0.09946366399526596, + "learning_rate": 0.002, + "loss": 2.325, + "step": 395560 + }, + { + "epoch": 1.5291629942323453, + "grad_norm": 0.09750016778707504, + "learning_rate": 0.002, + "loss": 2.335, + "step": 395570 + }, + { + "epoch": 1.5292016514357285, + "grad_norm": 0.09980180114507675, + "learning_rate": 0.002, + "loss": 2.34, + "step": 395580 + }, + { + "epoch": 1.5292403086391118, + "grad_norm": 0.10483616590499878, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 395590 + }, + { + "epoch": 1.529278965842495, + "grad_norm": 0.09471538662910461, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 395600 + }, + { + "epoch": 1.5293176230458783, + "grad_norm": 0.1223699301481247, + "learning_rate": 0.002, + "loss": 2.3241, + "step": 395610 + }, + { + "epoch": 1.5293562802492615, + "grad_norm": 0.09756605327129364, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 395620 + }, + { + "epoch": 1.5293949374526448, + "grad_norm": 0.10068816691637039, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 395630 + }, + { + "epoch": 1.5294335946560282, + "grad_norm": 0.10722284764051437, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 395640 + }, + { + "epoch": 1.5294722518594115, + "grad_norm": 0.08979428559541702, + "learning_rate": 0.002, + "loss": 2.3303, + "step": 395650 + }, + { + "epoch": 1.5295109090627947, + "grad_norm": 0.09775793552398682, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 395660 + }, + { + "epoch": 1.529549566266178, + "grad_norm": 0.10150935500860214, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 395670 + }, + { + "epoch": 1.5295882234695615, + "grad_norm": 0.1311400681734085, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 395680 + }, + { + "epoch": 1.5296268806729447, + "grad_norm": 0.11002326011657715, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 395690 + }, + { + "epoch": 1.529665537876328, + "grad_norm": 0.11130055040121078, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 395700 + }, + { + "epoch": 1.5297041950797112, + "grad_norm": 0.10423920303583145, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 395710 + }, + { + "epoch": 1.5297428522830945, + "grad_norm": 0.10642270743846893, + "learning_rate": 0.002, + "loss": 2.3191, + "step": 395720 + }, + { + "epoch": 1.5297815094864777, + "grad_norm": 0.09561317414045334, + "learning_rate": 0.002, + "loss": 2.3335, + "step": 395730 + }, + { + "epoch": 1.529820166689861, + "grad_norm": 0.10052410513162613, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 395740 + }, + { + "epoch": 1.5298588238932442, + "grad_norm": 0.12037631869316101, + "learning_rate": 0.002, + "loss": 2.342, + "step": 395750 + }, + { + "epoch": 1.5298974810966275, + "grad_norm": 0.08717092871665955, + "learning_rate": 0.002, + "loss": 2.3287, + "step": 395760 + }, + { + "epoch": 1.5299361383000107, + "grad_norm": 0.10462912172079086, + "learning_rate": 0.002, + "loss": 2.3141, + "step": 395770 + }, + { + "epoch": 1.529974795503394, + "grad_norm": 0.09321984648704529, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 395780 + }, + { + "epoch": 1.5300134527067772, + "grad_norm": 0.39973539113998413, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 395790 + }, + { + "epoch": 1.5300521099101605, + "grad_norm": 0.1333955079317093, + "learning_rate": 0.002, + "loss": 2.3516, + "step": 395800 + }, + { + "epoch": 1.530090767113544, + "grad_norm": 0.10980807989835739, + "learning_rate": 0.002, + "loss": 2.3544, + "step": 395810 + }, + { + "epoch": 1.5301294243169272, + "grad_norm": 0.09195207804441452, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 395820 + }, + { + "epoch": 1.5301680815203105, + "grad_norm": 0.0882343202829361, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 395830 + }, + { + "epoch": 1.5302067387236937, + "grad_norm": 0.11924585700035095, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 395840 + }, + { + "epoch": 1.5302453959270772, + "grad_norm": 0.10046351701021194, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 395850 + }, + { + "epoch": 1.5302840531304605, + "grad_norm": 0.09576009958982468, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 395860 + }, + { + "epoch": 1.5303227103338437, + "grad_norm": 0.09291372448205948, + "learning_rate": 0.002, + "loss": 2.3273, + "step": 395870 + }, + { + "epoch": 1.530361367537227, + "grad_norm": 0.12952889502048492, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 395880 + }, + { + "epoch": 1.5304000247406102, + "grad_norm": 0.11749241501092911, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 395890 + }, + { + "epoch": 1.5304386819439935, + "grad_norm": 0.10607244074344635, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 395900 + }, + { + "epoch": 1.5304773391473767, + "grad_norm": 0.1116856187582016, + "learning_rate": 0.002, + "loss": 2.337, + "step": 395910 + }, + { + "epoch": 1.53051599635076, + "grad_norm": 0.10862328857183456, + "learning_rate": 0.002, + "loss": 2.3393, + "step": 395920 + }, + { + "epoch": 1.5305546535541432, + "grad_norm": 0.10131102055311203, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 395930 + }, + { + "epoch": 1.5305933107575265, + "grad_norm": 0.17241306602954865, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 395940 + }, + { + "epoch": 1.5306319679609097, + "grad_norm": 0.09801255911588669, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 395950 + }, + { + "epoch": 1.530670625164293, + "grad_norm": 0.10967551171779633, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 395960 + }, + { + "epoch": 1.5307092823676762, + "grad_norm": 0.11190198361873627, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 395970 + }, + { + "epoch": 1.5307479395710597, + "grad_norm": 0.10620824247598648, + "learning_rate": 0.002, + "loss": 2.3214, + "step": 395980 + }, + { + "epoch": 1.530786596774443, + "grad_norm": 0.09396962076425552, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 395990 + }, + { + "epoch": 1.5308252539778262, + "grad_norm": 0.10517092794179916, + "learning_rate": 0.002, + "loss": 2.3473, + "step": 396000 + }, + { + "epoch": 1.5308639111812095, + "grad_norm": 0.09492594748735428, + "learning_rate": 0.002, + "loss": 2.3504, + "step": 396010 + }, + { + "epoch": 1.530902568384593, + "grad_norm": 0.09977784752845764, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 396020 + }, + { + "epoch": 1.5309412255879762, + "grad_norm": 0.11173679679632187, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 396030 + }, + { + "epoch": 1.5309798827913594, + "grad_norm": 0.0971636101603508, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 396040 + }, + { + "epoch": 1.5310185399947427, + "grad_norm": 0.0979008898139, + "learning_rate": 0.002, + "loss": 2.3216, + "step": 396050 + }, + { + "epoch": 1.531057197198126, + "grad_norm": 0.1309940665960312, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 396060 + }, + { + "epoch": 1.5310958544015092, + "grad_norm": 0.10882627218961716, + "learning_rate": 0.002, + "loss": 2.3186, + "step": 396070 + }, + { + "epoch": 1.5311345116048924, + "grad_norm": 0.09825605899095535, + "learning_rate": 0.002, + "loss": 2.335, + "step": 396080 + }, + { + "epoch": 1.5311731688082757, + "grad_norm": 0.10115810483694077, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 396090 + }, + { + "epoch": 1.531211826011659, + "grad_norm": 0.10364280641078949, + "learning_rate": 0.002, + "loss": 2.3226, + "step": 396100 + }, + { + "epoch": 1.5312504832150422, + "grad_norm": 0.09675593674182892, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 396110 + }, + { + "epoch": 1.5312891404184255, + "grad_norm": 0.12637415528297424, + "learning_rate": 0.002, + "loss": 2.3596, + "step": 396120 + }, + { + "epoch": 1.5313277976218087, + "grad_norm": 0.12695544958114624, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 396130 + }, + { + "epoch": 1.5313664548251922, + "grad_norm": 0.11234147101640701, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 396140 + }, + { + "epoch": 1.5314051120285754, + "grad_norm": 0.09313137084245682, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 396150 + }, + { + "epoch": 1.5314437692319587, + "grad_norm": 0.09089941531419754, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 396160 + }, + { + "epoch": 1.531482426435342, + "grad_norm": 0.09164874255657196, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 396170 + }, + { + "epoch": 1.5315210836387252, + "grad_norm": 0.10642282664775848, + "learning_rate": 0.002, + "loss": 2.3341, + "step": 396180 + }, + { + "epoch": 1.5315597408421087, + "grad_norm": 0.11467402428388596, + "learning_rate": 0.002, + "loss": 2.3171, + "step": 396190 + }, + { + "epoch": 1.531598398045492, + "grad_norm": 0.12948563694953918, + "learning_rate": 0.002, + "loss": 2.3101, + "step": 396200 + }, + { + "epoch": 1.5316370552488752, + "grad_norm": 0.1263953000307083, + "learning_rate": 0.002, + "loss": 2.318, + "step": 396210 + }, + { + "epoch": 1.5316757124522584, + "grad_norm": 0.10095341503620148, + "learning_rate": 0.002, + "loss": 2.329, + "step": 396220 + }, + { + "epoch": 1.5317143696556417, + "grad_norm": 0.09533122181892395, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 396230 + }, + { + "epoch": 1.531753026859025, + "grad_norm": 0.10737673193216324, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 396240 + }, + { + "epoch": 1.5317916840624082, + "grad_norm": 0.12329886853694916, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 396250 + }, + { + "epoch": 1.5318303412657914, + "grad_norm": 0.11738262325525284, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 396260 + }, + { + "epoch": 1.5318689984691747, + "grad_norm": 0.1225343570113182, + "learning_rate": 0.002, + "loss": 2.3151, + "step": 396270 + }, + { + "epoch": 1.531907655672558, + "grad_norm": 0.10273466259241104, + "learning_rate": 0.002, + "loss": 2.337, + "step": 396280 + }, + { + "epoch": 1.5319463128759412, + "grad_norm": 0.10513811558485031, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 396290 + }, + { + "epoch": 1.5319849700793244, + "grad_norm": 0.12400317937135696, + "learning_rate": 0.002, + "loss": 2.339, + "step": 396300 + }, + { + "epoch": 1.532023627282708, + "grad_norm": 0.12455243617296219, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 396310 + }, + { + "epoch": 1.5320622844860912, + "grad_norm": 0.20735898613929749, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 396320 + }, + { + "epoch": 1.5321009416894744, + "grad_norm": 0.10270749777555466, + "learning_rate": 0.002, + "loss": 2.3107, + "step": 396330 + }, + { + "epoch": 1.5321395988928577, + "grad_norm": 0.12237662076950073, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 396340 + }, + { + "epoch": 1.532178256096241, + "grad_norm": 0.10366332530975342, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 396350 + }, + { + "epoch": 1.5322169132996244, + "grad_norm": 0.09420368820428848, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 396360 + }, + { + "epoch": 1.5322555705030076, + "grad_norm": 0.0979878306388855, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 396370 + }, + { + "epoch": 1.532294227706391, + "grad_norm": 0.10520835965871811, + "learning_rate": 0.002, + "loss": 2.3383, + "step": 396380 + }, + { + "epoch": 1.5323328849097742, + "grad_norm": 0.0948127955198288, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 396390 + }, + { + "epoch": 1.5323715421131574, + "grad_norm": 0.11359903216362, + "learning_rate": 0.002, + "loss": 2.3412, + "step": 396400 + }, + { + "epoch": 1.5324101993165407, + "grad_norm": 0.11875665932893753, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 396410 + }, + { + "epoch": 1.532448856519924, + "grad_norm": 0.1266757994890213, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 396420 + }, + { + "epoch": 1.5324875137233072, + "grad_norm": 0.10371547937393188, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 396430 + }, + { + "epoch": 1.5325261709266904, + "grad_norm": 0.10963073372840881, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 396440 + }, + { + "epoch": 1.5325648281300737, + "grad_norm": 0.0958590880036354, + "learning_rate": 0.002, + "loss": 2.3505, + "step": 396450 + }, + { + "epoch": 1.532603485333457, + "grad_norm": 0.11888827383518219, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 396460 + }, + { + "epoch": 1.5326421425368402, + "grad_norm": 0.09254291653633118, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 396470 + }, + { + "epoch": 1.5326807997402236, + "grad_norm": 0.10755075514316559, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 396480 + }, + { + "epoch": 1.532719456943607, + "grad_norm": 0.1045212596654892, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 396490 + }, + { + "epoch": 1.5327581141469901, + "grad_norm": 0.1087140217423439, + "learning_rate": 0.002, + "loss": 2.3185, + "step": 396500 + }, + { + "epoch": 1.5327967713503734, + "grad_norm": 0.0878644585609436, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 396510 + }, + { + "epoch": 1.5328354285537569, + "grad_norm": 0.12269987165927887, + "learning_rate": 0.002, + "loss": 2.3212, + "step": 396520 + }, + { + "epoch": 1.5328740857571401, + "grad_norm": 0.11972922086715698, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 396530 + }, + { + "epoch": 1.5329127429605234, + "grad_norm": 0.11032579094171524, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 396540 + }, + { + "epoch": 1.5329514001639066, + "grad_norm": 0.11419215053319931, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 396550 + }, + { + "epoch": 1.5329900573672899, + "grad_norm": 0.10795775800943375, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 396560 + }, + { + "epoch": 1.5330287145706731, + "grad_norm": 0.105446957051754, + "learning_rate": 0.002, + "loss": 2.3336, + "step": 396570 + }, + { + "epoch": 1.5330673717740564, + "grad_norm": 0.1169656291604042, + "learning_rate": 0.002, + "loss": 2.3237, + "step": 396580 + }, + { + "epoch": 1.5331060289774396, + "grad_norm": 0.09742467850446701, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 396590 + }, + { + "epoch": 1.533144686180823, + "grad_norm": 0.12050811201334, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 396600 + }, + { + "epoch": 1.5331833433842061, + "grad_norm": 0.10963039100170135, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 396610 + }, + { + "epoch": 1.5332220005875894, + "grad_norm": 0.09053657948970795, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 396620 + }, + { + "epoch": 1.5332606577909726, + "grad_norm": 0.3095700442790985, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 396630 + }, + { + "epoch": 1.533299314994356, + "grad_norm": 0.09944092482328415, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 396640 + }, + { + "epoch": 1.5333379721977394, + "grad_norm": 0.1099468246102333, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 396650 + }, + { + "epoch": 1.5333766294011226, + "grad_norm": 0.0962558463215828, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 396660 + }, + { + "epoch": 1.5334152866045059, + "grad_norm": 0.12188946455717087, + "learning_rate": 0.002, + "loss": 2.3162, + "step": 396670 + }, + { + "epoch": 1.5334539438078891, + "grad_norm": 0.1150522455573082, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 396680 + }, + { + "epoch": 1.5334926010112726, + "grad_norm": 0.10602287948131561, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 396690 + }, + { + "epoch": 1.5335312582146559, + "grad_norm": 0.09643089771270752, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 396700 + }, + { + "epoch": 1.533569915418039, + "grad_norm": 0.12320626527070999, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 396710 + }, + { + "epoch": 1.5336085726214224, + "grad_norm": 0.10964152216911316, + "learning_rate": 0.002, + "loss": 2.3442, + "step": 396720 + }, + { + "epoch": 1.5336472298248056, + "grad_norm": 0.10293164104223251, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 396730 + }, + { + "epoch": 1.5336858870281889, + "grad_norm": 0.0910605862736702, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 396740 + }, + { + "epoch": 1.5337245442315721, + "grad_norm": 0.10125657171010971, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 396750 + }, + { + "epoch": 1.5337632014349554, + "grad_norm": 0.09557554870843887, + "learning_rate": 0.002, + "loss": 2.3323, + "step": 396760 + }, + { + "epoch": 1.5338018586383386, + "grad_norm": 0.09974601864814758, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 396770 + }, + { + "epoch": 1.5338405158417219, + "grad_norm": 0.09233753383159637, + "learning_rate": 0.002, + "loss": 2.3413, + "step": 396780 + }, + { + "epoch": 1.5338791730451051, + "grad_norm": 0.10616888850927353, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 396790 + }, + { + "epoch": 1.5339178302484884, + "grad_norm": 0.1244562566280365, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 396800 + }, + { + "epoch": 1.5339564874518716, + "grad_norm": 0.11564454436302185, + "learning_rate": 0.002, + "loss": 2.3332, + "step": 396810 + }, + { + "epoch": 1.533995144655255, + "grad_norm": 0.12104111909866333, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 396820 + }, + { + "epoch": 1.5340338018586384, + "grad_norm": 0.10878870636224747, + "learning_rate": 0.002, + "loss": 2.3204, + "step": 396830 + }, + { + "epoch": 1.5340724590620216, + "grad_norm": 0.10166596621274948, + "learning_rate": 0.002, + "loss": 2.3294, + "step": 396840 + }, + { + "epoch": 1.5341111162654049, + "grad_norm": 0.10973954945802689, + "learning_rate": 0.002, + "loss": 2.3304, + "step": 396850 + }, + { + "epoch": 1.5341497734687883, + "grad_norm": 0.09237612038850784, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 396860 + }, + { + "epoch": 1.5341884306721716, + "grad_norm": 0.0972457304596901, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 396870 + }, + { + "epoch": 1.5342270878755548, + "grad_norm": 0.10720162093639374, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 396880 + }, + { + "epoch": 1.534265745078938, + "grad_norm": 0.10426491498947144, + "learning_rate": 0.002, + "loss": 2.3194, + "step": 396890 + }, + { + "epoch": 1.5343044022823213, + "grad_norm": 0.1059766411781311, + "learning_rate": 0.002, + "loss": 2.332, + "step": 396900 + }, + { + "epoch": 1.5343430594857046, + "grad_norm": 0.09205926954746246, + "learning_rate": 0.002, + "loss": 2.3233, + "step": 396910 + }, + { + "epoch": 1.5343817166890878, + "grad_norm": 0.09527570009231567, + "learning_rate": 0.002, + "loss": 2.3376, + "step": 396920 + }, + { + "epoch": 1.534420373892471, + "grad_norm": 0.11123289912939072, + "learning_rate": 0.002, + "loss": 2.328, + "step": 396930 + }, + { + "epoch": 1.5344590310958544, + "grad_norm": 0.12551504373550415, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 396940 + }, + { + "epoch": 1.5344976882992376, + "grad_norm": 0.09078148007392883, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 396950 + }, + { + "epoch": 1.5345363455026209, + "grad_norm": 0.09621188044548035, + "learning_rate": 0.002, + "loss": 2.3494, + "step": 396960 + }, + { + "epoch": 1.534575002706004, + "grad_norm": 0.12395072728395462, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 396970 + }, + { + "epoch": 1.5346136599093874, + "grad_norm": 0.10399714857339859, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 396980 + }, + { + "epoch": 1.5346523171127708, + "grad_norm": 0.12199089676141739, + "learning_rate": 0.002, + "loss": 2.32, + "step": 396990 + }, + { + "epoch": 1.534690974316154, + "grad_norm": 0.12163523584604263, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 397000 + }, + { + "epoch": 1.5347296315195373, + "grad_norm": 0.10492004454135895, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 397010 + }, + { + "epoch": 1.5347682887229206, + "grad_norm": 0.0947810560464859, + "learning_rate": 0.002, + "loss": 2.3591, + "step": 397020 + }, + { + "epoch": 1.534806945926304, + "grad_norm": 0.10641422867774963, + "learning_rate": 0.002, + "loss": 2.317, + "step": 397030 + }, + { + "epoch": 1.5348456031296873, + "grad_norm": 0.1084209531545639, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 397040 + }, + { + "epoch": 1.5348842603330706, + "grad_norm": 0.11186496913433075, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 397050 + }, + { + "epoch": 1.5349229175364538, + "grad_norm": 0.11108078062534332, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 397060 + }, + { + "epoch": 1.534961574739837, + "grad_norm": 0.08751664310693741, + "learning_rate": 0.002, + "loss": 2.3257, + "step": 397070 + }, + { + "epoch": 1.5350002319432203, + "grad_norm": 0.13249637186527252, + "learning_rate": 0.002, + "loss": 2.3313, + "step": 397080 + }, + { + "epoch": 1.5350388891466036, + "grad_norm": 0.1015789583325386, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 397090 + }, + { + "epoch": 1.5350775463499868, + "grad_norm": 0.10492010414600372, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 397100 + }, + { + "epoch": 1.53511620355337, + "grad_norm": 0.11029554158449173, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 397110 + }, + { + "epoch": 1.5351548607567533, + "grad_norm": 0.0864063948392868, + "learning_rate": 0.002, + "loss": 2.327, + "step": 397120 + }, + { + "epoch": 1.5351935179601366, + "grad_norm": 0.11806897073984146, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 397130 + }, + { + "epoch": 1.5352321751635198, + "grad_norm": 0.11673596501350403, + "learning_rate": 0.002, + "loss": 2.3503, + "step": 397140 + }, + { + "epoch": 1.535270832366903, + "grad_norm": 0.10795406252145767, + "learning_rate": 0.002, + "loss": 2.3231, + "step": 397150 + }, + { + "epoch": 1.5353094895702866, + "grad_norm": 0.11805335432291031, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 397160 + }, + { + "epoch": 1.5353481467736698, + "grad_norm": 0.10892137140035629, + "learning_rate": 0.002, + "loss": 2.3262, + "step": 397170 + }, + { + "epoch": 1.535386803977053, + "grad_norm": 0.1233871653676033, + "learning_rate": 0.002, + "loss": 2.3197, + "step": 397180 + }, + { + "epoch": 1.5354254611804363, + "grad_norm": 0.26964542269706726, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 397190 + }, + { + "epoch": 1.5354641183838198, + "grad_norm": 0.09856158494949341, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 397200 + }, + { + "epoch": 1.535502775587203, + "grad_norm": 0.12949031591415405, + "learning_rate": 0.002, + "loss": 2.3326, + "step": 397210 + }, + { + "epoch": 1.5355414327905863, + "grad_norm": 0.11520420014858246, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 397220 + }, + { + "epoch": 1.5355800899939696, + "grad_norm": 0.1231561005115509, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 397230 + }, + { + "epoch": 1.5356187471973528, + "grad_norm": 0.1025674045085907, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 397240 + }, + { + "epoch": 1.535657404400736, + "grad_norm": 0.10563423484563828, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 397250 + }, + { + "epoch": 1.5356960616041193, + "grad_norm": 0.11561419069766998, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 397260 + }, + { + "epoch": 1.5357347188075026, + "grad_norm": 0.09945831447839737, + "learning_rate": 0.002, + "loss": 2.3488, + "step": 397270 + }, + { + "epoch": 1.5357733760108858, + "grad_norm": 0.11000964045524597, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 397280 + }, + { + "epoch": 1.535812033214269, + "grad_norm": 0.11010735481977463, + "learning_rate": 0.002, + "loss": 2.3411, + "step": 397290 + }, + { + "epoch": 1.5358506904176523, + "grad_norm": 0.11759929358959198, + "learning_rate": 0.002, + "loss": 2.3264, + "step": 397300 + }, + { + "epoch": 1.5358893476210356, + "grad_norm": 0.10051693022251129, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 397310 + }, + { + "epoch": 1.5359280048244188, + "grad_norm": 0.10334565490484238, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 397320 + }, + { + "epoch": 1.5359666620278023, + "grad_norm": 0.11563760787248611, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 397330 + }, + { + "epoch": 1.5360053192311856, + "grad_norm": 0.1098022386431694, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 397340 + }, + { + "epoch": 1.5360439764345688, + "grad_norm": 0.10105536133050919, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 397350 + }, + { + "epoch": 1.536082633637952, + "grad_norm": 0.09732984006404877, + "learning_rate": 0.002, + "loss": 2.3397, + "step": 397360 + }, + { + "epoch": 1.5361212908413355, + "grad_norm": 0.10375712811946869, + "learning_rate": 0.002, + "loss": 2.3327, + "step": 397370 + }, + { + "epoch": 1.5361599480447188, + "grad_norm": 0.13661228120326996, + "learning_rate": 0.002, + "loss": 2.3512, + "step": 397380 + }, + { + "epoch": 1.536198605248102, + "grad_norm": 0.09348346292972565, + "learning_rate": 0.002, + "loss": 2.3414, + "step": 397390 + }, + { + "epoch": 1.5362372624514853, + "grad_norm": 0.10297410190105438, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 397400 + }, + { + "epoch": 1.5362759196548685, + "grad_norm": 0.10874597728252411, + "learning_rate": 0.002, + "loss": 2.3299, + "step": 397410 + }, + { + "epoch": 1.5363145768582518, + "grad_norm": 0.12305815517902374, + "learning_rate": 0.002, + "loss": 2.3135, + "step": 397420 + }, + { + "epoch": 1.536353234061635, + "grad_norm": 0.09961279481649399, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 397430 + }, + { + "epoch": 1.5363918912650183, + "grad_norm": 0.0989786684513092, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 397440 + }, + { + "epoch": 1.5364305484684015, + "grad_norm": 0.10146909207105637, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 397450 + }, + { + "epoch": 1.5364692056717848, + "grad_norm": 0.10440550744533539, + "learning_rate": 0.002, + "loss": 2.3449, + "step": 397460 + }, + { + "epoch": 1.536507862875168, + "grad_norm": 0.11641864478588104, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 397470 + }, + { + "epoch": 1.5365465200785513, + "grad_norm": 0.0965433195233345, + "learning_rate": 0.002, + "loss": 2.3154, + "step": 397480 + }, + { + "epoch": 1.5365851772819346, + "grad_norm": 0.1075579971075058, + "learning_rate": 0.002, + "loss": 2.3489, + "step": 397490 + }, + { + "epoch": 1.536623834485318, + "grad_norm": 0.09854072332382202, + "learning_rate": 0.002, + "loss": 2.3349, + "step": 397500 + }, + { + "epoch": 1.5366624916887013, + "grad_norm": 0.08956795185804367, + "learning_rate": 0.002, + "loss": 2.3406, + "step": 397510 + }, + { + "epoch": 1.5367011488920845, + "grad_norm": 0.11217138916254044, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 397520 + }, + { + "epoch": 1.5367398060954678, + "grad_norm": 0.11101838946342468, + "learning_rate": 0.002, + "loss": 2.319, + "step": 397530 + }, + { + "epoch": 1.5367784632988513, + "grad_norm": 0.10932407528162003, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 397540 + }, + { + "epoch": 1.5368171205022345, + "grad_norm": 0.10210220515727997, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 397550 + }, + { + "epoch": 1.5368557777056178, + "grad_norm": 0.11524409055709839, + "learning_rate": 0.002, + "loss": 2.3266, + "step": 397560 + }, + { + "epoch": 1.536894434909001, + "grad_norm": 0.1281406432390213, + "learning_rate": 0.002, + "loss": 2.3242, + "step": 397570 + }, + { + "epoch": 1.5369330921123843, + "grad_norm": 0.09939172863960266, + "learning_rate": 0.002, + "loss": 2.3295, + "step": 397580 + }, + { + "epoch": 1.5369717493157675, + "grad_norm": 0.09247737377882004, + "learning_rate": 0.002, + "loss": 2.337, + "step": 397590 + }, + { + "epoch": 1.5370104065191508, + "grad_norm": 0.09294746816158295, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 397600 + }, + { + "epoch": 1.537049063722534, + "grad_norm": 0.13566049933433533, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 397610 + }, + { + "epoch": 1.5370877209259173, + "grad_norm": 0.10132917016744614, + "learning_rate": 0.002, + "loss": 2.3267, + "step": 397620 + }, + { + "epoch": 1.5371263781293005, + "grad_norm": 0.09911826252937317, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 397630 + }, + { + "epoch": 1.5371650353326838, + "grad_norm": 0.0962437316775322, + "learning_rate": 0.002, + "loss": 2.3153, + "step": 397640 + }, + { + "epoch": 1.537203692536067, + "grad_norm": 0.11543872952461243, + "learning_rate": 0.002, + "loss": 2.3324, + "step": 397650 + }, + { + "epoch": 1.5372423497394503, + "grad_norm": 0.10791458934545517, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 397660 + }, + { + "epoch": 1.5372810069428338, + "grad_norm": 0.09911391139030457, + "learning_rate": 0.002, + "loss": 2.3562, + "step": 397670 + }, + { + "epoch": 1.537319664146217, + "grad_norm": 0.10981152206659317, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 397680 + }, + { + "epoch": 1.5373583213496003, + "grad_norm": 0.10167837142944336, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 397690 + }, + { + "epoch": 1.5373969785529835, + "grad_norm": 0.09902778267860413, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 397700 + }, + { + "epoch": 1.537435635756367, + "grad_norm": 0.09959257394075394, + "learning_rate": 0.002, + "loss": 2.3342, + "step": 397710 + }, + { + "epoch": 1.5374742929597502, + "grad_norm": 0.1057918518781662, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 397720 + }, + { + "epoch": 1.5375129501631335, + "grad_norm": 0.13278275728225708, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 397730 + }, + { + "epoch": 1.5375516073665167, + "grad_norm": 0.11418651789426804, + "learning_rate": 0.002, + "loss": 2.3437, + "step": 397740 + }, + { + "epoch": 1.5375902645699, + "grad_norm": 0.10052768886089325, + "learning_rate": 0.002, + "loss": 2.3315, + "step": 397750 + }, + { + "epoch": 1.5376289217732833, + "grad_norm": 0.09811482578516006, + "learning_rate": 0.002, + "loss": 2.3265, + "step": 397760 + }, + { + "epoch": 1.5376675789766665, + "grad_norm": 0.15639714896678925, + "learning_rate": 0.002, + "loss": 2.3134, + "step": 397770 + }, + { + "epoch": 1.5377062361800498, + "grad_norm": 0.10803427547216415, + "learning_rate": 0.002, + "loss": 2.3346, + "step": 397780 + }, + { + "epoch": 1.537744893383433, + "grad_norm": 0.09680962562561035, + "learning_rate": 0.002, + "loss": 2.3347, + "step": 397790 + }, + { + "epoch": 1.5377835505868163, + "grad_norm": 0.10290376096963882, + "learning_rate": 0.002, + "loss": 2.3289, + "step": 397800 + }, + { + "epoch": 1.5378222077901995, + "grad_norm": 0.09839563071727753, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 397810 + }, + { + "epoch": 1.5378608649935828, + "grad_norm": 0.08913205564022064, + "learning_rate": 0.002, + "loss": 2.3155, + "step": 397820 + }, + { + "epoch": 1.537899522196966, + "grad_norm": 0.10999598354101181, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 397830 + }, + { + "epoch": 1.5379381794003495, + "grad_norm": 0.13350215554237366, + "learning_rate": 0.002, + "loss": 2.3356, + "step": 397840 + }, + { + "epoch": 1.5379768366037327, + "grad_norm": 0.10599138587713242, + "learning_rate": 0.002, + "loss": 2.3391, + "step": 397850 + }, + { + "epoch": 1.538015493807116, + "grad_norm": 0.12027101218700409, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 397860 + }, + { + "epoch": 1.5380541510104992, + "grad_norm": 0.09793560951948166, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 397870 + }, + { + "epoch": 1.5380928082138827, + "grad_norm": 0.09150359034538269, + "learning_rate": 0.002, + "loss": 2.3541, + "step": 397880 + }, + { + "epoch": 1.538131465417266, + "grad_norm": 0.10976289212703705, + "learning_rate": 0.002, + "loss": 2.3281, + "step": 397890 + }, + { + "epoch": 1.5381701226206492, + "grad_norm": 0.10149134695529938, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 397900 + }, + { + "epoch": 1.5382087798240325, + "grad_norm": 0.10179386287927628, + "learning_rate": 0.002, + "loss": 2.3501, + "step": 397910 + }, + { + "epoch": 1.5382474370274157, + "grad_norm": 0.09850426018238068, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 397920 + }, + { + "epoch": 1.538286094230799, + "grad_norm": 0.11354361474514008, + "learning_rate": 0.002, + "loss": 2.3258, + "step": 397930 + }, + { + "epoch": 1.5383247514341822, + "grad_norm": 0.11222802102565765, + "learning_rate": 0.002, + "loss": 2.331, + "step": 397940 + }, + { + "epoch": 1.5383634086375655, + "grad_norm": 0.10113963484764099, + "learning_rate": 0.002, + "loss": 2.3443, + "step": 397950 + }, + { + "epoch": 1.5384020658409487, + "grad_norm": 0.09058237820863724, + "learning_rate": 0.002, + "loss": 2.3398, + "step": 397960 + }, + { + "epoch": 1.538440723044332, + "grad_norm": 0.09591413289308548, + "learning_rate": 0.002, + "loss": 2.3405, + "step": 397970 + }, + { + "epoch": 1.5384793802477152, + "grad_norm": 0.10284294188022614, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 397980 + }, + { + "epoch": 1.5385180374510985, + "grad_norm": 0.11103425920009613, + "learning_rate": 0.002, + "loss": 2.328, + "step": 397990 + }, + { + "epoch": 1.538556694654482, + "grad_norm": 0.10229439288377762, + "learning_rate": 0.002, + "loss": 2.3475, + "step": 398000 + }, + { + "epoch": 1.5385953518578652, + "grad_norm": 0.10271711647510529, + "learning_rate": 0.002, + "loss": 2.3454, + "step": 398010 + }, + { + "epoch": 1.5386340090612485, + "grad_norm": 0.11383017897605896, + "learning_rate": 0.002, + "loss": 2.3114, + "step": 398020 + }, + { + "epoch": 1.5386726662646317, + "grad_norm": 0.09810294210910797, + "learning_rate": 0.002, + "loss": 2.3401, + "step": 398030 + }, + { + "epoch": 1.538711323468015, + "grad_norm": 0.10992488265037537, + "learning_rate": 0.002, + "loss": 2.322, + "step": 398040 + }, + { + "epoch": 1.5387499806713985, + "grad_norm": 0.12305788695812225, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 398050 + }, + { + "epoch": 1.5387886378747817, + "grad_norm": 0.10567454993724823, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 398060 + }, + { + "epoch": 1.538827295078165, + "grad_norm": 0.10405563563108444, + "learning_rate": 0.002, + "loss": 2.3474, + "step": 398070 + }, + { + "epoch": 1.5388659522815482, + "grad_norm": 0.115367092192173, + "learning_rate": 0.002, + "loss": 2.3236, + "step": 398080 + }, + { + "epoch": 1.5389046094849315, + "grad_norm": 0.11122714728116989, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 398090 + }, + { + "epoch": 1.5389432666883147, + "grad_norm": 0.09935643523931503, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 398100 + }, + { + "epoch": 1.538981923891698, + "grad_norm": 0.10723993182182312, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 398110 + }, + { + "epoch": 1.5390205810950812, + "grad_norm": 0.09951582551002502, + "learning_rate": 0.002, + "loss": 2.3373, + "step": 398120 + }, + { + "epoch": 1.5390592382984645, + "grad_norm": 0.12732456624507904, + "learning_rate": 0.002, + "loss": 2.3147, + "step": 398130 + }, + { + "epoch": 1.5390978955018477, + "grad_norm": 0.1021832600235939, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 398140 + }, + { + "epoch": 1.539136552705231, + "grad_norm": 0.11529269069433212, + "learning_rate": 0.002, + "loss": 2.336, + "step": 398150 + }, + { + "epoch": 1.5391752099086142, + "grad_norm": 0.14958423376083374, + "learning_rate": 0.002, + "loss": 2.3253, + "step": 398160 + }, + { + "epoch": 1.5392138671119977, + "grad_norm": 0.09263918548822403, + "learning_rate": 0.002, + "loss": 2.3396, + "step": 398170 + }, + { + "epoch": 1.539252524315381, + "grad_norm": 0.1017642617225647, + "learning_rate": 0.002, + "loss": 2.3296, + "step": 398180 + }, + { + "epoch": 1.5392911815187642, + "grad_norm": 0.09107334166765213, + "learning_rate": 0.002, + "loss": 2.3184, + "step": 398190 + }, + { + "epoch": 1.5393298387221475, + "grad_norm": 0.1224852204322815, + "learning_rate": 0.002, + "loss": 2.3286, + "step": 398200 + }, + { + "epoch": 1.5393684959255307, + "grad_norm": 0.10259949415922165, + "learning_rate": 0.002, + "loss": 2.3319, + "step": 398210 + }, + { + "epoch": 1.5394071531289142, + "grad_norm": 0.1043364554643631, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 398220 + }, + { + "epoch": 1.5394458103322974, + "grad_norm": 0.1084597185254097, + "learning_rate": 0.002, + "loss": 2.3208, + "step": 398230 + }, + { + "epoch": 1.5394844675356807, + "grad_norm": 0.10298971086740494, + "learning_rate": 0.002, + "loss": 2.3357, + "step": 398240 + }, + { + "epoch": 1.539523124739064, + "grad_norm": 0.10925693809986115, + "learning_rate": 0.002, + "loss": 2.3168, + "step": 398250 + }, + { + "epoch": 1.5395617819424472, + "grad_norm": 0.09399773925542831, + "learning_rate": 0.002, + "loss": 2.3268, + "step": 398260 + }, + { + "epoch": 1.5396004391458304, + "grad_norm": 0.15015877783298492, + "learning_rate": 0.002, + "loss": 2.3459, + "step": 398270 + }, + { + "epoch": 1.5396390963492137, + "grad_norm": 0.0967710092663765, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 398280 + }, + { + "epoch": 1.539677753552597, + "grad_norm": 0.09569672495126724, + "learning_rate": 0.002, + "loss": 2.3211, + "step": 398290 + }, + { + "epoch": 1.5397164107559802, + "grad_norm": 0.09085290133953094, + "learning_rate": 0.002, + "loss": 2.3334, + "step": 398300 + }, + { + "epoch": 1.5397550679593635, + "grad_norm": 0.109792560338974, + "learning_rate": 0.002, + "loss": 2.3402, + "step": 398310 + }, + { + "epoch": 1.5397937251627467, + "grad_norm": 0.11429277807474136, + "learning_rate": 0.002, + "loss": 2.3374, + "step": 398320 + }, + { + "epoch": 1.53983238236613, + "grad_norm": 0.10654623806476593, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 398330 + }, + { + "epoch": 1.5398710395695134, + "grad_norm": 0.09431030601263046, + "learning_rate": 0.002, + "loss": 2.3198, + "step": 398340 + }, + { + "epoch": 1.5399096967728967, + "grad_norm": 0.10289266705513, + "learning_rate": 0.002, + "loss": 2.3355, + "step": 398350 + }, + { + "epoch": 1.53994835397628, + "grad_norm": 0.0963275283575058, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 398360 + }, + { + "epoch": 1.5399870111796632, + "grad_norm": 0.12232542783021927, + "learning_rate": 0.002, + "loss": 2.3416, + "step": 398370 + }, + { + "epoch": 1.5400256683830464, + "grad_norm": 0.09303968399763107, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 398380 + }, + { + "epoch": 1.54006432558643, + "grad_norm": 0.10578244924545288, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 398390 + }, + { + "epoch": 1.5401029827898132, + "grad_norm": 0.0913250744342804, + "learning_rate": 0.002, + "loss": 2.3339, + "step": 398400 + }, + { + "epoch": 1.5401416399931964, + "grad_norm": 0.10749612748622894, + "learning_rate": 0.002, + "loss": 2.3188, + "step": 398410 + }, + { + "epoch": 1.5401802971965797, + "grad_norm": 0.09793798625469208, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 398420 + }, + { + "epoch": 1.540218954399963, + "grad_norm": 0.09848188608884811, + "learning_rate": 0.002, + "loss": 2.3244, + "step": 398430 + }, + { + "epoch": 1.5402576116033462, + "grad_norm": 0.10011995583772659, + "learning_rate": 0.002, + "loss": 2.3371, + "step": 398440 + }, + { + "epoch": 1.5402962688067294, + "grad_norm": 0.08867098391056061, + "learning_rate": 0.002, + "loss": 2.3331, + "step": 398450 + }, + { + "epoch": 1.5403349260101127, + "grad_norm": 0.08888798952102661, + "learning_rate": 0.002, + "loss": 2.3189, + "step": 398460 + }, + { + "epoch": 1.540373583213496, + "grad_norm": 0.09818118065595627, + "learning_rate": 0.002, + "loss": 2.3452, + "step": 398470 + }, + { + "epoch": 1.5404122404168792, + "grad_norm": 0.09578051418066025, + "learning_rate": 0.002, + "loss": 2.3285, + "step": 398480 + }, + { + "epoch": 1.5404508976202624, + "grad_norm": 0.10496256500482559, + "learning_rate": 0.002, + "loss": 2.3167, + "step": 398490 + }, + { + "epoch": 1.5404895548236457, + "grad_norm": 0.09453893452882767, + "learning_rate": 0.002, + "loss": 2.3329, + "step": 398500 + }, + { + "epoch": 1.5405282120270292, + "grad_norm": 0.09974279999732971, + "learning_rate": 0.002, + "loss": 2.32, + "step": 398510 + }, + { + "epoch": 1.5405668692304124, + "grad_norm": 0.09207332134246826, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 398520 + }, + { + "epoch": 1.5406055264337957, + "grad_norm": 0.15341641008853912, + "learning_rate": 0.002, + "loss": 2.3243, + "step": 398530 + }, + { + "epoch": 1.540644183637179, + "grad_norm": 0.10787559300661087, + "learning_rate": 0.002, + "loss": 2.3523, + "step": 398540 + }, + { + "epoch": 1.5406828408405624, + "grad_norm": 0.09427941590547562, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 398550 + }, + { + "epoch": 1.5407214980439456, + "grad_norm": 0.13333727419376373, + "learning_rate": 0.002, + "loss": 2.3325, + "step": 398560 + }, + { + "epoch": 1.540760155247329, + "grad_norm": 0.11192180216312408, + "learning_rate": 0.002, + "loss": 2.3254, + "step": 398570 + }, + { + "epoch": 1.5407988124507122, + "grad_norm": 0.10843715071678162, + "learning_rate": 0.002, + "loss": 2.3125, + "step": 398580 + }, + { + "epoch": 1.5408374696540954, + "grad_norm": 0.10502120107412338, + "learning_rate": 0.002, + "loss": 2.3368, + "step": 398590 + }, + { + "epoch": 1.5408761268574787, + "grad_norm": 0.10552159696817398, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 398600 + }, + { + "epoch": 1.540914784060862, + "grad_norm": 0.11767709255218506, + "learning_rate": 0.002, + "loss": 2.3246, + "step": 398610 + }, + { + "epoch": 1.5409534412642452, + "grad_norm": 0.12128245830535889, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 398620 + }, + { + "epoch": 1.5409920984676284, + "grad_norm": 0.10676980763673782, + "learning_rate": 0.002, + "loss": 2.3164, + "step": 398630 + }, + { + "epoch": 1.5410307556710117, + "grad_norm": 0.08841355890035629, + "learning_rate": 0.002, + "loss": 2.3399, + "step": 398640 + }, + { + "epoch": 1.541069412874395, + "grad_norm": 0.11226193606853485, + "learning_rate": 0.002, + "loss": 2.3419, + "step": 398650 + }, + { + "epoch": 1.5411080700777782, + "grad_norm": 0.13144473731517792, + "learning_rate": 0.002, + "loss": 2.3453, + "step": 398660 + }, + { + "epoch": 1.5411467272811614, + "grad_norm": 0.12197545170783997, + "learning_rate": 0.002, + "loss": 2.3466, + "step": 398670 + }, + { + "epoch": 1.541185384484545, + "grad_norm": 0.10014840960502625, + "learning_rate": 0.002, + "loss": 2.3252, + "step": 398680 + }, + { + "epoch": 1.5412240416879281, + "grad_norm": 0.11811138689517975, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 398690 + }, + { + "epoch": 1.5412626988913114, + "grad_norm": 0.10400564223527908, + "learning_rate": 0.002, + "loss": 2.3255, + "step": 398700 + }, + { + "epoch": 1.5413013560946947, + "grad_norm": 0.1099502369761467, + "learning_rate": 0.002, + "loss": 2.3338, + "step": 398710 + }, + { + "epoch": 1.5413400132980781, + "grad_norm": 0.10907261818647385, + "learning_rate": 0.002, + "loss": 2.3279, + "step": 398720 + }, + { + "epoch": 1.5413786705014614, + "grad_norm": 0.11199428141117096, + "learning_rate": 0.002, + "loss": 2.3119, + "step": 398730 + }, + { + "epoch": 1.5414173277048446, + "grad_norm": 0.10097459703683853, + "learning_rate": 0.002, + "loss": 2.3248, + "step": 398740 + }, + { + "epoch": 1.5414559849082279, + "grad_norm": 0.09460058063268661, + "learning_rate": 0.002, + "loss": 2.3223, + "step": 398750 + }, + { + "epoch": 1.5414946421116111, + "grad_norm": 0.13177794218063354, + "learning_rate": 0.002, + "loss": 2.3392, + "step": 398760 + }, + { + "epoch": 1.5415332993149944, + "grad_norm": 0.11397556215524673, + "learning_rate": 0.002, + "loss": 2.3207, + "step": 398770 + }, + { + "epoch": 1.5415719565183776, + "grad_norm": 0.09486106038093567, + "learning_rate": 0.002, + "loss": 2.317, + "step": 398780 + }, + { + "epoch": 1.541610613721761, + "grad_norm": 0.11118344962596893, + "learning_rate": 0.002, + "loss": 2.3234, + "step": 398790 + }, + { + "epoch": 1.5416492709251441, + "grad_norm": 0.13064731657505035, + "learning_rate": 0.002, + "loss": 2.3297, + "step": 398800 + }, + { + "epoch": 1.5416879281285274, + "grad_norm": 0.10388616472482681, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 398810 + }, + { + "epoch": 1.5417265853319106, + "grad_norm": 0.12096244096755981, + "learning_rate": 0.002, + "loss": 2.3358, + "step": 398820 + }, + { + "epoch": 1.541765242535294, + "grad_norm": 0.10955790430307388, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 398830 + }, + { + "epoch": 1.5418038997386772, + "grad_norm": 0.13898706436157227, + "learning_rate": 0.002, + "loss": 2.331, + "step": 398840 + }, + { + "epoch": 1.5418425569420606, + "grad_norm": 0.09149462729692459, + "learning_rate": 0.002, + "loss": 2.3139, + "step": 398850 + }, + { + "epoch": 1.5418812141454439, + "grad_norm": 0.12002673745155334, + "learning_rate": 0.002, + "loss": 2.3387, + "step": 398860 + }, + { + "epoch": 1.5419198713488271, + "grad_norm": 0.09715425968170166, + "learning_rate": 0.002, + "loss": 2.3403, + "step": 398870 + }, + { + "epoch": 1.5419585285522104, + "grad_norm": 0.11434023827314377, + "learning_rate": 0.002, + "loss": 2.327, + "step": 398880 + }, + { + "epoch": 1.5419971857555939, + "grad_norm": 0.10705401003360748, + "learning_rate": 0.002, + "loss": 2.3436, + "step": 398890 + }, + { + "epoch": 1.542035842958977, + "grad_norm": 0.10347422957420349, + "learning_rate": 0.002, + "loss": 2.3118, + "step": 398900 + }, + { + "epoch": 1.5420745001623604, + "grad_norm": 0.10077045857906342, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 398910 + }, + { + "epoch": 1.5421131573657436, + "grad_norm": 0.11980032175779343, + "learning_rate": 0.002, + "loss": 2.3213, + "step": 398920 + }, + { + "epoch": 1.5421518145691269, + "grad_norm": 0.08976440131664276, + "learning_rate": 0.002, + "loss": 2.328, + "step": 398930 + }, + { + "epoch": 1.5421904717725101, + "grad_norm": 0.08498923480510712, + "learning_rate": 0.002, + "loss": 2.3384, + "step": 398940 + }, + { + "epoch": 1.5422291289758934, + "grad_norm": 0.10932603478431702, + "learning_rate": 0.002, + "loss": 2.3112, + "step": 398950 + }, + { + "epoch": 1.5422677861792766, + "grad_norm": 0.10000132769346237, + "learning_rate": 0.002, + "loss": 2.3389, + "step": 398960 + }, + { + "epoch": 1.5423064433826599, + "grad_norm": 0.09836205095052719, + "learning_rate": 0.002, + "loss": 2.3269, + "step": 398970 + }, + { + "epoch": 1.5423451005860431, + "grad_norm": 0.10029017180204391, + "learning_rate": 0.002, + "loss": 2.34, + "step": 398980 + }, + { + "epoch": 1.5423837577894264, + "grad_norm": 0.10624997317790985, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 398990 + }, + { + "epoch": 1.5424224149928096, + "grad_norm": 0.1110740602016449, + "learning_rate": 0.002, + "loss": 2.3293, + "step": 399000 + }, + { + "epoch": 1.5424610721961929, + "grad_norm": 0.09967368841171265, + "learning_rate": 0.002, + "loss": 2.3513, + "step": 399010 + }, + { + "epoch": 1.5424997293995764, + "grad_norm": 0.08127487450838089, + "learning_rate": 0.002, + "loss": 2.3261, + "step": 399020 + }, + { + "epoch": 1.5425383866029596, + "grad_norm": 0.11044812202453613, + "learning_rate": 0.002, + "loss": 2.3247, + "step": 399030 + }, + { + "epoch": 1.5425770438063429, + "grad_norm": 0.10314445942640305, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 399040 + }, + { + "epoch": 1.5426157010097261, + "grad_norm": 0.24173246324062347, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 399050 + }, + { + "epoch": 1.5426543582131096, + "grad_norm": 0.1080498993396759, + "learning_rate": 0.002, + "loss": 2.335, + "step": 399060 + }, + { + "epoch": 1.5426930154164928, + "grad_norm": 0.12131674587726593, + "learning_rate": 0.002, + "loss": 2.3251, + "step": 399070 + }, + { + "epoch": 1.542731672619876, + "grad_norm": 0.09877556562423706, + "learning_rate": 0.002, + "loss": 2.3221, + "step": 399080 + }, + { + "epoch": 1.5427703298232593, + "grad_norm": 0.10076315701007843, + "learning_rate": 0.002, + "loss": 2.3318, + "step": 399090 + }, + { + "epoch": 1.5428089870266426, + "grad_norm": 0.08925936371088028, + "learning_rate": 0.002, + "loss": 2.3218, + "step": 399100 + }, + { + "epoch": 1.5428476442300258, + "grad_norm": 0.12738700211048126, + "learning_rate": 0.002, + "loss": 2.3305, + "step": 399110 + }, + { + "epoch": 1.542886301433409, + "grad_norm": 0.10984507948160172, + "learning_rate": 0.002, + "loss": 2.3205, + "step": 399120 + }, + { + "epoch": 1.5429249586367924, + "grad_norm": 0.18421679735183716, + "learning_rate": 0.002, + "loss": 2.3249, + "step": 399130 + }, + { + "epoch": 1.5429636158401756, + "grad_norm": 0.11120090633630753, + "learning_rate": 0.002, + "loss": 2.3409, + "step": 399140 + }, + { + "epoch": 1.5430022730435589, + "grad_norm": 0.09735604375600815, + "learning_rate": 0.002, + "loss": 2.3463, + "step": 399150 + }, + { + "epoch": 1.543040930246942, + "grad_norm": 0.09944828599691391, + "learning_rate": 0.002, + "loss": 2.3432, + "step": 399160 + }, + { + "epoch": 1.5430795874503254, + "grad_norm": 0.14178155362606049, + "learning_rate": 0.002, + "loss": 2.3545, + "step": 399170 + }, + { + "epoch": 1.5431182446537086, + "grad_norm": 0.09563390910625458, + "learning_rate": 0.002, + "loss": 2.3227, + "step": 399180 + }, + { + "epoch": 1.543156901857092, + "grad_norm": 0.12702666223049164, + "learning_rate": 0.002, + "loss": 2.3496, + "step": 399190 + }, + { + "epoch": 1.5431955590604753, + "grad_norm": 0.1233954131603241, + "learning_rate": 0.002, + "loss": 2.3362, + "step": 399200 + }, + { + "epoch": 1.5432342162638586, + "grad_norm": 0.09861844778060913, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 399210 + }, + { + "epoch": 1.5432728734672418, + "grad_norm": 0.37939682602882385, + "learning_rate": 0.002, + "loss": 2.3676, + "step": 399220 + }, + { + "epoch": 1.5433115306706253, + "grad_norm": 0.12103582173585892, + "learning_rate": 0.002, + "loss": 2.3678, + "step": 399230 + }, + { + "epoch": 1.5433501878740086, + "grad_norm": 0.08898939937353134, + "learning_rate": 0.002, + "loss": 2.3522, + "step": 399240 + }, + { + "epoch": 1.5433888450773918, + "grad_norm": 0.1117500364780426, + "learning_rate": 0.002, + "loss": 2.3441, + "step": 399250 + }, + { + "epoch": 1.543427502280775, + "grad_norm": 0.1187075525522232, + "learning_rate": 0.002, + "loss": 2.3271, + "step": 399260 + }, + { + "epoch": 1.5434661594841583, + "grad_norm": 0.10672876983880997, + "learning_rate": 0.002, + "loss": 2.3404, + "step": 399270 + }, + { + "epoch": 1.5435048166875416, + "grad_norm": 0.12173470854759216, + "learning_rate": 0.002, + "loss": 2.3284, + "step": 399280 + }, + { + "epoch": 1.5435434738909248, + "grad_norm": 0.10364934802055359, + "learning_rate": 0.002, + "loss": 2.3351, + "step": 399290 + }, + { + "epoch": 1.543582131094308, + "grad_norm": 0.09502959996461868, + "learning_rate": 0.002, + "loss": 2.329, + "step": 399300 + }, + { + "epoch": 1.5436207882976913, + "grad_norm": 0.11555942893028259, + "learning_rate": 0.002, + "loss": 2.3228, + "step": 399310 + }, + { + "epoch": 1.5436594455010746, + "grad_norm": 0.10680690407752991, + "learning_rate": 0.002, + "loss": 2.3372, + "step": 399320 + }, + { + "epoch": 1.5436981027044578, + "grad_norm": 0.11268538981676102, + "learning_rate": 0.002, + "loss": 2.3365, + "step": 399330 + }, + { + "epoch": 1.543736759907841, + "grad_norm": 0.09227351099252701, + "learning_rate": 0.002, + "loss": 2.3301, + "step": 399340 + }, + { + "epoch": 1.5437754171112243, + "grad_norm": 0.09874910861253738, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 399350 + }, + { + "epoch": 1.5438140743146078, + "grad_norm": 0.12444418668746948, + "learning_rate": 0.002, + "loss": 2.3232, + "step": 399360 + }, + { + "epoch": 1.543852731517991, + "grad_norm": 0.10006395727396011, + "learning_rate": 0.002, + "loss": 2.3535, + "step": 399370 + }, + { + "epoch": 1.5438913887213743, + "grad_norm": 0.10121716558933258, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 399380 + }, + { + "epoch": 1.5439300459247576, + "grad_norm": 0.32989785075187683, + "learning_rate": 0.002, + "loss": 2.3359, + "step": 399390 + }, + { + "epoch": 1.543968703128141, + "grad_norm": 0.10850565880537033, + "learning_rate": 0.002, + "loss": 2.3344, + "step": 399400 + }, + { + "epoch": 1.5440073603315243, + "grad_norm": 0.10429178178310394, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 399410 + }, + { + "epoch": 1.5440460175349076, + "grad_norm": 0.09502607583999634, + "learning_rate": 0.002, + "loss": 2.3354, + "step": 399420 + }, + { + "epoch": 1.5440846747382908, + "grad_norm": 0.09839571267366409, + "learning_rate": 0.002, + "loss": 2.3239, + "step": 399430 + }, + { + "epoch": 1.544123331941674, + "grad_norm": 0.1268584430217743, + "learning_rate": 0.002, + "loss": 2.3434, + "step": 399440 + }, + { + "epoch": 1.5441619891450573, + "grad_norm": 0.10402169078588486, + "learning_rate": 0.002, + "loss": 2.3471, + "step": 399450 + }, + { + "epoch": 1.5442006463484406, + "grad_norm": 0.11737069487571716, + "learning_rate": 0.002, + "loss": 2.3291, + "step": 399460 + }, + { + "epoch": 1.5442393035518238, + "grad_norm": 0.09239500761032104, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 399470 + }, + { + "epoch": 1.544277960755207, + "grad_norm": 0.11623097956180573, + "learning_rate": 0.002, + "loss": 2.346, + "step": 399480 + }, + { + "epoch": 1.5443166179585903, + "grad_norm": 0.09053388983011246, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 399490 + }, + { + "epoch": 1.5443552751619736, + "grad_norm": 0.10397826880216599, + "learning_rate": 0.002, + "loss": 2.329, + "step": 399500 + }, + { + "epoch": 1.5443939323653568, + "grad_norm": 0.11161120235919952, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 399510 + }, + { + "epoch": 1.54443258956874, + "grad_norm": 0.11719498038291931, + "learning_rate": 0.002, + "loss": 2.3298, + "step": 399520 + }, + { + "epoch": 1.5444712467721236, + "grad_norm": 0.1244446188211441, + "learning_rate": 0.002, + "loss": 2.3275, + "step": 399530 + }, + { + "epoch": 1.5445099039755068, + "grad_norm": 0.10088226944208145, + "learning_rate": 0.002, + "loss": 2.338, + "step": 399540 + }, + { + "epoch": 1.54454856117889, + "grad_norm": 0.10080454498529434, + "learning_rate": 0.002, + "loss": 2.3314, + "step": 399550 + }, + { + "epoch": 1.5445872183822733, + "grad_norm": 0.09458685666322708, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 399560 + }, + { + "epoch": 1.5446258755856568, + "grad_norm": 0.1011287122964859, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 399570 + }, + { + "epoch": 1.54466453278904, + "grad_norm": 0.1155065968632698, + "learning_rate": 0.002, + "loss": 2.3448, + "step": 399580 + }, + { + "epoch": 1.5447031899924233, + "grad_norm": 0.0928688570857048, + "learning_rate": 0.002, + "loss": 2.3263, + "step": 399590 + }, + { + "epoch": 1.5447418471958065, + "grad_norm": 0.13152991235256195, + "learning_rate": 0.002, + "loss": 2.3456, + "step": 399600 + }, + { + "epoch": 1.5447805043991898, + "grad_norm": 0.09968362748622894, + "learning_rate": 0.002, + "loss": 2.3306, + "step": 399610 + }, + { + "epoch": 1.544819161602573, + "grad_norm": 0.10470478981733322, + "learning_rate": 0.002, + "loss": 2.3394, + "step": 399620 + }, + { + "epoch": 1.5448578188059563, + "grad_norm": 0.1123451516032219, + "learning_rate": 0.002, + "loss": 2.3224, + "step": 399630 + }, + { + "epoch": 1.5448964760093395, + "grad_norm": 0.11159180849790573, + "learning_rate": 0.002, + "loss": 2.325, + "step": 399640 + }, + { + "epoch": 1.5449351332127228, + "grad_norm": 0.0886734277009964, + "learning_rate": 0.002, + "loss": 2.3418, + "step": 399650 + }, + { + "epoch": 1.544973790416106, + "grad_norm": 0.10552497208118439, + "learning_rate": 0.002, + "loss": 2.3302, + "step": 399660 + }, + { + "epoch": 1.5450124476194893, + "grad_norm": 0.10136623680591583, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 399670 + }, + { + "epoch": 1.5450511048228726, + "grad_norm": 0.09759674221277237, + "learning_rate": 0.002, + "loss": 2.3377, + "step": 399680 + }, + { + "epoch": 1.5450897620262558, + "grad_norm": 0.1186317577958107, + "learning_rate": 0.002, + "loss": 2.3482, + "step": 399690 + }, + { + "epoch": 1.5451284192296393, + "grad_norm": 0.10864102095365524, + "learning_rate": 0.002, + "loss": 2.326, + "step": 399700 + }, + { + "epoch": 1.5451670764330225, + "grad_norm": 0.11893132328987122, + "learning_rate": 0.002, + "loss": 2.3152, + "step": 399710 + }, + { + "epoch": 1.5452057336364058, + "grad_norm": 0.109439916908741, + "learning_rate": 0.002, + "loss": 2.3408, + "step": 399720 + }, + { + "epoch": 1.545244390839789, + "grad_norm": 0.10437440127134323, + "learning_rate": 0.002, + "loss": 2.3165, + "step": 399730 + }, + { + "epoch": 1.5452830480431725, + "grad_norm": 0.09925030171871185, + "learning_rate": 0.002, + "loss": 2.3278, + "step": 399740 + }, + { + "epoch": 1.5453217052465558, + "grad_norm": 0.09538068622350693, + "learning_rate": 0.002, + "loss": 2.3163, + "step": 399750 + }, + { + "epoch": 1.545360362449939, + "grad_norm": 0.09987644851207733, + "learning_rate": 0.002, + "loss": 2.3424, + "step": 399760 + }, + { + "epoch": 1.5453990196533223, + "grad_norm": 0.2534315586090088, + "learning_rate": 0.002, + "loss": 2.3256, + "step": 399770 + }, + { + "epoch": 1.5454376768567055, + "grad_norm": 0.10734555870294571, + "learning_rate": 0.002, + "loss": 2.3369, + "step": 399780 + }, + { + "epoch": 1.5454763340600888, + "grad_norm": 0.09567726403474808, + "learning_rate": 0.002, + "loss": 2.3348, + "step": 399790 + }, + { + "epoch": 1.545514991263472, + "grad_norm": 0.10363011807203293, + "learning_rate": 0.002, + "loss": 2.3337, + "step": 399800 + }, + { + "epoch": 1.5455536484668553, + "grad_norm": 0.10830151289701462, + "learning_rate": 0.002, + "loss": 2.338, + "step": 399810 + }, + { + "epoch": 1.5455923056702385, + "grad_norm": 0.09303846955299377, + "learning_rate": 0.002, + "loss": 2.3353, + "step": 399820 + }, + { + "epoch": 1.5456309628736218, + "grad_norm": 0.08819768577814102, + "learning_rate": 0.002, + "loss": 2.3363, + "step": 399830 + }, + { + "epoch": 1.545669620077005, + "grad_norm": 0.09079156070947647, + "learning_rate": 0.002, + "loss": 2.3328, + "step": 399840 + }, + { + "epoch": 1.5457082772803883, + "grad_norm": 0.11149446666240692, + "learning_rate": 0.002, + "loss": 2.3422, + "step": 399850 + }, + { + "epoch": 1.5457469344837715, + "grad_norm": 0.5256873965263367, + "learning_rate": 0.002, + "loss": 2.3259, + "step": 399860 + }, + { + "epoch": 1.545785591687155, + "grad_norm": 0.1418309509754181, + "learning_rate": 0.002, + "loss": 2.3457, + "step": 399870 + }, + { + "epoch": 1.5458242488905383, + "grad_norm": 0.3079683482646942, + "learning_rate": 0.002, + "loss": 2.3274, + "step": 399880 + }, + { + "epoch": 1.5458629060939215, + "grad_norm": 0.1174551472067833, + "learning_rate": 0.002, + "loss": 2.3382, + "step": 399890 + }, + { + "epoch": 1.5459015632973048, + "grad_norm": 0.10708323121070862, + "learning_rate": 0.002, + "loss": 2.3283, + "step": 399900 + }, + { + "epoch": 1.5459402205006882, + "grad_norm": 0.18589606881141663, + "learning_rate": 0.002, + "loss": 2.3578, + "step": 399910 + }, + { + "epoch": 1.5459788777040715, + "grad_norm": 0.12244658917188644, + "learning_rate": 0.002, + "loss": 2.3288, + "step": 399920 + }, + { + "epoch": 1.5460175349074547, + "grad_norm": 0.10290535539388657, + "learning_rate": 0.002, + "loss": 2.3307, + "step": 399930 + }, + { + "epoch": 1.546056192110838, + "grad_norm": 0.10942217707633972, + "learning_rate": 0.002, + "loss": 2.3483, + "step": 399940 + }, + { + "epoch": 1.5460948493142213, + "grad_norm": 0.10202532261610031, + "learning_rate": 0.002, + "loss": 2.3316, + "step": 399950 + }, + { + "epoch": 1.5461335065176045, + "grad_norm": 0.09840740263462067, + "learning_rate": 0.002, + "loss": 2.3317, + "step": 399960 + }, + { + "epoch": 1.5461721637209878, + "grad_norm": 0.11503361165523529, + "learning_rate": 0.002, + "loss": 2.3333, + "step": 399970 + }, + { + "epoch": 1.546210820924371, + "grad_norm": 0.10702019184827805, + "learning_rate": 0.002, + "loss": 2.3352, + "step": 399980 + }, + { + "epoch": 1.5462494781277543, + "grad_norm": 0.4364302456378937, + "learning_rate": 0.002, + "loss": 2.3415, + "step": 399990 + }, + { + "epoch": 1.5462881353311375, + "grad_norm": 0.1672603040933609, + "learning_rate": 0.002, + "loss": 2.3276, + "step": 400000 + }, + { + "epoch": 1.5463267925345208, + "grad_norm": 0.09613791108131409, + "learning_rate": 0.00198, + "loss": 2.3347, + "step": 400010 + }, + { + "epoch": 1.546365449737904, + "grad_norm": 0.12125419080257416, + "learning_rate": 0.001971715728752538, + "loss": 2.3203, + "step": 400020 + }, + { + "epoch": 1.5464041069412875, + "grad_norm": 0.10514193773269653, + "learning_rate": 0.0019653589838486227, + "loss": 2.3491, + "step": 400030 + }, + { + "epoch": 1.5464427641446707, + "grad_norm": 0.1063866913318634, + "learning_rate": 0.00196, + "loss": 2.3327, + "step": 400040 + }, + { + "epoch": 1.546481421348054, + "grad_norm": 0.09683094173669815, + "learning_rate": 0.001955278640450004, + "loss": 2.3387, + "step": 400050 + }, + { + "epoch": 1.5465200785514372, + "grad_norm": 0.10151323676109314, + "learning_rate": 0.0019510102051443366, + "loss": 2.3366, + "step": 400060 + }, + { + "epoch": 1.5465587357548205, + "grad_norm": 0.10045294463634491, + "learning_rate": 0.0019470849737787082, + "loss": 2.3392, + "step": 400070 + }, + { + "epoch": 1.546597392958204, + "grad_norm": 0.10340237617492676, + "learning_rate": 0.0019434314575050762, + "loss": 2.3339, + "step": 400080 + }, + { + "epoch": 1.5466360501615872, + "grad_norm": 0.1004454717040062, + "learning_rate": 0.0019399999999999999, + "loss": 2.326, + "step": 400090 + }, + { + "epoch": 1.5466747073649705, + "grad_norm": 0.09741810709238052, + "learning_rate": 0.0019367544467966324, + "loss": 2.3313, + "step": 400100 + }, + { + "epoch": 1.5467133645683537, + "grad_norm": 0.10408639907836914, + "learning_rate": 0.0019336675041928919, + "loss": 2.3397, + "step": 400110 + }, + { + "epoch": 1.546752021771737, + "grad_norm": 0.11467229574918747, + "learning_rate": 0.001930717967697245, + "loss": 2.3468, + "step": 400120 + }, + { + "epoch": 1.5467906789751202, + "grad_norm": 0.102940134704113, + "learning_rate": 0.00192788897449072, + "loss": 2.3373, + "step": 400130 + }, + { + "epoch": 1.5468293361785035, + "grad_norm": 0.09272021800279617, + "learning_rate": 0.0019251668522645212, + "loss": 2.3421, + "step": 400140 + }, + { + "epoch": 1.5468679933818867, + "grad_norm": 0.09395978599786758, + "learning_rate": 0.0019225403330758518, + "loss": 2.3284, + "step": 400150 + }, + { + "epoch": 1.54690665058527, + "grad_norm": 0.10013695806264877, + "learning_rate": 0.00192, + "loss": 2.3276, + "step": 400160 + }, + { + "epoch": 1.5469453077886532, + "grad_norm": 0.10074548423290253, + "learning_rate": 0.0019175378874876467, + "loss": 2.3231, + "step": 400170 + }, + { + "epoch": 1.5469839649920365, + "grad_norm": 0.10383658856153488, + "learning_rate": 0.0019151471862576144, + "loss": 2.3379, + "step": 400180 + }, + { + "epoch": 1.5470226221954197, + "grad_norm": 0.11273407936096191, + "learning_rate": 0.0019128220211291867, + "loss": 2.3302, + "step": 400190 + }, + { + "epoch": 1.5470612793988032, + "grad_norm": 0.10116290301084518, + "learning_rate": 0.0019105572809000086, + "loss": 2.336, + "step": 400200 + }, + { + "epoch": 1.5470999366021865, + "grad_norm": 0.0944078117609024, + "learning_rate": 0.0019083484861008833, + "loss": 2.3357, + "step": 400210 + }, + { + "epoch": 1.5471385938055697, + "grad_norm": 0.09677086025476456, + "learning_rate": 0.0019061916848035314, + "loss": 2.3129, + "step": 400220 + }, + { + "epoch": 1.547177251008953, + "grad_norm": 0.09968365728855133, + "learning_rate": 0.0019040833695337456, + "loss": 2.3417, + "step": 400230 + }, + { + "epoch": 1.5472159082123362, + "grad_norm": 0.10656339675188065, + "learning_rate": 0.001902020410288673, + "loss": 2.3373, + "step": 400240 + }, + { + "epoch": 1.5472545654157197, + "grad_norm": 0.11012217402458191, + "learning_rate": 0.0019, + "loss": 2.3491, + "step": 400250 + }, + { + "epoch": 1.547293222619103, + "grad_norm": 0.08270913362503052, + "learning_rate": 0.0018980196097281444, + "loss": 2.3177, + "step": 400260 + }, + { + "epoch": 1.5473318798224862, + "grad_norm": 0.09015677869319916, + "learning_rate": 0.0018960769515458673, + "loss": 2.3248, + "step": 400270 + }, + { + "epoch": 1.5473705370258695, + "grad_norm": 0.10345932841300964, + "learning_rate": 0.0018941699475574164, + "loss": 2.3395, + "step": 400280 + }, + { + "epoch": 1.5474091942292527, + "grad_norm": 0.10158301144838333, + "learning_rate": 0.00189229670385731, + "loss": 2.3251, + "step": 400290 + }, + { + "epoch": 1.547447851432636, + "grad_norm": 0.09819726645946503, + "learning_rate": 0.0018904554884989668, + "loss": 2.3319, + "step": 400300 + }, + { + "epoch": 1.5474865086360192, + "grad_norm": 0.10514269769191742, + "learning_rate": 0.0018886447127433998, + "loss": 2.3271, + "step": 400310 + }, + { + "epoch": 1.5475251658394025, + "grad_norm": 0.11034499853849411, + "learning_rate": 0.0018868629150101525, + "loss": 2.3264, + "step": 400320 + }, + { + "epoch": 1.5475638230427857, + "grad_norm": 0.09889666736125946, + "learning_rate": 0.0018851087470692395, + "loss": 2.3236, + "step": 400330 + }, + { + "epoch": 1.547602480246169, + "grad_norm": 0.09816837310791016, + "learning_rate": 0.001883380962103094, + "loss": 2.3293, + "step": 400340 + }, + { + "epoch": 1.5476411374495522, + "grad_norm": 0.10355667024850845, + "learning_rate": 0.0018816784043380076, + "loss": 2.3193, + "step": 400350 + }, + { + "epoch": 1.5476797946529355, + "grad_norm": 0.10169237107038498, + "learning_rate": 0.00188, + "loss": 2.3178, + "step": 400360 + }, + { + "epoch": 1.547718451856319, + "grad_norm": 0.10110478848218918, + "learning_rate": 0.0018783447493940356, + "loss": 2.3239, + "step": 400370 + }, + { + "epoch": 1.5477571090597022, + "grad_norm": 0.10995251685380936, + "learning_rate": 0.0018767117199406204, + "loss": 2.3166, + "step": 400380 + }, + { + "epoch": 1.5477957662630855, + "grad_norm": 0.10122602432966232, + "learning_rate": 0.0018751000400320321, + "loss": 2.3247, + "step": 400390 + }, + { + "epoch": 1.5478344234664687, + "grad_norm": 0.11095928400754929, + "learning_rate": 0.001873508893593265, + "loss": 2.3257, + "step": 400400 + }, + { + "epoch": 1.5478730806698522, + "grad_norm": 0.11088376492261887, + "learning_rate": 0.001871937515251343, + "loss": 2.3242, + "step": 400410 + }, + { + "epoch": 1.5479117378732354, + "grad_norm": 0.10559550672769547, + "learning_rate": 0.0018703851860318427, + "loss": 2.3357, + "step": 400420 + }, + { + "epoch": 1.5479503950766187, + "grad_norm": 0.0967123806476593, + "learning_rate": 0.00186885122951396, + "loss": 2.31, + "step": 400430 + }, + { + "epoch": 1.547989052280002, + "grad_norm": 0.10819984972476959, + "learning_rate": 0.0018673350083857842, + "loss": 2.3237, + "step": 400440 + }, + { + "epoch": 1.5480277094833852, + "grad_norm": 0.08791578561067581, + "learning_rate": 0.0018658359213500127, + "loss": 2.3294, + "step": 400450 + }, + { + "epoch": 1.5480663666867684, + "grad_norm": 0.10057539492845535, + "learning_rate": 0.0018643534003374947, + "loss": 2.3184, + "step": 400460 + }, + { + "epoch": 1.5481050238901517, + "grad_norm": 0.10410565137863159, + "learning_rate": 0.0018628869079919791, + "loss": 2.3487, + "step": 400470 + }, + { + "epoch": 1.548143681093535, + "grad_norm": 0.10269663482904434, + "learning_rate": 0.0018614359353944898, + "loss": 2.3307, + "step": 400480 + }, + { + "epoch": 1.5481823382969182, + "grad_norm": 0.10686153173446655, + "learning_rate": 0.00186, + "loss": 2.3123, + "step": 400490 + }, + { + "epoch": 1.5482209955003015, + "grad_norm": 0.0893033891916275, + "learning_rate": 0.0018585786437626906, + "loss": 2.3257, + "step": 400500 + }, + { + "epoch": 1.5482596527036847, + "grad_norm": 0.10048429667949677, + "learning_rate": 0.001857171431429143, + "loss": 2.3426, + "step": 400510 + }, + { + "epoch": 1.548298309907068, + "grad_norm": 0.1200726330280304, + "learning_rate": 0.0018557779489814404, + "loss": 2.3116, + "step": 400520 + }, + { + "epoch": 1.5483369671104512, + "grad_norm": 0.09477726370096207, + "learning_rate": 0.0018543978022143898, + "loss": 2.3486, + "step": 400530 + }, + { + "epoch": 1.5483756243138347, + "grad_norm": 0.09153016656637192, + "learning_rate": 0.0018530306154330095, + "loss": 2.3381, + "step": 400540 + }, + { + "epoch": 1.548414281517218, + "grad_norm": 0.09223145991563797, + "learning_rate": 0.0018516760302580869, + "loss": 2.3146, + "step": 400550 + }, + { + "epoch": 1.5484529387206012, + "grad_norm": 0.09365306049585342, + "learning_rate": 0.0018503337045290423, + "loss": 2.3223, + "step": 400560 + }, + { + "epoch": 1.5484915959239844, + "grad_norm": 0.09258891642093658, + "learning_rate": 0.001849003311294585, + "loss": 2.3144, + "step": 400570 + }, + { + "epoch": 1.548530253127368, + "grad_norm": 0.09590215235948563, + "learning_rate": 0.001847684537882722, + "loss": 2.3272, + "step": 400580 + }, + { + "epoch": 1.5485689103307512, + "grad_norm": 0.11163296550512314, + "learning_rate": 0.0018463770850426278, + "loss": 2.3233, + "step": 400590 + }, + { + "epoch": 1.5486075675341344, + "grad_norm": 0.08850311487913132, + "learning_rate": 0.0018450806661517035, + "loss": 2.3218, + "step": 400600 + }, + { + "epoch": 1.5486462247375177, + "grad_norm": 0.1200372651219368, + "learning_rate": 0.001843795006481867, + "loss": 2.3257, + "step": 400610 + }, + { + "epoch": 1.548684881940901, + "grad_norm": 0.10512597113847733, + "learning_rate": 0.0018425198425197637, + "loss": 2.3318, + "step": 400620 + }, + { + "epoch": 1.5487235391442842, + "grad_norm": 0.0931624174118042, + "learning_rate": 0.0018412549213361244, + "loss": 2.3145, + "step": 400630 + }, + { + "epoch": 1.5487621963476674, + "grad_norm": 0.08724930137395859, + "learning_rate": 0.00184, + "loss": 2.3231, + "step": 400640 + }, + { + "epoch": 1.5488008535510507, + "grad_norm": 0.13431008160114288, + "learning_rate": 0.001838754845034029, + "loss": 2.3179, + "step": 400650 + }, + { + "epoch": 1.548839510754434, + "grad_norm": 0.11374661326408386, + "learning_rate": 0.0018375192319072808, + "loss": 2.3164, + "step": 400660 + }, + { + "epoch": 1.5488781679578172, + "grad_norm": 0.08780115097761154, + "learning_rate": 0.0018362929445625512, + "loss": 2.3131, + "step": 400670 + }, + { + "epoch": 1.5489168251612004, + "grad_norm": 0.11381842941045761, + "learning_rate": 0.0018350757749752936, + "loss": 2.324, + "step": 400680 + }, + { + "epoch": 1.5489554823645837, + "grad_norm": 0.09610319882631302, + "learning_rate": 0.0018338675227416387, + "loss": 2.3059, + "step": 400690 + }, + { + "epoch": 1.548994139567967, + "grad_norm": 0.10623107850551605, + "learning_rate": 0.001832667994693185, + "loss": 2.3182, + "step": 400700 + }, + { + "epoch": 1.5490327967713504, + "grad_norm": 0.10675205290317535, + "learning_rate": 0.0018314770045364727, + "loss": 2.3274, + "step": 400710 + }, + { + "epoch": 1.5490714539747337, + "grad_norm": 0.09334469586610794, + "learning_rate": 0.0018302943725152286, + "loss": 2.3179, + "step": 400720 + }, + { + "epoch": 1.549110111178117, + "grad_norm": 0.08830467611551285, + "learning_rate": 0.0018291199250936494, + "loss": 2.3191, + "step": 400730 + }, + { + "epoch": 1.5491487683815002, + "grad_norm": 0.09757346659898758, + "learning_rate": 0.0018279534946591474, + "loss": 2.3117, + "step": 400740 + }, + { + "epoch": 1.5491874255848836, + "grad_norm": 0.09814561903476715, + "learning_rate": 0.0018267949192431123, + "loss": 2.313, + "step": 400750 + }, + { + "epoch": 1.549226082788267, + "grad_norm": 0.11958609521389008, + "learning_rate": 0.0018256440422583732, + "loss": 2.3135, + "step": 400760 + }, + { + "epoch": 1.5492647399916502, + "grad_norm": 0.09650695323944092, + "learning_rate": 0.0018245007122521576, + "loss": 2.3139, + "step": 400770 + }, + { + "epoch": 1.5493033971950334, + "grad_norm": 0.09163734316825867, + "learning_rate": 0.0018233647826734433, + "loss": 2.3148, + "step": 400780 + }, + { + "epoch": 1.5493420543984167, + "grad_norm": 0.08900466561317444, + "learning_rate": 0.0018222361116536883, + "loss": 2.322, + "step": 400790 + }, + { + "epoch": 1.5493807116018, + "grad_norm": 0.12508372962474823, + "learning_rate": 0.0018211145618000169, + "loss": 2.3183, + "step": 400800 + }, + { + "epoch": 1.5494193688051832, + "grad_norm": 0.09552543610334396, + "learning_rate": 0.00182, + "loss": 2.3093, + "step": 400810 + }, + { + "epoch": 1.5494580260085664, + "grad_norm": 0.10733716189861298, + "learning_rate": 0.0018188922972372516, + "loss": 2.3158, + "step": 400820 + }, + { + "epoch": 1.5494966832119497, + "grad_norm": 0.09862062335014343, + "learning_rate": 0.001817791328417114, + "loss": 2.3221, + "step": 400830 + }, + { + "epoch": 1.549535340415333, + "grad_norm": 0.08959076553583145, + "learning_rate": 0.0018166969722017666, + "loss": 2.3046, + "step": 400840 + }, + { + "epoch": 1.5495739976187162, + "grad_norm": 0.0865064188838005, + "learning_rate": 0.0018156091108541424, + "loss": 2.3107, + "step": 400850 + }, + { + "epoch": 1.5496126548220994, + "grad_norm": 0.10689177364110947, + "learning_rate": 0.0018145276300900858, + "loss": 2.3257, + "step": 400860 + }, + { + "epoch": 1.5496513120254827, + "grad_norm": 0.10806325078010559, + "learning_rate": 0.0018134524189382239, + "loss": 2.3276, + "step": 400870 + }, + { + "epoch": 1.5496899692288661, + "grad_norm": 0.09787989407777786, + "learning_rate": 0.001812383369607063, + "loss": 2.315, + "step": 400880 + }, + { + "epoch": 1.5497286264322494, + "grad_norm": 0.0919385477900505, + "learning_rate": 0.001811320377358868, + "loss": 2.3366, + "step": 400890 + }, + { + "epoch": 1.5497672836356327, + "grad_norm": 0.09771880507469177, + "learning_rate": 0.0018102633403898974, + "loss": 2.3307, + "step": 400900 + }, + { + "epoch": 1.549805940839016, + "grad_norm": 0.1106455996632576, + "learning_rate": 0.001809212159716611, + "loss": 2.324, + "step": 400910 + }, + { + "epoch": 1.5498445980423994, + "grad_norm": 0.09291300177574158, + "learning_rate": 0.0018081667390674912, + "loss": 2.3126, + "step": 400920 + }, + { + "epoch": 1.5498832552457826, + "grad_norm": 0.11154467612504959, + "learning_rate": 0.001807126984780141, + "loss": 2.3218, + "step": 400930 + }, + { + "epoch": 1.5499219124491659, + "grad_norm": 0.11702819168567657, + "learning_rate": 0.0018060928057033467, + "loss": 2.3198, + "step": 400940 + }, + { + "epoch": 1.5499605696525491, + "grad_norm": 0.08991272002458572, + "learning_rate": 0.001805064113103821, + "loss": 2.3291, + "step": 400950 + }, + { + "epoch": 1.5499992268559324, + "grad_norm": 0.10776031762361526, + "learning_rate": 0.0018040408205773457, + "loss": 2.3171, + "step": 400960 + }, + { + "epoch": 1.5500378840593156, + "grad_norm": 0.09182223677635193, + "learning_rate": 0.001803022843964078, + "loss": 2.3203, + "step": 400970 + }, + { + "epoch": 1.550076541262699, + "grad_norm": 0.11256624758243561, + "learning_rate": 0.0018020101012677667, + "loss": 2.3326, + "step": 400980 + }, + { + "epoch": 1.5501151984660821, + "grad_norm": 0.0917619988322258, + "learning_rate": 0.001801002512578676, + "loss": 2.3226, + "step": 400990 + }, + { + "epoch": 1.5501538556694654, + "grad_norm": 0.10037446767091751, + "learning_rate": 0.0018000000000000002, + "loss": 2.3168, + "step": 401000 + }, + { + "epoch": 1.5501925128728486, + "grad_norm": 0.09557545930147171, + "learning_rate": 0.001799002487577582, + "loss": 2.3071, + "step": 401010 + }, + { + "epoch": 1.550231170076232, + "grad_norm": 0.08818859606981277, + "learning_rate": 0.0017980099012327585, + "loss": 2.3239, + "step": 401020 + }, + { + "epoch": 1.5502698272796152, + "grad_norm": 0.10895640403032303, + "learning_rate": 0.0017970221686981556, + "loss": 2.3325, + "step": 401030 + }, + { + "epoch": 1.5503084844829984, + "grad_norm": 0.10136058926582336, + "learning_rate": 0.0017960392194562886, + "loss": 2.3291, + "step": 401040 + }, + { + "epoch": 1.5503471416863819, + "grad_norm": 0.09887836873531342, + "learning_rate": 0.001795060984680808, + "loss": 2.3346, + "step": 401050 + }, + { + "epoch": 1.5503857988897651, + "grad_norm": 0.1036849245429039, + "learning_rate": 0.00179408739718026, + "loss": 2.3176, + "step": 401060 + }, + { + "epoch": 1.5504244560931484, + "grad_norm": 0.09475167095661163, + "learning_rate": 0.0017931183913442279, + "loss": 2.3148, + "step": 401070 + }, + { + "epoch": 1.5504631132965316, + "grad_norm": 0.10519945621490479, + "learning_rate": 0.0017921539030917347, + "loss": 2.3172, + "step": 401080 + }, + { + "epoch": 1.550501770499915, + "grad_norm": 0.09560395032167435, + "learning_rate": 0.0017911938698217892, + "loss": 2.3086, + "step": 401090 + }, + { + "epoch": 1.5505404277032984, + "grad_norm": 0.09103363007307053, + "learning_rate": 0.0017902382303659696, + "loss": 2.3198, + "step": 401100 + }, + { + "epoch": 1.5505790849066816, + "grad_norm": 0.09273833781480789, + "learning_rate": 0.0017892869249429453, + "loss": 2.3329, + "step": 401110 + }, + { + "epoch": 1.5506177421100649, + "grad_norm": 0.10024043917655945, + "learning_rate": 0.0017883398951148329, + "loss": 2.3195, + "step": 401120 + }, + { + "epoch": 1.5506563993134481, + "grad_norm": 0.1332581788301468, + "learning_rate": 0.0017873970837453072, + "loss": 2.32, + "step": 401130 + }, + { + "epoch": 1.5506950565168314, + "grad_norm": 0.12426207214593887, + "learning_rate": 0.0017864584349593737, + "loss": 2.3221, + "step": 401140 + }, + { + "epoch": 1.5507337137202146, + "grad_norm": 0.10845503956079483, + "learning_rate": 0.0017855238941047278, + "loss": 2.3191, + "step": 401150 + }, + { + "epoch": 1.5507723709235979, + "grad_norm": 0.1018427163362503, + "learning_rate": 0.00178459340771462, + "loss": 2.3215, + "step": 401160 + }, + { + "epoch": 1.5508110281269811, + "grad_norm": 0.13322055339813232, + "learning_rate": 0.0017836669234721607, + "loss": 2.3301, + "step": 401170 + }, + { + "epoch": 1.5508496853303644, + "grad_norm": 0.11589889228343964, + "learning_rate": 0.0017827443901759956, + "loss": 2.3184, + "step": 401180 + }, + { + "epoch": 1.5508883425337476, + "grad_norm": 0.11078567057847977, + "learning_rate": 0.001781825757707286, + "loss": 2.3031, + "step": 401190 + }, + { + "epoch": 1.5509269997371309, + "grad_norm": 0.09361839294433594, + "learning_rate": 0.0017809109769979336, + "loss": 2.3139, + "step": 401200 + }, + { + "epoch": 1.5509656569405141, + "grad_norm": 0.09666917473077774, + "learning_rate": 0.0017800000000000001, + "loss": 2.3225, + "step": 401210 + }, + { + "epoch": 1.5510043141438976, + "grad_norm": 0.1282287836074829, + "learning_rate": 0.0017790927796562548, + "loss": 2.3288, + "step": 401220 + }, + { + "epoch": 1.5510429713472809, + "grad_norm": 0.09642815589904785, + "learning_rate": 0.0017781892698718116, + "loss": 2.31, + "step": 401230 + }, + { + "epoch": 1.5510816285506641, + "grad_norm": 0.09528058767318726, + "learning_rate": 0.0017772894254867993, + "loss": 2.3134, + "step": 401240 + }, + { + "epoch": 1.5511202857540474, + "grad_norm": 0.12327845394611359, + "learning_rate": 0.001776393202250021, + "loss": 2.3202, + "step": 401250 + }, + { + "epoch": 1.5511589429574308, + "grad_norm": 0.09685596823692322, + "learning_rate": 0.0017755005567935637, + "loss": 2.3177, + "step": 401260 + }, + { + "epoch": 1.551197600160814, + "grad_norm": 0.09323841333389282, + "learning_rate": 0.0017746114466083071, + "loss": 2.3288, + "step": 401270 + }, + { + "epoch": 1.5512362573641973, + "grad_norm": 0.09714601933956146, + "learning_rate": 0.0017737258300203047, + "loss": 2.3082, + "step": 401280 + }, + { + "epoch": 1.5512749145675806, + "grad_norm": 0.11459461599588394, + "learning_rate": 0.0017728436661679891, + "loss": 2.3257, + "step": 401290 + }, + { + "epoch": 1.5513135717709639, + "grad_norm": 0.09478688985109329, + "learning_rate": 0.0017719649149801724, + "loss": 2.3119, + "step": 401300 + }, + { + "epoch": 1.551352228974347, + "grad_norm": 0.09680263698101044, + "learning_rate": 0.001771089537154808, + "loss": 2.3094, + "step": 401310 + }, + { + "epoch": 1.5513908861777304, + "grad_norm": 0.09631063044071198, + "learning_rate": 0.0017702174941384788, + "loss": 2.3171, + "step": 401320 + }, + { + "epoch": 1.5514295433811136, + "grad_norm": 0.10044778883457184, + "learning_rate": 0.0017693487481065843, + "loss": 2.3217, + "step": 401330 + }, + { + "epoch": 1.5514682005844969, + "grad_norm": 0.0928507149219513, + "learning_rate": 0.0017684832619441954, + "loss": 2.3125, + "step": 401340 + }, + { + "epoch": 1.5515068577878801, + "grad_norm": 0.08930382877588272, + "learning_rate": 0.001767620999227555, + "loss": 2.3282, + "step": 401350 + }, + { + "epoch": 1.5515455149912634, + "grad_norm": 0.10376087576150894, + "learning_rate": 0.001766761924206188, + "loss": 2.3218, + "step": 401360 + }, + { + "epoch": 1.5515841721946466, + "grad_norm": 0.09389341622591019, + "learning_rate": 0.0017659060017856075, + "loss": 2.3314, + "step": 401370 + }, + { + "epoch": 1.5516228293980299, + "grad_norm": 0.10632017254829407, + "learning_rate": 0.0017650531975105855, + "loss": 2.3161, + "step": 401380 + }, + { + "epoch": 1.5516614866014133, + "grad_norm": 0.10752441734075546, + "learning_rate": 0.0017642034775489682, + "loss": 2.313, + "step": 401390 + }, + { + "epoch": 1.5517001438047966, + "grad_norm": 0.09111400693655014, + "learning_rate": 0.0017633568086760155, + "loss": 2.3151, + "step": 401400 + }, + { + "epoch": 1.5517388010081798, + "grad_norm": 0.11407702416181564, + "learning_rate": 0.0017625131582592417, + "loss": 2.3116, + "step": 401410 + }, + { + "epoch": 1.551777458211563, + "grad_norm": 0.09957265853881836, + "learning_rate": 0.0017616724942437403, + "loss": 2.3201, + "step": 401420 + }, + { + "epoch": 1.5518161154149466, + "grad_norm": 0.11485077440738678, + "learning_rate": 0.001760834785137972, + "loss": 2.3202, + "step": 401430 + }, + { + "epoch": 1.5518547726183298, + "grad_norm": 0.09502989798784256, + "learning_rate": 0.00176, + "loss": 2.3158, + "step": 401440 + }, + { + "epoch": 1.551893429821713, + "grad_norm": 0.10360467433929443, + "learning_rate": 0.0017591681084241542, + "loss": 2.3093, + "step": 401450 + }, + { + "epoch": 1.5519320870250963, + "grad_norm": 0.08504238724708557, + "learning_rate": 0.0017583390805281085, + "loss": 2.3005, + "step": 401460 + }, + { + "epoch": 1.5519707442284796, + "grad_norm": 0.0989072248339653, + "learning_rate": 0.0017575128869403572, + "loss": 2.3084, + "step": 401470 + }, + { + "epoch": 1.5520094014318628, + "grad_norm": 0.08768634498119354, + "learning_rate": 0.0017566894987880713, + "loss": 2.31, + "step": 401480 + }, + { + "epoch": 1.552048058635246, + "grad_norm": 0.11494536697864532, + "learning_rate": 0.001755868887685326, + "loss": 2.3079, + "step": 401490 + }, + { + "epoch": 1.5520867158386293, + "grad_norm": 0.0976579487323761, + "learning_rate": 0.0017550510257216823, + "loss": 2.2971, + "step": 401500 + }, + { + "epoch": 1.5521253730420126, + "grad_norm": 0.09919919073581696, + "learning_rate": 0.0017542358854511098, + "loss": 2.3158, + "step": 401510 + }, + { + "epoch": 1.5521640302453958, + "grad_norm": 0.45339933037757874, + "learning_rate": 0.001753423439881241, + "loss": 2.3116, + "step": 401520 + }, + { + "epoch": 1.552202687448779, + "grad_norm": 0.09647250175476074, + "learning_rate": 0.0017526136624629403, + "loss": 2.3251, + "step": 401530 + }, + { + "epoch": 1.5522413446521623, + "grad_norm": 0.11359827220439911, + "learning_rate": 0.001751806527080183, + "loss": 2.3095, + "step": 401540 + }, + { + "epoch": 1.5522800018555456, + "grad_norm": 0.10807959735393524, + "learning_rate": 0.0017510020080402254, + "loss": 2.3172, + "step": 401550 + }, + { + "epoch": 1.552318659058929, + "grad_norm": 0.09357649832963943, + "learning_rate": 0.001750200080064064, + "loss": 2.3051, + "step": 401560 + }, + { + "epoch": 1.5523573162623123, + "grad_norm": 0.10182651877403259, + "learning_rate": 0.0017494007182771668, + "loss": 2.3103, + "step": 401570 + }, + { + "epoch": 1.5523959734656956, + "grad_norm": 0.09746528416872025, + "learning_rate": 0.0017486038982004693, + "loss": 2.3086, + "step": 401580 + }, + { + "epoch": 1.5524346306690788, + "grad_norm": 0.10169055312871933, + "learning_rate": 0.0017478095957416302, + "loss": 2.3067, + "step": 401590 + }, + { + "epoch": 1.5524732878724623, + "grad_norm": 0.102170929312706, + "learning_rate": 0.0017470177871865297, + "loss": 2.325, + "step": 401600 + }, + { + "epoch": 1.5525119450758456, + "grad_norm": 0.09123319387435913, + "learning_rate": 0.0017462284491910098, + "loss": 2.3178, + "step": 401610 + }, + { + "epoch": 1.5525506022792288, + "grad_norm": 0.09148348122835159, + "learning_rate": 0.001745441558772843, + "loss": 2.31, + "step": 401620 + }, + { + "epoch": 1.552589259482612, + "grad_norm": 0.28315749764442444, + "learning_rate": 0.0017446570933039262, + "loss": 2.324, + "step": 401630 + }, + { + "epoch": 1.5526279166859953, + "grad_norm": 0.09796846657991409, + "learning_rate": 0.001743875030502686, + "loss": 2.3082, + "step": 401640 + }, + { + "epoch": 1.5526665738893786, + "grad_norm": 0.09901988506317139, + "learning_rate": 0.0017430953484266973, + "loss": 2.3334, + "step": 401650 + }, + { + "epoch": 1.5527052310927618, + "grad_norm": 0.08878893405199051, + "learning_rate": 0.0017423180254654976, + "loss": 2.3042, + "step": 401660 + }, + { + "epoch": 1.552743888296145, + "grad_norm": 0.13966940343379974, + "learning_rate": 0.0017415430403335984, + "loss": 2.3115, + "step": 401670 + }, + { + "epoch": 1.5527825454995283, + "grad_norm": 0.10997126996517181, + "learning_rate": 0.0017407703720636856, + "loss": 2.3204, + "step": 401680 + }, + { + "epoch": 1.5528212027029116, + "grad_norm": 0.09582191705703735, + "learning_rate": 0.00174, + "loss": 2.2998, + "step": 401690 + }, + { + "epoch": 1.5528598599062948, + "grad_norm": 0.0954718291759491, + "learning_rate": 0.001739231903791894, + "loss": 2.3069, + "step": 401700 + }, + { + "epoch": 1.552898517109678, + "grad_norm": 0.08937995135784149, + "learning_rate": 0.0017384660633875595, + "loss": 2.3267, + "step": 401710 + }, + { + "epoch": 1.5529371743130613, + "grad_norm": 0.09006324410438538, + "learning_rate": 0.00173770245902792, + "loss": 2.3027, + "step": 401720 + }, + { + "epoch": 1.5529758315164448, + "grad_norm": 0.09693530946969986, + "learning_rate": 0.0017369410712406818, + "loss": 2.3182, + "step": 401730 + }, + { + "epoch": 1.553014488719828, + "grad_norm": 0.11013215780258179, + "learning_rate": 0.0017361818808345416, + "loss": 2.3176, + "step": 401740 + }, + { + "epoch": 1.5530531459232113, + "grad_norm": 0.09539856761693954, + "learning_rate": 0.001735424868893541, + "loss": 2.3102, + "step": 401750 + }, + { + "epoch": 1.5530918031265946, + "grad_norm": 0.10650390386581421, + "learning_rate": 0.001734670016771568, + "loss": 2.3264, + "step": 401760 + }, + { + "epoch": 1.553130460329978, + "grad_norm": 0.10164553672075272, + "learning_rate": 0.0017339173060869986, + "loss": 2.3162, + "step": 401770 + }, + { + "epoch": 1.5531691175333613, + "grad_norm": 0.10432730615139008, + "learning_rate": 0.0017331667187174733, + "loss": 2.2991, + "step": 401780 + }, + { + "epoch": 1.5532077747367445, + "grad_norm": 0.08654264360666275, + "learning_rate": 0.001732418236794807, + "loss": 2.316, + "step": 401790 + }, + { + "epoch": 1.5532464319401278, + "grad_norm": 0.09503353387117386, + "learning_rate": 0.0017316718427000254, + "loss": 2.3102, + "step": 401800 + }, + { + "epoch": 1.553285089143511, + "grad_norm": 0.09561841189861298, + "learning_rate": 0.001730927519058526, + "loss": 2.3267, + "step": 401810 + }, + { + "epoch": 1.5533237463468943, + "grad_norm": 0.11583434045314789, + "learning_rate": 0.001730185248735359, + "loss": 2.306, + "step": 401820 + }, + { + "epoch": 1.5533624035502775, + "grad_norm": 0.1029180958867073, + "learning_rate": 0.0017294450148306263, + "loss": 2.2991, + "step": 401830 + }, + { + "epoch": 1.5534010607536608, + "grad_norm": 0.09783989191055298, + "learning_rate": 0.0017287068006749894, + "loss": 2.32, + "step": 401840 + }, + { + "epoch": 1.553439717957044, + "grad_norm": 0.09479359537363052, + "learning_rate": 0.0017279705898252912, + "loss": 2.305, + "step": 401850 + }, + { + "epoch": 1.5534783751604273, + "grad_norm": 0.08666594326496124, + "learning_rate": 0.001727236366060283, + "loss": 2.3039, + "step": 401860 + }, + { + "epoch": 1.5535170323638106, + "grad_norm": 0.0965995267033577, + "learning_rate": 0.0017265041133764532, + "loss": 2.3118, + "step": 401870 + }, + { + "epoch": 1.5535556895671938, + "grad_norm": 0.10142888128757477, + "learning_rate": 0.0017257738159839582, + "loss": 2.2966, + "step": 401880 + }, + { + "epoch": 1.5535943467705773, + "grad_norm": 0.09101786464452744, + "learning_rate": 0.0017250454583026497, + "loss": 2.3118, + "step": 401890 + }, + { + "epoch": 1.5536330039739605, + "grad_norm": 0.10528852790594101, + "learning_rate": 0.0017243190249581954, + "loss": 2.3094, + "step": 401900 + }, + { + "epoch": 1.5536716611773438, + "grad_norm": 0.09671095758676529, + "learning_rate": 0.0017235945007782949, + "loss": 2.3117, + "step": 401910 + }, + { + "epoch": 1.553710318380727, + "grad_norm": 0.12018919736146927, + "learning_rate": 0.0017228718707889797, + "loss": 2.3165, + "step": 401920 + }, + { + "epoch": 1.5537489755841103, + "grad_norm": 0.10087112337350845, + "learning_rate": 0.001722151120211004, + "loss": 2.3122, + "step": 401930 + }, + { + "epoch": 1.5537876327874938, + "grad_norm": 0.08700679242610931, + "learning_rate": 0.0017214322344563176, + "loss": 2.2984, + "step": 401940 + }, + { + "epoch": 1.553826289990877, + "grad_norm": 0.10135027021169662, + "learning_rate": 0.0017207151991246214, + "loss": 2.3121, + "step": 401950 + }, + { + "epoch": 1.5538649471942603, + "grad_norm": 0.0919208973646164, + "learning_rate": 0.00172, + "loss": 2.3116, + "step": 401960 + }, + { + "epoch": 1.5539036043976435, + "grad_norm": 0.08301553875207901, + "learning_rate": 0.001719286623047636, + "loss": 2.3051, + "step": 401970 + }, + { + "epoch": 1.5539422616010268, + "grad_norm": 0.09961026906967163, + "learning_rate": 0.0017185750544105943, + "loss": 2.3124, + "step": 401980 + }, + { + "epoch": 1.55398091880441, + "grad_norm": 0.1093689426779747, + "learning_rate": 0.0017178652804066823, + "loss": 2.3203, + "step": 401990 + }, + { + "epoch": 1.5540195760077933, + "grad_norm": 0.08834511041641235, + "learning_rate": 0.001717157287525381, + "loss": 2.3168, + "step": 402000 + }, + { + "epoch": 1.5540582332111765, + "grad_norm": 0.09967672824859619, + "learning_rate": 0.0017164510624248434, + "loss": 2.3181, + "step": 402010 + }, + { + "epoch": 1.5540968904145598, + "grad_norm": 0.09733570367097855, + "learning_rate": 0.001715746591928962, + "loss": 2.3099, + "step": 402020 + }, + { + "epoch": 1.554135547617943, + "grad_norm": 0.08811885118484497, + "learning_rate": 0.0017150438630245, + "loss": 2.2935, + "step": 402030 + }, + { + "epoch": 1.5541742048213263, + "grad_norm": 0.10241816192865372, + "learning_rate": 0.001714342862858286, + "loss": 2.3091, + "step": 402040 + }, + { + "epoch": 1.5542128620247095, + "grad_norm": 0.09512423723936081, + "learning_rate": 0.001713643578734473, + "loss": 2.3217, + "step": 402050 + }, + { + "epoch": 1.554251519228093, + "grad_norm": 0.0858379676938057, + "learning_rate": 0.0017129459981118536, + "loss": 2.3169, + "step": 402060 + }, + { + "epoch": 1.5542901764314763, + "grad_norm": 0.08720643073320389, + "learning_rate": 0.001712250108601237, + "loss": 2.3249, + "step": 402070 + }, + { + "epoch": 1.5543288336348595, + "grad_norm": 0.08493568003177643, + "learning_rate": 0.001711555897962881, + "loss": 2.314, + "step": 402080 + }, + { + "epoch": 1.5543674908382428, + "grad_norm": 0.09646010398864746, + "learning_rate": 0.001710863354103981, + "loss": 2.2924, + "step": 402090 + }, + { + "epoch": 1.554406148041626, + "grad_norm": 0.10783302783966064, + "learning_rate": 0.0017101724650762113, + "loss": 2.3187, + "step": 402100 + }, + { + "epoch": 1.5544448052450095, + "grad_norm": 0.09830918163061142, + "learning_rate": 0.001709483219073321, + "loss": 2.3102, + "step": 402110 + }, + { + "epoch": 1.5544834624483927, + "grad_norm": 0.09604904055595398, + "learning_rate": 0.0017087956044287793, + "loss": 2.3204, + "step": 402120 + }, + { + "epoch": 1.554522119651776, + "grad_norm": 0.09845247119665146, + "learning_rate": 0.0017081096096134717, + "loss": 2.3242, + "step": 402130 + }, + { + "epoch": 1.5545607768551593, + "grad_norm": 0.09527433663606644, + "learning_rate": 0.0017074252232334441, + "loss": 2.2961, + "step": 402140 + }, + { + "epoch": 1.5545994340585425, + "grad_norm": 0.10510462522506714, + "learning_rate": 0.0017067424340276964, + "loss": 2.3101, + "step": 402150 + }, + { + "epoch": 1.5546380912619258, + "grad_norm": 0.10998314619064331, + "learning_rate": 0.0017060612308660186, + "loss": 2.3157, + "step": 402160 + }, + { + "epoch": 1.554676748465309, + "grad_norm": 0.09259533137083054, + "learning_rate": 0.0017053816027468754, + "loss": 2.3101, + "step": 402170 + }, + { + "epoch": 1.5547154056686923, + "grad_norm": 0.11178915947675705, + "learning_rate": 0.001704703538795332, + "loss": 2.306, + "step": 402180 + }, + { + "epoch": 1.5547540628720755, + "grad_norm": 0.09446456283330917, + "learning_rate": 0.0017040270282610251, + "loss": 2.2987, + "step": 402190 + }, + { + "epoch": 1.5547927200754588, + "grad_norm": 0.1069209948182106, + "learning_rate": 0.0017033520605161737, + "loss": 2.3044, + "step": 402200 + }, + { + "epoch": 1.554831377278842, + "grad_norm": 0.10097237676382065, + "learning_rate": 0.00170267862505363, + "loss": 2.2995, + "step": 402210 + }, + { + "epoch": 1.5548700344822253, + "grad_norm": 0.09666384011507034, + "learning_rate": 0.0017020067114849732, + "loss": 2.3085, + "step": 402220 + }, + { + "epoch": 1.5549086916856087, + "grad_norm": 0.09053724259138107, + "learning_rate": 0.0017013363095386386, + "loss": 2.313, + "step": 402230 + }, + { + "epoch": 1.554947348888992, + "grad_norm": 0.08768399804830551, + "learning_rate": 0.0017006674090580846, + "loss": 2.2969, + "step": 402240 + }, + { + "epoch": 1.5549860060923753, + "grad_norm": 0.10341167449951172, + "learning_rate": 0.0017, + "loss": 2.3079, + "step": 402250 + }, + { + "epoch": 1.5550246632957585, + "grad_norm": 0.11017939448356628, + "learning_rate": 0.001699334072432542, + "loss": 2.3145, + "step": 402260 + }, + { + "epoch": 1.5550633204991418, + "grad_norm": 0.11902227252721786, + "learning_rate": 0.0016986696165336129, + "loss": 2.3129, + "step": 402270 + }, + { + "epoch": 1.5551019777025252, + "grad_norm": 0.12658636271953583, + "learning_rate": 0.0016980066225891698, + "loss": 2.328, + "step": 402280 + }, + { + "epoch": 1.5551406349059085, + "grad_norm": 0.08775745332241058, + "learning_rate": 0.001697345080991569, + "loss": 2.313, + "step": 402290 + }, + { + "epoch": 1.5551792921092917, + "grad_norm": 0.11420045793056488, + "learning_rate": 0.001696684982237938, + "loss": 2.3107, + "step": 402300 + }, + { + "epoch": 1.555217949312675, + "grad_norm": 0.09320458769798279, + "learning_rate": 0.0016960263169285867, + "loss": 2.3109, + "step": 402310 + }, + { + "epoch": 1.5552566065160582, + "grad_norm": 0.11391550302505493, + "learning_rate": 0.0016953690757654436, + "loss": 2.3109, + "step": 402320 + }, + { + "epoch": 1.5552952637194415, + "grad_norm": 0.09274116158485413, + "learning_rate": 0.001694713249550525, + "loss": 2.3179, + "step": 402330 + }, + { + "epoch": 1.5553339209228247, + "grad_norm": 0.09113335609436035, + "learning_rate": 0.001694058829184433, + "loss": 2.3207, + "step": 402340 + }, + { + "epoch": 1.555372578126208, + "grad_norm": 0.09407570213079453, + "learning_rate": 0.0016934058056648823, + "loss": 2.3197, + "step": 402350 + }, + { + "epoch": 1.5554112353295912, + "grad_norm": 0.11472621560096741, + "learning_rate": 0.0016927541700852559, + "loss": 2.3086, + "step": 402360 + }, + { + "epoch": 1.5554498925329745, + "grad_norm": 0.09536908566951752, + "learning_rate": 0.001692103913633187, + "loss": 2.3043, + "step": 402370 + }, + { + "epoch": 1.5554885497363578, + "grad_norm": 0.09856908768415451, + "learning_rate": 0.0016914550275891698, + "loss": 2.3104, + "step": 402380 + }, + { + "epoch": 1.555527206939741, + "grad_norm": 0.09119411557912827, + "learning_rate": 0.001690807503325194, + "loss": 2.3038, + "step": 402390 + }, + { + "epoch": 1.5555658641431245, + "grad_norm": 0.08942999690771103, + "learning_rate": 0.0016901613323034067, + "loss": 2.3136, + "step": 402400 + }, + { + "epoch": 1.5556045213465077, + "grad_norm": 0.08663064986467361, + "learning_rate": 0.0016895165060747998, + "loss": 2.3072, + "step": 402410 + }, + { + "epoch": 1.555643178549891, + "grad_norm": 0.0936921089887619, + "learning_rate": 0.0016888730162779191, + "loss": 2.3079, + "step": 402420 + }, + { + "epoch": 1.5556818357532742, + "grad_norm": 0.09534984081983566, + "learning_rate": 0.0016882308546376022, + "loss": 2.3013, + "step": 402430 + }, + { + "epoch": 1.5557204929566577, + "grad_norm": 0.10839767009019852, + "learning_rate": 0.001687590012963734, + "loss": 2.299, + "step": 402440 + }, + { + "epoch": 1.555759150160041, + "grad_norm": 0.13321153819561005, + "learning_rate": 0.0016869504831500295, + "loss": 2.3115, + "step": 402450 + }, + { + "epoch": 1.5557978073634242, + "grad_norm": 0.10143892467021942, + "learning_rate": 0.0016863122571728376, + "loss": 2.3137, + "step": 402460 + }, + { + "epoch": 1.5558364645668075, + "grad_norm": 0.11658959090709686, + "learning_rate": 0.0016856753270899658, + "loss": 2.3132, + "step": 402470 + }, + { + "epoch": 1.5558751217701907, + "grad_norm": 0.09141705185174942, + "learning_rate": 0.0016850396850395274, + "loss": 2.317, + "step": 402480 + }, + { + "epoch": 1.555913778973574, + "grad_norm": 0.10423143208026886, + "learning_rate": 0.00168440532323881, + "loss": 2.3036, + "step": 402490 + }, + { + "epoch": 1.5559524361769572, + "grad_norm": 0.11280666291713715, + "learning_rate": 0.001683772233983162, + "loss": 2.31, + "step": 402500 + }, + { + "epoch": 1.5559910933803405, + "grad_norm": 0.11399994045495987, + "learning_rate": 0.0016831404096449027, + "loss": 2.323, + "step": 402510 + }, + { + "epoch": 1.5560297505837237, + "grad_norm": 0.09138819575309753, + "learning_rate": 0.0016825098426722493, + "loss": 2.3094, + "step": 402520 + }, + { + "epoch": 1.556068407787107, + "grad_norm": 0.10838840901851654, + "learning_rate": 0.0016818805255882628, + "loss": 2.3267, + "step": 402530 + }, + { + "epoch": 1.5561070649904902, + "grad_norm": 0.10054635256528854, + "learning_rate": 0.0016812524509898154, + "loss": 2.3192, + "step": 402540 + }, + { + "epoch": 1.5561457221938735, + "grad_norm": 0.1000504270195961, + "learning_rate": 0.0016806256115465738, + "loss": 2.2949, + "step": 402550 + }, + { + "epoch": 1.5561843793972567, + "grad_norm": 0.09790179878473282, + "learning_rate": 0.00168, + "loss": 2.3207, + "step": 402560 + }, + { + "epoch": 1.5562230366006402, + "grad_norm": 0.09651616215705872, + "learning_rate": 0.001679375609162372, + "loss": 2.3064, + "step": 402570 + }, + { + "epoch": 1.5562616938040235, + "grad_norm": 0.09436143189668655, + "learning_rate": 0.0016787524319158199, + "loss": 2.3141, + "step": 402580 + }, + { + "epoch": 1.5563003510074067, + "grad_norm": 0.1127728670835495, + "learning_rate": 0.0016781304612113784, + "loss": 2.3029, + "step": 402590 + }, + { + "epoch": 1.55633900821079, + "grad_norm": 0.10509685426950455, + "learning_rate": 0.001677509690068058, + "loss": 2.3241, + "step": 402600 + }, + { + "epoch": 1.5563776654141734, + "grad_norm": 0.09460047632455826, + "learning_rate": 0.0016768901115719298, + "loss": 2.2929, + "step": 402610 + }, + { + "epoch": 1.5564163226175567, + "grad_norm": 0.09158284217119217, + "learning_rate": 0.0016762717188752272, + "loss": 2.3087, + "step": 402620 + }, + { + "epoch": 1.55645497982094, + "grad_norm": 0.09809747338294983, + "learning_rate": 0.001675654505195463, + "loss": 2.3213, + "step": 402630 + }, + { + "epoch": 1.5564936370243232, + "grad_norm": 0.10416768491268158, + "learning_rate": 0.0016750384638145616, + "loss": 2.3046, + "step": 402640 + }, + { + "epoch": 1.5565322942277064, + "grad_norm": 0.10024569928646088, + "learning_rate": 0.001674423588078006, + "loss": 2.3101, + "step": 402650 + }, + { + "epoch": 1.5565709514310897, + "grad_norm": 0.10588731616735458, + "learning_rate": 0.0016738098713939983, + "loss": 2.3095, + "step": 402660 + }, + { + "epoch": 1.556609608634473, + "grad_norm": 0.12064171582460403, + "learning_rate": 0.0016731973072326362, + "loss": 2.3084, + "step": 402670 + }, + { + "epoch": 1.5566482658378562, + "grad_norm": 0.09845302999019623, + "learning_rate": 0.001672585889125102, + "loss": 2.3035, + "step": 402680 + }, + { + "epoch": 1.5566869230412395, + "grad_norm": 0.10145915299654007, + "learning_rate": 0.0016719756106628654, + "loss": 2.2949, + "step": 402690 + }, + { + "epoch": 1.5567255802446227, + "grad_norm": 0.13840682804584503, + "learning_rate": 0.0016713664654969002, + "loss": 2.3077, + "step": 402700 + }, + { + "epoch": 1.556764237448006, + "grad_norm": 0.09022045880556107, + "learning_rate": 0.0016707584473369134, + "loss": 2.2954, + "step": 402710 + }, + { + "epoch": 1.5568028946513892, + "grad_norm": 0.09547263383865356, + "learning_rate": 0.0016701515499505872, + "loss": 2.2929, + "step": 402720 + }, + { + "epoch": 1.5568415518547725, + "grad_norm": 0.11516858637332916, + "learning_rate": 0.001669545767162834, + "loss": 2.3237, + "step": 402730 + }, + { + "epoch": 1.556880209058156, + "grad_norm": 0.10102009028196335, + "learning_rate": 0.001668941092855063, + "loss": 2.31, + "step": 402740 + }, + { + "epoch": 1.5569188662615392, + "grad_norm": 0.11381220817565918, + "learning_rate": 0.00166833752096446, + "loss": 2.2922, + "step": 402750 + }, + { + "epoch": 1.5569575234649224, + "grad_norm": 0.0834990069270134, + "learning_rate": 0.0016677350454832771, + "loss": 2.2958, + "step": 402760 + }, + { + "epoch": 1.5569961806683057, + "grad_norm": 0.08868042379617691, + "learning_rate": 0.0016671336604581353, + "loss": 2.2997, + "step": 402770 + }, + { + "epoch": 1.5570348378716892, + "grad_norm": 0.09132802486419678, + "learning_rate": 0.0016665333599893387, + "loss": 2.3004, + "step": 402780 + }, + { + "epoch": 1.5570734950750724, + "grad_norm": 0.12312108278274536, + "learning_rate": 0.0016659341382301988, + "loss": 2.2988, + "step": 402790 + }, + { + "epoch": 1.5571121522784557, + "grad_norm": 0.09049829840660095, + "learning_rate": 0.0016653359893863697, + "loss": 2.3147, + "step": 402800 + }, + { + "epoch": 1.557150809481839, + "grad_norm": 0.08735433220863342, + "learning_rate": 0.0016647389077151958, + "loss": 2.2921, + "step": 402810 + }, + { + "epoch": 1.5571894666852222, + "grad_norm": 0.08642420172691345, + "learning_rate": 0.0016641428875250669, + "loss": 2.2984, + "step": 402820 + }, + { + "epoch": 1.5572281238886054, + "grad_norm": 0.10535074025392532, + "learning_rate": 0.0016635479231747856, + "loss": 2.3132, + "step": 402830 + }, + { + "epoch": 1.5572667810919887, + "grad_norm": 0.09041352570056915, + "learning_rate": 0.0016629540090729456, + "loss": 2.312, + "step": 402840 + }, + { + "epoch": 1.557305438295372, + "grad_norm": 0.09955234080553055, + "learning_rate": 0.0016623611396773175, + "loss": 2.2881, + "step": 402850 + }, + { + "epoch": 1.5573440954987552, + "grad_norm": 0.08979614078998566, + "learning_rate": 0.0016617693094942447, + "loss": 2.3217, + "step": 402860 + }, + { + "epoch": 1.5573827527021384, + "grad_norm": 0.08955507725477219, + "learning_rate": 0.0016611785130780517, + "loss": 2.3222, + "step": 402870 + }, + { + "epoch": 1.5574214099055217, + "grad_norm": 0.0945705994963646, + "learning_rate": 0.0016605887450304572, + "loss": 2.3083, + "step": 402880 + }, + { + "epoch": 1.557460067108905, + "grad_norm": 0.10606098920106888, + "learning_rate": 0.0016600000000000002, + "loss": 2.3259, + "step": 402890 + }, + { + "epoch": 1.5574987243122882, + "grad_norm": 0.09341254830360413, + "learning_rate": 0.0016594122726814719, + "loss": 2.3099, + "step": 402900 + }, + { + "epoch": 1.5575373815156717, + "grad_norm": 0.11154890060424805, + "learning_rate": 0.0016588255578153604, + "loss": 2.3172, + "step": 402910 + }, + { + "epoch": 1.557576038719055, + "grad_norm": 0.08776730298995972, + "learning_rate": 0.0016582398501872988, + "loss": 2.3059, + "step": 402920 + }, + { + "epoch": 1.5576146959224382, + "grad_norm": 0.0916043370962143, + "learning_rate": 0.0016576551446275264, + "loss": 2.3114, + "step": 402930 + }, + { + "epoch": 1.5576533531258214, + "grad_norm": 0.10796567797660828, + "learning_rate": 0.0016570714360103552, + "loss": 2.3113, + "step": 402940 + }, + { + "epoch": 1.557692010329205, + "grad_norm": 0.11403820663690567, + "learning_rate": 0.0016564887192536467, + "loss": 2.3049, + "step": 402950 + }, + { + "epoch": 1.5577306675325882, + "grad_norm": 0.1020413413643837, + "learning_rate": 0.001655906989318295, + "loss": 2.311, + "step": 402960 + }, + { + "epoch": 1.5577693247359714, + "grad_norm": 0.10035323351621628, + "learning_rate": 0.0016553262412077184, + "loss": 2.2995, + "step": 402970 + }, + { + "epoch": 1.5578079819393547, + "grad_norm": 0.09566937386989594, + "learning_rate": 0.0016547464699673588, + "loss": 2.3201, + "step": 402980 + }, + { + "epoch": 1.557846639142738, + "grad_norm": 0.09444167464971542, + "learning_rate": 0.0016541676706841883, + "loss": 2.3061, + "step": 402990 + }, + { + "epoch": 1.5578852963461212, + "grad_norm": 0.10356861352920532, + "learning_rate": 0.0016535898384862247, + "loss": 2.3177, + "step": 403000 + }, + { + "epoch": 1.5579239535495044, + "grad_norm": 0.11803413927555084, + "learning_rate": 0.0016530129685420505, + "loss": 2.3072, + "step": 403010 + }, + { + "epoch": 1.5579626107528877, + "grad_norm": 0.09005914628505707, + "learning_rate": 0.0016524370560603448, + "loss": 2.3037, + "step": 403020 + }, + { + "epoch": 1.558001267956271, + "grad_norm": 0.10417237877845764, + "learning_rate": 0.0016518620962894157, + "loss": 2.2866, + "step": 403030 + }, + { + "epoch": 1.5580399251596542, + "grad_norm": 0.10798921436071396, + "learning_rate": 0.0016512880845167461, + "loss": 2.3025, + "step": 403040 + }, + { + "epoch": 1.5580785823630374, + "grad_norm": 0.10037175565958023, + "learning_rate": 0.0016507150160685405, + "loss": 2.3261, + "step": 403050 + }, + { + "epoch": 1.5581172395664207, + "grad_norm": 0.1046285331249237, + "learning_rate": 0.001650142886309282, + "loss": 2.303, + "step": 403060 + }, + { + "epoch": 1.558155896769804, + "grad_norm": 0.09166203439235687, + "learning_rate": 0.0016495716906412955, + "loss": 2.2987, + "step": 403070 + }, + { + "epoch": 1.5581945539731874, + "grad_norm": 0.10187604278326035, + "learning_rate": 0.0016490014245043151, + "loss": 2.3015, + "step": 403080 + }, + { + "epoch": 1.5582332111765707, + "grad_norm": 0.10249179601669312, + "learning_rate": 0.0016484320833750612, + "loss": 2.3023, + "step": 403090 + }, + { + "epoch": 1.558271868379954, + "grad_norm": 0.10240791738033295, + "learning_rate": 0.0016478636627668198, + "loss": 2.3049, + "step": 403100 + }, + { + "epoch": 1.5583105255833372, + "grad_norm": 0.09928867965936661, + "learning_rate": 0.001647296158229032, + "loss": 2.3009, + "step": 403110 + }, + { + "epoch": 1.5583491827867206, + "grad_norm": 0.11111550778150558, + "learning_rate": 0.001646729565346886, + "loss": 2.3116, + "step": 403120 + }, + { + "epoch": 1.5583878399901039, + "grad_norm": 0.10868023335933685, + "learning_rate": 0.0016461638797409172, + "loss": 2.3134, + "step": 403130 + }, + { + "epoch": 1.5584264971934871, + "grad_norm": 0.09917468577623367, + "learning_rate": 0.001645599097066613, + "loss": 2.2945, + "step": 403140 + }, + { + "epoch": 1.5584651543968704, + "grad_norm": 0.10844592750072479, + "learning_rate": 0.001645035213014023, + "loss": 2.3096, + "step": 403150 + }, + { + "epoch": 1.5585038116002536, + "grad_norm": 0.0975717306137085, + "learning_rate": 0.0016444722233073766, + "loss": 2.2947, + "step": 403160 + }, + { + "epoch": 1.558542468803637, + "grad_norm": 0.12086108326911926, + "learning_rate": 0.001643910123704703, + "loss": 2.3201, + "step": 403170 + }, + { + "epoch": 1.5585811260070201, + "grad_norm": 0.1903742551803589, + "learning_rate": 0.00164334890999746, + "loss": 2.3063, + "step": 403180 + }, + { + "epoch": 1.5586197832104034, + "grad_norm": 0.1074889600276947, + "learning_rate": 0.001642788578010165, + "loss": 2.2973, + "step": 403190 + }, + { + "epoch": 1.5586584404137866, + "grad_norm": 0.0952894538640976, + "learning_rate": 0.0016422291236000337, + "loss": 2.3008, + "step": 403200 + }, + { + "epoch": 1.55869709761717, + "grad_norm": 0.08857908099889755, + "learning_rate": 0.0016416705426566217, + "loss": 2.3148, + "step": 403210 + }, + { + "epoch": 1.5587357548205532, + "grad_norm": 0.09984030574560165, + "learning_rate": 0.0016411128311014727, + "loss": 2.3017, + "step": 403220 + }, + { + "epoch": 1.5587744120239364, + "grad_norm": 0.10996051877737045, + "learning_rate": 0.0016405559848877713, + "loss": 2.3167, + "step": 403230 + }, + { + "epoch": 1.5588130692273197, + "grad_norm": 0.09792662411928177, + "learning_rate": 0.0016400000000000002, + "loss": 2.3018, + "step": 403240 + }, + { + "epoch": 1.5588517264307031, + "grad_norm": 0.09244784712791443, + "learning_rate": 0.001639444872453601, + "loss": 2.2989, + "step": 403250 + }, + { + "epoch": 1.5588903836340864, + "grad_norm": 0.09633370488882065, + "learning_rate": 0.0016388905982946443, + "loss": 2.3134, + "step": 403260 + }, + { + "epoch": 1.5589290408374696, + "grad_norm": 0.09883268177509308, + "learning_rate": 0.0016383371735994973, + "loss": 2.2933, + "step": 403270 + }, + { + "epoch": 1.558967698040853, + "grad_norm": 0.09502178430557251, + "learning_rate": 0.0016377845944745032, + "loss": 2.2998, + "step": 403280 + }, + { + "epoch": 1.5590063552442364, + "grad_norm": 0.0832071527838707, + "learning_rate": 0.001637232857055659, + "loss": 2.305, + "step": 403290 + }, + { + "epoch": 1.5590450124476196, + "grad_norm": 0.08292759954929352, + "learning_rate": 0.001636681957508301, + "loss": 2.2914, + "step": 403300 + }, + { + "epoch": 1.5590836696510029, + "grad_norm": 0.1084103211760521, + "learning_rate": 0.001636131892026795, + "loss": 2.3133, + "step": 403310 + }, + { + "epoch": 1.5591223268543861, + "grad_norm": 0.09909511357545853, + "learning_rate": 0.001635582656834228, + "loss": 2.2919, + "step": 403320 + }, + { + "epoch": 1.5591609840577694, + "grad_norm": 0.10739119350910187, + "learning_rate": 0.0016350342481821069, + "loss": 2.3024, + "step": 403330 + }, + { + "epoch": 1.5591996412611526, + "grad_norm": 0.10292712599039078, + "learning_rate": 0.0016344866623500588, + "loss": 2.3043, + "step": 403340 + }, + { + "epoch": 1.5592382984645359, + "grad_norm": 0.09883809834718704, + "learning_rate": 0.0016339398956455376, + "loss": 2.3082, + "step": 403350 + }, + { + "epoch": 1.5592769556679191, + "grad_norm": 0.08739059418439865, + "learning_rate": 0.001633393944403533, + "loss": 2.2986, + "step": 403360 + }, + { + "epoch": 1.5593156128713024, + "grad_norm": 0.12030746042728424, + "learning_rate": 0.0016328488049862835, + "loss": 2.3061, + "step": 403370 + }, + { + "epoch": 1.5593542700746856, + "grad_norm": 0.09531065076589584, + "learning_rate": 0.0016323044737829953, + "loss": 2.3006, + "step": 403380 + }, + { + "epoch": 1.5593929272780689, + "grad_norm": 0.12471679598093033, + "learning_rate": 0.0016317609472095607, + "loss": 2.3053, + "step": 403390 + }, + { + "epoch": 1.5594315844814521, + "grad_norm": 0.11966842412948608, + "learning_rate": 0.0016312182217082846, + "loss": 2.31, + "step": 403400 + }, + { + "epoch": 1.5594702416848354, + "grad_norm": 0.08434103429317474, + "learning_rate": 0.0016306762937476123, + "loss": 2.2828, + "step": 403410 + }, + { + "epoch": 1.5595088988882189, + "grad_norm": 0.09526640921831131, + "learning_rate": 0.0016301351598218615, + "loss": 2.3021, + "step": 403420 + }, + { + "epoch": 1.5595475560916021, + "grad_norm": 0.08554691821336746, + "learning_rate": 0.0016295948164509572, + "loss": 2.3047, + "step": 403430 + }, + { + "epoch": 1.5595862132949854, + "grad_norm": 0.08580779284238815, + "learning_rate": 0.0016290552601801718, + "loss": 2.3107, + "step": 403440 + }, + { + "epoch": 1.5596248704983686, + "grad_norm": 0.10723388195037842, + "learning_rate": 0.0016285164875798658, + "loss": 2.3114, + "step": 403450 + }, + { + "epoch": 1.559663527701752, + "grad_norm": 0.09583356231451035, + "learning_rate": 0.0016279784952452346, + "loss": 2.3199, + "step": 403460 + }, + { + "epoch": 1.5597021849051353, + "grad_norm": 0.08936525881290436, + "learning_rate": 0.0016274412797960569, + "loss": 2.3109, + "step": 403470 + }, + { + "epoch": 1.5597408421085186, + "grad_norm": 0.08797308802604675, + "learning_rate": 0.0016269048378764475, + "loss": 2.3036, + "step": 403480 + }, + { + "epoch": 1.5597794993119019, + "grad_norm": 0.08992758393287659, + "learning_rate": 0.001626369166154612, + "loss": 2.3087, + "step": 403490 + }, + { + "epoch": 1.559818156515285, + "grad_norm": 0.08879110962152481, + "learning_rate": 0.0016258342613226057, + "loss": 2.3088, + "step": 403500 + }, + { + "epoch": 1.5598568137186684, + "grad_norm": 0.09392860531806946, + "learning_rate": 0.0016253001200960961, + "loss": 2.3107, + "step": 403510 + }, + { + "epoch": 1.5598954709220516, + "grad_norm": 0.11898693442344666, + "learning_rate": 0.0016247667392141256, + "loss": 2.3222, + "step": 403520 + }, + { + "epoch": 1.5599341281254349, + "grad_norm": 0.08376120030879974, + "learning_rate": 0.0016242341154388813, + "loss": 2.3027, + "step": 403530 + }, + { + "epoch": 1.5599727853288181, + "grad_norm": 0.11127032339572906, + "learning_rate": 0.0016237022455554645, + "loss": 2.2968, + "step": 403540 + }, + { + "epoch": 1.5600114425322014, + "grad_norm": 0.08576207607984543, + "learning_rate": 0.0016231711263716647, + "loss": 2.3012, + "step": 403550 + }, + { + "epoch": 1.5600500997355846, + "grad_norm": 0.10351715236902237, + "learning_rate": 0.001622640754717736, + "loss": 2.3086, + "step": 403560 + }, + { + "epoch": 1.5600887569389679, + "grad_norm": 0.08792705088853836, + "learning_rate": 0.0016221111274461763, + "loss": 2.3092, + "step": 403570 + }, + { + "epoch": 1.5601274141423511, + "grad_norm": 0.1035882756114006, + "learning_rate": 0.00162158224143151, + "loss": 2.2993, + "step": 403580 + }, + { + "epoch": 1.5601660713457346, + "grad_norm": 0.09966867417097092, + "learning_rate": 0.0016210540935700716, + "loss": 2.3229, + "step": 403590 + }, + { + "epoch": 1.5602047285491178, + "grad_norm": 0.09071838110685349, + "learning_rate": 0.0016205266807797945, + "loss": 2.2982, + "step": 403600 + }, + { + "epoch": 1.560243385752501, + "grad_norm": 0.09316880255937576, + "learning_rate": 0.0016200000000000001, + "loss": 2.3015, + "step": 403610 + }, + { + "epoch": 1.5602820429558844, + "grad_norm": 0.08824972063302994, + "learning_rate": 0.001619474048191191, + "loss": 2.2916, + "step": 403620 + }, + { + "epoch": 1.5603207001592678, + "grad_norm": 0.09666429460048676, + "learning_rate": 0.001618948822334847, + "loss": 2.3021, + "step": 403630 + }, + { + "epoch": 1.560359357362651, + "grad_norm": 0.09371677786111832, + "learning_rate": 0.0016184243194332217, + "loss": 2.3096, + "step": 403640 + }, + { + "epoch": 1.5603980145660343, + "grad_norm": 0.08939579129219055, + "learning_rate": 0.001617900536509144, + "loss": 2.2947, + "step": 403650 + }, + { + "epoch": 1.5604366717694176, + "grad_norm": 0.09367606788873672, + "learning_rate": 0.0016173774706058203, + "loss": 2.32, + "step": 403660 + }, + { + "epoch": 1.5604753289728008, + "grad_norm": 0.08723913878202438, + "learning_rate": 0.0016168551187866395, + "loss": 2.2983, + "step": 403670 + }, + { + "epoch": 1.560513986176184, + "grad_norm": 0.09485051780939102, + "learning_rate": 0.0016163334781349825, + "loss": 2.2985, + "step": 403680 + }, + { + "epoch": 1.5605526433795673, + "grad_norm": 0.09703823179006577, + "learning_rate": 0.001615812545754029, + "loss": 2.2967, + "step": 403690 + }, + { + "epoch": 1.5605913005829506, + "grad_norm": 0.10443862527608871, + "learning_rate": 0.0016152923187665732, + "loss": 2.3232, + "step": 403700 + }, + { + "epoch": 1.5606299577863338, + "grad_norm": 0.09260427206754684, + "learning_rate": 0.0016147727943148357, + "loss": 2.2957, + "step": 403710 + }, + { + "epoch": 1.560668614989717, + "grad_norm": 0.0898246020078659, + "learning_rate": 0.0016142539695602818, + "loss": 2.2967, + "step": 403720 + }, + { + "epoch": 1.5607072721931003, + "grad_norm": 0.10553883761167526, + "learning_rate": 0.0016137358416834406, + "loss": 2.3074, + "step": 403730 + }, + { + "epoch": 1.5607459293964836, + "grad_norm": 0.10429342091083527, + "learning_rate": 0.0016132184078837257, + "loss": 2.3027, + "step": 403740 + }, + { + "epoch": 1.560784586599867, + "grad_norm": 0.09854055196046829, + "learning_rate": 0.0016127016653792582, + "loss": 2.2968, + "step": 403750 + }, + { + "epoch": 1.5608232438032503, + "grad_norm": 0.14768493175506592, + "learning_rate": 0.0016121856114066936, + "loss": 2.3109, + "step": 403760 + }, + { + "epoch": 1.5608619010066336, + "grad_norm": 0.09202374517917633, + "learning_rate": 0.0016116702432210478, + "loss": 2.2951, + "step": 403770 + }, + { + "epoch": 1.5609005582100168, + "grad_norm": 0.0918547585606575, + "learning_rate": 0.0016111555580955283, + "loss": 2.3131, + "step": 403780 + }, + { + "epoch": 1.5609392154134, + "grad_norm": 0.08800943940877914, + "learning_rate": 0.0016106415533213643, + "loss": 2.2971, + "step": 403790 + }, + { + "epoch": 1.5609778726167836, + "grad_norm": 0.10626718401908875, + "learning_rate": 0.0016101282262076415, + "loss": 2.2902, + "step": 403800 + }, + { + "epoch": 1.5610165298201668, + "grad_norm": 0.10609113425016403, + "learning_rate": 0.0016096155740811374, + "loss": 2.3076, + "step": 403810 + }, + { + "epoch": 1.56105518702355, + "grad_norm": 0.09052567183971405, + "learning_rate": 0.0016091035942861587, + "loss": 2.3103, + "step": 403820 + }, + { + "epoch": 1.5610938442269333, + "grad_norm": 0.13472731411457062, + "learning_rate": 0.0016085922841843814, + "loss": 2.3097, + "step": 403830 + }, + { + "epoch": 1.5611325014303166, + "grad_norm": 0.10542362183332443, + "learning_rate": 0.0016080816411546915, + "loss": 2.3079, + "step": 403840 + }, + { + "epoch": 1.5611711586336998, + "grad_norm": 0.1079033687710762, + "learning_rate": 0.0016075716625930283, + "loss": 2.3053, + "step": 403850 + }, + { + "epoch": 1.561209815837083, + "grad_norm": 0.09278237819671631, + "learning_rate": 0.00160706234591223, + "loss": 2.3149, + "step": 403860 + }, + { + "epoch": 1.5612484730404663, + "grad_norm": 0.08637858927249908, + "learning_rate": 0.00160655368854188, + "loss": 2.2866, + "step": 403870 + }, + { + "epoch": 1.5612871302438496, + "grad_norm": 0.10525112599134445, + "learning_rate": 0.0016060456879281558, + "loss": 2.2973, + "step": 403880 + }, + { + "epoch": 1.5613257874472328, + "grad_norm": 0.10020878165960312, + "learning_rate": 0.0016055383415336797, + "loss": 2.303, + "step": 403890 + }, + { + "epoch": 1.561364444650616, + "grad_norm": 0.08318115025758743, + "learning_rate": 0.00160503164683737, + "loss": 2.3193, + "step": 403900 + }, + { + "epoch": 1.5614031018539993, + "grad_norm": 0.11012984067201614, + "learning_rate": 0.0016045256013342965, + "loss": 2.3035, + "step": 403910 + }, + { + "epoch": 1.5614417590573828, + "grad_norm": 0.08762725442647934, + "learning_rate": 0.0016040202025355335, + "loss": 2.3057, + "step": 403920 + }, + { + "epoch": 1.561480416260766, + "grad_norm": 0.09949534386396408, + "learning_rate": 0.00160351544796802, + "loss": 2.2934, + "step": 403930 + }, + { + "epoch": 1.5615190734641493, + "grad_norm": 0.0957961231470108, + "learning_rate": 0.0016030113351744157, + "loss": 2.3151, + "step": 403940 + }, + { + "epoch": 1.5615577306675326, + "grad_norm": 0.08565373718738556, + "learning_rate": 0.0016025078617129642, + "loss": 2.3107, + "step": 403950 + }, + { + "epoch": 1.5615963878709158, + "grad_norm": 0.09716659784317017, + "learning_rate": 0.001602005025157352, + "loss": 2.2956, + "step": 403960 + }, + { + "epoch": 1.5616350450742993, + "grad_norm": 0.10047255456447601, + "learning_rate": 0.0016015028230965745, + "loss": 2.2965, + "step": 403970 + }, + { + "epoch": 1.5616737022776825, + "grad_norm": 0.08772067725658417, + "learning_rate": 0.0016010012531348, + "loss": 2.3126, + "step": 403980 + }, + { + "epoch": 1.5617123594810658, + "grad_norm": 0.09811752289533615, + "learning_rate": 0.0016005003128912365, + "loss": 2.2973, + "step": 403990 + }, + { + "epoch": 1.561751016684449, + "grad_norm": 0.08804940432310104, + "learning_rate": 0.0016, + "loss": 2.3035, + "step": 404000 + }, + { + "epoch": 1.5617896738878323, + "grad_norm": 0.0887695848941803, + "learning_rate": 0.0015995003121099844, + "loss": 2.2913, + "step": 404010 + }, + { + "epoch": 1.5618283310912155, + "grad_norm": 0.09368077665567398, + "learning_rate": 0.0015990012468847317, + "loss": 2.306, + "step": 404020 + }, + { + "epoch": 1.5618669882945988, + "grad_norm": 0.09503257274627686, + "learning_rate": 0.0015985028020023054, + "loss": 2.3125, + "step": 404030 + }, + { + "epoch": 1.561905645497982, + "grad_norm": 0.09424030780792236, + "learning_rate": 0.0015980049751551644, + "loss": 2.3032, + "step": 404040 + }, + { + "epoch": 1.5619443027013653, + "grad_norm": 0.09154917299747467, + "learning_rate": 0.001597507764050038, + "loss": 2.304, + "step": 404050 + }, + { + "epoch": 1.5619829599047486, + "grad_norm": 0.08909681439399719, + "learning_rate": 0.0015970111664078024, + "loss": 2.3203, + "step": 404060 + }, + { + "epoch": 1.5620216171081318, + "grad_norm": 0.09056919813156128, + "learning_rate": 0.0015965151799633597, + "loss": 2.3096, + "step": 404070 + }, + { + "epoch": 1.562060274311515, + "grad_norm": 0.10641703754663467, + "learning_rate": 0.0015960198024655167, + "loss": 2.296, + "step": 404080 + }, + { + "epoch": 1.5620989315148985, + "grad_norm": 0.10420147329568863, + "learning_rate": 0.0015955250316768664, + "loss": 2.2943, + "step": 404090 + }, + { + "epoch": 1.5621375887182818, + "grad_norm": 0.10419346392154694, + "learning_rate": 0.0015950308653736682, + "loss": 2.2908, + "step": 404100 + }, + { + "epoch": 1.562176245921665, + "grad_norm": 0.08935526013374329, + "learning_rate": 0.0015945373013457342, + "loss": 2.3078, + "step": 404110 + }, + { + "epoch": 1.5622149031250483, + "grad_norm": 0.10160025209188461, + "learning_rate": 0.0015940443373963112, + "loss": 2.3096, + "step": 404120 + }, + { + "epoch": 1.5622535603284315, + "grad_norm": 0.11097364872694016, + "learning_rate": 0.0015935519713419684, + "loss": 2.3054, + "step": 404130 + }, + { + "epoch": 1.562292217531815, + "grad_norm": 0.08406781405210495, + "learning_rate": 0.0015930602010124839, + "loss": 2.317, + "step": 404140 + }, + { + "epoch": 1.5623308747351983, + "grad_norm": 0.09016028046607971, + "learning_rate": 0.0015925690242507328, + "loss": 2.3145, + "step": 404150 + }, + { + "epoch": 1.5623695319385815, + "grad_norm": 0.0907144844532013, + "learning_rate": 0.0015920784389125773, + "loss": 2.2806, + "step": 404160 + }, + { + "epoch": 1.5624081891419648, + "grad_norm": 0.09960740059614182, + "learning_rate": 0.0015915884428667573, + "loss": 2.2886, + "step": 404170 + }, + { + "epoch": 1.562446846345348, + "grad_norm": 0.09331320971250534, + "learning_rate": 0.0015910990339947826, + "loss": 2.3056, + "step": 404180 + }, + { + "epoch": 1.5624855035487313, + "grad_norm": 0.10006804764270782, + "learning_rate": 0.0015906102101908256, + "loss": 2.2976, + "step": 404190 + }, + { + "epoch": 1.5625241607521145, + "grad_norm": 0.11300943791866302, + "learning_rate": 0.001590121969361616, + "loss": 2.3179, + "step": 404200 + }, + { + "epoch": 1.5625628179554978, + "grad_norm": 0.09404987841844559, + "learning_rate": 0.0015896343094263363, + "loss": 2.2953, + "step": 404210 + }, + { + "epoch": 1.562601475158881, + "grad_norm": 0.10447025299072266, + "learning_rate": 0.0015891472283165173, + "loss": 2.291, + "step": 404220 + }, + { + "epoch": 1.5626401323622643, + "grad_norm": 0.09680704772472382, + "learning_rate": 0.0015886607239759373, + "loss": 2.2923, + "step": 404230 + }, + { + "epoch": 1.5626787895656475, + "grad_norm": 0.1104864552617073, + "learning_rate": 0.00158817479436052, + "loss": 2.2877, + "step": 404240 + }, + { + "epoch": 1.5627174467690308, + "grad_norm": 0.09455239772796631, + "learning_rate": 0.001587689437438234, + "loss": 2.3164, + "step": 404250 + }, + { + "epoch": 1.5627561039724143, + "grad_norm": 0.09873755276203156, + "learning_rate": 0.001587204651188994, + "loss": 2.2793, + "step": 404260 + }, + { + "epoch": 1.5627947611757975, + "grad_norm": 0.09644533693790436, + "learning_rate": 0.0015867204336045635, + "loss": 2.3147, + "step": 404270 + }, + { + "epoch": 1.5628334183791808, + "grad_norm": 0.10555064678192139, + "learning_rate": 0.0015862367826884561, + "loss": 2.316, + "step": 404280 + }, + { + "epoch": 1.562872075582564, + "grad_norm": 0.09051340818405151, + "learning_rate": 0.0015857536964558406, + "loss": 2.2949, + "step": 404290 + }, + { + "epoch": 1.5629107327859475, + "grad_norm": 0.0887477844953537, + "learning_rate": 0.0015852711729334456, + "loss": 2.3257, + "step": 404300 + }, + { + "epoch": 1.5629493899893308, + "grad_norm": 0.09938947856426239, + "learning_rate": 0.001584789210159466, + "loss": 2.2996, + "step": 404310 + }, + { + "epoch": 1.562988047192714, + "grad_norm": 0.1027822345495224, + "learning_rate": 0.0015843078061834695, + "loss": 2.2955, + "step": 404320 + }, + { + "epoch": 1.5630267043960973, + "grad_norm": 0.09696999937295914, + "learning_rate": 0.001583826959066304, + "loss": 2.3087, + "step": 404330 + }, + { + "epoch": 1.5630653615994805, + "grad_norm": 0.09085328131914139, + "learning_rate": 0.001583346666880007, + "loss": 2.3112, + "step": 404340 + }, + { + "epoch": 1.5631040188028638, + "grad_norm": 0.10693728923797607, + "learning_rate": 0.0015828669277077158, + "loss": 2.3, + "step": 404350 + }, + { + "epoch": 1.563142676006247, + "grad_norm": 0.10152090340852737, + "learning_rate": 0.001582387739643578, + "loss": 2.2922, + "step": 404360 + }, + { + "epoch": 1.5631813332096303, + "grad_norm": 0.11221075803041458, + "learning_rate": 0.0015819091007926627, + "loss": 2.2943, + "step": 404370 + }, + { + "epoch": 1.5632199904130135, + "grad_norm": 0.1135498657822609, + "learning_rate": 0.001581431009270873, + "loss": 2.2907, + "step": 404380 + }, + { + "epoch": 1.5632586476163968, + "grad_norm": 0.10113508999347687, + "learning_rate": 0.0015809534632048607, + "loss": 2.2966, + "step": 404390 + }, + { + "epoch": 1.56329730481978, + "grad_norm": 0.09676952660083771, + "learning_rate": 0.0015804764607319393, + "loss": 2.3061, + "step": 404400 + }, + { + "epoch": 1.5633359620231633, + "grad_norm": 0.10081905871629715, + "learning_rate": 0.00158, + "loss": 2.2943, + "step": 404410 + }, + { + "epoch": 1.5633746192265465, + "grad_norm": 0.11143055558204651, + "learning_rate": 0.0015795240791674273, + "loss": 2.3154, + "step": 404420 + }, + { + "epoch": 1.56341327642993, + "grad_norm": 0.10502201318740845, + "learning_rate": 0.0015790486964030164, + "loss": 2.3043, + "step": 404430 + }, + { + "epoch": 1.5634519336333133, + "grad_norm": 0.08794165402650833, + "learning_rate": 0.0015785738498858906, + "loss": 2.2958, + "step": 404440 + }, + { + "epoch": 1.5634905908366965, + "grad_norm": 0.09427875280380249, + "learning_rate": 0.0015780995378054203, + "loss": 2.2992, + "step": 404450 + }, + { + "epoch": 1.5635292480400798, + "grad_norm": 0.09788183867931366, + "learning_rate": 0.0015776257583611426, + "loss": 2.2962, + "step": 404460 + }, + { + "epoch": 1.5635679052434632, + "grad_norm": 0.11642619967460632, + "learning_rate": 0.0015771525097626805, + "loss": 2.3116, + "step": 404470 + }, + { + "epoch": 1.5636065624468465, + "grad_norm": 0.09682761877775192, + "learning_rate": 0.0015766797902296657, + "loss": 2.3041, + "step": 404480 + }, + { + "epoch": 1.5636452196502297, + "grad_norm": 0.08750241249799728, + "learning_rate": 0.0015762075979916582, + "loss": 2.3025, + "step": 404490 + }, + { + "epoch": 1.563683876853613, + "grad_norm": 0.09719724208116531, + "learning_rate": 0.0015757359312880714, + "loss": 2.3009, + "step": 404500 + }, + { + "epoch": 1.5637225340569962, + "grad_norm": 0.09916465729475021, + "learning_rate": 0.001575264788368094, + "loss": 2.3055, + "step": 404510 + }, + { + "epoch": 1.5637611912603795, + "grad_norm": 0.09297320246696472, + "learning_rate": 0.001574794167490614, + "loss": 2.2828, + "step": 404520 + }, + { + "epoch": 1.5637998484637627, + "grad_norm": 0.10248459875583649, + "learning_rate": 0.0015743240669241448, + "loss": 2.2927, + "step": 404530 + }, + { + "epoch": 1.563838505667146, + "grad_norm": 0.11836646497249603, + "learning_rate": 0.0015738544849467498, + "loss": 2.3037, + "step": 404540 + }, + { + "epoch": 1.5638771628705292, + "grad_norm": 0.10327929258346558, + "learning_rate": 0.0015733854198459692, + "loss": 2.2934, + "step": 404550 + }, + { + "epoch": 1.5639158200739125, + "grad_norm": 0.0815177783370018, + "learning_rate": 0.0015729168699187474, + "loss": 2.3026, + "step": 404560 + }, + { + "epoch": 1.5639544772772958, + "grad_norm": 0.10899542272090912, + "learning_rate": 0.0015724488334713609, + "loss": 2.3002, + "step": 404570 + }, + { + "epoch": 1.563993134480679, + "grad_norm": 0.10903110355138779, + "learning_rate": 0.0015719813088193463, + "loss": 2.3106, + "step": 404580 + }, + { + "epoch": 1.5640317916840623, + "grad_norm": 0.09604477882385254, + "learning_rate": 0.001571514294287429, + "loss": 2.2954, + "step": 404590 + }, + { + "epoch": 1.5640704488874457, + "grad_norm": 0.09511998295783997, + "learning_rate": 0.0015710477882094555, + "loss": 2.2929, + "step": 404600 + }, + { + "epoch": 1.564109106090829, + "grad_norm": 0.08982515335083008, + "learning_rate": 0.0015705817889283223, + "loss": 2.2926, + "step": 404610 + }, + { + "epoch": 1.5641477632942122, + "grad_norm": 0.08804116398096085, + "learning_rate": 0.0015701162947959065, + "loss": 2.307, + "step": 404620 + }, + { + "epoch": 1.5641864204975955, + "grad_norm": 0.0896340161561966, + "learning_rate": 0.0015696513041729998, + "loss": 2.3074, + "step": 404630 + }, + { + "epoch": 1.564225077700979, + "grad_norm": 0.12138937413692474, + "learning_rate": 0.0015691868154292397, + "loss": 2.3095, + "step": 404640 + }, + { + "epoch": 1.5642637349043622, + "grad_norm": 0.10645350068807602, + "learning_rate": 0.0015687228269430436, + "loss": 2.3073, + "step": 404650 + }, + { + "epoch": 1.5643023921077455, + "grad_norm": 0.12402874231338501, + "learning_rate": 0.0015682593371015418, + "loss": 2.2938, + "step": 404660 + }, + { + "epoch": 1.5643410493111287, + "grad_norm": 0.08726049959659576, + "learning_rate": 0.0015677963443005139, + "loss": 2.2876, + "step": 404670 + }, + { + "epoch": 1.564379706514512, + "grad_norm": 0.11048829555511475, + "learning_rate": 0.0015673338469443213, + "loss": 2.2954, + "step": 404680 + }, + { + "epoch": 1.5644183637178952, + "grad_norm": 0.09738937020301819, + "learning_rate": 0.0015668718434458458, + "loss": 2.3104, + "step": 404690 + }, + { + "epoch": 1.5644570209212785, + "grad_norm": 0.09116674214601517, + "learning_rate": 0.001566410332226424, + "loss": 2.2856, + "step": 404700 + }, + { + "epoch": 1.5644956781246617, + "grad_norm": 0.08973516523838043, + "learning_rate": 0.0015659493117157859, + "loss": 2.277, + "step": 404710 + }, + { + "epoch": 1.564534335328045, + "grad_norm": 0.09033355116844177, + "learning_rate": 0.0015654887803519912, + "loss": 2.3133, + "step": 404720 + }, + { + "epoch": 1.5645729925314282, + "grad_norm": 0.1006704568862915, + "learning_rate": 0.001565028736581369, + "loss": 2.2991, + "step": 404730 + }, + { + "epoch": 1.5646116497348115, + "grad_norm": 0.1322348564863205, + "learning_rate": 0.0015645691788584552, + "loss": 2.3152, + "step": 404740 + }, + { + "epoch": 1.5646503069381947, + "grad_norm": 0.08889655768871307, + "learning_rate": 0.0015641101056459328, + "loss": 2.301, + "step": 404750 + }, + { + "epoch": 1.564688964141578, + "grad_norm": 0.09167850762605667, + "learning_rate": 0.0015636515154145714, + "loss": 2.3004, + "step": 404760 + }, + { + "epoch": 1.5647276213449615, + "grad_norm": 0.09554719924926758, + "learning_rate": 0.001563193406643169, + "loss": 2.3018, + "step": 404770 + }, + { + "epoch": 1.5647662785483447, + "grad_norm": 0.09353359788656235, + "learning_rate": 0.001562735777818491, + "loss": 2.2896, + "step": 404780 + }, + { + "epoch": 1.564804935751728, + "grad_norm": 0.1024642139673233, + "learning_rate": 0.0015622786274352143, + "loss": 2.3121, + "step": 404790 + }, + { + "epoch": 1.5648435929551112, + "grad_norm": 0.09727098792791367, + "learning_rate": 0.001561821953995867, + "loss": 2.2878, + "step": 404800 + }, + { + "epoch": 1.5648822501584947, + "grad_norm": 0.09108156710863113, + "learning_rate": 0.0015613657560107738, + "loss": 2.2889, + "step": 404810 + }, + { + "epoch": 1.564920907361878, + "grad_norm": 0.09672145545482635, + "learning_rate": 0.001560910031997997, + "loss": 2.2887, + "step": 404820 + }, + { + "epoch": 1.5649595645652612, + "grad_norm": 0.09327162802219391, + "learning_rate": 0.001560454780483282, + "loss": 2.297, + "step": 404830 + }, + { + "epoch": 1.5649982217686444, + "grad_norm": 0.11216502636671066, + "learning_rate": 0.0015600000000000002, + "loss": 2.2957, + "step": 404840 + }, + { + "epoch": 1.5650368789720277, + "grad_norm": 0.09545884281396866, + "learning_rate": 0.001559545689089095, + "loss": 2.2991, + "step": 404850 + }, + { + "epoch": 1.565075536175411, + "grad_norm": 0.10925882309675217, + "learning_rate": 0.001559091846299028, + "loss": 2.3099, + "step": 404860 + }, + { + "epoch": 1.5651141933787942, + "grad_norm": 0.10249479860067368, + "learning_rate": 0.0015586384701857217, + "loss": 2.2967, + "step": 404870 + }, + { + "epoch": 1.5651528505821775, + "grad_norm": 0.08673015236854553, + "learning_rate": 0.0015581855593125096, + "loss": 2.3004, + "step": 404880 + }, + { + "epoch": 1.5651915077855607, + "grad_norm": 0.09049186110496521, + "learning_rate": 0.0015577331122500805, + "loss": 2.2948, + "step": 404890 + }, + { + "epoch": 1.565230164988944, + "grad_norm": 0.09595070779323578, + "learning_rate": 0.0015572811275764268, + "loss": 2.2978, + "step": 404900 + }, + { + "epoch": 1.5652688221923272, + "grad_norm": 0.09924677759408951, + "learning_rate": 0.0015568296038767932, + "loss": 2.2994, + "step": 404910 + }, + { + "epoch": 1.5653074793957105, + "grad_norm": 0.12818053364753723, + "learning_rate": 0.0015563785397436235, + "loss": 2.2861, + "step": 404920 + }, + { + "epoch": 1.5653461365990937, + "grad_norm": 0.09655303508043289, + "learning_rate": 0.0015559279337765098, + "loss": 2.308, + "step": 404930 + }, + { + "epoch": 1.5653847938024772, + "grad_norm": 0.09840988367795944, + "learning_rate": 0.0015554777845821425, + "loss": 2.3117, + "step": 404940 + }, + { + "epoch": 1.5654234510058604, + "grad_norm": 0.09422668814659119, + "learning_rate": 0.0015550280907742603, + "loss": 2.2959, + "step": 404950 + }, + { + "epoch": 1.5654621082092437, + "grad_norm": 0.09979169815778732, + "learning_rate": 0.0015545788509735983, + "loss": 2.3184, + "step": 404960 + }, + { + "epoch": 1.565500765412627, + "grad_norm": 0.09588532149791718, + "learning_rate": 0.001554130063807841, + "loss": 2.3002, + "step": 404970 + }, + { + "epoch": 1.5655394226160104, + "grad_norm": 0.10011453926563263, + "learning_rate": 0.001553681727911572, + "loss": 2.3092, + "step": 404980 + }, + { + "epoch": 1.5655780798193937, + "grad_norm": 0.09356016665697098, + "learning_rate": 0.0015532338419262267, + "loss": 2.2948, + "step": 404990 + }, + { + "epoch": 1.565616737022777, + "grad_norm": 0.3818550109863281, + "learning_rate": 0.0015527864045000422, + "loss": 2.2887, + "step": 405000 + }, + { + "epoch": 1.5656553942261602, + "grad_norm": 0.08383811265230179, + "learning_rate": 0.0015523394142880123, + "loss": 2.3076, + "step": 405010 + }, + { + "epoch": 1.5656940514295434, + "grad_norm": 0.09441009163856506, + "learning_rate": 0.0015518928699518385, + "loss": 2.3064, + "step": 405020 + }, + { + "epoch": 1.5657327086329267, + "grad_norm": 0.09960923343896866, + "learning_rate": 0.001551446770159884, + "loss": 2.3078, + "step": 405030 + }, + { + "epoch": 1.56577136583631, + "grad_norm": 0.0911037027835846, + "learning_rate": 0.001551001113587127, + "loss": 2.3047, + "step": 405040 + }, + { + "epoch": 1.5658100230396932, + "grad_norm": 0.08394049108028412, + "learning_rate": 0.0015505558989151154, + "loss": 2.3029, + "step": 405050 + }, + { + "epoch": 1.5658486802430764, + "grad_norm": 0.09003666043281555, + "learning_rate": 0.0015501111248319205, + "loss": 2.2986, + "step": 405060 + }, + { + "epoch": 1.5658873374464597, + "grad_norm": 0.1031719297170639, + "learning_rate": 0.001549666790032092, + "loss": 2.3044, + "step": 405070 + }, + { + "epoch": 1.565925994649843, + "grad_norm": 0.10729104280471802, + "learning_rate": 0.0015492228932166144, + "loss": 2.2966, + "step": 405080 + }, + { + "epoch": 1.5659646518532262, + "grad_norm": 0.09601952135562897, + "learning_rate": 0.0015487794330928607, + "loss": 2.3064, + "step": 405090 + }, + { + "epoch": 1.5660033090566094, + "grad_norm": 0.09457894414663315, + "learning_rate": 0.0015483364083745514, + "loss": 2.2983, + "step": 405100 + }, + { + "epoch": 1.566041966259993, + "grad_norm": 0.09483607113361359, + "learning_rate": 0.0015478938177817074, + "loss": 2.3162, + "step": 405110 + }, + { + "epoch": 1.5660806234633762, + "grad_norm": 0.08963212370872498, + "learning_rate": 0.0015474516600406097, + "loss": 2.2948, + "step": 405120 + }, + { + "epoch": 1.5661192806667594, + "grad_norm": 0.10295980423688889, + "learning_rate": 0.001547009933883755, + "loss": 2.2943, + "step": 405130 + }, + { + "epoch": 1.5661579378701427, + "grad_norm": 0.08700409531593323, + "learning_rate": 0.0015465686380498147, + "loss": 2.299, + "step": 405140 + }, + { + "epoch": 1.5661965950735262, + "grad_norm": 0.10050175338983536, + "learning_rate": 0.0015461277712835913, + "loss": 2.319, + "step": 405150 + }, + { + "epoch": 1.5662352522769094, + "grad_norm": 0.09328875690698624, + "learning_rate": 0.0015456873323359782, + "loss": 2.2914, + "step": 405160 + }, + { + "epoch": 1.5662739094802927, + "grad_norm": 0.09559834003448486, + "learning_rate": 0.001545247319963917, + "loss": 2.317, + "step": 405170 + }, + { + "epoch": 1.566312566683676, + "grad_norm": 0.10605069994926453, + "learning_rate": 0.0015448077329303582, + "loss": 2.3129, + "step": 405180 + }, + { + "epoch": 1.5663512238870592, + "grad_norm": 0.09179332107305527, + "learning_rate": 0.0015443685700042193, + "loss": 2.3062, + "step": 405190 + }, + { + "epoch": 1.5663898810904424, + "grad_norm": 0.09346537292003632, + "learning_rate": 0.001543929829960345, + "loss": 2.2932, + "step": 405200 + }, + { + "epoch": 1.5664285382938257, + "grad_norm": 0.09841105341911316, + "learning_rate": 0.001543491511579467, + "loss": 2.3092, + "step": 405210 + }, + { + "epoch": 1.566467195497209, + "grad_norm": 0.09264381974935532, + "learning_rate": 0.0015430536136481657, + "loss": 2.2978, + "step": 405220 + }, + { + "epoch": 1.5665058527005922, + "grad_norm": 0.10439188033342361, + "learning_rate": 0.0015426161349588292, + "loss": 2.3002, + "step": 405230 + }, + { + "epoch": 1.5665445099039754, + "grad_norm": 0.09768210351467133, + "learning_rate": 0.0015421790743096163, + "loss": 2.2982, + "step": 405240 + }, + { + "epoch": 1.5665831671073587, + "grad_norm": 0.11853209882974625, + "learning_rate": 0.001541742430504416, + "loss": 2.3093, + "step": 405250 + }, + { + "epoch": 1.566621824310742, + "grad_norm": 0.10544905066490173, + "learning_rate": 0.0015413062023528115, + "loss": 2.291, + "step": 405260 + }, + { + "epoch": 1.5666604815141252, + "grad_norm": 0.10261768102645874, + "learning_rate": 0.0015408703886700401, + "loss": 2.307, + "step": 405270 + }, + { + "epoch": 1.5666991387175087, + "grad_norm": 0.10163643211126328, + "learning_rate": 0.0015404349882769577, + "loss": 2.3067, + "step": 405280 + }, + { + "epoch": 1.566737795920892, + "grad_norm": 0.10879616439342499, + "learning_rate": 0.0015400000000000001, + "loss": 2.2961, + "step": 405290 + }, + { + "epoch": 1.5667764531242752, + "grad_norm": 0.09426324814558029, + "learning_rate": 0.0015395654226711467, + "loss": 2.2886, + "step": 405300 + }, + { + "epoch": 1.5668151103276584, + "grad_norm": 0.09058332443237305, + "learning_rate": 0.0015391312551278835, + "loss": 2.2944, + "step": 405310 + }, + { + "epoch": 1.5668537675310419, + "grad_norm": 0.09346079081296921, + "learning_rate": 0.0015386974962131683, + "loss": 2.2927, + "step": 405320 + }, + { + "epoch": 1.5668924247344251, + "grad_norm": 0.09960110485553741, + "learning_rate": 0.0015382641447753923, + "loss": 2.3021, + "step": 405330 + }, + { + "epoch": 1.5669310819378084, + "grad_norm": 0.09202473610639572, + "learning_rate": 0.0015378311996683462, + "loss": 2.295, + "step": 405340 + }, + { + "epoch": 1.5669697391411916, + "grad_norm": 0.09738468378782272, + "learning_rate": 0.0015373986597511848, + "loss": 2.2838, + "step": 405350 + }, + { + "epoch": 1.567008396344575, + "grad_norm": 0.0873396024107933, + "learning_rate": 0.001536966523888391, + "loss": 2.3042, + "step": 405360 + }, + { + "epoch": 1.5670470535479581, + "grad_norm": 0.0920250341296196, + "learning_rate": 0.0015365347909497415, + "loss": 2.2925, + "step": 405370 + }, + { + "epoch": 1.5670857107513414, + "grad_norm": 0.09826283156871796, + "learning_rate": 0.001536103459810272, + "loss": 2.3059, + "step": 405380 + }, + { + "epoch": 1.5671243679547247, + "grad_norm": 0.08695420622825623, + "learning_rate": 0.001535672529350244, + "loss": 2.3007, + "step": 405390 + }, + { + "epoch": 1.567163025158108, + "grad_norm": 0.10699399560689926, + "learning_rate": 0.00153524199845511, + "loss": 2.2854, + "step": 405400 + }, + { + "epoch": 1.5672016823614912, + "grad_norm": 0.10169906914234161, + "learning_rate": 0.0015348118660154797, + "loss": 2.2965, + "step": 405410 + }, + { + "epoch": 1.5672403395648744, + "grad_norm": 0.08803411573171616, + "learning_rate": 0.0015343821309270872, + "loss": 2.2862, + "step": 405420 + }, + { + "epoch": 1.5672789967682577, + "grad_norm": 0.11092033982276917, + "learning_rate": 0.0015339527920907583, + "loss": 2.2922, + "step": 405430 + }, + { + "epoch": 1.567317653971641, + "grad_norm": 0.10854346305131912, + "learning_rate": 0.0015335238484123761, + "loss": 2.3048, + "step": 405440 + }, + { + "epoch": 1.5673563111750244, + "grad_norm": 0.09341815859079361, + "learning_rate": 0.00153309529880285, + "loss": 2.3061, + "step": 405450 + }, + { + "epoch": 1.5673949683784076, + "grad_norm": 0.0901259183883667, + "learning_rate": 0.001532667142178083, + "loss": 2.3048, + "step": 405460 + }, + { + "epoch": 1.567433625581791, + "grad_norm": 0.09512338042259216, + "learning_rate": 0.00153223937745894, + "loss": 2.3011, + "step": 405470 + }, + { + "epoch": 1.5674722827851741, + "grad_norm": 0.09469451010227203, + "learning_rate": 0.0015318120035712152, + "loss": 2.2995, + "step": 405480 + }, + { + "epoch": 1.5675109399885576, + "grad_norm": 0.10558336973190308, + "learning_rate": 0.0015313850194456006, + "loss": 2.2988, + "step": 405490 + }, + { + "epoch": 1.5675495971919409, + "grad_norm": 0.08973246067762375, + "learning_rate": 0.0015309584240176572, + "loss": 2.291, + "step": 405500 + }, + { + "epoch": 1.5675882543953241, + "grad_norm": 0.08959843218326569, + "learning_rate": 0.0015305322162277799, + "loss": 2.2912, + "step": 405510 + }, + { + "epoch": 1.5676269115987074, + "grad_norm": 0.12382600456476212, + "learning_rate": 0.001530106395021171, + "loss": 2.3053, + "step": 405520 + }, + { + "epoch": 1.5676655688020906, + "grad_norm": 0.09585974365472794, + "learning_rate": 0.0015296809593478061, + "loss": 2.2898, + "step": 405530 + }, + { + "epoch": 1.5677042260054739, + "grad_norm": 0.09502732008695602, + "learning_rate": 0.001529255908162407, + "loss": 2.3122, + "step": 405540 + }, + { + "epoch": 1.5677428832088571, + "grad_norm": 0.0948452278971672, + "learning_rate": 0.0015288312404244102, + "loss": 2.2958, + "step": 405550 + }, + { + "epoch": 1.5677815404122404, + "grad_norm": 0.11344671249389648, + "learning_rate": 0.0015284069550979362, + "loss": 2.3004, + "step": 405560 + }, + { + "epoch": 1.5678201976156236, + "grad_norm": 0.09710563719272614, + "learning_rate": 0.0015279830511517622, + "loss": 2.2906, + "step": 405570 + }, + { + "epoch": 1.5678588548190069, + "grad_norm": 0.09709560871124268, + "learning_rate": 0.0015275595275592913, + "loss": 2.3023, + "step": 405580 + }, + { + "epoch": 1.5678975120223901, + "grad_norm": 0.11424267292022705, + "learning_rate": 0.0015271363832985245, + "loss": 2.2982, + "step": 405590 + }, + { + "epoch": 1.5679361692257734, + "grad_norm": 0.10697871446609497, + "learning_rate": 0.001526713617352031, + "loss": 2.3051, + "step": 405600 + }, + { + "epoch": 1.5679748264291566, + "grad_norm": 0.09038961678743362, + "learning_rate": 0.0015262912287069197, + "loss": 2.2992, + "step": 405610 + }, + { + "epoch": 1.5680134836325401, + "grad_norm": 0.10185104608535767, + "learning_rate": 0.0015258692163548121, + "loss": 2.304, + "step": 405620 + }, + { + "epoch": 1.5680521408359234, + "grad_norm": 0.09717609733343124, + "learning_rate": 0.0015254475792918132, + "loss": 2.3047, + "step": 405630 + }, + { + "epoch": 1.5680907980393066, + "grad_norm": 0.09980986267328262, + "learning_rate": 0.0015250263165184834, + "loss": 2.2991, + "step": 405640 + }, + { + "epoch": 1.5681294552426899, + "grad_norm": 0.08376102149486542, + "learning_rate": 0.0015246054270398116, + "loss": 2.2921, + "step": 405650 + }, + { + "epoch": 1.5681681124460733, + "grad_norm": 0.10156821459531784, + "learning_rate": 0.0015241849098651871, + "loss": 2.3028, + "step": 405660 + }, + { + "epoch": 1.5682067696494566, + "grad_norm": 0.1134498193860054, + "learning_rate": 0.0015237647640083739, + "loss": 2.2907, + "step": 405670 + }, + { + "epoch": 1.5682454268528399, + "grad_norm": 0.0972135066986084, + "learning_rate": 0.0015233449884874805, + "loss": 2.301, + "step": 405680 + }, + { + "epoch": 1.568284084056223, + "grad_norm": 0.09666749089956284, + "learning_rate": 0.0015229255823249375, + "loss": 2.2921, + "step": 405690 + }, + { + "epoch": 1.5683227412596064, + "grad_norm": 0.10058042407035828, + "learning_rate": 0.001522506544547467, + "loss": 2.3025, + "step": 405700 + }, + { + "epoch": 1.5683613984629896, + "grad_norm": 0.10395345836877823, + "learning_rate": 0.0015220878741860591, + "loss": 2.2992, + "step": 405710 + }, + { + "epoch": 1.5684000556663729, + "grad_norm": 0.10118017345666885, + "learning_rate": 0.0015216695702759439, + "loss": 2.2953, + "step": 405720 + }, + { + "epoch": 1.5684387128697561, + "grad_norm": 0.11035631597042084, + "learning_rate": 0.001521251631856567, + "loss": 2.2899, + "step": 405730 + }, + { + "epoch": 1.5684773700731394, + "grad_norm": 0.1244610846042633, + "learning_rate": 0.0015208340579715624, + "loss": 2.2977, + "step": 405740 + }, + { + "epoch": 1.5685160272765226, + "grad_norm": 0.08581045269966125, + "learning_rate": 0.001520416847668728, + "loss": 2.2982, + "step": 405750 + }, + { + "epoch": 1.5685546844799059, + "grad_norm": 0.09502839297056198, + "learning_rate": 0.00152, + "loss": 2.2957, + "step": 405760 + }, + { + "epoch": 1.5685933416832891, + "grad_norm": 0.08598793298006058, + "learning_rate": 0.0015195835140214276, + "loss": 2.2969, + "step": 405770 + }, + { + "epoch": 1.5686319988866726, + "grad_norm": 0.10448497533798218, + "learning_rate": 0.001519167388793148, + "loss": 2.2906, + "step": 405780 + }, + { + "epoch": 1.5686706560900558, + "grad_norm": 0.10341082513332367, + "learning_rate": 0.0015187516233793614, + "loss": 2.2806, + "step": 405790 + }, + { + "epoch": 1.568709313293439, + "grad_norm": 0.08865875750780106, + "learning_rate": 0.001518336216848308, + "loss": 2.29, + "step": 405800 + }, + { + "epoch": 1.5687479704968224, + "grad_norm": 0.09579867124557495, + "learning_rate": 0.001517921168272242, + "loss": 2.3151, + "step": 405810 + }, + { + "epoch": 1.5687866277002056, + "grad_norm": 0.09985391795635223, + "learning_rate": 0.0015175064767274074, + "loss": 2.3067, + "step": 405820 + }, + { + "epoch": 1.568825284903589, + "grad_norm": 0.0938592478632927, + "learning_rate": 0.0015170921412940147, + "loss": 2.2875, + "step": 405830 + }, + { + "epoch": 1.5688639421069723, + "grad_norm": 0.09133358299732208, + "learning_rate": 0.001516678161056217, + "loss": 2.3052, + "step": 405840 + }, + { + "epoch": 1.5689025993103556, + "grad_norm": 0.23263610899448395, + "learning_rate": 0.0015162645351020872, + "loss": 2.2854, + "step": 405850 + }, + { + "epoch": 1.5689412565137388, + "grad_norm": 0.09544631838798523, + "learning_rate": 0.0015158512625235918, + "loss": 2.2988, + "step": 405860 + }, + { + "epoch": 1.568979913717122, + "grad_norm": 0.0995941236615181, + "learning_rate": 0.0015154383424165713, + "loss": 2.2911, + "step": 405870 + }, + { + "epoch": 1.5690185709205053, + "grad_norm": 0.09293455630540848, + "learning_rate": 0.0015150257738807142, + "loss": 2.3112, + "step": 405880 + }, + { + "epoch": 1.5690572281238886, + "grad_norm": 0.10342123359441757, + "learning_rate": 0.0015146135560195361, + "loss": 2.307, + "step": 405890 + }, + { + "epoch": 1.5690958853272718, + "grad_norm": 0.10460405796766281, + "learning_rate": 0.0015142016879403552, + "loss": 2.2901, + "step": 405900 + }, + { + "epoch": 1.569134542530655, + "grad_norm": 0.09743546694517136, + "learning_rate": 0.0015137901687542713, + "loss": 2.2987, + "step": 405910 + }, + { + "epoch": 1.5691731997340383, + "grad_norm": 0.11557811498641968, + "learning_rate": 0.0015133789975761425, + "loss": 2.3131, + "step": 405920 + }, + { + "epoch": 1.5692118569374216, + "grad_norm": 0.11989576369524002, + "learning_rate": 0.0015129681735245634, + "loss": 2.3038, + "step": 405930 + }, + { + "epoch": 1.5692505141408049, + "grad_norm": 0.09218642115592957, + "learning_rate": 0.0015125576957218426, + "loss": 2.2926, + "step": 405940 + }, + { + "epoch": 1.5692891713441883, + "grad_norm": 0.11180219054222107, + "learning_rate": 0.0015121475632939815, + "loss": 2.2939, + "step": 405950 + }, + { + "epoch": 1.5693278285475716, + "grad_norm": 0.09809017926454544, + "learning_rate": 0.0015117377753706518, + "loss": 2.2901, + "step": 405960 + }, + { + "epoch": 1.5693664857509548, + "grad_norm": 0.08998557180166245, + "learning_rate": 0.0015113283310851753, + "loss": 2.2973, + "step": 405970 + }, + { + "epoch": 1.569405142954338, + "grad_norm": 0.10271146148443222, + "learning_rate": 0.0015109192295745006, + "loss": 2.2886, + "step": 405980 + }, + { + "epoch": 1.5694438001577213, + "grad_norm": 0.10330662876367569, + "learning_rate": 0.0015105104699791835, + "loss": 2.2929, + "step": 405990 + }, + { + "epoch": 1.5694824573611048, + "grad_norm": 0.0977010577917099, + "learning_rate": 0.0015101020514433645, + "loss": 2.3121, + "step": 406000 + }, + { + "epoch": 1.569521114564488, + "grad_norm": 0.12290336936712265, + "learning_rate": 0.0015096939731147497, + "loss": 2.2993, + "step": 406010 + }, + { + "epoch": 1.5695597717678713, + "grad_norm": 0.0925399586558342, + "learning_rate": 0.0015092862341445881, + "loss": 2.2953, + "step": 406020 + }, + { + "epoch": 1.5695984289712546, + "grad_norm": 0.09576588124036789, + "learning_rate": 0.0015088788336876532, + "loss": 2.2992, + "step": 406030 + }, + { + "epoch": 1.5696370861746378, + "grad_norm": 0.1092677116394043, + "learning_rate": 0.0015084717709022198, + "loss": 2.2854, + "step": 406040 + }, + { + "epoch": 1.569675743378021, + "grad_norm": 0.1083604022860527, + "learning_rate": 0.0015080650449500463, + "loss": 2.2948, + "step": 406050 + }, + { + "epoch": 1.5697144005814043, + "grad_norm": 0.09485524892807007, + "learning_rate": 0.0015076586549963532, + "loss": 2.2844, + "step": 406060 + }, + { + "epoch": 1.5697530577847876, + "grad_norm": 0.09845910221338272, + "learning_rate": 0.0015072526002098032, + "loss": 2.2903, + "step": 406070 + }, + { + "epoch": 1.5697917149881708, + "grad_norm": 0.09440339356660843, + "learning_rate": 0.0015068468797624819, + "loss": 2.283, + "step": 406080 + }, + { + "epoch": 1.569830372191554, + "grad_norm": 0.10253866016864777, + "learning_rate": 0.0015064414928298772, + "loss": 2.2678, + "step": 406090 + }, + { + "epoch": 1.5698690293949373, + "grad_norm": 0.09150514751672745, + "learning_rate": 0.0015060364385908614, + "loss": 2.288, + "step": 406100 + }, + { + "epoch": 1.5699076865983206, + "grad_norm": 0.10575828701257706, + "learning_rate": 0.0015056317162276689, + "loss": 2.2965, + "step": 406110 + }, + { + "epoch": 1.569946343801704, + "grad_norm": 0.1001557931303978, + "learning_rate": 0.0015052273249258807, + "loss": 2.3123, + "step": 406120 + }, + { + "epoch": 1.5699850010050873, + "grad_norm": 0.09956227242946625, + "learning_rate": 0.0015048232638744021, + "loss": 2.287, + "step": 406130 + }, + { + "epoch": 1.5700236582084706, + "grad_norm": 0.0972357839345932, + "learning_rate": 0.0015044195322654452, + "loss": 2.2986, + "step": 406140 + }, + { + "epoch": 1.5700623154118538, + "grad_norm": 0.09053231030702591, + "learning_rate": 0.0015040161292945102, + "loss": 2.2851, + "step": 406150 + }, + { + "epoch": 1.5701009726152373, + "grad_norm": 0.0977761521935463, + "learning_rate": 0.0015036130541603656, + "loss": 2.3055, + "step": 406160 + }, + { + "epoch": 1.5701396298186205, + "grad_norm": 0.08875074982643127, + "learning_rate": 0.0015032103060650312, + "loss": 2.3, + "step": 406170 + }, + { + "epoch": 1.5701782870220038, + "grad_norm": 0.10786014795303345, + "learning_rate": 0.001502807884213758, + "loss": 2.3062, + "step": 406180 + }, + { + "epoch": 1.570216944225387, + "grad_norm": 0.09021253883838654, + "learning_rate": 0.001502405787815011, + "loss": 2.2884, + "step": 406190 + }, + { + "epoch": 1.5702556014287703, + "grad_norm": 0.1020994558930397, + "learning_rate": 0.0015020040160804507, + "loss": 2.3191, + "step": 406200 + }, + { + "epoch": 1.5702942586321536, + "grad_norm": 0.08927156031131744, + "learning_rate": 0.0015016025682249156, + "loss": 2.3037, + "step": 406210 + }, + { + "epoch": 1.5703329158355368, + "grad_norm": 0.09674308449029922, + "learning_rate": 0.0015012014434664029, + "loss": 2.2899, + "step": 406220 + }, + { + "epoch": 1.57037157303892, + "grad_norm": 0.10415749251842499, + "learning_rate": 0.0015008006410260526, + "loss": 2.2893, + "step": 406230 + }, + { + "epoch": 1.5704102302423033, + "grad_norm": 0.08874905109405518, + "learning_rate": 0.001500400160128128, + "loss": 2.29, + "step": 406240 + }, + { + "epoch": 1.5704488874456866, + "grad_norm": 0.09588893502950668, + "learning_rate": 0.0015, + "loss": 2.3002, + "step": 406250 + }, + { + "epoch": 1.5704875446490698, + "grad_norm": 0.10821720212697983, + "learning_rate": 0.001499600159872128, + "loss": 2.2989, + "step": 406260 + }, + { + "epoch": 1.570526201852453, + "grad_norm": 0.095454141497612, + "learning_rate": 0.0014992006389780434, + "loss": 2.2943, + "step": 406270 + }, + { + "epoch": 1.5705648590558363, + "grad_norm": 0.0938018187880516, + "learning_rate": 0.0014988014365543334, + "loss": 2.2917, + "step": 406280 + }, + { + "epoch": 1.5706035162592198, + "grad_norm": 0.09858124703168869, + "learning_rate": 0.0014984025518406218, + "loss": 2.2879, + "step": 406290 + }, + { + "epoch": 1.570642173462603, + "grad_norm": 0.10998082906007767, + "learning_rate": 0.0014980039840795547, + "loss": 2.297, + "step": 406300 + }, + { + "epoch": 1.5706808306659863, + "grad_norm": 0.10663913190364838, + "learning_rate": 0.0014976057325167811, + "loss": 2.2791, + "step": 406310 + }, + { + "epoch": 1.5707194878693695, + "grad_norm": 0.08741891384124756, + "learning_rate": 0.0014972077964009384, + "loss": 2.2934, + "step": 406320 + }, + { + "epoch": 1.570758145072753, + "grad_norm": 0.09392169117927551, + "learning_rate": 0.0014968101749836351, + "loss": 2.3063, + "step": 406330 + }, + { + "epoch": 1.5707968022761363, + "grad_norm": 0.09536357969045639, + "learning_rate": 0.0014964128675194332, + "loss": 2.3099, + "step": 406340 + }, + { + "epoch": 1.5708354594795195, + "grad_norm": 0.09603865444660187, + "learning_rate": 0.0014960158732658338, + "loss": 2.3019, + "step": 406350 + }, + { + "epoch": 1.5708741166829028, + "grad_norm": 0.11046824604272842, + "learning_rate": 0.0014956191914832604, + "loss": 2.2862, + "step": 406360 + }, + { + "epoch": 1.570912773886286, + "grad_norm": 0.1022503674030304, + "learning_rate": 0.0014952228214350413, + "loss": 2.2962, + "step": 406370 + }, + { + "epoch": 1.5709514310896693, + "grad_norm": 0.08685960620641708, + "learning_rate": 0.0014948267623873966, + "loss": 2.3009, + "step": 406380 + }, + { + "epoch": 1.5709900882930525, + "grad_norm": 0.10193188488483429, + "learning_rate": 0.0014944310136094186, + "loss": 2.3034, + "step": 406390 + }, + { + "epoch": 1.5710287454964358, + "grad_norm": 0.10660498589277267, + "learning_rate": 0.0014940355743730594, + "loss": 2.304, + "step": 406400 + }, + { + "epoch": 1.571067402699819, + "grad_norm": 0.09901173412799835, + "learning_rate": 0.0014936404439531135, + "loss": 2.3006, + "step": 406410 + }, + { + "epoch": 1.5711060599032023, + "grad_norm": 0.09949599951505661, + "learning_rate": 0.001493245621627203, + "loss": 2.2982, + "step": 406420 + }, + { + "epoch": 1.5711447171065855, + "grad_norm": 0.09687718749046326, + "learning_rate": 0.0014928511066757614, + "loss": 2.2955, + "step": 406430 + }, + { + "epoch": 1.5711833743099688, + "grad_norm": 0.08860356360673904, + "learning_rate": 0.0014924568983820192, + "loss": 2.2951, + "step": 406440 + }, + { + "epoch": 1.571222031513352, + "grad_norm": 0.09513945877552032, + "learning_rate": 0.0014920629960319882, + "loss": 2.2821, + "step": 406450 + }, + { + "epoch": 1.5712606887167355, + "grad_norm": 0.08987179398536682, + "learning_rate": 0.0014916693989144468, + "loss": 2.2918, + "step": 406460 + }, + { + "epoch": 1.5712993459201188, + "grad_norm": 0.08767048269510269, + "learning_rate": 0.001491276106320924, + "loss": 2.3, + "step": 406470 + }, + { + "epoch": 1.571338003123502, + "grad_norm": 0.09782981127500534, + "learning_rate": 0.0014908831175456858, + "loss": 2.2984, + "step": 406480 + }, + { + "epoch": 1.5713766603268853, + "grad_norm": 0.1317455768585205, + "learning_rate": 0.0014904904318857202, + "loss": 2.2922, + "step": 406490 + }, + { + "epoch": 1.5714153175302688, + "grad_norm": 0.0975729376077652, + "learning_rate": 0.0014900980486407215, + "loss": 2.2993, + "step": 406500 + }, + { + "epoch": 1.571453974733652, + "grad_norm": 0.10746076703071594, + "learning_rate": 0.0014897059671130771, + "loss": 2.2949, + "step": 406510 + }, + { + "epoch": 1.5714926319370353, + "grad_norm": 0.10250459611415863, + "learning_rate": 0.0014893141866078518, + "loss": 2.2775, + "step": 406520 + }, + { + "epoch": 1.5715312891404185, + "grad_norm": 0.10624974966049194, + "learning_rate": 0.0014889227064327746, + "loss": 2.2846, + "step": 406530 + }, + { + "epoch": 1.5715699463438018, + "grad_norm": 0.09488651156425476, + "learning_rate": 0.0014885315258982234, + "loss": 2.2932, + "step": 406540 + }, + { + "epoch": 1.571608603547185, + "grad_norm": 0.09613313525915146, + "learning_rate": 0.001488140644317211, + "loss": 2.2969, + "step": 406550 + }, + { + "epoch": 1.5716472607505683, + "grad_norm": 0.10298208147287369, + "learning_rate": 0.0014877500610053721, + "loss": 2.2823, + "step": 406560 + }, + { + "epoch": 1.5716859179539515, + "grad_norm": 0.10946042090654373, + "learning_rate": 0.0014873597752809483, + "loss": 2.3014, + "step": 406570 + }, + { + "epoch": 1.5717245751573348, + "grad_norm": 0.09858959168195724, + "learning_rate": 0.0014869697864647736, + "loss": 2.2867, + "step": 406580 + }, + { + "epoch": 1.571763232360718, + "grad_norm": 0.10874930769205093, + "learning_rate": 0.0014865800938802627, + "loss": 2.3001, + "step": 406590 + }, + { + "epoch": 1.5718018895641013, + "grad_norm": 0.09290513396263123, + "learning_rate": 0.0014861906968533947, + "loss": 2.2731, + "step": 406600 + }, + { + "epoch": 1.5718405467674845, + "grad_norm": 0.09719132632017136, + "learning_rate": 0.0014858015947127024, + "loss": 2.2942, + "step": 406610 + }, + { + "epoch": 1.5718792039708678, + "grad_norm": 0.0922248438000679, + "learning_rate": 0.0014854127867892557, + "loss": 2.2888, + "step": 406620 + }, + { + "epoch": 1.5719178611742513, + "grad_norm": 0.10382457077503204, + "learning_rate": 0.0014850242724166508, + "loss": 2.2822, + "step": 406630 + }, + { + "epoch": 1.5719565183776345, + "grad_norm": 0.10628269612789154, + "learning_rate": 0.001484636050930995, + "loss": 2.2953, + "step": 406640 + }, + { + "epoch": 1.5719951755810178, + "grad_norm": 0.0957474410533905, + "learning_rate": 0.001484248121670895, + "loss": 2.2953, + "step": 406650 + }, + { + "epoch": 1.572033832784401, + "grad_norm": 0.10400521755218506, + "learning_rate": 0.0014838604839774426, + "loss": 2.2836, + "step": 406660 + }, + { + "epoch": 1.5720724899877845, + "grad_norm": 0.09058831632137299, + "learning_rate": 0.0014834731371942019, + "loss": 2.2958, + "step": 406670 + }, + { + "epoch": 1.5721111471911677, + "grad_norm": 0.11464519053697586, + "learning_rate": 0.0014830860806671967, + "loss": 2.3005, + "step": 406680 + }, + { + "epoch": 1.572149804394551, + "grad_norm": 0.09459482133388519, + "learning_rate": 0.0014826993137448973, + "loss": 2.2827, + "step": 406690 + }, + { + "epoch": 1.5721884615979342, + "grad_norm": 0.09405256062746048, + "learning_rate": 0.0014823128357782087, + "loss": 2.2952, + "step": 406700 + }, + { + "epoch": 1.5722271188013175, + "grad_norm": 0.09099555015563965, + "learning_rate": 0.0014819266461204553, + "loss": 2.2966, + "step": 406710 + }, + { + "epoch": 1.5722657760047007, + "grad_norm": 0.11007675528526306, + "learning_rate": 0.0014815407441273712, + "loss": 2.2839, + "step": 406720 + }, + { + "epoch": 1.572304433208084, + "grad_norm": 0.09731370210647583, + "learning_rate": 0.0014811551291570861, + "loss": 2.294, + "step": 406730 + }, + { + "epoch": 1.5723430904114672, + "grad_norm": 0.10459351539611816, + "learning_rate": 0.0014807698005701133, + "loss": 2.2937, + "step": 406740 + }, + { + "epoch": 1.5723817476148505, + "grad_norm": 0.10094854235649109, + "learning_rate": 0.0014803847577293367, + "loss": 2.2839, + "step": 406750 + }, + { + "epoch": 1.5724204048182338, + "grad_norm": 0.0960117056965828, + "learning_rate": 0.00148, + "loss": 2.289, + "step": 406760 + }, + { + "epoch": 1.572459062021617, + "grad_norm": 0.09705298393964767, + "learning_rate": 0.0014796155267496928, + "loss": 2.3085, + "step": 406770 + }, + { + "epoch": 1.5724977192250003, + "grad_norm": 0.11112324148416519, + "learning_rate": 0.0014792313373483386, + "loss": 2.3118, + "step": 406780 + }, + { + "epoch": 1.5725363764283835, + "grad_norm": 0.09867529571056366, + "learning_rate": 0.0014788474311681849, + "loss": 2.3043, + "step": 406790 + }, + { + "epoch": 1.572575033631767, + "grad_norm": 0.09210442006587982, + "learning_rate": 0.001478463807583788, + "loss": 2.3029, + "step": 406800 + }, + { + "epoch": 1.5726136908351502, + "grad_norm": 0.1095161885023117, + "learning_rate": 0.0014780804659720044, + "loss": 2.2978, + "step": 406810 + }, + { + "epoch": 1.5726523480385335, + "grad_norm": 0.10121491551399231, + "learning_rate": 0.001477697405711976, + "loss": 2.2927, + "step": 406820 + }, + { + "epoch": 1.5726910052419167, + "grad_norm": 0.0996260941028595, + "learning_rate": 0.0014773146261851209, + "loss": 2.3018, + "step": 406830 + }, + { + "epoch": 1.5727296624453002, + "grad_norm": 0.0989765152335167, + "learning_rate": 0.0014769321267751193, + "loss": 2.3013, + "step": 406840 + }, + { + "epoch": 1.5727683196486835, + "grad_norm": 0.08970838785171509, + "learning_rate": 0.001476549906867904, + "loss": 2.3049, + "step": 406850 + }, + { + "epoch": 1.5728069768520667, + "grad_norm": 0.08717630058526993, + "learning_rate": 0.0014761679658516482, + "loss": 2.2886, + "step": 406860 + }, + { + "epoch": 1.57284563405545, + "grad_norm": 0.09678196161985397, + "learning_rate": 0.0014757863031167537, + "loss": 2.3171, + "step": 406870 + }, + { + "epoch": 1.5728842912588332, + "grad_norm": 0.09176228940486908, + "learning_rate": 0.00147540491805584, + "loss": 2.2973, + "step": 406880 + }, + { + "epoch": 1.5729229484622165, + "grad_norm": 0.10987565666437149, + "learning_rate": 0.0014750238100637325, + "loss": 2.2988, + "step": 406890 + }, + { + "epoch": 1.5729616056655997, + "grad_norm": 0.10397421568632126, + "learning_rate": 0.0014746429785374522, + "loss": 2.2854, + "step": 406900 + }, + { + "epoch": 1.573000262868983, + "grad_norm": 0.08612383902072906, + "learning_rate": 0.0014742624228762033, + "loss": 2.2989, + "step": 406910 + }, + { + "epoch": 1.5730389200723662, + "grad_norm": 0.0959596112370491, + "learning_rate": 0.0014738821424813637, + "loss": 2.3031, + "step": 406920 + }, + { + "epoch": 1.5730775772757495, + "grad_norm": 0.11785572022199631, + "learning_rate": 0.0014735021367564727, + "loss": 2.2857, + "step": 406930 + }, + { + "epoch": 1.5731162344791327, + "grad_norm": 0.08782157301902771, + "learning_rate": 0.0014731224051072202, + "loss": 2.3039, + "step": 406940 + }, + { + "epoch": 1.573154891682516, + "grad_norm": 0.0940522849559784, + "learning_rate": 0.0014727429469414373, + "loss": 2.2877, + "step": 406950 + }, + { + "epoch": 1.5731935488858992, + "grad_norm": 0.09437448531389236, + "learning_rate": 0.0014723637616690832, + "loss": 2.3009, + "step": 406960 + }, + { + "epoch": 1.5732322060892827, + "grad_norm": 0.08396671712398529, + "learning_rate": 0.0014719848487022365, + "loss": 2.2748, + "step": 406970 + }, + { + "epoch": 1.573270863292666, + "grad_norm": 0.10707056522369385, + "learning_rate": 0.0014716062074550838, + "loss": 2.2777, + "step": 406980 + }, + { + "epoch": 1.5733095204960492, + "grad_norm": 0.09134868532419205, + "learning_rate": 0.0014712278373439087, + "loss": 2.2942, + "step": 406990 + }, + { + "epoch": 1.5733481776994325, + "grad_norm": 0.0936884954571724, + "learning_rate": 0.001470849737787082, + "loss": 2.2889, + "step": 407000 + }, + { + "epoch": 1.573386834902816, + "grad_norm": 0.11258005350828171, + "learning_rate": 0.001470471908205051, + "loss": 2.2963, + "step": 407010 + }, + { + "epoch": 1.5734254921061992, + "grad_norm": 0.10835757851600647, + "learning_rate": 0.0014700943480203293, + "loss": 2.2981, + "step": 407020 + }, + { + "epoch": 1.5734641493095824, + "grad_norm": 0.10515619814395905, + "learning_rate": 0.001469717056657486, + "loss": 2.2831, + "step": 407030 + }, + { + "epoch": 1.5735028065129657, + "grad_norm": 0.1021227315068245, + "learning_rate": 0.0014693400335431361, + "loss": 2.2988, + "step": 407040 + }, + { + "epoch": 1.573541463716349, + "grad_norm": 0.08599024266004562, + "learning_rate": 0.00146896327810593, + "loss": 2.2934, + "step": 407050 + }, + { + "epoch": 1.5735801209197322, + "grad_norm": 0.12350116670131683, + "learning_rate": 0.001468586789776543, + "loss": 2.2927, + "step": 407060 + }, + { + "epoch": 1.5736187781231155, + "grad_norm": 0.09155186265707016, + "learning_rate": 0.0014682105679876666, + "loss": 2.2795, + "step": 407070 + }, + { + "epoch": 1.5736574353264987, + "grad_norm": 0.0983167216181755, + "learning_rate": 0.0014678346121739972, + "loss": 2.2943, + "step": 407080 + }, + { + "epoch": 1.573696092529882, + "grad_norm": 0.10952533036470413, + "learning_rate": 0.001467458921772226, + "loss": 2.2762, + "step": 407090 + }, + { + "epoch": 1.5737347497332652, + "grad_norm": 0.09205260127782822, + "learning_rate": 0.0014670834962210312, + "loss": 2.3022, + "step": 407100 + }, + { + "epoch": 1.5737734069366485, + "grad_norm": 0.10734140127897263, + "learning_rate": 0.001466708334961065, + "loss": 2.3061, + "step": 407110 + }, + { + "epoch": 1.5738120641400317, + "grad_norm": 0.10416164994239807, + "learning_rate": 0.0014663334374349466, + "loss": 2.3031, + "step": 407120 + }, + { + "epoch": 1.573850721343415, + "grad_norm": 0.1047743633389473, + "learning_rate": 0.0014659588030872524, + "loss": 2.2928, + "step": 407130 + }, + { + "epoch": 1.5738893785467984, + "grad_norm": 0.11782661080360413, + "learning_rate": 0.0014655844313645045, + "loss": 2.3004, + "step": 407140 + }, + { + "epoch": 1.5739280357501817, + "grad_norm": 0.1171165481209755, + "learning_rate": 0.0014652103217151625, + "loss": 2.2923, + "step": 407150 + }, + { + "epoch": 1.573966692953565, + "grad_norm": 0.09023956954479218, + "learning_rate": 0.0014648364735896139, + "loss": 2.2792, + "step": 407160 + }, + { + "epoch": 1.5740053501569482, + "grad_norm": 0.10098619759082794, + "learning_rate": 0.0014644628864401645, + "loss": 2.3014, + "step": 407170 + }, + { + "epoch": 1.5740440073603317, + "grad_norm": 0.1009693369269371, + "learning_rate": 0.0014640895597210295, + "loss": 2.2983, + "step": 407180 + }, + { + "epoch": 1.574082664563715, + "grad_norm": 0.1049528419971466, + "learning_rate": 0.0014637164928883231, + "loss": 2.2988, + "step": 407190 + }, + { + "epoch": 1.5741213217670982, + "grad_norm": 0.09989991039037704, + "learning_rate": 0.0014633436854000505, + "loss": 2.2972, + "step": 407200 + }, + { + "epoch": 1.5741599789704814, + "grad_norm": 0.1667187511920929, + "learning_rate": 0.001462971136716098, + "loss": 2.2917, + "step": 407210 + }, + { + "epoch": 1.5741986361738647, + "grad_norm": 0.09764105081558228, + "learning_rate": 0.0014625988462982238, + "loss": 2.2825, + "step": 407220 + }, + { + "epoch": 1.574237293377248, + "grad_norm": 0.10742088407278061, + "learning_rate": 0.0014622268136100498, + "loss": 2.2903, + "step": 407230 + }, + { + "epoch": 1.5742759505806312, + "grad_norm": 0.09315811842679977, + "learning_rate": 0.0014618550381170516, + "loss": 2.2988, + "step": 407240 + }, + { + "epoch": 1.5743146077840144, + "grad_norm": 0.10519949346780777, + "learning_rate": 0.0014614835192865497, + "loss": 2.2998, + "step": 407250 + }, + { + "epoch": 1.5743532649873977, + "grad_norm": 0.09778866916894913, + "learning_rate": 0.0014611122565877008, + "loss": 2.2996, + "step": 407260 + }, + { + "epoch": 1.574391922190781, + "grad_norm": 0.09602082520723343, + "learning_rate": 0.0014607412494914893, + "loss": 2.2962, + "step": 407270 + }, + { + "epoch": 1.5744305793941642, + "grad_norm": 0.09230657666921616, + "learning_rate": 0.0014603704974707184, + "loss": 2.2843, + "step": 407280 + }, + { + "epoch": 1.5744692365975475, + "grad_norm": 0.09628087282180786, + "learning_rate": 0.00146, + "loss": 2.3029, + "step": 407290 + }, + { + "epoch": 1.5745078938009307, + "grad_norm": 0.08596788346767426, + "learning_rate": 0.0014596297565557484, + "loss": 2.2843, + "step": 407300 + }, + { + "epoch": 1.5745465510043142, + "grad_norm": 0.0886722058057785, + "learning_rate": 0.001459259766616169, + "loss": 2.287, + "step": 407310 + }, + { + "epoch": 1.5745852082076974, + "grad_norm": 0.10400716960430145, + "learning_rate": 0.0014588900296612529, + "loss": 2.2906, + "step": 407320 + }, + { + "epoch": 1.5746238654110807, + "grad_norm": 0.10699665546417236, + "learning_rate": 0.0014585205451727648, + "loss": 2.2916, + "step": 407330 + }, + { + "epoch": 1.574662522614464, + "grad_norm": 0.08873599767684937, + "learning_rate": 0.0014581513126342375, + "loss": 2.2822, + "step": 407340 + }, + { + "epoch": 1.5747011798178474, + "grad_norm": 0.092472143471241, + "learning_rate": 0.0014577823315309617, + "loss": 2.2867, + "step": 407350 + }, + { + "epoch": 1.5747398370212307, + "grad_norm": 0.09492716938257217, + "learning_rate": 0.0014574136013499786, + "loss": 2.2963, + "step": 407360 + }, + { + "epoch": 1.574778494224614, + "grad_norm": 0.111127570271492, + "learning_rate": 0.0014570451215800708, + "loss": 2.3091, + "step": 407370 + }, + { + "epoch": 1.5748171514279972, + "grad_norm": 0.09847905486822128, + "learning_rate": 0.001456676891711755, + "loss": 2.2934, + "step": 407380 + }, + { + "epoch": 1.5748558086313804, + "grad_norm": 0.10108734667301178, + "learning_rate": 0.0014563089112372726, + "loss": 2.2872, + "step": 407390 + }, + { + "epoch": 1.5748944658347637, + "grad_norm": 0.10206575691699982, + "learning_rate": 0.001455941179650582, + "loss": 2.2894, + "step": 407400 + }, + { + "epoch": 1.574933123038147, + "grad_norm": 0.09543667733669281, + "learning_rate": 0.001455573696447352, + "loss": 2.2996, + "step": 407410 + }, + { + "epoch": 1.5749717802415302, + "grad_norm": 0.09524298459291458, + "learning_rate": 0.0014552064611249506, + "loss": 2.2928, + "step": 407420 + }, + { + "epoch": 1.5750104374449134, + "grad_norm": 0.10034104436635971, + "learning_rate": 0.0014548394731824397, + "loss": 2.2875, + "step": 407430 + }, + { + "epoch": 1.5750490946482967, + "grad_norm": 0.25245344638824463, + "learning_rate": 0.001454472732120566, + "loss": 2.2905, + "step": 407440 + }, + { + "epoch": 1.57508775185168, + "grad_norm": 0.10932547599077225, + "learning_rate": 0.0014541062374417528, + "loss": 2.2833, + "step": 407450 + }, + { + "epoch": 1.5751264090550632, + "grad_norm": 0.10273553431034088, + "learning_rate": 0.0014537399886500935, + "loss": 2.298, + "step": 407460 + }, + { + "epoch": 1.5751650662584464, + "grad_norm": 0.09069519490003586, + "learning_rate": 0.001453373985251342, + "loss": 2.3058, + "step": 407470 + }, + { + "epoch": 1.57520372346183, + "grad_norm": 0.10794001072645187, + "learning_rate": 0.0014530082267529064, + "loss": 2.2977, + "step": 407480 + }, + { + "epoch": 1.5752423806652132, + "grad_norm": 0.08729327470064163, + "learning_rate": 0.0014526427126638397, + "loss": 2.2933, + "step": 407490 + }, + { + "epoch": 1.5752810378685964, + "grad_norm": 0.10574730485677719, + "learning_rate": 0.001452277442494834, + "loss": 2.2885, + "step": 407500 + }, + { + "epoch": 1.5753196950719797, + "grad_norm": 0.09718071669340134, + "learning_rate": 0.0014519124157582111, + "loss": 2.2903, + "step": 407510 + }, + { + "epoch": 1.5753583522753631, + "grad_norm": 0.09844125062227249, + "learning_rate": 0.0014515476319679164, + "loss": 2.2993, + "step": 407520 + }, + { + "epoch": 1.5753970094787464, + "grad_norm": 0.095311738550663, + "learning_rate": 0.0014511830906395102, + "loss": 2.2791, + "step": 407530 + }, + { + "epoch": 1.5754356666821296, + "grad_norm": 0.10125110298395157, + "learning_rate": 0.001450818791290161, + "loss": 2.2881, + "step": 407540 + }, + { + "epoch": 1.575474323885513, + "grad_norm": 0.10146543383598328, + "learning_rate": 0.0014504547334386368, + "loss": 2.2922, + "step": 407550 + }, + { + "epoch": 1.5755129810888961, + "grad_norm": 0.46111249923706055, + "learning_rate": 0.001450090916605299, + "loss": 2.2808, + "step": 407560 + }, + { + "epoch": 1.5755516382922794, + "grad_norm": 0.09353338181972504, + "learning_rate": 0.0014497273403120958, + "loss": 2.2934, + "step": 407570 + }, + { + "epoch": 1.5755902954956627, + "grad_norm": 0.1222909465432167, + "learning_rate": 0.0014493640040825516, + "loss": 2.312, + "step": 407580 + }, + { + "epoch": 1.575628952699046, + "grad_norm": 0.11227816343307495, + "learning_rate": 0.0014490009074417636, + "loss": 2.2977, + "step": 407590 + }, + { + "epoch": 1.5756676099024292, + "grad_norm": 0.10007626563310623, + "learning_rate": 0.001448638049916391, + "loss": 2.287, + "step": 407600 + }, + { + "epoch": 1.5757062671058124, + "grad_norm": 0.09337951242923737, + "learning_rate": 0.0014482754310346513, + "loss": 2.2769, + "step": 407610 + }, + { + "epoch": 1.5757449243091957, + "grad_norm": 0.10422071069478989, + "learning_rate": 0.0014479130503263096, + "loss": 2.2881, + "step": 407620 + }, + { + "epoch": 1.575783581512579, + "grad_norm": 0.11618789285421371, + "learning_rate": 0.0014475509073226746, + "loss": 2.298, + "step": 407630 + }, + { + "epoch": 1.5758222387159624, + "grad_norm": 0.09515954554080963, + "learning_rate": 0.00144718900155659, + "loss": 2.301, + "step": 407640 + }, + { + "epoch": 1.5758608959193456, + "grad_norm": 0.10632378607988358, + "learning_rate": 0.0014468273325624267, + "loss": 2.2911, + "step": 407650 + }, + { + "epoch": 1.575899553122729, + "grad_norm": 0.09790574014186859, + "learning_rate": 0.001446465899876078, + "loss": 2.2898, + "step": 407660 + }, + { + "epoch": 1.5759382103261121, + "grad_norm": 0.115874744951725, + "learning_rate": 0.0014461047030349507, + "loss": 2.2863, + "step": 407670 + }, + { + "epoch": 1.5759768675294954, + "grad_norm": 0.1001160591840744, + "learning_rate": 0.0014457437415779594, + "loss": 2.3023, + "step": 407680 + }, + { + "epoch": 1.5760155247328789, + "grad_norm": 0.2014889270067215, + "learning_rate": 0.0014453830150455182, + "loss": 2.2921, + "step": 407690 + }, + { + "epoch": 1.5760541819362621, + "grad_norm": 0.08689361810684204, + "learning_rate": 0.0014450225229795358, + "loss": 2.2836, + "step": 407700 + }, + { + "epoch": 1.5760928391396454, + "grad_norm": 0.10067226737737656, + "learning_rate": 0.0014446622649234074, + "loss": 2.2921, + "step": 407710 + }, + { + "epoch": 1.5761314963430286, + "grad_norm": 0.10704955458641052, + "learning_rate": 0.0014443022404220078, + "loss": 2.288, + "step": 407720 + }, + { + "epoch": 1.5761701535464119, + "grad_norm": 0.09658665955066681, + "learning_rate": 0.0014439424490216863, + "loss": 2.2724, + "step": 407730 + }, + { + "epoch": 1.5762088107497951, + "grad_norm": 0.11580196022987366, + "learning_rate": 0.0014435828902702576, + "loss": 2.2839, + "step": 407740 + }, + { + "epoch": 1.5762474679531784, + "grad_norm": 0.09257831424474716, + "learning_rate": 0.0014432235637169978, + "loss": 2.2959, + "step": 407750 + }, + { + "epoch": 1.5762861251565616, + "grad_norm": 0.0986739918589592, + "learning_rate": 0.0014428644689126351, + "loss": 2.2858, + "step": 407760 + }, + { + "epoch": 1.5763247823599449, + "grad_norm": 0.10795556008815765, + "learning_rate": 0.0014425056054093459, + "loss": 2.3015, + "step": 407770 + }, + { + "epoch": 1.5763634395633281, + "grad_norm": 0.10407692193984985, + "learning_rate": 0.001442146972760746, + "loss": 2.3058, + "step": 407780 + }, + { + "epoch": 1.5764020967667114, + "grad_norm": 0.09564802050590515, + "learning_rate": 0.0014417885705218856, + "loss": 2.2994, + "step": 407790 + }, + { + "epoch": 1.5764407539700946, + "grad_norm": 0.11026576906442642, + "learning_rate": 0.0014414303982492424, + "loss": 2.2939, + "step": 407800 + }, + { + "epoch": 1.5764794111734781, + "grad_norm": 0.09176908433437347, + "learning_rate": 0.0014410724555007151, + "loss": 2.2954, + "step": 407810 + }, + { + "epoch": 1.5765180683768614, + "grad_norm": 0.10155240446329117, + "learning_rate": 0.0014407147418356175, + "loss": 2.275, + "step": 407820 + }, + { + "epoch": 1.5765567255802446, + "grad_norm": 0.10282189399003983, + "learning_rate": 0.001440357256814671, + "loss": 2.2997, + "step": 407830 + }, + { + "epoch": 1.5765953827836279, + "grad_norm": 0.12512604892253876, + "learning_rate": 0.0014399999999999999, + "loss": 2.2979, + "step": 407840 + }, + { + "epoch": 1.5766340399870111, + "grad_norm": 0.09527765214443207, + "learning_rate": 0.001439642970955124, + "loss": 2.2951, + "step": 407850 + }, + { + "epoch": 1.5766726971903946, + "grad_norm": 0.09358199685811996, + "learning_rate": 0.0014392861692449526, + "loss": 2.3014, + "step": 407860 + }, + { + "epoch": 1.5767113543937779, + "grad_norm": 0.1274263858795166, + "learning_rate": 0.0014389295944357786, + "loss": 2.2863, + "step": 407870 + }, + { + "epoch": 1.576750011597161, + "grad_norm": 0.09673239290714264, + "learning_rate": 0.001438573246095272, + "loss": 2.2956, + "step": 407880 + }, + { + "epoch": 1.5767886688005444, + "grad_norm": 0.09876884520053864, + "learning_rate": 0.0014382171237924744, + "loss": 2.2907, + "step": 407890 + }, + { + "epoch": 1.5768273260039276, + "grad_norm": 0.09205851703882217, + "learning_rate": 0.0014378612270977923, + "loss": 2.2921, + "step": 407900 + }, + { + "epoch": 1.5768659832073109, + "grad_norm": 0.09754933416843414, + "learning_rate": 0.0014375055555829906, + "loss": 2.298, + "step": 407910 + }, + { + "epoch": 1.5769046404106941, + "grad_norm": 0.08937043696641922, + "learning_rate": 0.0014371501088211885, + "loss": 2.2888, + "step": 407920 + }, + { + "epoch": 1.5769432976140774, + "grad_norm": 0.09569250047206879, + "learning_rate": 0.001436794886386851, + "loss": 2.286, + "step": 407930 + }, + { + "epoch": 1.5769819548174606, + "grad_norm": 0.0856354683637619, + "learning_rate": 0.001436439887855785, + "loss": 2.2919, + "step": 407940 + }, + { + "epoch": 1.5770206120208439, + "grad_norm": 0.10027945786714554, + "learning_rate": 0.0014360851128051326, + "loss": 2.2967, + "step": 407950 + }, + { + "epoch": 1.5770592692242271, + "grad_norm": 0.0965295284986496, + "learning_rate": 0.0014357305608133646, + "loss": 2.2972, + "step": 407960 + }, + { + "epoch": 1.5770979264276104, + "grad_norm": 0.10624401271343231, + "learning_rate": 0.0014353762314602758, + "loss": 2.2902, + "step": 407970 + }, + { + "epoch": 1.5771365836309938, + "grad_norm": 0.1368337869644165, + "learning_rate": 0.0014350221243269786, + "loss": 2.2711, + "step": 407980 + }, + { + "epoch": 1.577175240834377, + "grad_norm": 0.09370030462741852, + "learning_rate": 0.0014346682389958971, + "loss": 2.3014, + "step": 407990 + }, + { + "epoch": 1.5772138980377604, + "grad_norm": 0.09466104954481125, + "learning_rate": 0.0014343145750507621, + "loss": 2.2876, + "step": 408000 + }, + { + "epoch": 1.5772525552411436, + "grad_norm": 0.08923348039388657, + "learning_rate": 0.0014339611320766039, + "loss": 2.3028, + "step": 408010 + }, + { + "epoch": 1.5772912124445269, + "grad_norm": 0.093915194272995, + "learning_rate": 0.0014336079096597483, + "loss": 2.3049, + "step": 408020 + }, + { + "epoch": 1.5773298696479103, + "grad_norm": 0.09254784882068634, + "learning_rate": 0.0014332549073878098, + "loss": 2.2877, + "step": 408030 + }, + { + "epoch": 1.5773685268512936, + "grad_norm": 0.09824127703905106, + "learning_rate": 0.001432902124849687, + "loss": 2.3099, + "step": 408040 + }, + { + "epoch": 1.5774071840546768, + "grad_norm": 0.10029337555170059, + "learning_rate": 0.0014325495616355557, + "loss": 2.2993, + "step": 408050 + }, + { + "epoch": 1.57744584125806, + "grad_norm": 0.11360572278499603, + "learning_rate": 0.0014321972173368645, + "loss": 2.3052, + "step": 408060 + }, + { + "epoch": 1.5774844984614433, + "grad_norm": 0.10877586901187897, + "learning_rate": 0.0014318450915463284, + "loss": 2.3072, + "step": 408070 + }, + { + "epoch": 1.5775231556648266, + "grad_norm": 0.11041352897882462, + "learning_rate": 0.0014314931838579242, + "loss": 2.288, + "step": 408080 + }, + { + "epoch": 1.5775618128682098, + "grad_norm": 0.10026522725820541, + "learning_rate": 0.0014311414938668846, + "loss": 2.2958, + "step": 408090 + }, + { + "epoch": 1.577600470071593, + "grad_norm": 0.09852489084005356, + "learning_rate": 0.0014307900211696918, + "loss": 2.298, + "step": 408100 + }, + { + "epoch": 1.5776391272749763, + "grad_norm": 0.08970458805561066, + "learning_rate": 0.0014304387653640744, + "loss": 2.2928, + "step": 408110 + }, + { + "epoch": 1.5776777844783596, + "grad_norm": 0.11003737896680832, + "learning_rate": 0.0014300877260489998, + "loss": 2.302, + "step": 408120 + }, + { + "epoch": 1.5777164416817429, + "grad_norm": 0.11057864129543304, + "learning_rate": 0.0014297369028246699, + "loss": 2.294, + "step": 408130 + }, + { + "epoch": 1.577755098885126, + "grad_norm": 0.0905907079577446, + "learning_rate": 0.0014293862952925159, + "loss": 2.2921, + "step": 408140 + }, + { + "epoch": 1.5777937560885096, + "grad_norm": 0.09885738790035248, + "learning_rate": 0.0014290359030551922, + "loss": 2.3063, + "step": 408150 + }, + { + "epoch": 1.5778324132918928, + "grad_norm": 0.10116096585988998, + "learning_rate": 0.001428685725716572, + "loss": 2.2841, + "step": 408160 + }, + { + "epoch": 1.577871070495276, + "grad_norm": 0.0916747897863388, + "learning_rate": 0.001428335762881742, + "loss": 2.2867, + "step": 408170 + }, + { + "epoch": 1.5779097276986593, + "grad_norm": 0.09478262811899185, + "learning_rate": 0.0014279860141569963, + "loss": 2.2881, + "step": 408180 + }, + { + "epoch": 1.5779483849020428, + "grad_norm": 0.11914552003145218, + "learning_rate": 0.0014276364791498326, + "loss": 2.2943, + "step": 408190 + }, + { + "epoch": 1.577987042105426, + "grad_norm": 0.10792306065559387, + "learning_rate": 0.0014272871574689458, + "loss": 2.2968, + "step": 408200 + }, + { + "epoch": 1.5780256993088093, + "grad_norm": 0.0978836938738823, + "learning_rate": 0.0014269380487242239, + "loss": 2.3056, + "step": 408210 + }, + { + "epoch": 1.5780643565121926, + "grad_norm": 0.08757525682449341, + "learning_rate": 0.0014265891525267421, + "loss": 2.3039, + "step": 408220 + }, + { + "epoch": 1.5781030137155758, + "grad_norm": 0.12824532389640808, + "learning_rate": 0.001426240468488758, + "loss": 2.2852, + "step": 408230 + }, + { + "epoch": 1.578141670918959, + "grad_norm": 0.11376450955867767, + "learning_rate": 0.0014258919962237071, + "loss": 2.2895, + "step": 408240 + }, + { + "epoch": 1.5781803281223423, + "grad_norm": 0.09492777287960052, + "learning_rate": 0.0014255437353461972, + "loss": 2.2876, + "step": 408250 + }, + { + "epoch": 1.5782189853257256, + "grad_norm": 0.09079192578792572, + "learning_rate": 0.0014251956854720034, + "loss": 2.2903, + "step": 408260 + }, + { + "epoch": 1.5782576425291088, + "grad_norm": 0.10566417127847672, + "learning_rate": 0.0014248478462180639, + "loss": 2.2921, + "step": 408270 + }, + { + "epoch": 1.578296299732492, + "grad_norm": 0.09935545921325684, + "learning_rate": 0.0014245002172024737, + "loss": 2.2963, + "step": 408280 + }, + { + "epoch": 1.5783349569358753, + "grad_norm": 0.1073034405708313, + "learning_rate": 0.0014241527980444813, + "loss": 2.3041, + "step": 408290 + }, + { + "epoch": 1.5783736141392586, + "grad_norm": 0.10224999487400055, + "learning_rate": 0.0014238055883644828, + "loss": 2.2713, + "step": 408300 + }, + { + "epoch": 1.5784122713426418, + "grad_norm": 0.11747825890779495, + "learning_rate": 0.0014234585877840173, + "loss": 2.2992, + "step": 408310 + }, + { + "epoch": 1.5784509285460253, + "grad_norm": 0.09979795664548874, + "learning_rate": 0.0014231117959257618, + "loss": 2.2943, + "step": 408320 + }, + { + "epoch": 1.5784895857494086, + "grad_norm": 0.084749236702919, + "learning_rate": 0.0014227652124135276, + "loss": 2.3049, + "step": 408330 + }, + { + "epoch": 1.5785282429527918, + "grad_norm": 0.10331593453884125, + "learning_rate": 0.001422418836872254, + "loss": 2.2902, + "step": 408340 + }, + { + "epoch": 1.578566900156175, + "grad_norm": 0.1007453128695488, + "learning_rate": 0.0014220726689280045, + "loss": 2.2821, + "step": 408350 + }, + { + "epoch": 1.5786055573595585, + "grad_norm": 0.09320829808712006, + "learning_rate": 0.0014217267082079618, + "loss": 2.2779, + "step": 408360 + }, + { + "epoch": 1.5786442145629418, + "grad_norm": 0.09189892560243607, + "learning_rate": 0.0014213809543404227, + "loss": 2.2908, + "step": 408370 + }, + { + "epoch": 1.578682871766325, + "grad_norm": 0.11019908636808395, + "learning_rate": 0.0014210354069547951, + "loss": 2.2777, + "step": 408380 + }, + { + "epoch": 1.5787215289697083, + "grad_norm": 0.10784642398357391, + "learning_rate": 0.0014206900656815904, + "loss": 2.2997, + "step": 408390 + }, + { + "epoch": 1.5787601861730916, + "grad_norm": 0.10673866420984268, + "learning_rate": 0.0014203449301524226, + "loss": 2.284, + "step": 408400 + }, + { + "epoch": 1.5787988433764748, + "grad_norm": 0.10720127075910568, + "learning_rate": 0.00142, + "loss": 2.2901, + "step": 408410 + }, + { + "epoch": 1.578837500579858, + "grad_norm": 0.09876653552055359, + "learning_rate": 0.0014196552748581238, + "loss": 2.2898, + "step": 408420 + }, + { + "epoch": 1.5788761577832413, + "grad_norm": 0.11836722493171692, + "learning_rate": 0.001419310754361681, + "loss": 2.2762, + "step": 408430 + }, + { + "epoch": 1.5789148149866246, + "grad_norm": 0.10221022367477417, + "learning_rate": 0.001418966438146642, + "loss": 2.309, + "step": 408440 + }, + { + "epoch": 1.5789534721900078, + "grad_norm": 0.11221145838499069, + "learning_rate": 0.0014186223258500546, + "loss": 2.2921, + "step": 408450 + }, + { + "epoch": 1.578992129393391, + "grad_norm": 0.10702753812074661, + "learning_rate": 0.0014182784171100408, + "loss": 2.2923, + "step": 408460 + }, + { + "epoch": 1.5790307865967743, + "grad_norm": 0.09933892637491226, + "learning_rate": 0.00141793471156579, + "loss": 2.2882, + "step": 408470 + }, + { + "epoch": 1.5790694438001576, + "grad_norm": 0.09902329742908478, + "learning_rate": 0.0014175912088575586, + "loss": 2.2871, + "step": 408480 + }, + { + "epoch": 1.579108101003541, + "grad_norm": 0.10397355258464813, + "learning_rate": 0.0014172479086266615, + "loss": 2.2816, + "step": 408490 + }, + { + "epoch": 1.5791467582069243, + "grad_norm": 0.10957317054271698, + "learning_rate": 0.0014169048105154701, + "loss": 2.2812, + "step": 408500 + }, + { + "epoch": 1.5791854154103075, + "grad_norm": 0.1058499738574028, + "learning_rate": 0.0014165619141674073, + "loss": 2.2871, + "step": 408510 + }, + { + "epoch": 1.5792240726136908, + "grad_norm": 0.11140215396881104, + "learning_rate": 0.0014162192192269433, + "loss": 2.277, + "step": 408520 + }, + { + "epoch": 1.5792627298170743, + "grad_norm": 0.0999557301402092, + "learning_rate": 0.0014158767253395906, + "loss": 2.283, + "step": 408530 + }, + { + "epoch": 1.5793013870204575, + "grad_norm": 0.09867963194847107, + "learning_rate": 0.0014155344321519017, + "loss": 2.2882, + "step": 408540 + }, + { + "epoch": 1.5793400442238408, + "grad_norm": 0.09761340171098709, + "learning_rate": 0.0014151923393114624, + "loss": 2.2788, + "step": 408550 + }, + { + "epoch": 1.579378701427224, + "grad_norm": 0.10846646130084991, + "learning_rate": 0.0014148504464668882, + "loss": 2.2889, + "step": 408560 + }, + { + "epoch": 1.5794173586306073, + "grad_norm": 0.1139591857790947, + "learning_rate": 0.0014145087532678221, + "loss": 2.292, + "step": 408570 + }, + { + "epoch": 1.5794560158339905, + "grad_norm": 0.0976622998714447, + "learning_rate": 0.0014141672593649278, + "loss": 2.2901, + "step": 408580 + }, + { + "epoch": 1.5794946730373738, + "grad_norm": 0.08976734429597855, + "learning_rate": 0.0014138259644098862, + "loss": 2.2818, + "step": 408590 + }, + { + "epoch": 1.579533330240757, + "grad_norm": 0.09310808777809143, + "learning_rate": 0.001413484868055393, + "loss": 2.2754, + "step": 408600 + }, + { + "epoch": 1.5795719874441403, + "grad_norm": 0.09846369922161102, + "learning_rate": 0.0014131439699551515, + "loss": 2.2906, + "step": 408610 + }, + { + "epoch": 1.5796106446475235, + "grad_norm": 0.08984551578760147, + "learning_rate": 0.001412803269763872, + "loss": 2.2803, + "step": 408620 + }, + { + "epoch": 1.5796493018509068, + "grad_norm": 0.10137151181697845, + "learning_rate": 0.0014124627671372647, + "loss": 2.3008, + "step": 408630 + }, + { + "epoch": 1.57968795905429, + "grad_norm": 0.11374299973249435, + "learning_rate": 0.0014121224617320372, + "loss": 2.2979, + "step": 408640 + }, + { + "epoch": 1.5797266162576733, + "grad_norm": 0.08813643455505371, + "learning_rate": 0.0014117823532058904, + "loss": 2.3102, + "step": 408650 + }, + { + "epoch": 1.5797652734610568, + "grad_norm": 0.0898527130484581, + "learning_rate": 0.0014114424412175136, + "loss": 2.2905, + "step": 408660 + }, + { + "epoch": 1.57980393066444, + "grad_norm": 0.10921207070350647, + "learning_rate": 0.0014111027254265819, + "loss": 2.2815, + "step": 408670 + }, + { + "epoch": 1.5798425878678233, + "grad_norm": 0.09411787986755371, + "learning_rate": 0.0014107632054937507, + "loss": 2.2744, + "step": 408680 + }, + { + "epoch": 1.5798812450712065, + "grad_norm": 0.09054916352033615, + "learning_rate": 0.0014104238810806532, + "loss": 2.2873, + "step": 408690 + }, + { + "epoch": 1.57991990227459, + "grad_norm": 0.1259244829416275, + "learning_rate": 0.001410084751849895, + "loss": 2.2749, + "step": 408700 + }, + { + "epoch": 1.5799585594779733, + "grad_norm": 0.10252106934785843, + "learning_rate": 0.0014097458174650516, + "loss": 2.2847, + "step": 408710 + }, + { + "epoch": 1.5799972166813565, + "grad_norm": 0.12871307134628296, + "learning_rate": 0.001409407077590664, + "loss": 2.2955, + "step": 408720 + }, + { + "epoch": 1.5800358738847398, + "grad_norm": 0.10662032663822174, + "learning_rate": 0.001409068531892234, + "loss": 2.284, + "step": 408730 + }, + { + "epoch": 1.580074531088123, + "grad_norm": 0.09653251618146896, + "learning_rate": 0.0014087301800362207, + "loss": 2.277, + "step": 408740 + }, + { + "epoch": 1.5801131882915063, + "grad_norm": 0.10052668303251266, + "learning_rate": 0.0014083920216900383, + "loss": 2.2755, + "step": 408750 + }, + { + "epoch": 1.5801518454948895, + "grad_norm": 0.11152070760726929, + "learning_rate": 0.0014080540565220502, + "loss": 2.2881, + "step": 408760 + }, + { + "epoch": 1.5801905026982728, + "grad_norm": 0.11330562084913254, + "learning_rate": 0.001407716284201566, + "loss": 2.3062, + "step": 408770 + }, + { + "epoch": 1.580229159901656, + "grad_norm": 0.09445352852344513, + "learning_rate": 0.001407378704398838, + "loss": 2.2682, + "step": 408780 + }, + { + "epoch": 1.5802678171050393, + "grad_norm": 0.09261759370565414, + "learning_rate": 0.0014070413167850565, + "loss": 2.293, + "step": 408790 + }, + { + "epoch": 1.5803064743084225, + "grad_norm": 0.1160760149359703, + "learning_rate": 0.001406704121032347, + "loss": 2.3078, + "step": 408800 + }, + { + "epoch": 1.5803451315118058, + "grad_norm": 0.093442402780056, + "learning_rate": 0.001406367116813767, + "loss": 2.284, + "step": 408810 + }, + { + "epoch": 1.580383788715189, + "grad_norm": 0.09904216229915619, + "learning_rate": 0.0014060303038033002, + "loss": 2.2989, + "step": 408820 + }, + { + "epoch": 1.5804224459185725, + "grad_norm": 0.09945961833000183, + "learning_rate": 0.001405693681675855, + "loss": 2.2893, + "step": 408830 + }, + { + "epoch": 1.5804611031219558, + "grad_norm": 0.09675027430057526, + "learning_rate": 0.0014053572501072599, + "loss": 2.2987, + "step": 408840 + }, + { + "epoch": 1.580499760325339, + "grad_norm": 0.09257011860609055, + "learning_rate": 0.0014050210087742593, + "loss": 2.2906, + "step": 408850 + }, + { + "epoch": 1.5805384175287223, + "grad_norm": 0.08864189684391022, + "learning_rate": 0.0014046849573545112, + "loss": 2.279, + "step": 408860 + }, + { + "epoch": 1.5805770747321057, + "grad_norm": 0.0994875431060791, + "learning_rate": 0.001404349095526583, + "loss": 2.2808, + "step": 408870 + }, + { + "epoch": 1.580615731935489, + "grad_norm": 0.09842891246080399, + "learning_rate": 0.0014040134229699464, + "loss": 2.2925, + "step": 408880 + }, + { + "epoch": 1.5806543891388722, + "grad_norm": 0.10454820096492767, + "learning_rate": 0.001403677939364977, + "loss": 2.2945, + "step": 408890 + }, + { + "epoch": 1.5806930463422555, + "grad_norm": 0.09167510271072388, + "learning_rate": 0.0014033426443929482, + "loss": 2.2758, + "step": 408900 + }, + { + "epoch": 1.5807317035456387, + "grad_norm": 0.09263806790113449, + "learning_rate": 0.001403007537736028, + "loss": 2.2821, + "step": 408910 + }, + { + "epoch": 1.580770360749022, + "grad_norm": 0.09206433594226837, + "learning_rate": 0.0014026726190772768, + "loss": 2.2835, + "step": 408920 + }, + { + "epoch": 1.5808090179524052, + "grad_norm": 0.10548428446054459, + "learning_rate": 0.0014023378881006425, + "loss": 2.2998, + "step": 408930 + }, + { + "epoch": 1.5808476751557885, + "grad_norm": 0.10245232284069061, + "learning_rate": 0.0014020033444909579, + "loss": 2.2964, + "step": 408940 + }, + { + "epoch": 1.5808863323591718, + "grad_norm": 0.09368015080690384, + "learning_rate": 0.0014016689879339363, + "loss": 2.2897, + "step": 408950 + }, + { + "epoch": 1.580924989562555, + "grad_norm": 0.10228240489959717, + "learning_rate": 0.0014013348181161694, + "loss": 2.2901, + "step": 408960 + }, + { + "epoch": 1.5809636467659383, + "grad_norm": 0.08766623586416245, + "learning_rate": 0.0014010008347251227, + "loss": 2.2727, + "step": 408970 + }, + { + "epoch": 1.5810023039693215, + "grad_norm": 0.11325518786907196, + "learning_rate": 0.001400667037449132, + "loss": 2.2948, + "step": 408980 + }, + { + "epoch": 1.5810409611727048, + "grad_norm": 0.0871184691786766, + "learning_rate": 0.0014003334259774021, + "loss": 2.2982, + "step": 408990 + }, + { + "epoch": 1.5810796183760882, + "grad_norm": 0.17203271389007568, + "learning_rate": 0.0014, + "loss": 2.2945, + "step": 409000 + }, + { + "epoch": 1.5811182755794715, + "grad_norm": 0.11297266185283661, + "learning_rate": 0.0013996667592078547, + "loss": 2.2977, + "step": 409010 + }, + { + "epoch": 1.5811569327828547, + "grad_norm": 0.11587999761104584, + "learning_rate": 0.0013993337032927517, + "loss": 2.294, + "step": 409020 + }, + { + "epoch": 1.581195589986238, + "grad_norm": 0.10281231999397278, + "learning_rate": 0.001399000831947331, + "loss": 2.2813, + "step": 409030 + }, + { + "epoch": 1.5812342471896215, + "grad_norm": 0.10410250723361969, + "learning_rate": 0.0013986681448650837, + "loss": 2.2818, + "step": 409040 + }, + { + "epoch": 1.5812729043930047, + "grad_norm": 0.09489083290100098, + "learning_rate": 0.001398335641740347, + "loss": 2.2981, + "step": 409050 + }, + { + "epoch": 1.581311561596388, + "grad_norm": 0.09412014484405518, + "learning_rate": 0.0013980033222683037, + "loss": 2.2875, + "step": 409060 + }, + { + "epoch": 1.5813502187997712, + "grad_norm": 0.11623632162809372, + "learning_rate": 0.0013976711861449762, + "loss": 2.2739, + "step": 409070 + }, + { + "epoch": 1.5813888760031545, + "grad_norm": 0.09715640544891357, + "learning_rate": 0.0013973392330672255, + "loss": 2.2781, + "step": 409080 + }, + { + "epoch": 1.5814275332065377, + "grad_norm": 0.09456169605255127, + "learning_rate": 0.0013970074627327467, + "loss": 2.2757, + "step": 409090 + }, + { + "epoch": 1.581466190409921, + "grad_norm": 0.11020629107952118, + "learning_rate": 0.0013966758748400658, + "loss": 2.2932, + "step": 409100 + }, + { + "epoch": 1.5815048476133042, + "grad_norm": 0.11536996811628342, + "learning_rate": 0.0013963444690885374, + "loss": 2.291, + "step": 409110 + }, + { + "epoch": 1.5815435048166875, + "grad_norm": 0.1016986295580864, + "learning_rate": 0.00139601324517834, + "loss": 2.2991, + "step": 409120 + }, + { + "epoch": 1.5815821620200707, + "grad_norm": 0.09667601436376572, + "learning_rate": 0.0013956822028104749, + "loss": 2.2762, + "step": 409130 + }, + { + "epoch": 1.581620819223454, + "grad_norm": 0.10011686384677887, + "learning_rate": 0.0013953513416867609, + "loss": 2.292, + "step": 409140 + }, + { + "epoch": 1.5816594764268372, + "grad_norm": 0.09894595295190811, + "learning_rate": 0.001395020661509833, + "loss": 2.2953, + "step": 409150 + }, + { + "epoch": 1.5816981336302205, + "grad_norm": 0.09758591651916504, + "learning_rate": 0.0013946901619831377, + "loss": 2.2848, + "step": 409160 + }, + { + "epoch": 1.581736790833604, + "grad_norm": 0.12541015446186066, + "learning_rate": 0.0013943598428109313, + "loss": 2.2877, + "step": 409170 + }, + { + "epoch": 1.5817754480369872, + "grad_norm": 0.09081506729125977, + "learning_rate": 0.0013940297036982754, + "loss": 2.3061, + "step": 409180 + }, + { + "epoch": 1.5818141052403705, + "grad_norm": 0.09523743391036987, + "learning_rate": 0.0013936997443510352, + "loss": 2.2783, + "step": 409190 + }, + { + "epoch": 1.5818527624437537, + "grad_norm": 0.09542723000049591, + "learning_rate": 0.0013933699644758759, + "loss": 2.2789, + "step": 409200 + }, + { + "epoch": 1.5818914196471372, + "grad_norm": 0.11139500141143799, + "learning_rate": 0.0013930403637802593, + "loss": 2.2724, + "step": 409210 + }, + { + "epoch": 1.5819300768505205, + "grad_norm": 0.08887927234172821, + "learning_rate": 0.0013927109419724411, + "loss": 2.3028, + "step": 409220 + }, + { + "epoch": 1.5819687340539037, + "grad_norm": 0.09690530598163605, + "learning_rate": 0.0013923816987614678, + "loss": 2.2806, + "step": 409230 + }, + { + "epoch": 1.582007391257287, + "grad_norm": 0.09255881607532501, + "learning_rate": 0.0013920526338571735, + "loss": 2.2911, + "step": 409240 + }, + { + "epoch": 1.5820460484606702, + "grad_norm": 0.1247158870100975, + "learning_rate": 0.001391723746970178, + "loss": 2.2646, + "step": 409250 + }, + { + "epoch": 1.5820847056640535, + "grad_norm": 0.08813361078500748, + "learning_rate": 0.0013913950378118824, + "loss": 2.2785, + "step": 409260 + }, + { + "epoch": 1.5821233628674367, + "grad_norm": 0.09330994635820389, + "learning_rate": 0.0013910665060944667, + "loss": 2.2819, + "step": 409270 + }, + { + "epoch": 1.58216202007082, + "grad_norm": 0.2143334448337555, + "learning_rate": 0.0013907381515308873, + "loss": 2.2856, + "step": 409280 + }, + { + "epoch": 1.5822006772742032, + "grad_norm": 0.11374535411596298, + "learning_rate": 0.0013904099738348732, + "loss": 2.2919, + "step": 409290 + }, + { + "epoch": 1.5822393344775865, + "grad_norm": 0.0991811528801918, + "learning_rate": 0.0013900819727209238, + "loss": 2.2894, + "step": 409300 + }, + { + "epoch": 1.5822779916809697, + "grad_norm": 0.0955258384346962, + "learning_rate": 0.0013897541479043058, + "loss": 2.3012, + "step": 409310 + }, + { + "epoch": 1.582316648884353, + "grad_norm": 0.09650768339633942, + "learning_rate": 0.0013894264991010502, + "loss": 2.2867, + "step": 409320 + }, + { + "epoch": 1.5823553060877362, + "grad_norm": 0.09201765805482864, + "learning_rate": 0.0013890990260279495, + "loss": 2.2851, + "step": 409330 + }, + { + "epoch": 1.5823939632911197, + "grad_norm": 0.0908934623003006, + "learning_rate": 0.0013887717284025549, + "loss": 2.2791, + "step": 409340 + }, + { + "epoch": 1.582432620494503, + "grad_norm": 0.10206924378871918, + "learning_rate": 0.0013884446059431736, + "loss": 2.2748, + "step": 409350 + }, + { + "epoch": 1.5824712776978862, + "grad_norm": 0.09511981904506683, + "learning_rate": 0.0013881176583688658, + "loss": 2.2912, + "step": 409360 + }, + { + "epoch": 1.5825099349012695, + "grad_norm": 0.08898471295833588, + "learning_rate": 0.0013877908853994411, + "loss": 2.3048, + "step": 409370 + }, + { + "epoch": 1.582548592104653, + "grad_norm": 0.09409672766923904, + "learning_rate": 0.0013874642867554578, + "loss": 2.2753, + "step": 409380 + }, + { + "epoch": 1.5825872493080362, + "grad_norm": 0.1006675586104393, + "learning_rate": 0.0013871378621582175, + "loss": 2.2878, + "step": 409390 + }, + { + "epoch": 1.5826259065114194, + "grad_norm": 0.11397848278284073, + "learning_rate": 0.0013868116113297643, + "loss": 2.2598, + "step": 409400 + }, + { + "epoch": 1.5826645637148027, + "grad_norm": 0.10111487656831741, + "learning_rate": 0.0013864855339928814, + "loss": 2.2876, + "step": 409410 + }, + { + "epoch": 1.582703220918186, + "grad_norm": 0.11789622157812119, + "learning_rate": 0.0013861596298710879, + "loss": 2.2848, + "step": 409420 + }, + { + "epoch": 1.5827418781215692, + "grad_norm": 0.0989585891366005, + "learning_rate": 0.0013858338986886365, + "loss": 2.2922, + "step": 409430 + }, + { + "epoch": 1.5827805353249524, + "grad_norm": 0.10369249433279037, + "learning_rate": 0.0013855083401705115, + "loss": 2.2769, + "step": 409440 + }, + { + "epoch": 1.5828191925283357, + "grad_norm": 0.09713876992464066, + "learning_rate": 0.001385182954042424, + "loss": 2.2898, + "step": 409450 + }, + { + "epoch": 1.582857849731719, + "grad_norm": 0.11114867031574249, + "learning_rate": 0.001384857740030812, + "loss": 2.2846, + "step": 409460 + }, + { + "epoch": 1.5828965069351022, + "grad_norm": 0.1006435751914978, + "learning_rate": 0.0013845326978628352, + "loss": 2.2703, + "step": 409470 + }, + { + "epoch": 1.5829351641384855, + "grad_norm": 0.11165027320384979, + "learning_rate": 0.001384207827266374, + "loss": 2.2909, + "step": 409480 + }, + { + "epoch": 1.5829738213418687, + "grad_norm": 0.09109925478696823, + "learning_rate": 0.0013838831279700254, + "loss": 2.3026, + "step": 409490 + }, + { + "epoch": 1.5830124785452522, + "grad_norm": 0.10089856386184692, + "learning_rate": 0.0013835585997031023, + "loss": 2.2982, + "step": 409500 + }, + { + "epoch": 1.5830511357486354, + "grad_norm": 0.114852674305439, + "learning_rate": 0.001383234242195629, + "loss": 2.2853, + "step": 409510 + }, + { + "epoch": 1.5830897929520187, + "grad_norm": 0.09972550719976425, + "learning_rate": 0.0013829100551783395, + "loss": 2.2905, + "step": 409520 + }, + { + "epoch": 1.583128450155402, + "grad_norm": 0.11271734535694122, + "learning_rate": 0.0013825860383826747, + "loss": 2.2872, + "step": 409530 + }, + { + "epoch": 1.5831671073587852, + "grad_norm": 0.10400784015655518, + "learning_rate": 0.00138226219154078, + "loss": 2.2949, + "step": 409540 + }, + { + "epoch": 1.5832057645621687, + "grad_norm": 0.08777232468128204, + "learning_rate": 0.0013819385143855024, + "loss": 2.2789, + "step": 409550 + }, + { + "epoch": 1.583244421765552, + "grad_norm": 0.1051279604434967, + "learning_rate": 0.0013816150066503878, + "loss": 2.2968, + "step": 409560 + }, + { + "epoch": 1.5832830789689352, + "grad_norm": 0.10784800350666046, + "learning_rate": 0.0013812916680696792, + "loss": 2.2763, + "step": 409570 + }, + { + "epoch": 1.5833217361723184, + "grad_norm": 0.09149152040481567, + "learning_rate": 0.0013809684983783134, + "loss": 2.2875, + "step": 409580 + }, + { + "epoch": 1.5833603933757017, + "grad_norm": 0.10545124113559723, + "learning_rate": 0.001380645497311919, + "loss": 2.2985, + "step": 409590 + }, + { + "epoch": 1.583399050579085, + "grad_norm": 0.09649606794118881, + "learning_rate": 0.0013803226646068132, + "loss": 2.2923, + "step": 409600 + }, + { + "epoch": 1.5834377077824682, + "grad_norm": 0.09470316767692566, + "learning_rate": 0.00138, + "loss": 2.2941, + "step": 409610 + }, + { + "epoch": 1.5834763649858514, + "grad_norm": 0.10625059902667999, + "learning_rate": 0.0013796775032291673, + "loss": 2.2805, + "step": 409620 + }, + { + "epoch": 1.5835150221892347, + "grad_norm": 0.09362804144620895, + "learning_rate": 0.001379355174032684, + "loss": 2.2774, + "step": 409630 + }, + { + "epoch": 1.583553679392618, + "grad_norm": 0.09844432771205902, + "learning_rate": 0.001379033012149599, + "loss": 2.2887, + "step": 409640 + }, + { + "epoch": 1.5835923365960012, + "grad_norm": 0.10673537850379944, + "learning_rate": 0.0013787110173196374, + "loss": 2.2756, + "step": 409650 + }, + { + "epoch": 1.5836309937993844, + "grad_norm": 0.09506750851869583, + "learning_rate": 0.0013783891892831979, + "loss": 2.2761, + "step": 409660 + }, + { + "epoch": 1.583669651002768, + "grad_norm": 0.10086861997842789, + "learning_rate": 0.0013780675277813518, + "loss": 2.2968, + "step": 409670 + }, + { + "epoch": 1.5837083082061512, + "grad_norm": 0.1022254228591919, + "learning_rate": 0.0013777460325558382, + "loss": 2.2836, + "step": 409680 + }, + { + "epoch": 1.5837469654095344, + "grad_norm": 0.09821146726608276, + "learning_rate": 0.0013774247033490647, + "loss": 2.2702, + "step": 409690 + }, + { + "epoch": 1.5837856226129177, + "grad_norm": 0.10450205951929092, + "learning_rate": 0.0013771035399041025, + "loss": 2.2992, + "step": 409700 + }, + { + "epoch": 1.583824279816301, + "grad_norm": 0.10975898802280426, + "learning_rate": 0.0013767825419646847, + "loss": 2.2944, + "step": 409710 + }, + { + "epoch": 1.5838629370196844, + "grad_norm": 0.10892923921346664, + "learning_rate": 0.0013764617092752044, + "loss": 2.3034, + "step": 409720 + }, + { + "epoch": 1.5839015942230676, + "grad_norm": 0.10283569991588593, + "learning_rate": 0.001376141041580711, + "loss": 2.285, + "step": 409730 + }, + { + "epoch": 1.583940251426451, + "grad_norm": 0.10661765933036804, + "learning_rate": 0.0013758205386269107, + "loss": 2.2894, + "step": 409740 + }, + { + "epoch": 1.5839789086298341, + "grad_norm": 0.10816767066717148, + "learning_rate": 0.0013755002001601601, + "loss": 2.2795, + "step": 409750 + }, + { + "epoch": 1.5840175658332174, + "grad_norm": 0.106157585978508, + "learning_rate": 0.0013751800259274676, + "loss": 2.2982, + "step": 409760 + }, + { + "epoch": 1.5840562230366007, + "grad_norm": 0.09719347953796387, + "learning_rate": 0.0013748600156764886, + "loss": 2.2863, + "step": 409770 + }, + { + "epoch": 1.584094880239984, + "grad_norm": 0.11564947664737701, + "learning_rate": 0.0013745401691555243, + "loss": 2.2661, + "step": 409780 + }, + { + "epoch": 1.5841335374433672, + "grad_norm": 0.09925565123558044, + "learning_rate": 0.0013742204861135194, + "loss": 2.2797, + "step": 409790 + }, + { + "epoch": 1.5841721946467504, + "grad_norm": 0.09485266357660294, + "learning_rate": 0.001373900966300059, + "loss": 2.2858, + "step": 409800 + }, + { + "epoch": 1.5842108518501337, + "grad_norm": 0.09301532804965973, + "learning_rate": 0.001373581609465367, + "loss": 2.2859, + "step": 409810 + }, + { + "epoch": 1.584249509053517, + "grad_norm": 0.13172714412212372, + "learning_rate": 0.0013732624153603042, + "loss": 2.2829, + "step": 409820 + }, + { + "epoch": 1.5842881662569002, + "grad_norm": 0.1107850968837738, + "learning_rate": 0.0013729433837363647, + "loss": 2.2935, + "step": 409830 + }, + { + "epoch": 1.5843268234602836, + "grad_norm": 0.09742755442857742, + "learning_rate": 0.001372624514345675, + "loss": 2.2824, + "step": 409840 + }, + { + "epoch": 1.584365480663667, + "grad_norm": 0.10189301520586014, + "learning_rate": 0.0013723058069409913, + "loss": 2.29, + "step": 409850 + }, + { + "epoch": 1.5844041378670501, + "grad_norm": 0.10698041319847107, + "learning_rate": 0.0013719872612756967, + "loss": 2.2877, + "step": 409860 + }, + { + "epoch": 1.5844427950704334, + "grad_norm": 0.10542524605989456, + "learning_rate": 0.0013716688771037997, + "loss": 2.2889, + "step": 409870 + }, + { + "epoch": 1.5844814522738166, + "grad_norm": 0.1208849772810936, + "learning_rate": 0.0013713506541799317, + "loss": 2.2875, + "step": 409880 + }, + { + "epoch": 1.5845201094772001, + "grad_norm": 0.09475509077310562, + "learning_rate": 0.0013710325922593445, + "loss": 2.2951, + "step": 409890 + }, + { + "epoch": 1.5845587666805834, + "grad_norm": 0.09734809398651123, + "learning_rate": 0.0013707146910979092, + "loss": 2.289, + "step": 409900 + }, + { + "epoch": 1.5845974238839666, + "grad_norm": 0.0958809107542038, + "learning_rate": 0.0013703969504521123, + "loss": 2.2839, + "step": 409910 + }, + { + "epoch": 1.5846360810873499, + "grad_norm": 0.10502952337265015, + "learning_rate": 0.001370079370079055, + "loss": 2.2833, + "step": 409920 + }, + { + "epoch": 1.5846747382907331, + "grad_norm": 0.10497497767210007, + "learning_rate": 0.0013697619497364509, + "loss": 2.2751, + "step": 409930 + }, + { + "epoch": 1.5847133954941164, + "grad_norm": 0.09893753379583359, + "learning_rate": 0.0013694446891826223, + "loss": 2.2889, + "step": 409940 + }, + { + "epoch": 1.5847520526974996, + "grad_norm": 0.10483641177415848, + "learning_rate": 0.0013691275881764998, + "loss": 2.2961, + "step": 409950 + }, + { + "epoch": 1.5847907099008829, + "grad_norm": 0.09729575365781784, + "learning_rate": 0.00136881064647762, + "loss": 2.2839, + "step": 409960 + }, + { + "epoch": 1.5848293671042661, + "grad_norm": 0.11494239419698715, + "learning_rate": 0.0013684938638461222, + "loss": 2.2947, + "step": 409970 + }, + { + "epoch": 1.5848680243076494, + "grad_norm": 0.108024001121521, + "learning_rate": 0.0013681772400427475, + "loss": 2.2968, + "step": 409980 + }, + { + "epoch": 1.5849066815110326, + "grad_norm": 0.10436026751995087, + "learning_rate": 0.0013678607748288357, + "loss": 2.2759, + "step": 409990 + }, + { + "epoch": 1.584945338714416, + "grad_norm": 0.10196762531995773, + "learning_rate": 0.001367544467966324, + "loss": 2.2939, + "step": 410000 + }, + { + "epoch": 1.5849839959177994, + "grad_norm": 0.10346586257219315, + "learning_rate": 0.001367228319217745, + "loss": 2.2845, + "step": 410010 + }, + { + "epoch": 1.5850226531211826, + "grad_norm": 0.09651974588632584, + "learning_rate": 0.0013669123283462235, + "loss": 2.2963, + "step": 410020 + }, + { + "epoch": 1.5850613103245659, + "grad_norm": 0.09264250844717026, + "learning_rate": 0.0013665964951154754, + "loss": 2.2735, + "step": 410030 + }, + { + "epoch": 1.5850999675279491, + "grad_norm": 0.09752018004655838, + "learning_rate": 0.0013662808192898056, + "loss": 2.2901, + "step": 410040 + }, + { + "epoch": 1.5851386247313326, + "grad_norm": 0.10400497913360596, + "learning_rate": 0.0013659653006341057, + "loss": 2.2782, + "step": 410050 + }, + { + "epoch": 1.5851772819347159, + "grad_norm": 0.11008419096469879, + "learning_rate": 0.0013656499389138519, + "loss": 2.2888, + "step": 410060 + }, + { + "epoch": 1.585215939138099, + "grad_norm": 0.11386552453041077, + "learning_rate": 0.0013653347338951028, + "loss": 2.2865, + "step": 410070 + }, + { + "epoch": 1.5852545963414824, + "grad_norm": 0.18960657715797424, + "learning_rate": 0.0013650196853444983, + "loss": 2.2879, + "step": 410080 + }, + { + "epoch": 1.5852932535448656, + "grad_norm": 0.09678232669830322, + "learning_rate": 0.0013647047930292564, + "loss": 2.2783, + "step": 410090 + }, + { + "epoch": 1.5853319107482489, + "grad_norm": 0.096040740609169, + "learning_rate": 0.0013643900567171718, + "loss": 2.2924, + "step": 410100 + }, + { + "epoch": 1.5853705679516321, + "grad_norm": 0.20530691742897034, + "learning_rate": 0.0013640754761766144, + "loss": 2.2836, + "step": 410110 + }, + { + "epoch": 1.5854092251550154, + "grad_norm": 0.10444552451372147, + "learning_rate": 0.0013637610511765255, + "loss": 2.2679, + "step": 410120 + }, + { + "epoch": 1.5854478823583986, + "grad_norm": 0.132937952876091, + "learning_rate": 0.001363446781486418, + "loss": 2.2928, + "step": 410130 + }, + { + "epoch": 1.5854865395617819, + "grad_norm": 0.09830232709646225, + "learning_rate": 0.0013631326668763738, + "loss": 2.2902, + "step": 410140 + }, + { + "epoch": 1.5855251967651651, + "grad_norm": 0.09446366876363754, + "learning_rate": 0.0013628187071170403, + "loss": 2.2991, + "step": 410150 + }, + { + "epoch": 1.5855638539685484, + "grad_norm": 0.0938754454255104, + "learning_rate": 0.001362504901979631, + "loss": 2.2895, + "step": 410160 + }, + { + "epoch": 1.5856025111719316, + "grad_norm": 0.1105690747499466, + "learning_rate": 0.0013621912512359212, + "loss": 2.2879, + "step": 410170 + }, + { + "epoch": 1.585641168375315, + "grad_norm": 0.09766722470521927, + "learning_rate": 0.0013618777546582472, + "loss": 2.2927, + "step": 410180 + }, + { + "epoch": 1.5856798255786984, + "grad_norm": 0.1109689325094223, + "learning_rate": 0.001361564412019505, + "loss": 2.2862, + "step": 410190 + }, + { + "epoch": 1.5857184827820816, + "grad_norm": 0.10538505017757416, + "learning_rate": 0.0013612512230931474, + "loss": 2.2708, + "step": 410200 + }, + { + "epoch": 1.5857571399854649, + "grad_norm": 0.09841452538967133, + "learning_rate": 0.001360938187653182, + "loss": 2.2811, + "step": 410210 + }, + { + "epoch": 1.5857957971888483, + "grad_norm": 0.09635888785123825, + "learning_rate": 0.0013606253054741688, + "loss": 2.2762, + "step": 410220 + }, + { + "epoch": 1.5858344543922316, + "grad_norm": 0.09994711726903915, + "learning_rate": 0.001360312576331221, + "loss": 2.2843, + "step": 410230 + }, + { + "epoch": 1.5858731115956148, + "grad_norm": 0.11057305335998535, + "learning_rate": 0.0013599999999999999, + "loss": 2.2859, + "step": 410240 + }, + { + "epoch": 1.585911768798998, + "grad_norm": 0.10168281197547913, + "learning_rate": 0.0013596875762567151, + "loss": 2.3044, + "step": 410250 + }, + { + "epoch": 1.5859504260023813, + "grad_norm": 0.10190840065479279, + "learning_rate": 0.0013593753048781214, + "loss": 2.2784, + "step": 410260 + }, + { + "epoch": 1.5859890832057646, + "grad_norm": 0.09425269812345505, + "learning_rate": 0.0013590631856415173, + "loss": 2.2899, + "step": 410270 + }, + { + "epoch": 1.5860277404091478, + "grad_norm": 0.11108811944723129, + "learning_rate": 0.0013587512183247442, + "loss": 2.2865, + "step": 410280 + }, + { + "epoch": 1.586066397612531, + "grad_norm": 0.12007686495780945, + "learning_rate": 0.0013584394027061823, + "loss": 2.2944, + "step": 410290 + }, + { + "epoch": 1.5861050548159144, + "grad_norm": 0.11422277241945267, + "learning_rate": 0.0013581277385647517, + "loss": 2.2944, + "step": 410300 + }, + { + "epoch": 1.5861437120192976, + "grad_norm": 0.10217493772506714, + "learning_rate": 0.0013578162256799072, + "loss": 2.2854, + "step": 410310 + }, + { + "epoch": 1.5861823692226809, + "grad_norm": 0.10135794430971146, + "learning_rate": 0.0013575048638316397, + "loss": 2.2994, + "step": 410320 + }, + { + "epoch": 1.586221026426064, + "grad_norm": 0.10746722668409348, + "learning_rate": 0.0013571936528004721, + "loss": 2.2899, + "step": 410330 + }, + { + "epoch": 1.5862596836294474, + "grad_norm": 0.09228982776403427, + "learning_rate": 0.001356882592367459, + "loss": 2.291, + "step": 410340 + }, + { + "epoch": 1.5862983408328308, + "grad_norm": 0.12576334178447723, + "learning_rate": 0.0013565716823141837, + "loss": 2.2806, + "step": 410350 + }, + { + "epoch": 1.586336998036214, + "grad_norm": 0.11180675029754639, + "learning_rate": 0.0013562609224227566, + "loss": 2.3051, + "step": 410360 + }, + { + "epoch": 1.5863756552395973, + "grad_norm": 0.0983600988984108, + "learning_rate": 0.0013559503124758153, + "loss": 2.2917, + "step": 410370 + }, + { + "epoch": 1.5864143124429806, + "grad_norm": 0.09851762652397156, + "learning_rate": 0.0013556398522565195, + "loss": 2.2819, + "step": 410380 + }, + { + "epoch": 1.586452969646364, + "grad_norm": 0.09128008782863617, + "learning_rate": 0.001355329541548552, + "loss": 2.2816, + "step": 410390 + }, + { + "epoch": 1.5864916268497473, + "grad_norm": 0.09307295829057693, + "learning_rate": 0.001355019380136116, + "loss": 2.2751, + "step": 410400 + }, + { + "epoch": 1.5865302840531306, + "grad_norm": 0.1199588030576706, + "learning_rate": 0.0013547093678039329, + "loss": 2.2911, + "step": 410410 + }, + { + "epoch": 1.5865689412565138, + "grad_norm": 0.10031304508447647, + "learning_rate": 0.0013543995043372413, + "loss": 2.2987, + "step": 410420 + }, + { + "epoch": 1.586607598459897, + "grad_norm": 0.1117083951830864, + "learning_rate": 0.0013540897895217942, + "loss": 2.2861, + "step": 410430 + }, + { + "epoch": 1.5866462556632803, + "grad_norm": 0.0932147353887558, + "learning_rate": 0.0013537802231438595, + "loss": 2.2821, + "step": 410440 + }, + { + "epoch": 1.5866849128666636, + "grad_norm": 0.10109856724739075, + "learning_rate": 0.0013534708049902156, + "loss": 2.2933, + "step": 410450 + }, + { + "epoch": 1.5867235700700468, + "grad_norm": 0.09498569369316101, + "learning_rate": 0.0013531615348481507, + "loss": 2.2964, + "step": 410460 + }, + { + "epoch": 1.58676222727343, + "grad_norm": 0.10946396738290787, + "learning_rate": 0.0013528524125054626, + "loss": 2.2835, + "step": 410470 + }, + { + "epoch": 1.5868008844768133, + "grad_norm": 0.09790240973234177, + "learning_rate": 0.0013525434377504543, + "loss": 2.2838, + "step": 410480 + }, + { + "epoch": 1.5868395416801966, + "grad_norm": 0.10081296414136887, + "learning_rate": 0.001352234610371934, + "loss": 2.2906, + "step": 410490 + }, + { + "epoch": 1.5868781988835798, + "grad_norm": 0.11622887849807739, + "learning_rate": 0.001351925930159214, + "loss": 2.2996, + "step": 410500 + }, + { + "epoch": 1.586916856086963, + "grad_norm": 0.09005551785230637, + "learning_rate": 0.001351617396902107, + "loss": 2.2913, + "step": 410510 + }, + { + "epoch": 1.5869555132903466, + "grad_norm": 0.10132360458374023, + "learning_rate": 0.0013513090103909257, + "loss": 2.2762, + "step": 410520 + }, + { + "epoch": 1.5869941704937298, + "grad_norm": 0.09307138621807098, + "learning_rate": 0.001351000770416482, + "loss": 2.2842, + "step": 410530 + }, + { + "epoch": 1.587032827697113, + "grad_norm": 0.19938762485980988, + "learning_rate": 0.001350692676770083, + "loss": 2.2708, + "step": 410540 + }, + { + "epoch": 1.5870714849004963, + "grad_norm": 0.09920799732208252, + "learning_rate": 0.0013503847292435315, + "loss": 2.291, + "step": 410550 + }, + { + "epoch": 1.5871101421038798, + "grad_norm": 0.09430687129497528, + "learning_rate": 0.0013500769276291232, + "loss": 2.2867, + "step": 410560 + }, + { + "epoch": 1.587148799307263, + "grad_norm": 0.1142541691660881, + "learning_rate": 0.0013497692717196457, + "loss": 2.3058, + "step": 410570 + }, + { + "epoch": 1.5871874565106463, + "grad_norm": 0.09234173595905304, + "learning_rate": 0.001349461761308376, + "loss": 2.2839, + "step": 410580 + }, + { + "epoch": 1.5872261137140296, + "grad_norm": 0.12318877130746841, + "learning_rate": 0.0013491543961890809, + "loss": 2.2912, + "step": 410590 + }, + { + "epoch": 1.5872647709174128, + "grad_norm": 0.11205442249774933, + "learning_rate": 0.0013488471761560117, + "loss": 2.2894, + "step": 410600 + }, + { + "epoch": 1.587303428120796, + "grad_norm": 0.11573982238769531, + "learning_rate": 0.001348540101003907, + "loss": 2.2769, + "step": 410610 + }, + { + "epoch": 1.5873420853241793, + "grad_norm": 0.12379911541938782, + "learning_rate": 0.001348233170527987, + "loss": 2.2867, + "step": 410620 + }, + { + "epoch": 1.5873807425275626, + "grad_norm": 0.09459318220615387, + "learning_rate": 0.0013479263845239558, + "loss": 2.2827, + "step": 410630 + }, + { + "epoch": 1.5874193997309458, + "grad_norm": 0.09499311447143555, + "learning_rate": 0.0013476197427879965, + "loss": 2.2919, + "step": 410640 + }, + { + "epoch": 1.587458056934329, + "grad_norm": 0.09781172126531601, + "learning_rate": 0.0013473132451167712, + "loss": 2.2859, + "step": 410650 + }, + { + "epoch": 1.5874967141377123, + "grad_norm": 0.09685497730970383, + "learning_rate": 0.0013470068913074198, + "loss": 2.2797, + "step": 410660 + }, + { + "epoch": 1.5875353713410956, + "grad_norm": 0.0856289267539978, + "learning_rate": 0.0013467006811575572, + "loss": 2.2763, + "step": 410670 + }, + { + "epoch": 1.5875740285444788, + "grad_norm": 0.0968082994222641, + "learning_rate": 0.0013463946144652724, + "loss": 2.2785, + "step": 410680 + }, + { + "epoch": 1.5876126857478623, + "grad_norm": 0.11211162805557251, + "learning_rate": 0.0013460886910291274, + "loss": 2.2963, + "step": 410690 + }, + { + "epoch": 1.5876513429512455, + "grad_norm": 0.26160678267478943, + "learning_rate": 0.001345782910648155, + "loss": 2.2934, + "step": 410700 + }, + { + "epoch": 1.5876900001546288, + "grad_norm": 0.09896879643201828, + "learning_rate": 0.0013454772731218572, + "loss": 2.2792, + "step": 410710 + }, + { + "epoch": 1.587728657358012, + "grad_norm": 0.10798089951276779, + "learning_rate": 0.001345171778250204, + "loss": 2.3055, + "step": 410720 + }, + { + "epoch": 1.5877673145613955, + "grad_norm": 0.10475075244903564, + "learning_rate": 0.0013448664258336321, + "loss": 2.2867, + "step": 410730 + }, + { + "epoch": 1.5878059717647788, + "grad_norm": 0.108814537525177, + "learning_rate": 0.0013445612156730426, + "loss": 2.2868, + "step": 410740 + }, + { + "epoch": 1.587844628968162, + "grad_norm": 0.11034875363111496, + "learning_rate": 0.0013442561475698002, + "loss": 2.2904, + "step": 410750 + }, + { + "epoch": 1.5878832861715453, + "grad_norm": 0.0990104079246521, + "learning_rate": 0.001343951221325731, + "loss": 2.3047, + "step": 410760 + }, + { + "epoch": 1.5879219433749285, + "grad_norm": 0.0992002859711647, + "learning_rate": 0.0013436464367431226, + "loss": 2.2871, + "step": 410770 + }, + { + "epoch": 1.5879606005783118, + "grad_norm": 0.10469389706850052, + "learning_rate": 0.0013433417936247198, + "loss": 2.2893, + "step": 410780 + }, + { + "epoch": 1.587999257781695, + "grad_norm": 0.10755585879087448, + "learning_rate": 0.0013430372917737263, + "loss": 2.2819, + "step": 410790 + }, + { + "epoch": 1.5880379149850783, + "grad_norm": 0.16936883330345154, + "learning_rate": 0.0013427329309938006, + "loss": 2.2872, + "step": 410800 + }, + { + "epoch": 1.5880765721884615, + "grad_norm": 0.10266046226024628, + "learning_rate": 0.001342428711089056, + "loss": 2.2775, + "step": 410810 + }, + { + "epoch": 1.5881152293918448, + "grad_norm": 0.09038469195365906, + "learning_rate": 0.001342124631864059, + "loss": 2.2807, + "step": 410820 + }, + { + "epoch": 1.588153886595228, + "grad_norm": 0.10589548945426941, + "learning_rate": 0.0013418206931238266, + "loss": 2.2908, + "step": 410830 + }, + { + "epoch": 1.5881925437986113, + "grad_norm": 0.08776730298995972, + "learning_rate": 0.001341516894673827, + "loss": 2.2934, + "step": 410840 + }, + { + "epoch": 1.5882312010019946, + "grad_norm": 0.10342015326023102, + "learning_rate": 0.001341213236319976, + "loss": 2.2711, + "step": 410850 + }, + { + "epoch": 1.588269858205378, + "grad_norm": 0.09962315857410431, + "learning_rate": 0.0013409097178686369, + "loss": 2.2739, + "step": 410860 + }, + { + "epoch": 1.5883085154087613, + "grad_norm": 0.12369918823242188, + "learning_rate": 0.0013406063391266185, + "loss": 2.2901, + "step": 410870 + }, + { + "epoch": 1.5883471726121445, + "grad_norm": 0.09774774312973022, + "learning_rate": 0.0013403030999011745, + "loss": 2.273, + "step": 410880 + }, + { + "epoch": 1.5883858298155278, + "grad_norm": 0.10486874729394913, + "learning_rate": 0.0013399999999999998, + "loss": 2.2856, + "step": 410890 + }, + { + "epoch": 1.5884244870189113, + "grad_norm": 0.12358658760786057, + "learning_rate": 0.0013396970392312328, + "loss": 2.2988, + "step": 410900 + }, + { + "epoch": 1.5884631442222945, + "grad_norm": 0.10743393003940582, + "learning_rate": 0.0013393942174034503, + "loss": 2.2783, + "step": 410910 + }, + { + "epoch": 1.5885018014256778, + "grad_norm": 0.10001206398010254, + "learning_rate": 0.0013390915343256678, + "loss": 2.2816, + "step": 410920 + }, + { + "epoch": 1.588540458629061, + "grad_norm": 0.10591772943735123, + "learning_rate": 0.0013387889898073383, + "loss": 2.2981, + "step": 410930 + }, + { + "epoch": 1.5885791158324443, + "grad_norm": 0.10507027804851532, + "learning_rate": 0.001338486583658351, + "loss": 2.293, + "step": 410940 + }, + { + "epoch": 1.5886177730358275, + "grad_norm": 0.13516682386398315, + "learning_rate": 0.0013381843156890282, + "loss": 2.2735, + "step": 410950 + }, + { + "epoch": 1.5886564302392108, + "grad_norm": 0.10282962769269943, + "learning_rate": 0.001337882185710126, + "loss": 2.2705, + "step": 410960 + }, + { + "epoch": 1.588695087442594, + "grad_norm": 0.10241902619600296, + "learning_rate": 0.0013375801935328322, + "loss": 2.2836, + "step": 410970 + }, + { + "epoch": 1.5887337446459773, + "grad_norm": 0.09397570788860321, + "learning_rate": 0.0013372783389687644, + "loss": 2.2804, + "step": 410980 + }, + { + "epoch": 1.5887724018493605, + "grad_norm": 0.09933995455503464, + "learning_rate": 0.001336976621829969, + "loss": 2.2846, + "step": 410990 + }, + { + "epoch": 1.5888110590527438, + "grad_norm": 0.23156529664993286, + "learning_rate": 0.00133667504192892, + "loss": 2.2898, + "step": 411000 + }, + { + "epoch": 1.588849716256127, + "grad_norm": 0.09423259645700455, + "learning_rate": 0.0013363735990785178, + "loss": 2.2851, + "step": 411010 + }, + { + "epoch": 1.5888883734595103, + "grad_norm": 0.10134994238615036, + "learning_rate": 0.0013360722930920868, + "loss": 2.2957, + "step": 411020 + }, + { + "epoch": 1.5889270306628938, + "grad_norm": 0.1059289425611496, + "learning_rate": 0.0013357711237833754, + "loss": 2.2816, + "step": 411030 + }, + { + "epoch": 1.588965687866277, + "grad_norm": 0.2090606540441513, + "learning_rate": 0.001335470090966554, + "loss": 2.3035, + "step": 411040 + }, + { + "epoch": 1.5890043450696603, + "grad_norm": 0.15536662936210632, + "learning_rate": 0.0013351691944562135, + "loss": 2.2997, + "step": 411050 + }, + { + "epoch": 1.5890430022730435, + "grad_norm": 0.10510016977787018, + "learning_rate": 0.0013348684340673644, + "loss": 2.2988, + "step": 411060 + }, + { + "epoch": 1.589081659476427, + "grad_norm": 0.4065359830856323, + "learning_rate": 0.001334567809615435, + "loss": 2.2891, + "step": 411070 + }, + { + "epoch": 1.5891203166798102, + "grad_norm": 0.09470411390066147, + "learning_rate": 0.0013342673209162706, + "loss": 2.3082, + "step": 411080 + }, + { + "epoch": 1.5891589738831935, + "grad_norm": 0.09722394496202469, + "learning_rate": 0.0013339669677861314, + "loss": 2.2815, + "step": 411090 + }, + { + "epoch": 1.5891976310865767, + "grad_norm": 0.10346391797065735, + "learning_rate": 0.0013336667500416929, + "loss": 2.2876, + "step": 411100 + }, + { + "epoch": 1.58923628828996, + "grad_norm": 0.09680181741714478, + "learning_rate": 0.0013333666675000417, + "loss": 2.2978, + "step": 411110 + }, + { + "epoch": 1.5892749454933432, + "grad_norm": 0.1011996865272522, + "learning_rate": 0.0013330667199786773, + "loss": 2.289, + "step": 411120 + }, + { + "epoch": 1.5893136026967265, + "grad_norm": 0.1092514917254448, + "learning_rate": 0.0013327669072955089, + "loss": 2.2919, + "step": 411130 + }, + { + "epoch": 1.5893522599001098, + "grad_norm": 0.09436289221048355, + "learning_rate": 0.0013324672292688545, + "loss": 2.2845, + "step": 411140 + }, + { + "epoch": 1.589390917103493, + "grad_norm": 0.09714016318321228, + "learning_rate": 0.00133216768571744, + "loss": 2.2978, + "step": 411150 + }, + { + "epoch": 1.5894295743068763, + "grad_norm": 0.10386151075363159, + "learning_rate": 0.0013318682764603973, + "loss": 2.2672, + "step": 411160 + }, + { + "epoch": 1.5894682315102595, + "grad_norm": 0.09310182183980942, + "learning_rate": 0.001331569001317264, + "loss": 2.2861, + "step": 411170 + }, + { + "epoch": 1.5895068887136428, + "grad_norm": 0.10692202299833298, + "learning_rate": 0.0013312698601079807, + "loss": 2.2751, + "step": 411180 + }, + { + "epoch": 1.589545545917026, + "grad_norm": 0.09382941573858261, + "learning_rate": 0.0013309708526528907, + "loss": 2.2891, + "step": 411190 + }, + { + "epoch": 1.5895842031204095, + "grad_norm": 0.10170342028141022, + "learning_rate": 0.0013306719787727396, + "loss": 2.2846, + "step": 411200 + }, + { + "epoch": 1.5896228603237927, + "grad_norm": 0.11549654603004456, + "learning_rate": 0.0013303732382886717, + "loss": 2.2846, + "step": 411210 + }, + { + "epoch": 1.589661517527176, + "grad_norm": 0.11813855916261673, + "learning_rate": 0.001330074631022231, + "loss": 2.263, + "step": 411220 + }, + { + "epoch": 1.5897001747305592, + "grad_norm": 0.09434445947408676, + "learning_rate": 0.0013297761567953583, + "loss": 2.276, + "step": 411230 + }, + { + "epoch": 1.5897388319339427, + "grad_norm": 0.09521712362766266, + "learning_rate": 0.0013294778154303915, + "loss": 2.2783, + "step": 411240 + }, + { + "epoch": 1.589777489137326, + "grad_norm": 0.10191566497087479, + "learning_rate": 0.001329179606750063, + "loss": 2.2815, + "step": 411250 + }, + { + "epoch": 1.5898161463407092, + "grad_norm": 0.10600905120372772, + "learning_rate": 0.0013288815305774993, + "loss": 2.2789, + "step": 411260 + }, + { + "epoch": 1.5898548035440925, + "grad_norm": 0.10898283123970032, + "learning_rate": 0.0013285835867362192, + "loss": 2.2853, + "step": 411270 + }, + { + "epoch": 1.5898934607474757, + "grad_norm": 0.4227880835533142, + "learning_rate": 0.0013282857750501337, + "loss": 2.2716, + "step": 411280 + }, + { + "epoch": 1.589932117950859, + "grad_norm": 0.10346785932779312, + "learning_rate": 0.0013279880953435423, + "loss": 2.2924, + "step": 411290 + }, + { + "epoch": 1.5899707751542422, + "grad_norm": 0.10272988677024841, + "learning_rate": 0.0013276905474411357, + "loss": 2.2859, + "step": 411300 + }, + { + "epoch": 1.5900094323576255, + "grad_norm": 0.17632514238357544, + "learning_rate": 0.0013273931311679906, + "loss": 2.2894, + "step": 411310 + }, + { + "epoch": 1.5900480895610087, + "grad_norm": 0.10075627267360687, + "learning_rate": 0.0013270958463495713, + "loss": 2.2829, + "step": 411320 + }, + { + "epoch": 1.590086746764392, + "grad_norm": 0.10793676227331161, + "learning_rate": 0.0013267986928117267, + "loss": 2.2933, + "step": 411330 + }, + { + "epoch": 1.5901254039677752, + "grad_norm": 0.10403290390968323, + "learning_rate": 0.0013265016703806905, + "loss": 2.2683, + "step": 411340 + }, + { + "epoch": 1.5901640611711585, + "grad_norm": 0.08908011019229889, + "learning_rate": 0.0013262047788830792, + "loss": 2.2938, + "step": 411350 + }, + { + "epoch": 1.5902027183745417, + "grad_norm": 0.09441845118999481, + "learning_rate": 0.0013259080181458912, + "loss": 2.2854, + "step": 411360 + }, + { + "epoch": 1.5902413755779252, + "grad_norm": 0.10546926409006119, + "learning_rate": 0.0013256113879965056, + "loss": 2.2922, + "step": 411370 + }, + { + "epoch": 1.5902800327813085, + "grad_norm": 0.1103266179561615, + "learning_rate": 0.00132531488826268, + "loss": 2.277, + "step": 411380 + }, + { + "epoch": 1.5903186899846917, + "grad_norm": 0.0980391874909401, + "learning_rate": 0.001325018518772552, + "loss": 2.3028, + "step": 411390 + }, + { + "epoch": 1.590357347188075, + "grad_norm": 0.1028299555182457, + "learning_rate": 0.0013247222793546347, + "loss": 2.3042, + "step": 411400 + }, + { + "epoch": 1.5903960043914585, + "grad_norm": 0.0907234251499176, + "learning_rate": 0.0013244261698378186, + "loss": 2.2924, + "step": 411410 + }, + { + "epoch": 1.5904346615948417, + "grad_norm": 0.10806559771299362, + "learning_rate": 0.001324130190051368, + "loss": 2.2932, + "step": 411420 + }, + { + "epoch": 1.590473318798225, + "grad_norm": 0.0910990834236145, + "learning_rate": 0.0013238343398249213, + "loss": 2.2821, + "step": 411430 + }, + { + "epoch": 1.5905119760016082, + "grad_norm": 0.09880875796079636, + "learning_rate": 0.0013235386189884896, + "loss": 2.2785, + "step": 411440 + }, + { + "epoch": 1.5905506332049915, + "grad_norm": 0.09702501446008682, + "learning_rate": 0.0013232430273724548, + "loss": 2.2757, + "step": 411450 + }, + { + "epoch": 1.5905892904083747, + "grad_norm": 0.09511641412973404, + "learning_rate": 0.0013229475648075697, + "loss": 2.2857, + "step": 411460 + }, + { + "epoch": 1.590627947611758, + "grad_norm": 0.10028154402971268, + "learning_rate": 0.001322652231124956, + "loss": 2.2678, + "step": 411470 + }, + { + "epoch": 1.5906666048151412, + "grad_norm": 0.10853612422943115, + "learning_rate": 0.0013223570261561032, + "loss": 2.2888, + "step": 411480 + }, + { + "epoch": 1.5907052620185245, + "grad_norm": 0.3342318832874298, + "learning_rate": 0.0013220619497328683, + "loss": 2.2837, + "step": 411490 + }, + { + "epoch": 1.5907439192219077, + "grad_norm": 0.092925526201725, + "learning_rate": 0.0013217670016874732, + "loss": 2.2834, + "step": 411500 + }, + { + "epoch": 1.590782576425291, + "grad_norm": 0.09471990168094635, + "learning_rate": 0.001321472181852505, + "loss": 2.2605, + "step": 411510 + }, + { + "epoch": 1.5908212336286742, + "grad_norm": 0.10889468342065811, + "learning_rate": 0.0013211774900609144, + "loss": 2.2817, + "step": 411520 + }, + { + "epoch": 1.5908598908320577, + "grad_norm": 0.10209710150957108, + "learning_rate": 0.0013208829261460143, + "loss": 2.2977, + "step": 411530 + }, + { + "epoch": 1.590898548035441, + "grad_norm": 0.09678677469491959, + "learning_rate": 0.0013205884899414788, + "loss": 2.2805, + "step": 411540 + }, + { + "epoch": 1.5909372052388242, + "grad_norm": 0.12248372286558151, + "learning_rate": 0.0013202941812813429, + "loss": 2.2852, + "step": 411550 + }, + { + "epoch": 1.5909758624422075, + "grad_norm": 0.09738996624946594, + "learning_rate": 0.00132, + "loss": 2.3028, + "step": 411560 + }, + { + "epoch": 1.5910145196455907, + "grad_norm": 0.09860974550247192, + "learning_rate": 0.0013197059459322022, + "loss": 2.2742, + "step": 411570 + }, + { + "epoch": 1.5910531768489742, + "grad_norm": 0.11459953337907791, + "learning_rate": 0.0013194120189130578, + "loss": 2.2892, + "step": 411580 + }, + { + "epoch": 1.5910918340523574, + "grad_norm": 0.11038514971733093, + "learning_rate": 0.001319118218778032, + "loss": 2.2809, + "step": 411590 + }, + { + "epoch": 1.5911304912557407, + "grad_norm": 0.09345544129610062, + "learning_rate": 0.001318824545362944, + "loss": 2.2818, + "step": 411600 + }, + { + "epoch": 1.591169148459124, + "grad_norm": 0.08866851031780243, + "learning_rate": 0.001318530998503967, + "loss": 2.2901, + "step": 411610 + }, + { + "epoch": 1.5912078056625072, + "grad_norm": 0.09870915859937668, + "learning_rate": 0.0013182375780376274, + "loss": 2.2757, + "step": 411620 + }, + { + "epoch": 1.5912464628658904, + "grad_norm": 0.09726520627737045, + "learning_rate": 0.0013179442838008027, + "loss": 2.2711, + "step": 411630 + }, + { + "epoch": 1.5912851200692737, + "grad_norm": 0.11688517779111862, + "learning_rate": 0.0013176511156307207, + "loss": 2.276, + "step": 411640 + }, + { + "epoch": 1.591323777272657, + "grad_norm": 0.11012086272239685, + "learning_rate": 0.0013173580733649595, + "loss": 2.289, + "step": 411650 + }, + { + "epoch": 1.5913624344760402, + "grad_norm": 0.08703190833330154, + "learning_rate": 0.001317065156841445, + "loss": 2.2957, + "step": 411660 + }, + { + "epoch": 1.5914010916794235, + "grad_norm": 0.15332461893558502, + "learning_rate": 0.0013167723658984512, + "loss": 2.3, + "step": 411670 + }, + { + "epoch": 1.5914397488828067, + "grad_norm": 0.09997732192277908, + "learning_rate": 0.0013164797003745977, + "loss": 2.2693, + "step": 411680 + }, + { + "epoch": 1.59147840608619, + "grad_norm": 0.10567644983530045, + "learning_rate": 0.0013161871601088495, + "loss": 2.291, + "step": 411690 + }, + { + "epoch": 1.5915170632895734, + "grad_norm": 0.09762544929981232, + "learning_rate": 0.0013158947449405172, + "loss": 2.2918, + "step": 411700 + }, + { + "epoch": 1.5915557204929567, + "grad_norm": 0.10543996840715408, + "learning_rate": 0.001315602454709253, + "loss": 2.2713, + "step": 411710 + }, + { + "epoch": 1.59159437769634, + "grad_norm": 0.09037619829177856, + "learning_rate": 0.0013153102892550525, + "loss": 2.2706, + "step": 411720 + }, + { + "epoch": 1.5916330348997232, + "grad_norm": 0.10732545703649521, + "learning_rate": 0.0013150182484182515, + "loss": 2.2967, + "step": 411730 + }, + { + "epoch": 1.5916716921031064, + "grad_norm": 0.09629809856414795, + "learning_rate": 0.0013147263320395274, + "loss": 2.2672, + "step": 411740 + }, + { + "epoch": 1.59171034930649, + "grad_norm": 0.10680815577507019, + "learning_rate": 0.0013144345399598955, + "loss": 2.2666, + "step": 411750 + }, + { + "epoch": 1.5917490065098732, + "grad_norm": 0.09527281671762466, + "learning_rate": 0.0013141428720207103, + "loss": 2.2815, + "step": 411760 + }, + { + "epoch": 1.5917876637132564, + "grad_norm": 0.10793960839509964, + "learning_rate": 0.0013138513280636624, + "loss": 2.2803, + "step": 411770 + }, + { + "epoch": 1.5918263209166397, + "grad_norm": 0.10628779232501984, + "learning_rate": 0.0013135599079307796, + "loss": 2.287, + "step": 411780 + }, + { + "epoch": 1.591864978120023, + "grad_norm": 0.10582350194454193, + "learning_rate": 0.0013132686114644243, + "loss": 2.2832, + "step": 411790 + }, + { + "epoch": 1.5919036353234062, + "grad_norm": 0.11519595235586166, + "learning_rate": 0.0013129774385072933, + "loss": 2.2925, + "step": 411800 + }, + { + "epoch": 1.5919422925267894, + "grad_norm": 0.09863905608654022, + "learning_rate": 0.0013126863889024168, + "loss": 2.2756, + "step": 411810 + }, + { + "epoch": 1.5919809497301727, + "grad_norm": 0.10615672171115875, + "learning_rate": 0.0013123954624931567, + "loss": 2.2781, + "step": 411820 + }, + { + "epoch": 1.592019606933556, + "grad_norm": 0.09898418933153152, + "learning_rate": 0.0013121046591232065, + "loss": 2.2748, + "step": 411830 + }, + { + "epoch": 1.5920582641369392, + "grad_norm": 0.09342687577009201, + "learning_rate": 0.00131181397863659, + "loss": 2.2863, + "step": 411840 + }, + { + "epoch": 1.5920969213403224, + "grad_norm": 0.0946342870593071, + "learning_rate": 0.0013115234208776597, + "loss": 2.274, + "step": 411850 + }, + { + "epoch": 1.5921355785437057, + "grad_norm": 0.09903334826231003, + "learning_rate": 0.0013112329856910976, + "loss": 2.2742, + "step": 411860 + }, + { + "epoch": 1.5921742357470892, + "grad_norm": 0.10328184813261032, + "learning_rate": 0.0013109426729219114, + "loss": 2.285, + "step": 411870 + }, + { + "epoch": 1.5922128929504724, + "grad_norm": 0.09361221641302109, + "learning_rate": 0.0013106524824154366, + "loss": 2.2823, + "step": 411880 + }, + { + "epoch": 1.5922515501538557, + "grad_norm": 0.0951387882232666, + "learning_rate": 0.0013103624140173335, + "loss": 2.284, + "step": 411890 + }, + { + "epoch": 1.592290207357239, + "grad_norm": 0.10114741325378418, + "learning_rate": 0.0013100724675735864, + "loss": 2.2861, + "step": 411900 + }, + { + "epoch": 1.5923288645606224, + "grad_norm": 0.10498538613319397, + "learning_rate": 0.0013097826429305042, + "loss": 2.2775, + "step": 411910 + }, + { + "epoch": 1.5923675217640056, + "grad_norm": 0.092182457447052, + "learning_rate": 0.0013094929399347173, + "loss": 2.2895, + "step": 411920 + }, + { + "epoch": 1.592406178967389, + "grad_norm": 0.11279166489839554, + "learning_rate": 0.0013092033584331784, + "loss": 2.2872, + "step": 411930 + }, + { + "epoch": 1.5924448361707721, + "grad_norm": 0.09732379764318466, + "learning_rate": 0.0013089138982731604, + "loss": 2.2741, + "step": 411940 + }, + { + "epoch": 1.5924834933741554, + "grad_norm": 0.09594089537858963, + "learning_rate": 0.001308624559302256, + "loss": 2.2718, + "step": 411950 + }, + { + "epoch": 1.5925221505775387, + "grad_norm": 0.1188618466258049, + "learning_rate": 0.0013083353413683768, + "loss": 2.2821, + "step": 411960 + }, + { + "epoch": 1.592560807780922, + "grad_norm": 0.11669903993606567, + "learning_rate": 0.0013080462443197524, + "loss": 2.287, + "step": 411970 + }, + { + "epoch": 1.5925994649843052, + "grad_norm": 0.10351302474737167, + "learning_rate": 0.0013077572680049287, + "loss": 2.2905, + "step": 411980 + }, + { + "epoch": 1.5926381221876884, + "grad_norm": 0.3517273962497711, + "learning_rate": 0.0013074684122727684, + "loss": 2.2672, + "step": 411990 + }, + { + "epoch": 1.5926767793910717, + "grad_norm": 0.11604971438646317, + "learning_rate": 0.0013071796769724491, + "loss": 2.2852, + "step": 412000 + }, + { + "epoch": 1.592715436594455, + "grad_norm": 0.09867674112319946, + "learning_rate": 0.0013068910619534618, + "loss": 2.2912, + "step": 412010 + }, + { + "epoch": 1.5927540937978382, + "grad_norm": 0.09733185172080994, + "learning_rate": 0.0013066025670656115, + "loss": 2.2827, + "step": 412020 + }, + { + "epoch": 1.5927927510012214, + "grad_norm": 0.10976596921682358, + "learning_rate": 0.0013063141921590149, + "loss": 2.2807, + "step": 412030 + }, + { + "epoch": 1.592831408204605, + "grad_norm": 0.10603948682546616, + "learning_rate": 0.001306025937084101, + "loss": 2.2792, + "step": 412040 + }, + { + "epoch": 1.5928700654079881, + "grad_norm": 0.10735761374235153, + "learning_rate": 0.0013057378016916089, + "loss": 2.2661, + "step": 412050 + }, + { + "epoch": 1.5929087226113714, + "grad_norm": 0.09300761669874191, + "learning_rate": 0.0013054497858325865, + "loss": 2.2894, + "step": 412060 + }, + { + "epoch": 1.5929473798147546, + "grad_norm": 0.10261404514312744, + "learning_rate": 0.0013051618893583916, + "loss": 2.2913, + "step": 412070 + }, + { + "epoch": 1.5929860370181381, + "grad_norm": 0.1160757839679718, + "learning_rate": 0.0013048741121206894, + "loss": 2.2692, + "step": 412080 + }, + { + "epoch": 1.5930246942215214, + "grad_norm": 0.11559846252202988, + "learning_rate": 0.0013045864539714515, + "loss": 2.2747, + "step": 412090 + }, + { + "epoch": 1.5930633514249046, + "grad_norm": 0.10721922665834427, + "learning_rate": 0.0013042989147629567, + "loss": 2.2882, + "step": 412100 + }, + { + "epoch": 1.5931020086282879, + "grad_norm": 0.10484657436609268, + "learning_rate": 0.0013040114943477874, + "loss": 2.2707, + "step": 412110 + }, + { + "epoch": 1.5931406658316711, + "grad_norm": 0.1271674782037735, + "learning_rate": 0.0013037241925788314, + "loss": 2.2859, + "step": 412120 + }, + { + "epoch": 1.5931793230350544, + "grad_norm": 0.09756782650947571, + "learning_rate": 0.0013034370093092803, + "loss": 2.2738, + "step": 412130 + }, + { + "epoch": 1.5932179802384376, + "grad_norm": 0.09413274377584457, + "learning_rate": 0.0013031499443926261, + "loss": 2.2885, + "step": 412140 + }, + { + "epoch": 1.5932566374418209, + "grad_norm": 0.09922010451555252, + "learning_rate": 0.0013028629976826651, + "loss": 2.2891, + "step": 412150 + }, + { + "epoch": 1.5932952946452041, + "grad_norm": 0.10199158638715744, + "learning_rate": 0.0013025761690334922, + "loss": 2.2676, + "step": 412160 + }, + { + "epoch": 1.5933339518485874, + "grad_norm": 0.11042314022779465, + "learning_rate": 0.0013022894582995037, + "loss": 2.2784, + "step": 412170 + }, + { + "epoch": 1.5933726090519706, + "grad_norm": 0.0946459174156189, + "learning_rate": 0.001302002865335394, + "loss": 2.266, + "step": 412180 + }, + { + "epoch": 1.593411266255354, + "grad_norm": 0.09742408990859985, + "learning_rate": 0.0013017163899961563, + "loss": 2.28, + "step": 412190 + }, + { + "epoch": 1.5934499234587371, + "grad_norm": 0.10281988233327866, + "learning_rate": 0.0013014300321370809, + "loss": 2.2715, + "step": 412200 + }, + { + "epoch": 1.5934885806621206, + "grad_norm": 0.09957725554704666, + "learning_rate": 0.001301143791613754, + "loss": 2.2696, + "step": 412210 + }, + { + "epoch": 1.5935272378655039, + "grad_norm": 0.10443780571222305, + "learning_rate": 0.0013008576682820587, + "loss": 2.2815, + "step": 412220 + }, + { + "epoch": 1.5935658950688871, + "grad_norm": 0.12150289118289948, + "learning_rate": 0.0013005716619981715, + "loss": 2.2797, + "step": 412230 + }, + { + "epoch": 1.5936045522722704, + "grad_norm": 0.09694153815507889, + "learning_rate": 0.001300285772618564, + "loss": 2.2784, + "step": 412240 + }, + { + "epoch": 1.5936432094756539, + "grad_norm": 0.10975735634565353, + "learning_rate": 0.0013000000000000002, + "loss": 2.2676, + "step": 412250 + }, + { + "epoch": 1.593681866679037, + "grad_norm": 0.11566482484340668, + "learning_rate": 0.0012997143439995363, + "loss": 2.2765, + "step": 412260 + }, + { + "epoch": 1.5937205238824204, + "grad_norm": 0.0927523523569107, + "learning_rate": 0.0012994288044745202, + "loss": 2.2901, + "step": 412270 + }, + { + "epoch": 1.5937591810858036, + "grad_norm": 0.10696090012788773, + "learning_rate": 0.0012991433812825908, + "loss": 2.2845, + "step": 412280 + }, + { + "epoch": 1.5937978382891869, + "grad_norm": 0.10401918739080429, + "learning_rate": 0.001298858074281676, + "loss": 2.2804, + "step": 412290 + }, + { + "epoch": 1.5938364954925701, + "grad_norm": 0.09173014760017395, + "learning_rate": 0.0012985728833299927, + "loss": 2.2877, + "step": 412300 + }, + { + "epoch": 1.5938751526959534, + "grad_norm": 0.0979364663362503, + "learning_rate": 0.0012982878082860468, + "loss": 2.2779, + "step": 412310 + }, + { + "epoch": 1.5939138098993366, + "grad_norm": 0.10152798146009445, + "learning_rate": 0.0012980028490086305, + "loss": 2.2735, + "step": 412320 + }, + { + "epoch": 1.5939524671027199, + "grad_norm": 0.09956847876310349, + "learning_rate": 0.0012977180053568224, + "loss": 2.2834, + "step": 412330 + }, + { + "epoch": 1.5939911243061031, + "grad_norm": 0.115941122174263, + "learning_rate": 0.0012974332771899882, + "loss": 2.284, + "step": 412340 + }, + { + "epoch": 1.5940297815094864, + "grad_norm": 0.11504446715116501, + "learning_rate": 0.0012971486643677769, + "loss": 2.2872, + "step": 412350 + }, + { + "epoch": 1.5940684387128696, + "grad_norm": 0.12059466540813446, + "learning_rate": 0.0012968641667501222, + "loss": 2.2706, + "step": 412360 + }, + { + "epoch": 1.5941070959162529, + "grad_norm": 0.11342114210128784, + "learning_rate": 0.0012965797841972412, + "loss": 2.2714, + "step": 412370 + }, + { + "epoch": 1.5941457531196364, + "grad_norm": 0.09073950350284576, + "learning_rate": 0.0012962955165696328, + "loss": 2.2844, + "step": 412380 + }, + { + "epoch": 1.5941844103230196, + "grad_norm": 0.0957561805844307, + "learning_rate": 0.0012960113637280784, + "loss": 2.2792, + "step": 412390 + }, + { + "epoch": 1.5942230675264029, + "grad_norm": 0.11272169649600983, + "learning_rate": 0.0012957273255336397, + "loss": 2.2842, + "step": 412400 + }, + { + "epoch": 1.5942617247297861, + "grad_norm": 0.10272194445133209, + "learning_rate": 0.0012954434018476583, + "loss": 2.2899, + "step": 412410 + }, + { + "epoch": 1.5943003819331696, + "grad_norm": 0.0964650809764862, + "learning_rate": 0.001295159592531756, + "loss": 2.2863, + "step": 412420 + }, + { + "epoch": 1.5943390391365528, + "grad_norm": 0.09082885831594467, + "learning_rate": 0.0012948758974478322, + "loss": 2.2722, + "step": 412430 + }, + { + "epoch": 1.594377696339936, + "grad_norm": 0.10105688869953156, + "learning_rate": 0.001294592316458064, + "loss": 2.2836, + "step": 412440 + }, + { + "epoch": 1.5944163535433193, + "grad_norm": 0.11495474725961685, + "learning_rate": 0.001294308849424906, + "loss": 2.2826, + "step": 412450 + }, + { + "epoch": 1.5944550107467026, + "grad_norm": 0.09655379503965378, + "learning_rate": 0.0012940254962110881, + "loss": 2.265, + "step": 412460 + }, + { + "epoch": 1.5944936679500858, + "grad_norm": 0.10862415283918381, + "learning_rate": 0.001293742256679617, + "loss": 2.305, + "step": 412470 + }, + { + "epoch": 1.594532325153469, + "grad_norm": 0.10821930319070816, + "learning_rate": 0.0012934591306937723, + "loss": 2.2805, + "step": 412480 + }, + { + "epoch": 1.5945709823568524, + "grad_norm": 0.10074169933795929, + "learning_rate": 0.0012931761181171084, + "loss": 2.2778, + "step": 412490 + }, + { + "epoch": 1.5946096395602356, + "grad_norm": 0.10636480897665024, + "learning_rate": 0.0012928932188134526, + "loss": 2.2731, + "step": 412500 + }, + { + "epoch": 1.5946482967636189, + "grad_norm": 0.09842386096715927, + "learning_rate": 0.0012926104326469044, + "loss": 2.2806, + "step": 412510 + }, + { + "epoch": 1.594686953967002, + "grad_norm": 0.10499393194913864, + "learning_rate": 0.0012923277594818346, + "loss": 2.2826, + "step": 412520 + }, + { + "epoch": 1.5947256111703854, + "grad_norm": 0.0923650711774826, + "learning_rate": 0.0012920451991828856, + "loss": 2.2768, + "step": 412530 + }, + { + "epoch": 1.5947642683737686, + "grad_norm": 0.09121539443731308, + "learning_rate": 0.001291762751614969, + "loss": 2.2732, + "step": 412540 + }, + { + "epoch": 1.594802925577152, + "grad_norm": 0.12280796468257904, + "learning_rate": 0.0012914804166432661, + "loss": 2.2729, + "step": 412550 + }, + { + "epoch": 1.5948415827805353, + "grad_norm": 0.10645350813865662, + "learning_rate": 0.0012911981941332261, + "loss": 2.2759, + "step": 412560 + }, + { + "epoch": 1.5948802399839186, + "grad_norm": 0.08899696171283722, + "learning_rate": 0.0012909160839505668, + "loss": 2.2877, + "step": 412570 + }, + { + "epoch": 1.5949188971873018, + "grad_norm": 0.08800100535154343, + "learning_rate": 0.001290634085961272, + "loss": 2.2736, + "step": 412580 + }, + { + "epoch": 1.5949575543906853, + "grad_norm": 0.10100112110376358, + "learning_rate": 0.001290352200031593, + "loss": 2.2794, + "step": 412590 + }, + { + "epoch": 1.5949962115940686, + "grad_norm": 0.1053532063961029, + "learning_rate": 0.0012900704260280463, + "loss": 2.2769, + "step": 412600 + }, + { + "epoch": 1.5950348687974518, + "grad_norm": 0.11090871691703796, + "learning_rate": 0.001289788763817412, + "loss": 2.2763, + "step": 412610 + }, + { + "epoch": 1.595073526000835, + "grad_norm": 0.0996427908539772, + "learning_rate": 0.0012895072132667355, + "loss": 2.2865, + "step": 412620 + }, + { + "epoch": 1.5951121832042183, + "grad_norm": 0.10796479880809784, + "learning_rate": 0.0012892257742433254, + "loss": 2.2742, + "step": 412630 + }, + { + "epoch": 1.5951508404076016, + "grad_norm": 0.11622989177703857, + "learning_rate": 0.0012889444466147528, + "loss": 2.2768, + "step": 412640 + }, + { + "epoch": 1.5951894976109848, + "grad_norm": 0.09687823057174683, + "learning_rate": 0.0012886632302488504, + "loss": 2.2826, + "step": 412650 + }, + { + "epoch": 1.595228154814368, + "grad_norm": 0.107315793633461, + "learning_rate": 0.0012883821250137123, + "loss": 2.2936, + "step": 412660 + }, + { + "epoch": 1.5952668120177513, + "grad_norm": 0.11312927305698395, + "learning_rate": 0.0012881011307776923, + "loss": 2.2905, + "step": 412670 + }, + { + "epoch": 1.5953054692211346, + "grad_norm": 0.09998205304145813, + "learning_rate": 0.0012878202474094057, + "loss": 2.2802, + "step": 412680 + }, + { + "epoch": 1.5953441264245178, + "grad_norm": 0.10146909207105637, + "learning_rate": 0.001287539474777725, + "loss": 2.2754, + "step": 412690 + }, + { + "epoch": 1.595382783627901, + "grad_norm": 0.09956265985965729, + "learning_rate": 0.0012872588127517816, + "loss": 2.2828, + "step": 412700 + }, + { + "epoch": 1.5954214408312843, + "grad_norm": 0.1043073907494545, + "learning_rate": 0.0012869782612009643, + "loss": 2.2712, + "step": 412710 + }, + { + "epoch": 1.5954600980346678, + "grad_norm": 0.11289764195680618, + "learning_rate": 0.0012866978199949197, + "loss": 2.2858, + "step": 412720 + }, + { + "epoch": 1.595498755238051, + "grad_norm": 0.10688609629869461, + "learning_rate": 0.0012864174890035492, + "loss": 2.26, + "step": 412730 + }, + { + "epoch": 1.5955374124414343, + "grad_norm": 0.1139904335141182, + "learning_rate": 0.00128613726809701, + "loss": 2.2785, + "step": 412740 + }, + { + "epoch": 1.5955760696448176, + "grad_norm": 0.10517367720603943, + "learning_rate": 0.0012858571571457151, + "loss": 2.2816, + "step": 412750 + }, + { + "epoch": 1.595614726848201, + "grad_norm": 0.10349394381046295, + "learning_rate": 0.0012855771560203299, + "loss": 2.2903, + "step": 412760 + }, + { + "epoch": 1.5956533840515843, + "grad_norm": 0.09697014838457108, + "learning_rate": 0.0012852972645917746, + "loss": 2.2865, + "step": 412770 + }, + { + "epoch": 1.5956920412549676, + "grad_norm": 0.10311219096183777, + "learning_rate": 0.0012850174827312212, + "loss": 2.2963, + "step": 412780 + }, + { + "epoch": 1.5957306984583508, + "grad_norm": 0.10511083900928497, + "learning_rate": 0.0012847378103100933, + "loss": 2.2981, + "step": 412790 + }, + { + "epoch": 1.595769355661734, + "grad_norm": 0.09088359028100967, + "learning_rate": 0.0012844582472000675, + "loss": 2.2919, + "step": 412800 + }, + { + "epoch": 1.5958080128651173, + "grad_norm": 0.10324206948280334, + "learning_rate": 0.001284178793273069, + "loss": 2.2829, + "step": 412810 + }, + { + "epoch": 1.5958466700685006, + "grad_norm": 0.09309279173612595, + "learning_rate": 0.0012838994484012738, + "loss": 2.2731, + "step": 412820 + }, + { + "epoch": 1.5958853272718838, + "grad_norm": 0.17614911496639252, + "learning_rate": 0.0012836202124571075, + "loss": 2.268, + "step": 412830 + }, + { + "epoch": 1.595923984475267, + "grad_norm": 0.10486125200986862, + "learning_rate": 0.0012833410853132431, + "loss": 2.2921, + "step": 412840 + }, + { + "epoch": 1.5959626416786503, + "grad_norm": 0.11186165362596512, + "learning_rate": 0.0012830620668426032, + "loss": 2.2847, + "step": 412850 + }, + { + "epoch": 1.5960012988820336, + "grad_norm": 0.12183138728141785, + "learning_rate": 0.001282783156918356, + "loss": 2.2755, + "step": 412860 + }, + { + "epoch": 1.5960399560854168, + "grad_norm": 0.0917850136756897, + "learning_rate": 0.001282504355413916, + "loss": 2.2678, + "step": 412870 + }, + { + "epoch": 1.5960786132888, + "grad_norm": 0.10441316664218903, + "learning_rate": 0.0012822256622029456, + "loss": 2.2848, + "step": 412880 + }, + { + "epoch": 1.5961172704921835, + "grad_norm": 0.09781082719564438, + "learning_rate": 0.0012819470771593504, + "loss": 2.2803, + "step": 412890 + }, + { + "epoch": 1.5961559276955668, + "grad_norm": 0.09336867183446884, + "learning_rate": 0.001281668600157281, + "loss": 2.2777, + "step": 412900 + }, + { + "epoch": 1.59619458489895, + "grad_norm": 0.11104751378297806, + "learning_rate": 0.001281390231071133, + "loss": 2.2826, + "step": 412910 + }, + { + "epoch": 1.5962332421023333, + "grad_norm": 0.09125860035419464, + "learning_rate": 0.001281111969775543, + "loss": 2.2865, + "step": 412920 + }, + { + "epoch": 1.5962718993057168, + "grad_norm": 0.10214760154485703, + "learning_rate": 0.001280833816145392, + "loss": 2.2782, + "step": 412930 + }, + { + "epoch": 1.5963105565091, + "grad_norm": 0.10561887919902802, + "learning_rate": 0.0012805557700558022, + "loss": 2.2736, + "step": 412940 + }, + { + "epoch": 1.5963492137124833, + "grad_norm": 0.10510638356208801, + "learning_rate": 0.0012802778313821368, + "loss": 2.2831, + "step": 412950 + }, + { + "epoch": 1.5963878709158665, + "grad_norm": 0.11140754073858261, + "learning_rate": 0.00128, + "loss": 2.279, + "step": 412960 + }, + { + "epoch": 1.5964265281192498, + "grad_norm": 0.10227959603071213, + "learning_rate": 0.0012797222757852356, + "loss": 2.2798, + "step": 412970 + }, + { + "epoch": 1.596465185322633, + "grad_norm": 0.10155359655618668, + "learning_rate": 0.0012794446586139273, + "loss": 2.2788, + "step": 412980 + }, + { + "epoch": 1.5965038425260163, + "grad_norm": 0.09626945108175278, + "learning_rate": 0.001279167148362396, + "loss": 2.2714, + "step": 412990 + }, + { + "epoch": 1.5965424997293995, + "grad_norm": 0.1154874861240387, + "learning_rate": 0.0012788897449072022, + "loss": 2.2803, + "step": 413000 + }, + { + "epoch": 1.5965811569327828, + "grad_norm": 0.09753113240003586, + "learning_rate": 0.0012786124481251426, + "loss": 2.2998, + "step": 413010 + }, + { + "epoch": 1.596619814136166, + "grad_norm": 0.10309763252735138, + "learning_rate": 0.0012783352578932515, + "loss": 2.2726, + "step": 413020 + }, + { + "epoch": 1.5966584713395493, + "grad_norm": 0.10636594146490097, + "learning_rate": 0.0012780581740887982, + "loss": 2.2824, + "step": 413030 + }, + { + "epoch": 1.5966971285429326, + "grad_norm": 0.11068262904882431, + "learning_rate": 0.0012777811965892884, + "loss": 2.2909, + "step": 413040 + }, + { + "epoch": 1.5967357857463158, + "grad_norm": 0.10145969688892365, + "learning_rate": 0.0012775043252724622, + "loss": 2.2845, + "step": 413050 + }, + { + "epoch": 1.5967744429496993, + "grad_norm": 0.10044920444488525, + "learning_rate": 0.0012772275600162939, + "loss": 2.2767, + "step": 413060 + }, + { + "epoch": 1.5968131001530825, + "grad_norm": 0.10166584700345993, + "learning_rate": 0.0012769509006989913, + "loss": 2.2787, + "step": 413070 + }, + { + "epoch": 1.5968517573564658, + "grad_norm": 0.1131749376654625, + "learning_rate": 0.001276674347198995, + "loss": 2.287, + "step": 413080 + }, + { + "epoch": 1.596890414559849, + "grad_norm": 0.09794627130031586, + "learning_rate": 0.0012763978993949783, + "loss": 2.2853, + "step": 413090 + }, + { + "epoch": 1.5969290717632325, + "grad_norm": 0.10854090750217438, + "learning_rate": 0.0012761215571658459, + "loss": 2.2833, + "step": 413100 + }, + { + "epoch": 1.5969677289666158, + "grad_norm": 0.09260708838701248, + "learning_rate": 0.0012758453203907329, + "loss": 2.2825, + "step": 413110 + }, + { + "epoch": 1.597006386169999, + "grad_norm": 0.10994840413331985, + "learning_rate": 0.0012755691889490067, + "loss": 2.2724, + "step": 413120 + }, + { + "epoch": 1.5970450433733823, + "grad_norm": 0.10354938358068466, + "learning_rate": 0.0012752931627202627, + "loss": 2.275, + "step": 413130 + }, + { + "epoch": 1.5970837005767655, + "grad_norm": 0.09041547030210495, + "learning_rate": 0.0012750172415843257, + "loss": 2.2847, + "step": 413140 + }, + { + "epoch": 1.5971223577801488, + "grad_norm": 0.10713107883930206, + "learning_rate": 0.0012747414254212501, + "loss": 2.3039, + "step": 413150 + }, + { + "epoch": 1.597161014983532, + "grad_norm": 0.11910802125930786, + "learning_rate": 0.0012744657141113178, + "loss": 2.2999, + "step": 413160 + }, + { + "epoch": 1.5971996721869153, + "grad_norm": 0.0957915261387825, + "learning_rate": 0.0012741901075350376, + "loss": 2.2702, + "step": 413170 + }, + { + "epoch": 1.5972383293902985, + "grad_norm": 0.09311871975660324, + "learning_rate": 0.001273914605573146, + "loss": 2.2805, + "step": 413180 + }, + { + "epoch": 1.5972769865936818, + "grad_norm": 0.0910448431968689, + "learning_rate": 0.001273639208106605, + "loss": 2.2822, + "step": 413190 + }, + { + "epoch": 1.597315643797065, + "grad_norm": 0.0983869656920433, + "learning_rate": 0.0012733639150166018, + "loss": 2.2788, + "step": 413200 + }, + { + "epoch": 1.5973543010004483, + "grad_norm": 0.08953586965799332, + "learning_rate": 0.0012730887261845501, + "loss": 2.2892, + "step": 413210 + }, + { + "epoch": 1.5973929582038315, + "grad_norm": 0.0971858873963356, + "learning_rate": 0.0012728136414920863, + "loss": 2.2721, + "step": 413220 + }, + { + "epoch": 1.597431615407215, + "grad_norm": 0.09747296571731567, + "learning_rate": 0.0012725386608210716, + "loss": 2.2784, + "step": 413230 + }, + { + "epoch": 1.5974702726105983, + "grad_norm": 0.10249760001897812, + "learning_rate": 0.00127226378405359, + "loss": 2.2793, + "step": 413240 + }, + { + "epoch": 1.5975089298139815, + "grad_norm": 0.09742303937673569, + "learning_rate": 0.0012719890110719483, + "loss": 2.2895, + "step": 413250 + }, + { + "epoch": 1.5975475870173648, + "grad_norm": 0.09713740646839142, + "learning_rate": 0.001271714341758675, + "loss": 2.2726, + "step": 413260 + }, + { + "epoch": 1.5975862442207482, + "grad_norm": 0.1044480949640274, + "learning_rate": 0.0012714397759965206, + "loss": 2.2858, + "step": 413270 + }, + { + "epoch": 1.5976249014241315, + "grad_norm": 0.09574960172176361, + "learning_rate": 0.001271165313668456, + "loss": 2.285, + "step": 413280 + }, + { + "epoch": 1.5976635586275147, + "grad_norm": 0.1157715916633606, + "learning_rate": 0.0012708909546576726, + "loss": 2.289, + "step": 413290 + }, + { + "epoch": 1.597702215830898, + "grad_norm": 0.09488074481487274, + "learning_rate": 0.0012706166988475812, + "loss": 2.2866, + "step": 413300 + }, + { + "epoch": 1.5977408730342813, + "grad_norm": 0.1098017692565918, + "learning_rate": 0.001270342546121812, + "loss": 2.276, + "step": 413310 + }, + { + "epoch": 1.5977795302376645, + "grad_norm": 0.10246486961841583, + "learning_rate": 0.0012700684963642135, + "loss": 2.2728, + "step": 413320 + }, + { + "epoch": 1.5978181874410478, + "grad_norm": 0.10948675870895386, + "learning_rate": 0.0012697945494588529, + "loss": 2.2733, + "step": 413330 + }, + { + "epoch": 1.597856844644431, + "grad_norm": 0.10623447597026825, + "learning_rate": 0.0012695207052900131, + "loss": 2.2814, + "step": 413340 + }, + { + "epoch": 1.5978955018478143, + "grad_norm": 0.09846518188714981, + "learning_rate": 0.0012692469637421958, + "loss": 2.2762, + "step": 413350 + }, + { + "epoch": 1.5979341590511975, + "grad_norm": 0.11750591546297073, + "learning_rate": 0.0012689733247001173, + "loss": 2.27, + "step": 413360 + }, + { + "epoch": 1.5979728162545808, + "grad_norm": 0.10157160460948944, + "learning_rate": 0.001268699788048711, + "loss": 2.2894, + "step": 413370 + }, + { + "epoch": 1.598011473457964, + "grad_norm": 0.10318373888731003, + "learning_rate": 0.0012684263536731249, + "loss": 2.283, + "step": 413380 + }, + { + "epoch": 1.5980501306613475, + "grad_norm": 0.09734541177749634, + "learning_rate": 0.0012681530214587204, + "loss": 2.2726, + "step": 413390 + }, + { + "epoch": 1.5980887878647307, + "grad_norm": 0.09607987105846405, + "learning_rate": 0.001267879791291075, + "loss": 2.2841, + "step": 413400 + }, + { + "epoch": 1.598127445068114, + "grad_norm": 0.10669342428445816, + "learning_rate": 0.0012676066630559779, + "loss": 2.2788, + "step": 413410 + }, + { + "epoch": 1.5981661022714972, + "grad_norm": 0.0970144048333168, + "learning_rate": 0.001267333636639432, + "loss": 2.2853, + "step": 413420 + }, + { + "epoch": 1.5982047594748805, + "grad_norm": 0.09008067846298218, + "learning_rate": 0.0012670607119276522, + "loss": 2.2842, + "step": 413430 + }, + { + "epoch": 1.598243416678264, + "grad_norm": 0.11209733784198761, + "learning_rate": 0.0012667878888070656, + "loss": 2.2788, + "step": 413440 + }, + { + "epoch": 1.5982820738816472, + "grad_norm": 0.09946850687265396, + "learning_rate": 0.0012665151671643101, + "loss": 2.2864, + "step": 413450 + }, + { + "epoch": 1.5983207310850305, + "grad_norm": 0.09585539996623993, + "learning_rate": 0.0012662425468862343, + "loss": 2.2808, + "step": 413460 + }, + { + "epoch": 1.5983593882884137, + "grad_norm": 0.11974449455738068, + "learning_rate": 0.0012659700278598973, + "loss": 2.2809, + "step": 413470 + }, + { + "epoch": 1.598398045491797, + "grad_norm": 0.11818915605545044, + "learning_rate": 0.0012656976099725671, + "loss": 2.2805, + "step": 413480 + }, + { + "epoch": 1.5984367026951802, + "grad_norm": 0.1089639961719513, + "learning_rate": 0.0012654252931117217, + "loss": 2.2755, + "step": 413490 + }, + { + "epoch": 1.5984753598985635, + "grad_norm": 0.13089609146118164, + "learning_rate": 0.0012651530771650465, + "loss": 2.2763, + "step": 413500 + }, + { + "epoch": 1.5985140171019467, + "grad_norm": 0.11262860149145126, + "learning_rate": 0.0012648809620204359, + "loss": 2.2805, + "step": 413510 + }, + { + "epoch": 1.59855267430533, + "grad_norm": 0.11303657293319702, + "learning_rate": 0.0012646089475659905, + "loss": 2.2887, + "step": 413520 + }, + { + "epoch": 1.5985913315087132, + "grad_norm": 0.09849189966917038, + "learning_rate": 0.001264337033690019, + "loss": 2.276, + "step": 413530 + }, + { + "epoch": 1.5986299887120965, + "grad_norm": 0.10323601216077805, + "learning_rate": 0.001264065220281036, + "loss": 2.2874, + "step": 413540 + }, + { + "epoch": 1.5986686459154797, + "grad_norm": 0.10031892359256744, + "learning_rate": 0.0012637935072277616, + "loss": 2.2652, + "step": 413550 + }, + { + "epoch": 1.5987073031188632, + "grad_norm": 0.3931354284286499, + "learning_rate": 0.0012635218944191213, + "loss": 2.284, + "step": 413560 + }, + { + "epoch": 1.5987459603222465, + "grad_norm": 0.09942755848169327, + "learning_rate": 0.0012632503817442456, + "loss": 2.2645, + "step": 413570 + }, + { + "epoch": 1.5987846175256297, + "grad_norm": 0.1002105176448822, + "learning_rate": 0.001262978969092469, + "loss": 2.2841, + "step": 413580 + }, + { + "epoch": 1.598823274729013, + "grad_norm": 0.10870186984539032, + "learning_rate": 0.0012627076563533298, + "loss": 2.267, + "step": 413590 + }, + { + "epoch": 1.5988619319323962, + "grad_norm": 0.10717492550611496, + "learning_rate": 0.001262436443416569, + "loss": 2.262, + "step": 413600 + }, + { + "epoch": 1.5989005891357797, + "grad_norm": 0.10230233520269394, + "learning_rate": 0.0012621653301721314, + "loss": 2.2821, + "step": 413610 + }, + { + "epoch": 1.598939246339163, + "grad_norm": 0.1118675246834755, + "learning_rate": 0.0012618943165101629, + "loss": 2.287, + "step": 413620 + }, + { + "epoch": 1.5989779035425462, + "grad_norm": 0.10283380001783371, + "learning_rate": 0.0012616234023210106, + "loss": 2.268, + "step": 413630 + }, + { + "epoch": 1.5990165607459295, + "grad_norm": 0.10829443484544754, + "learning_rate": 0.0012613525874952244, + "loss": 2.2735, + "step": 413640 + }, + { + "epoch": 1.5990552179493127, + "grad_norm": 0.09989216923713684, + "learning_rate": 0.0012610818719235534, + "loss": 2.2734, + "step": 413650 + }, + { + "epoch": 1.599093875152696, + "grad_norm": 0.12643863260746002, + "learning_rate": 0.0012608112554969469, + "loss": 2.2787, + "step": 413660 + }, + { + "epoch": 1.5991325323560792, + "grad_norm": 0.09596065431833267, + "learning_rate": 0.0012605407381065538, + "loss": 2.2773, + "step": 413670 + }, + { + "epoch": 1.5991711895594625, + "grad_norm": 0.11484631896018982, + "learning_rate": 0.001260270319643723, + "loss": 2.2758, + "step": 413680 + }, + { + "epoch": 1.5992098467628457, + "grad_norm": 0.10234335064888, + "learning_rate": 0.00126, + "loss": 2.2901, + "step": 413690 + }, + { + "epoch": 1.599248503966229, + "grad_norm": 0.09941039234399796, + "learning_rate": 0.00125972977906713, + "loss": 2.2768, + "step": 413700 + }, + { + "epoch": 1.5992871611696122, + "grad_norm": 0.09414675086736679, + "learning_rate": 0.0012594596567370552, + "loss": 2.2717, + "step": 413710 + }, + { + "epoch": 1.5993258183729955, + "grad_norm": 0.10195624828338623, + "learning_rate": 0.0012591896329019147, + "loss": 2.2717, + "step": 413720 + }, + { + "epoch": 1.599364475576379, + "grad_norm": 0.11093621701002121, + "learning_rate": 0.001258919707454044, + "loss": 2.2896, + "step": 413730 + }, + { + "epoch": 1.5994031327797622, + "grad_norm": 0.11402977257966995, + "learning_rate": 0.0012586498802859745, + "loss": 2.2758, + "step": 413740 + }, + { + "epoch": 1.5994417899831455, + "grad_norm": 0.09677516669034958, + "learning_rate": 0.0012583801512904339, + "loss": 2.2798, + "step": 413750 + }, + { + "epoch": 1.5994804471865287, + "grad_norm": 0.1029655858874321, + "learning_rate": 0.0012581105203603436, + "loss": 2.2876, + "step": 413760 + }, + { + "epoch": 1.599519104389912, + "grad_norm": 0.10037294775247574, + "learning_rate": 0.0012578409873888212, + "loss": 2.2785, + "step": 413770 + }, + { + "epoch": 1.5995577615932954, + "grad_norm": 0.10133747011423111, + "learning_rate": 0.0012575715522691767, + "loss": 2.2804, + "step": 413780 + }, + { + "epoch": 1.5995964187966787, + "grad_norm": 0.10408452898263931, + "learning_rate": 0.0012573022148949144, + "loss": 2.2828, + "step": 413790 + }, + { + "epoch": 1.599635076000062, + "grad_norm": 0.10227343440055847, + "learning_rate": 0.0012570329751597316, + "loss": 2.2861, + "step": 413800 + }, + { + "epoch": 1.5996737332034452, + "grad_norm": 0.1007285937666893, + "learning_rate": 0.0012567638329575182, + "loss": 2.2801, + "step": 413810 + }, + { + "epoch": 1.5997123904068284, + "grad_norm": 0.1031252071261406, + "learning_rate": 0.001256494788182356, + "loss": 2.2789, + "step": 413820 + }, + { + "epoch": 1.5997510476102117, + "grad_norm": 0.1067824587225914, + "learning_rate": 0.0012562258407285182, + "loss": 2.3046, + "step": 413830 + }, + { + "epoch": 1.599789704813595, + "grad_norm": 0.10092274099588394, + "learning_rate": 0.001255956990490469, + "loss": 2.2862, + "step": 413840 + }, + { + "epoch": 1.5998283620169782, + "grad_norm": 0.10653362423181534, + "learning_rate": 0.0012556882373628642, + "loss": 2.2808, + "step": 413850 + }, + { + "epoch": 1.5998670192203615, + "grad_norm": 0.09871282428503036, + "learning_rate": 0.0012554195812405488, + "loss": 2.2773, + "step": 413860 + }, + { + "epoch": 1.5999056764237447, + "grad_norm": 0.1083545833826065, + "learning_rate": 0.001255151022018557, + "loss": 2.2712, + "step": 413870 + }, + { + "epoch": 1.599944333627128, + "grad_norm": 0.10615554451942444, + "learning_rate": 0.0012548825595921139, + "loss": 2.2687, + "step": 413880 + }, + { + "epoch": 1.5999829908305112, + "grad_norm": 0.10352876782417297, + "learning_rate": 0.001254614193856631, + "loss": 2.2818, + "step": 413890 + }, + { + "epoch": 1.6000216480338947, + "grad_norm": 0.11283598840236664, + "learning_rate": 0.00125434592470771, + "loss": 2.2948, + "step": 413900 + }, + { + "epoch": 1.600060305237278, + "grad_norm": 0.11240514367818832, + "learning_rate": 0.0012540777520411394, + "loss": 2.2909, + "step": 413910 + }, + { + "epoch": 1.6000989624406612, + "grad_norm": 0.10302004963159561, + "learning_rate": 0.0012538096757528949, + "loss": 2.275, + "step": 413920 + }, + { + "epoch": 1.6001376196440444, + "grad_norm": 0.11001778393983841, + "learning_rate": 0.001253541695739139, + "loss": 2.2815, + "step": 413930 + }, + { + "epoch": 1.600176276847428, + "grad_norm": 0.09422043710947037, + "learning_rate": 0.0012532738118962213, + "loss": 2.281, + "step": 413940 + }, + { + "epoch": 1.6002149340508112, + "grad_norm": 0.1052865982055664, + "learning_rate": 0.0012530060241206762, + "loss": 2.2635, + "step": 413950 + }, + { + "epoch": 1.6002535912541944, + "grad_norm": 0.09602361172437668, + "learning_rate": 0.0012527383323092237, + "loss": 2.2773, + "step": 413960 + }, + { + "epoch": 1.6002922484575777, + "grad_norm": 0.0933568999171257, + "learning_rate": 0.0012524707363587696, + "loss": 2.2783, + "step": 413970 + }, + { + "epoch": 1.600330905660961, + "grad_norm": 0.10403438657522202, + "learning_rate": 0.0012522032361664034, + "loss": 2.2874, + "step": 413980 + }, + { + "epoch": 1.6003695628643442, + "grad_norm": 0.1254464089870453, + "learning_rate": 0.0012519358316293982, + "loss": 2.2858, + "step": 413990 + }, + { + "epoch": 1.6004082200677274, + "grad_norm": 0.10663452744483948, + "learning_rate": 0.0012516685226452117, + "loss": 2.2685, + "step": 414000 + }, + { + "epoch": 1.6004468772711107, + "grad_norm": 0.09303441643714905, + "learning_rate": 0.0012514013091114839, + "loss": 2.2671, + "step": 414010 + }, + { + "epoch": 1.600485534474494, + "grad_norm": 0.12453543394804001, + "learning_rate": 0.0012511341909260379, + "loss": 2.2814, + "step": 414020 + }, + { + "epoch": 1.6005241916778772, + "grad_norm": 0.1046123132109642, + "learning_rate": 0.0012508671679868782, + "loss": 2.2908, + "step": 414030 + }, + { + "epoch": 1.6005628488812604, + "grad_norm": 0.10266414284706116, + "learning_rate": 0.0012506002401921922, + "loss": 2.2784, + "step": 414040 + }, + { + "epoch": 1.6006015060846437, + "grad_norm": 0.09736956655979156, + "learning_rate": 0.0012503334074403477, + "loss": 2.2799, + "step": 414050 + }, + { + "epoch": 1.600640163288027, + "grad_norm": 0.10303352773189545, + "learning_rate": 0.001250066669629893, + "loss": 2.2809, + "step": 414060 + }, + { + "epoch": 1.6006788204914104, + "grad_norm": 0.10491194576025009, + "learning_rate": 0.001249800026659558, + "loss": 2.2847, + "step": 414070 + }, + { + "epoch": 1.6007174776947937, + "grad_norm": 0.10172902047634125, + "learning_rate": 0.0012495334784282512, + "loss": 2.2779, + "step": 414080 + }, + { + "epoch": 1.600756134898177, + "grad_norm": 0.2797784209251404, + "learning_rate": 0.0012492670248350616, + "loss": 2.2656, + "step": 414090 + }, + { + "epoch": 1.6007947921015602, + "grad_norm": 0.1073826253414154, + "learning_rate": 0.0012490006657792565, + "loss": 2.2824, + "step": 414100 + }, + { + "epoch": 1.6008334493049436, + "grad_norm": 0.10700371861457825, + "learning_rate": 0.0012487344011602821, + "loss": 2.2809, + "step": 414110 + }, + { + "epoch": 1.600872106508327, + "grad_norm": 0.11427832394838333, + "learning_rate": 0.0012484682308777626, + "loss": 2.2699, + "step": 414120 + }, + { + "epoch": 1.6009107637117101, + "grad_norm": 0.10496781021356583, + "learning_rate": 0.0012482021548315, + "loss": 2.2802, + "step": 414130 + }, + { + "epoch": 1.6009494209150934, + "grad_norm": 0.11209113150835037, + "learning_rate": 0.0012479361729214734, + "loss": 2.2817, + "step": 414140 + }, + { + "epoch": 1.6009880781184767, + "grad_norm": 0.09979293495416641, + "learning_rate": 0.001247670285047839, + "loss": 2.2669, + "step": 414150 + }, + { + "epoch": 1.60102673532186, + "grad_norm": 0.10232799500226974, + "learning_rate": 0.0012474044911109288, + "loss": 2.2704, + "step": 414160 + }, + { + "epoch": 1.6010653925252432, + "grad_norm": 0.1055445596575737, + "learning_rate": 0.0012471387910112518, + "loss": 2.2804, + "step": 414170 + }, + { + "epoch": 1.6011040497286264, + "grad_norm": 0.11574835330247879, + "learning_rate": 0.0012468731846494907, + "loss": 2.2749, + "step": 414180 + }, + { + "epoch": 1.6011427069320097, + "grad_norm": 0.0972663164138794, + "learning_rate": 0.0012466076719265056, + "loss": 2.2759, + "step": 414190 + }, + { + "epoch": 1.601181364135393, + "grad_norm": 0.09286480396986008, + "learning_rate": 0.0012463422527433292, + "loss": 2.2798, + "step": 414200 + }, + { + "epoch": 1.6012200213387762, + "grad_norm": 0.14326174557209015, + "learning_rate": 0.0012460769270011697, + "loss": 2.2817, + "step": 414210 + }, + { + "epoch": 1.6012586785421594, + "grad_norm": 0.11282878369092941, + "learning_rate": 0.001245811694601408, + "loss": 2.2926, + "step": 414220 + }, + { + "epoch": 1.6012973357455427, + "grad_norm": 0.10788289457559586, + "learning_rate": 0.0012455465554455994, + "loss": 2.2846, + "step": 414230 + }, + { + "epoch": 1.6013359929489261, + "grad_norm": 0.09294237941503525, + "learning_rate": 0.0012452815094354717, + "loss": 2.2862, + "step": 414240 + }, + { + "epoch": 1.6013746501523094, + "grad_norm": 0.10845671594142914, + "learning_rate": 0.0012450165564729253, + "loss": 2.2683, + "step": 414250 + }, + { + "epoch": 1.6014133073556927, + "grad_norm": 0.10894111543893814, + "learning_rate": 0.001244751696460032, + "loss": 2.2686, + "step": 414260 + }, + { + "epoch": 1.601451964559076, + "grad_norm": 0.09936366230249405, + "learning_rate": 0.0012444869292990359, + "loss": 2.2792, + "step": 414270 + }, + { + "epoch": 1.6014906217624594, + "grad_norm": 0.09494331479072571, + "learning_rate": 0.0012442222548923528, + "loss": 2.2858, + "step": 414280 + }, + { + "epoch": 1.6015292789658426, + "grad_norm": 0.09906581044197083, + "learning_rate": 0.001243957673142568, + "loss": 2.2755, + "step": 414290 + }, + { + "epoch": 1.6015679361692259, + "grad_norm": 0.10921325534582138, + "learning_rate": 0.0012436931839524385, + "loss": 2.2702, + "step": 414300 + }, + { + "epoch": 1.6016065933726091, + "grad_norm": 0.105479396879673, + "learning_rate": 0.0012434287872248903, + "loss": 2.2867, + "step": 414310 + }, + { + "epoch": 1.6016452505759924, + "grad_norm": 0.10393494367599487, + "learning_rate": 0.00124316448286302, + "loss": 2.281, + "step": 414320 + }, + { + "epoch": 1.6016839077793756, + "grad_norm": 0.10371506959199905, + "learning_rate": 0.001242900270770092, + "loss": 2.2936, + "step": 414330 + }, + { + "epoch": 1.601722564982759, + "grad_norm": 0.11971278488636017, + "learning_rate": 0.0012426361508495404, + "loss": 2.2801, + "step": 414340 + }, + { + "epoch": 1.6017612221861421, + "grad_norm": 0.10474719107151031, + "learning_rate": 0.0012423721230049676, + "loss": 2.291, + "step": 414350 + }, + { + "epoch": 1.6017998793895254, + "grad_norm": 0.10546832531690598, + "learning_rate": 0.0012421081871401435, + "loss": 2.2873, + "step": 414360 + }, + { + "epoch": 1.6018385365929086, + "grad_norm": 0.11091950535774231, + "learning_rate": 0.0012418443431590053, + "loss": 2.273, + "step": 414370 + }, + { + "epoch": 1.601877193796292, + "grad_norm": 0.09752365201711655, + "learning_rate": 0.0012415805909656583, + "loss": 2.277, + "step": 414380 + }, + { + "epoch": 1.6019158509996752, + "grad_norm": 0.09656338393688202, + "learning_rate": 0.0012413169304643736, + "loss": 2.2818, + "step": 414390 + }, + { + "epoch": 1.6019545082030584, + "grad_norm": 0.12549050152301788, + "learning_rate": 0.001241053361559589, + "loss": 2.2766, + "step": 414400 + }, + { + "epoch": 1.6019931654064419, + "grad_norm": 0.0944322869181633, + "learning_rate": 0.0012407898841559077, + "loss": 2.2892, + "step": 414410 + }, + { + "epoch": 1.6020318226098251, + "grad_norm": 0.10391388088464737, + "learning_rate": 0.001240526498158099, + "loss": 2.2892, + "step": 414420 + }, + { + "epoch": 1.6020704798132084, + "grad_norm": 0.09657981991767883, + "learning_rate": 0.001240263203471097, + "loss": 2.287, + "step": 414430 + }, + { + "epoch": 1.6021091370165916, + "grad_norm": 0.09660341590642929, + "learning_rate": 0.00124, + "loss": 2.2946, + "step": 414440 + }, + { + "epoch": 1.602147794219975, + "grad_norm": 0.39730313420295715, + "learning_rate": 0.0012397368876500715, + "loss": 2.2819, + "step": 414450 + }, + { + "epoch": 1.6021864514233584, + "grad_norm": 0.10600872337818146, + "learning_rate": 0.0012394738663267383, + "loss": 2.2668, + "step": 414460 + }, + { + "epoch": 1.6022251086267416, + "grad_norm": 0.09585840255022049, + "learning_rate": 0.0012392109359355907, + "loss": 2.2738, + "step": 414470 + }, + { + "epoch": 1.6022637658301249, + "grad_norm": 0.09625507146120071, + "learning_rate": 0.001238948096382382, + "loss": 2.2827, + "step": 414480 + }, + { + "epoch": 1.6023024230335081, + "grad_norm": 0.10400725156068802, + "learning_rate": 0.001238685347573029, + "loss": 2.2772, + "step": 414490 + }, + { + "epoch": 1.6023410802368914, + "grad_norm": 0.1072283685207367, + "learning_rate": 0.0012384226894136092, + "loss": 2.2799, + "step": 414500 + }, + { + "epoch": 1.6023797374402746, + "grad_norm": 0.11016058921813965, + "learning_rate": 0.0012381601218103637, + "loss": 2.2494, + "step": 414510 + }, + { + "epoch": 1.6024183946436579, + "grad_norm": 0.11033624410629272, + "learning_rate": 0.0012378976446696939, + "loss": 2.267, + "step": 414520 + }, + { + "epoch": 1.6024570518470411, + "grad_norm": 0.1050303652882576, + "learning_rate": 0.0012376352578981633, + "loss": 2.286, + "step": 414530 + }, + { + "epoch": 1.6024957090504244, + "grad_norm": 0.11567538976669312, + "learning_rate": 0.0012373729614024952, + "loss": 2.2639, + "step": 414540 + }, + { + "epoch": 1.6025343662538076, + "grad_norm": 0.10231651365756989, + "learning_rate": 0.001237110755089574, + "loss": 2.2814, + "step": 414550 + }, + { + "epoch": 1.6025730234571909, + "grad_norm": 0.09145838022232056, + "learning_rate": 0.0012368486388664435, + "loss": 2.2815, + "step": 414560 + }, + { + "epoch": 1.6026116806605741, + "grad_norm": 0.09755510091781616, + "learning_rate": 0.0012365866126403074, + "loss": 2.2707, + "step": 414570 + }, + { + "epoch": 1.6026503378639576, + "grad_norm": 0.10884889215230942, + "learning_rate": 0.0012363246763185285, + "loss": 2.2893, + "step": 414580 + }, + { + "epoch": 1.6026889950673409, + "grad_norm": 0.11061963438987732, + "learning_rate": 0.001236062829808629, + "loss": 2.2809, + "step": 414590 + }, + { + "epoch": 1.6027276522707241, + "grad_norm": 0.1166672632098198, + "learning_rate": 0.001235801073018288, + "loss": 2.2647, + "step": 414600 + }, + { + "epoch": 1.6027663094741074, + "grad_norm": 0.10163895040750504, + "learning_rate": 0.0012355394058553442, + "loss": 2.2558, + "step": 414610 + }, + { + "epoch": 1.6028049666774908, + "grad_norm": 0.11530463397502899, + "learning_rate": 0.0012352778282277935, + "loss": 2.2682, + "step": 414620 + }, + { + "epoch": 1.602843623880874, + "grad_norm": 0.10394533723592758, + "learning_rate": 0.001235016340043789, + "loss": 2.2795, + "step": 414630 + }, + { + "epoch": 1.6028822810842573, + "grad_norm": 0.09342104196548462, + "learning_rate": 0.0012347549412116403, + "loss": 2.2847, + "step": 414640 + }, + { + "epoch": 1.6029209382876406, + "grad_norm": 0.09775515645742416, + "learning_rate": 0.0012344936316398146, + "loss": 2.2587, + "step": 414650 + }, + { + "epoch": 1.6029595954910238, + "grad_norm": 0.18582019209861755, + "learning_rate": 0.001234232411236934, + "loss": 2.2786, + "step": 414660 + }, + { + "epoch": 1.602998252694407, + "grad_norm": 0.09853124618530273, + "learning_rate": 0.0012339712799117777, + "loss": 2.2731, + "step": 414670 + }, + { + "epoch": 1.6030369098977904, + "grad_norm": 0.11088378727436066, + "learning_rate": 0.0012337102375732792, + "loss": 2.298, + "step": 414680 + }, + { + "epoch": 1.6030755671011736, + "grad_norm": 0.11615308374166489, + "learning_rate": 0.001233449284130528, + "loss": 2.2655, + "step": 414690 + }, + { + "epoch": 1.6031142243045569, + "grad_norm": 0.09753936529159546, + "learning_rate": 0.0012331884194927674, + "loss": 2.264, + "step": 414700 + }, + { + "epoch": 1.60315288150794, + "grad_norm": 0.11125210672616959, + "learning_rate": 0.0012329276435693957, + "loss": 2.2854, + "step": 414710 + }, + { + "epoch": 1.6031915387113234, + "grad_norm": 0.10580608248710632, + "learning_rate": 0.001232666956269965, + "loss": 2.2761, + "step": 414720 + }, + { + "epoch": 1.6032301959147066, + "grad_norm": 0.1012212336063385, + "learning_rate": 0.0012324063575041807, + "loss": 2.2729, + "step": 414730 + }, + { + "epoch": 1.6032688531180899, + "grad_norm": 0.09648676961660385, + "learning_rate": 0.0012321458471819013, + "loss": 2.2899, + "step": 414740 + }, + { + "epoch": 1.6033075103214733, + "grad_norm": 0.09944088011980057, + "learning_rate": 0.001231885425213139, + "loss": 2.2746, + "step": 414750 + }, + { + "epoch": 1.6033461675248566, + "grad_norm": 0.1056373193860054, + "learning_rate": 0.0012316250915080582, + "loss": 2.2798, + "step": 414760 + }, + { + "epoch": 1.6033848247282398, + "grad_norm": 0.10777273774147034, + "learning_rate": 0.0012313648459769747, + "loss": 2.2701, + "step": 414770 + }, + { + "epoch": 1.603423481931623, + "grad_norm": 0.10863542556762695, + "learning_rate": 0.0012311046885303564, + "loss": 2.2884, + "step": 414780 + }, + { + "epoch": 1.6034621391350066, + "grad_norm": 0.10409296303987503, + "learning_rate": 0.0012308446190788236, + "loss": 2.2668, + "step": 414790 + }, + { + "epoch": 1.6035007963383898, + "grad_norm": 0.10630719363689423, + "learning_rate": 0.0012305846375331461, + "loss": 2.2815, + "step": 414800 + }, + { + "epoch": 1.603539453541773, + "grad_norm": 0.10058703273534775, + "learning_rate": 0.0012303247438042457, + "loss": 2.2879, + "step": 414810 + }, + { + "epoch": 1.6035781107451563, + "grad_norm": 0.09760040044784546, + "learning_rate": 0.001230064937803194, + "loss": 2.2757, + "step": 414820 + }, + { + "epoch": 1.6036167679485396, + "grad_norm": 0.09717744588851929, + "learning_rate": 0.0012298052194412118, + "loss": 2.2789, + "step": 414830 + }, + { + "epoch": 1.6036554251519228, + "grad_norm": 0.10187872499227524, + "learning_rate": 0.0012295455886296711, + "loss": 2.2813, + "step": 414840 + }, + { + "epoch": 1.603694082355306, + "grad_norm": 0.11174602806568146, + "learning_rate": 0.0012292860452800922, + "loss": 2.264, + "step": 414850 + }, + { + "epoch": 1.6037327395586893, + "grad_norm": 0.11284361034631729, + "learning_rate": 0.0012290265893041448, + "loss": 2.2585, + "step": 414860 + }, + { + "epoch": 1.6037713967620726, + "grad_norm": 0.09597291797399521, + "learning_rate": 0.0012287672206136465, + "loss": 2.2831, + "step": 414870 + }, + { + "epoch": 1.6038100539654558, + "grad_norm": 0.0953512042760849, + "learning_rate": 0.0012285079391205636, + "loss": 2.2606, + "step": 414880 + }, + { + "epoch": 1.603848711168839, + "grad_norm": 0.1113586574792862, + "learning_rate": 0.0012282487447370105, + "loss": 2.2814, + "step": 414890 + }, + { + "epoch": 1.6038873683722223, + "grad_norm": 0.11221566796302795, + "learning_rate": 0.0012279896373752486, + "loss": 2.272, + "step": 414900 + }, + { + "epoch": 1.6039260255756056, + "grad_norm": 0.10260015726089478, + "learning_rate": 0.001227730616947687, + "loss": 2.2744, + "step": 414910 + }, + { + "epoch": 1.603964682778989, + "grad_norm": 0.1190873309969902, + "learning_rate": 0.0012274716833668813, + "loss": 2.2702, + "step": 414920 + }, + { + "epoch": 1.6040033399823723, + "grad_norm": 0.10727227479219437, + "learning_rate": 0.0012272128365455337, + "loss": 2.2882, + "step": 414930 + }, + { + "epoch": 1.6040419971857556, + "grad_norm": 0.12654146552085876, + "learning_rate": 0.0012269540763964924, + "loss": 2.2745, + "step": 414940 + }, + { + "epoch": 1.6040806543891388, + "grad_norm": 0.09809217602014542, + "learning_rate": 0.0012266954028327518, + "loss": 2.2914, + "step": 414950 + }, + { + "epoch": 1.6041193115925223, + "grad_norm": 0.10526150465011597, + "learning_rate": 0.0012264368157674514, + "loss": 2.2849, + "step": 414960 + }, + { + "epoch": 1.6041579687959056, + "grad_norm": 0.0995633453130722, + "learning_rate": 0.0012261783151138758, + "loss": 2.2896, + "step": 414970 + }, + { + "epoch": 1.6041966259992888, + "grad_norm": 0.09011335670948029, + "learning_rate": 0.001225919900785455, + "loss": 2.2663, + "step": 414980 + }, + { + "epoch": 1.604235283202672, + "grad_norm": 0.10598836839199066, + "learning_rate": 0.0012256615726957624, + "loss": 2.2811, + "step": 414990 + }, + { + "epoch": 1.6042739404060553, + "grad_norm": 0.1029854342341423, + "learning_rate": 0.0012254033307585166, + "loss": 2.2946, + "step": 415000 + }, + { + "epoch": 1.6043125976094386, + "grad_norm": 0.09580104798078537, + "learning_rate": 0.0012251451748875794, + "loss": 2.2908, + "step": 415010 + }, + { + "epoch": 1.6043512548128218, + "grad_norm": 0.10150299221277237, + "learning_rate": 0.0012248871049969558, + "loss": 2.2644, + "step": 415020 + }, + { + "epoch": 1.604389912016205, + "grad_norm": 0.09846926480531693, + "learning_rate": 0.001224629121000795, + "loss": 2.2755, + "step": 415030 + }, + { + "epoch": 1.6044285692195883, + "grad_norm": 0.11625342816114426, + "learning_rate": 0.0012243712228133875, + "loss": 2.2873, + "step": 415040 + }, + { + "epoch": 1.6044672264229716, + "grad_norm": 0.11542271077632904, + "learning_rate": 0.0012241134103491672, + "loss": 2.2749, + "step": 415050 + }, + { + "epoch": 1.6045058836263548, + "grad_norm": 0.09932534396648407, + "learning_rate": 0.0012238556835227098, + "loss": 2.2735, + "step": 415060 + }, + { + "epoch": 1.604544540829738, + "grad_norm": 0.10799705982208252, + "learning_rate": 0.0012235980422487332, + "loss": 2.2614, + "step": 415070 + }, + { + "epoch": 1.6045831980331213, + "grad_norm": 0.11131885647773743, + "learning_rate": 0.001223340486442096, + "loss": 2.278, + "step": 415080 + }, + { + "epoch": 1.6046218552365048, + "grad_norm": 0.10251007974147797, + "learning_rate": 0.0012230830160177987, + "loss": 2.2725, + "step": 415090 + }, + { + "epoch": 1.604660512439888, + "grad_norm": 0.09627630561590195, + "learning_rate": 0.001222825630890982, + "loss": 2.2852, + "step": 415100 + }, + { + "epoch": 1.6046991696432713, + "grad_norm": 0.12653441727161407, + "learning_rate": 0.0012225683309769276, + "loss": 2.2731, + "step": 415110 + }, + { + "epoch": 1.6047378268466546, + "grad_norm": 0.09953426569700241, + "learning_rate": 0.0012223111161910568, + "loss": 2.2678, + "step": 415120 + }, + { + "epoch": 1.604776484050038, + "grad_norm": 0.10526780039072037, + "learning_rate": 0.001222053986448931, + "loss": 2.2708, + "step": 415130 + }, + { + "epoch": 1.6048151412534213, + "grad_norm": 0.10229865461587906, + "learning_rate": 0.001221796941666251, + "loss": 2.2776, + "step": 415140 + }, + { + "epoch": 1.6048537984568045, + "grad_norm": 0.0909830704331398, + "learning_rate": 0.0012215399817588576, + "loss": 2.2825, + "step": 415150 + }, + { + "epoch": 1.6048924556601878, + "grad_norm": 0.12754322588443756, + "learning_rate": 0.0012212831066427286, + "loss": 2.2681, + "step": 415160 + }, + { + "epoch": 1.604931112863571, + "grad_norm": 0.10755956172943115, + "learning_rate": 0.0012210263162339822, + "loss": 2.2771, + "step": 415170 + }, + { + "epoch": 1.6049697700669543, + "grad_norm": 0.08844668418169022, + "learning_rate": 0.001220769610448874, + "loss": 2.2871, + "step": 415180 + }, + { + "epoch": 1.6050084272703375, + "grad_norm": 0.11943338066339493, + "learning_rate": 0.001220512989203797, + "loss": 2.2607, + "step": 415190 + }, + { + "epoch": 1.6050470844737208, + "grad_norm": 0.10932467132806778, + "learning_rate": 0.001220256452415283, + "loss": 2.2764, + "step": 415200 + }, + { + "epoch": 1.605085741677104, + "grad_norm": 0.11282321810722351, + "learning_rate": 0.00122, + "loss": 2.2761, + "step": 415210 + }, + { + "epoch": 1.6051243988804873, + "grad_norm": 0.09715761244297028, + "learning_rate": 0.0012197436318747536, + "loss": 2.2705, + "step": 415220 + }, + { + "epoch": 1.6051630560838706, + "grad_norm": 0.11072029173374176, + "learning_rate": 0.0012194873479564859, + "loss": 2.2825, + "step": 415230 + }, + { + "epoch": 1.6052017132872538, + "grad_norm": 0.12762323021888733, + "learning_rate": 0.0012192311481622746, + "loss": 2.2746, + "step": 415240 + }, + { + "epoch": 1.6052403704906373, + "grad_norm": 0.10402945429086685, + "learning_rate": 0.0012189750324093347, + "loss": 2.2752, + "step": 415250 + }, + { + "epoch": 1.6052790276940205, + "grad_norm": 0.11042294651269913, + "learning_rate": 0.0012187190006150157, + "loss": 2.2655, + "step": 415260 + }, + { + "epoch": 1.6053176848974038, + "grad_norm": 0.11077842116355896, + "learning_rate": 0.0012184630526968032, + "loss": 2.2568, + "step": 415270 + }, + { + "epoch": 1.605356342100787, + "grad_norm": 0.12798729538917542, + "learning_rate": 0.0012182071885723173, + "loss": 2.2744, + "step": 415280 + }, + { + "epoch": 1.6053949993041703, + "grad_norm": 0.10232830047607422, + "learning_rate": 0.001217951408159314, + "loss": 2.2643, + "step": 415290 + }, + { + "epoch": 1.6054336565075538, + "grad_norm": 0.1272798478603363, + "learning_rate": 0.001217695711375682, + "loss": 2.2706, + "step": 415300 + }, + { + "epoch": 1.605472313710937, + "grad_norm": 0.10634199529886246, + "learning_rate": 0.0012174400981394458, + "loss": 2.2836, + "step": 415310 + }, + { + "epoch": 1.6055109709143203, + "grad_norm": 0.09858424961566925, + "learning_rate": 0.0012171845683687627, + "loss": 2.2855, + "step": 415320 + }, + { + "epoch": 1.6055496281177035, + "grad_norm": 0.10499022156000137, + "learning_rate": 0.0012169291219819242, + "loss": 2.267, + "step": 415330 + }, + { + "epoch": 1.6055882853210868, + "grad_norm": 0.09204845130443573, + "learning_rate": 0.0012166737588973544, + "loss": 2.2823, + "step": 415340 + }, + { + "epoch": 1.60562694252447, + "grad_norm": 0.1029248759150505, + "learning_rate": 0.0012164184790336107, + "loss": 2.2772, + "step": 415350 + }, + { + "epoch": 1.6056655997278533, + "grad_norm": 0.10894620418548584, + "learning_rate": 0.001216163282309383, + "loss": 2.2736, + "step": 415360 + }, + { + "epoch": 1.6057042569312365, + "grad_norm": 0.11481393128633499, + "learning_rate": 0.0012159081686434936, + "loss": 2.28, + "step": 415370 + }, + { + "epoch": 1.6057429141346198, + "grad_norm": 0.09570683538913727, + "learning_rate": 0.001215653137954897, + "loss": 2.2621, + "step": 415380 + }, + { + "epoch": 1.605781571338003, + "grad_norm": 0.11351441591978073, + "learning_rate": 0.0012153981901626787, + "loss": 2.256, + "step": 415390 + }, + { + "epoch": 1.6058202285413863, + "grad_norm": 0.10348304361104965, + "learning_rate": 0.0012151433251860566, + "loss": 2.2847, + "step": 415400 + }, + { + "epoch": 1.6058588857447695, + "grad_norm": 0.1195356547832489, + "learning_rate": 0.0012148885429443792, + "loss": 2.285, + "step": 415410 + }, + { + "epoch": 1.605897542948153, + "grad_norm": 0.10650033503770828, + "learning_rate": 0.0012146338433571256, + "loss": 2.2921, + "step": 415420 + }, + { + "epoch": 1.6059362001515363, + "grad_norm": 0.10261499881744385, + "learning_rate": 0.0012143792263439059, + "loss": 2.289, + "step": 415430 + }, + { + "epoch": 1.6059748573549195, + "grad_norm": 0.1088041141629219, + "learning_rate": 0.00121412469182446, + "loss": 2.262, + "step": 415440 + }, + { + "epoch": 1.6060135145583028, + "grad_norm": 0.09664658457040787, + "learning_rate": 0.0012138702397186583, + "loss": 2.2742, + "step": 415450 + }, + { + "epoch": 1.606052171761686, + "grad_norm": 0.10386968404054642, + "learning_rate": 0.0012136158699465002, + "loss": 2.2706, + "step": 415460 + }, + { + "epoch": 1.6060908289650695, + "grad_norm": 0.10762540996074677, + "learning_rate": 0.0012133615824281145, + "loss": 2.2706, + "step": 415470 + }, + { + "epoch": 1.6061294861684527, + "grad_norm": 0.098637655377388, + "learning_rate": 0.00121310737708376, + "loss": 2.2957, + "step": 415480 + }, + { + "epoch": 1.606168143371836, + "grad_norm": 0.12009122967720032, + "learning_rate": 0.0012128532538338227, + "loss": 2.273, + "step": 415490 + }, + { + "epoch": 1.6062068005752193, + "grad_norm": 0.1068306490778923, + "learning_rate": 0.001212599212598819, + "loss": 2.2643, + "step": 415500 + }, + { + "epoch": 1.6062454577786025, + "grad_norm": 0.10786595940589905, + "learning_rate": 0.0012123452532993913, + "loss": 2.28, + "step": 415510 + }, + { + "epoch": 1.6062841149819858, + "grad_norm": 0.09172281622886658, + "learning_rate": 0.0012120913758563118, + "loss": 2.28, + "step": 415520 + }, + { + "epoch": 1.606322772185369, + "grad_norm": 0.10430020838975906, + "learning_rate": 0.0012118375801904789, + "loss": 2.2769, + "step": 415530 + }, + { + "epoch": 1.6063614293887523, + "grad_norm": 0.10138547420501709, + "learning_rate": 0.0012115838662229191, + "loss": 2.273, + "step": 415540 + }, + { + "epoch": 1.6064000865921355, + "grad_norm": 0.1290520429611206, + "learning_rate": 0.0012113302338747861, + "loss": 2.2781, + "step": 415550 + }, + { + "epoch": 1.6064387437955188, + "grad_norm": 0.11046361923217773, + "learning_rate": 0.0012110766830673594, + "loss": 2.2859, + "step": 415560 + }, + { + "epoch": 1.606477400998902, + "grad_norm": 0.09649594128131866, + "learning_rate": 0.001210823213722046, + "loss": 2.2544, + "step": 415570 + }, + { + "epoch": 1.6065160582022853, + "grad_norm": 0.1014927476644516, + "learning_rate": 0.001210569825760378, + "loss": 2.273, + "step": 415580 + }, + { + "epoch": 1.6065547154056687, + "grad_norm": 0.11194448918104172, + "learning_rate": 0.0012103165191040147, + "loss": 2.2812, + "step": 415590 + }, + { + "epoch": 1.606593372609052, + "grad_norm": 0.09651295840740204, + "learning_rate": 0.00121006329367474, + "loss": 2.2823, + "step": 415600 + }, + { + "epoch": 1.6066320298124352, + "grad_norm": 0.09813214838504791, + "learning_rate": 0.0012098101493944636, + "loss": 2.2667, + "step": 415610 + }, + { + "epoch": 1.6066706870158185, + "grad_norm": 0.14182907342910767, + "learning_rate": 0.0012095570861852198, + "loss": 2.277, + "step": 415620 + }, + { + "epoch": 1.6067093442192018, + "grad_norm": 0.10931894183158875, + "learning_rate": 0.0012093041039691684, + "loss": 2.2732, + "step": 415630 + }, + { + "epoch": 1.6067480014225852, + "grad_norm": 0.09297894686460495, + "learning_rate": 0.0012090512026685925, + "loss": 2.2752, + "step": 415640 + }, + { + "epoch": 1.6067866586259685, + "grad_norm": 0.10945281386375427, + "learning_rate": 0.0012087983822059007, + "loss": 2.2743, + "step": 415650 + }, + { + "epoch": 1.6068253158293517, + "grad_norm": 0.10067486017942429, + "learning_rate": 0.001208545642503625, + "loss": 2.2632, + "step": 415660 + }, + { + "epoch": 1.606863973032735, + "grad_norm": 0.19785423576831818, + "learning_rate": 0.0012082929834844206, + "loss": 2.2771, + "step": 415670 + }, + { + "epoch": 1.6069026302361182, + "grad_norm": 0.10551826655864716, + "learning_rate": 0.001208040405071067, + "loss": 2.2693, + "step": 415680 + }, + { + "epoch": 1.6069412874395015, + "grad_norm": 0.10171528160572052, + "learning_rate": 0.0012077879071864658, + "loss": 2.2591, + "step": 415690 + }, + { + "epoch": 1.6069799446428847, + "grad_norm": 0.1067056804895401, + "learning_rate": 0.0012075354897536422, + "loss": 2.2809, + "step": 415700 + }, + { + "epoch": 1.607018601846268, + "grad_norm": 0.0996486097574234, + "learning_rate": 0.0012072831526957436, + "loss": 2.2799, + "step": 415710 + }, + { + "epoch": 1.6070572590496512, + "grad_norm": 0.10620211809873581, + "learning_rate": 0.0012070308959360395, + "loss": 2.2734, + "step": 415720 + }, + { + "epoch": 1.6070959162530345, + "grad_norm": 0.1143997311592102, + "learning_rate": 0.0012067787193979225, + "loss": 2.2835, + "step": 415730 + }, + { + "epoch": 1.6071345734564177, + "grad_norm": 0.10100308805704117, + "learning_rate": 0.0012065266230049051, + "loss": 2.2762, + "step": 415740 + }, + { + "epoch": 1.607173230659801, + "grad_norm": 0.09416519105434418, + "learning_rate": 0.0012062746066806226, + "loss": 2.2799, + "step": 415750 + }, + { + "epoch": 1.6072118878631845, + "grad_norm": 0.1051805317401886, + "learning_rate": 0.0012060226703488317, + "loss": 2.2753, + "step": 415760 + }, + { + "epoch": 1.6072505450665677, + "grad_norm": 0.09900598227977753, + "learning_rate": 0.0012057708139334088, + "loss": 2.2765, + "step": 415770 + }, + { + "epoch": 1.607289202269951, + "grad_norm": 0.10039504617452621, + "learning_rate": 0.0012055190373583518, + "loss": 2.2885, + "step": 415780 + }, + { + "epoch": 1.6073278594733342, + "grad_norm": 0.11165333539247513, + "learning_rate": 0.0012052673405477789, + "loss": 2.2762, + "step": 415790 + }, + { + "epoch": 1.6073665166767177, + "grad_norm": 0.09489556401968002, + "learning_rate": 0.0012050157234259286, + "loss": 2.2751, + "step": 415800 + }, + { + "epoch": 1.607405173880101, + "grad_norm": 0.10287255793809891, + "learning_rate": 0.0012047641859171583, + "loss": 2.2809, + "step": 415810 + }, + { + "epoch": 1.6074438310834842, + "grad_norm": 0.1100325882434845, + "learning_rate": 0.0012045127279459464, + "loss": 2.2672, + "step": 415820 + }, + { + "epoch": 1.6074824882868675, + "grad_norm": 0.10662521421909332, + "learning_rate": 0.0012042613494368895, + "loss": 2.2614, + "step": 415830 + }, + { + "epoch": 1.6075211454902507, + "grad_norm": 0.12481796741485596, + "learning_rate": 0.001204010050314704, + "loss": 2.2778, + "step": 415840 + }, + { + "epoch": 1.607559802693634, + "grad_norm": 0.1037929430603981, + "learning_rate": 0.0012037588305042247, + "loss": 2.2737, + "step": 415850 + }, + { + "epoch": 1.6075984598970172, + "grad_norm": 0.25705114006996155, + "learning_rate": 0.0012035076899304048, + "loss": 2.2819, + "step": 415860 + }, + { + "epoch": 1.6076371171004005, + "grad_norm": 0.09950485825538635, + "learning_rate": 0.0012032566285183164, + "loss": 2.2747, + "step": 415870 + }, + { + "epoch": 1.6076757743037837, + "grad_norm": 0.10227572917938232, + "learning_rate": 0.001203005646193149, + "loss": 2.2741, + "step": 415880 + }, + { + "epoch": 1.607714431507167, + "grad_norm": 0.10218802094459534, + "learning_rate": 0.00120275474288021, + "loss": 2.2791, + "step": 415890 + }, + { + "epoch": 1.6077530887105502, + "grad_norm": 0.1049424335360527, + "learning_rate": 0.0012025039185049246, + "loss": 2.2683, + "step": 415900 + }, + { + "epoch": 1.6077917459139335, + "grad_norm": 0.10968547314405441, + "learning_rate": 0.001202253172992835, + "loss": 2.2662, + "step": 415910 + }, + { + "epoch": 1.6078304031173167, + "grad_norm": 0.11751758307218552, + "learning_rate": 0.0012020025062695998, + "loss": 2.2607, + "step": 415920 + }, + { + "epoch": 1.6078690603207002, + "grad_norm": 0.09624996781349182, + "learning_rate": 0.001201751918260996, + "loss": 2.275, + "step": 415930 + }, + { + "epoch": 1.6079077175240835, + "grad_norm": 0.11658187955617905, + "learning_rate": 0.001201501408892915, + "loss": 2.2897, + "step": 415940 + }, + { + "epoch": 1.6079463747274667, + "grad_norm": 0.09554409235715866, + "learning_rate": 0.0012012509780913656, + "loss": 2.2782, + "step": 415950 + }, + { + "epoch": 1.60798503193085, + "grad_norm": 0.12213790416717529, + "learning_rate": 0.001201000625782473, + "loss": 2.2756, + "step": 415960 + }, + { + "epoch": 1.6080236891342334, + "grad_norm": 0.1274234801530838, + "learning_rate": 0.0012007503518924767, + "loss": 2.2652, + "step": 415970 + }, + { + "epoch": 1.6080623463376167, + "grad_norm": 0.10022182762622833, + "learning_rate": 0.0012005001563477327, + "loss": 2.272, + "step": 415980 + }, + { + "epoch": 1.608101003541, + "grad_norm": 0.10145244747400284, + "learning_rate": 0.001200250039074712, + "loss": 2.2771, + "step": 415990 + }, + { + "epoch": 1.6081396607443832, + "grad_norm": 0.1067088395357132, + "learning_rate": 0.0012, + "loss": 2.2807, + "step": 416000 + }, + { + "epoch": 1.6081783179477664, + "grad_norm": 0.10339207947254181, + "learning_rate": 0.0011997500390502978, + "loss": 2.2696, + "step": 416010 + }, + { + "epoch": 1.6082169751511497, + "grad_norm": 0.1001124158501625, + "learning_rate": 0.0011995001561524198, + "loss": 2.2686, + "step": 416020 + }, + { + "epoch": 1.608255632354533, + "grad_norm": 0.120066799223423, + "learning_rate": 0.001199250351233296, + "loss": 2.2708, + "step": 416030 + }, + { + "epoch": 1.6082942895579162, + "grad_norm": 0.10201053321361542, + "learning_rate": 0.0011990006242199684, + "loss": 2.267, + "step": 416040 + }, + { + "epoch": 1.6083329467612995, + "grad_norm": 0.09921201318502426, + "learning_rate": 0.0011987509750395948, + "loss": 2.2634, + "step": 416050 + }, + { + "epoch": 1.6083716039646827, + "grad_norm": 0.10539157688617706, + "learning_rate": 0.0011985014036194448, + "loss": 2.2806, + "step": 416060 + }, + { + "epoch": 1.608410261168066, + "grad_norm": 0.10070233047008514, + "learning_rate": 0.0011982519098869022, + "loss": 2.2711, + "step": 416070 + }, + { + "epoch": 1.6084489183714492, + "grad_norm": 0.09526374936103821, + "learning_rate": 0.0011980024937694631, + "loss": 2.2893, + "step": 416080 + }, + { + "epoch": 1.6084875755748325, + "grad_norm": 0.09702368080615997, + "learning_rate": 0.001197753155194737, + "loss": 2.258, + "step": 416090 + }, + { + "epoch": 1.608526232778216, + "grad_norm": 0.09795309603214264, + "learning_rate": 0.0011975038940904448, + "loss": 2.2763, + "step": 416100 + }, + { + "epoch": 1.6085648899815992, + "grad_norm": 0.10636353492736816, + "learning_rate": 0.0011972547103844208, + "loss": 2.2737, + "step": 416110 + }, + { + "epoch": 1.6086035471849824, + "grad_norm": 0.1189536526799202, + "learning_rate": 0.0011970056040046108, + "loss": 2.268, + "step": 416120 + }, + { + "epoch": 1.6086422043883657, + "grad_norm": 0.10652356594800949, + "learning_rate": 0.001196756574879072, + "loss": 2.2684, + "step": 416130 + }, + { + "epoch": 1.6086808615917492, + "grad_norm": 0.12403170019388199, + "learning_rate": 0.0011965076229359735, + "loss": 2.265, + "step": 416140 + }, + { + "epoch": 1.6087195187951324, + "grad_norm": 0.09871499240398407, + "learning_rate": 0.0011962587481035953, + "loss": 2.2798, + "step": 416150 + }, + { + "epoch": 1.6087581759985157, + "grad_norm": 0.10580728948116302, + "learning_rate": 0.0011960099503103287, + "loss": 2.2648, + "step": 416160 + }, + { + "epoch": 1.608796833201899, + "grad_norm": 0.10762586444616318, + "learning_rate": 0.001195761229484676, + "loss": 2.2817, + "step": 416170 + }, + { + "epoch": 1.6088354904052822, + "grad_norm": 0.10945604741573334, + "learning_rate": 0.0011955125855552494, + "loss": 2.2607, + "step": 416180 + }, + { + "epoch": 1.6088741476086654, + "grad_norm": 0.1232224777340889, + "learning_rate": 0.0011952640184507716, + "loss": 2.2782, + "step": 416190 + }, + { + "epoch": 1.6089128048120487, + "grad_norm": 0.10195323079824448, + "learning_rate": 0.0011950155281000758, + "loss": 2.2713, + "step": 416200 + }, + { + "epoch": 1.608951462015432, + "grad_norm": 0.10153280198574066, + "learning_rate": 0.0011947671144321042, + "loss": 2.2703, + "step": 416210 + }, + { + "epoch": 1.6089901192188152, + "grad_norm": 0.09289544075727463, + "learning_rate": 0.0011945187773759094, + "loss": 2.248, + "step": 416220 + }, + { + "epoch": 1.6090287764221984, + "grad_norm": 0.09814459830522537, + "learning_rate": 0.0011942705168606525, + "loss": 2.267, + "step": 416230 + }, + { + "epoch": 1.6090674336255817, + "grad_norm": 0.11871287971735, + "learning_rate": 0.0011940223328156048, + "loss": 2.2877, + "step": 416240 + }, + { + "epoch": 1.609106090828965, + "grad_norm": 0.09919058531522751, + "learning_rate": 0.0011937742251701452, + "loss": 2.2533, + "step": 416250 + }, + { + "epoch": 1.6091447480323482, + "grad_norm": 0.10049314051866531, + "learning_rate": 0.001193526193853762, + "loss": 2.2548, + "step": 416260 + }, + { + "epoch": 1.6091834052357317, + "grad_norm": 0.1021869033575058, + "learning_rate": 0.0011932782387960516, + "loss": 2.2675, + "step": 416270 + }, + { + "epoch": 1.609222062439115, + "grad_norm": 0.09958551824092865, + "learning_rate": 0.0011930303599267194, + "loss": 2.2816, + "step": 416280 + }, + { + "epoch": 1.6092607196424982, + "grad_norm": 0.09233255684375763, + "learning_rate": 0.0011927825571755775, + "loss": 2.274, + "step": 416290 + }, + { + "epoch": 1.6092993768458814, + "grad_norm": 0.10324325412511826, + "learning_rate": 0.001192534830472546, + "loss": 2.2708, + "step": 416300 + }, + { + "epoch": 1.609338034049265, + "grad_norm": 0.11270972341299057, + "learning_rate": 0.0011922871797476532, + "loss": 2.2767, + "step": 416310 + }, + { + "epoch": 1.6093766912526482, + "grad_norm": 0.09859174489974976, + "learning_rate": 0.0011920396049310339, + "loss": 2.265, + "step": 416320 + }, + { + "epoch": 1.6094153484560314, + "grad_norm": 0.11577589809894562, + "learning_rate": 0.00119179210595293, + "loss": 2.277, + "step": 416330 + }, + { + "epoch": 1.6094540056594147, + "grad_norm": 0.10810605436563492, + "learning_rate": 0.0011915446827436905, + "loss": 2.2668, + "step": 416340 + }, + { + "epoch": 1.609492662862798, + "grad_norm": 0.11018019914627075, + "learning_rate": 0.0011912973352337709, + "loss": 2.2652, + "step": 416350 + }, + { + "epoch": 1.6095313200661812, + "grad_norm": 0.10128393024206161, + "learning_rate": 0.0011910500633537328, + "loss": 2.2706, + "step": 416360 + }, + { + "epoch": 1.6095699772695644, + "grad_norm": 0.10257522016763687, + "learning_rate": 0.0011908028670342436, + "loss": 2.2729, + "step": 416370 + }, + { + "epoch": 1.6096086344729477, + "grad_norm": 0.10266554355621338, + "learning_rate": 0.0011905557462060777, + "loss": 2.2685, + "step": 416380 + }, + { + "epoch": 1.609647291676331, + "grad_norm": 0.10134238749742508, + "learning_rate": 0.0011903087008001136, + "loss": 2.2722, + "step": 416390 + }, + { + "epoch": 1.6096859488797142, + "grad_norm": 0.09120500087738037, + "learning_rate": 0.0011900617307473364, + "loss": 2.2741, + "step": 416400 + }, + { + "epoch": 1.6097246060830974, + "grad_norm": 0.10055642575025558, + "learning_rate": 0.0011898148359788363, + "loss": 2.2759, + "step": 416410 + }, + { + "epoch": 1.6097632632864807, + "grad_norm": 0.11470640450716019, + "learning_rate": 0.0011895680164258076, + "loss": 2.2621, + "step": 416420 + }, + { + "epoch": 1.609801920489864, + "grad_norm": 0.10170623660087585, + "learning_rate": 0.0011893212720195502, + "loss": 2.2847, + "step": 416430 + }, + { + "epoch": 1.6098405776932474, + "grad_norm": 0.09756741672754288, + "learning_rate": 0.0011890746026914686, + "loss": 2.2654, + "step": 416440 + }, + { + "epoch": 1.6098792348966307, + "grad_norm": 0.0965120941400528, + "learning_rate": 0.0011888280083730703, + "loss": 2.2714, + "step": 416450 + }, + { + "epoch": 1.609917892100014, + "grad_norm": 0.11919204145669937, + "learning_rate": 0.0011885814889959683, + "loss": 2.2869, + "step": 416460 + }, + { + "epoch": 1.6099565493033972, + "grad_norm": 0.1065000370144844, + "learning_rate": 0.001188335044491879, + "loss": 2.2792, + "step": 416470 + }, + { + "epoch": 1.6099952065067806, + "grad_norm": 0.13623447716236115, + "learning_rate": 0.0011880886747926225, + "loss": 2.2593, + "step": 416480 + }, + { + "epoch": 1.6100338637101639, + "grad_norm": 0.10131317377090454, + "learning_rate": 0.001187842379830122, + "loss": 2.2975, + "step": 416490 + }, + { + "epoch": 1.6100725209135471, + "grad_norm": 0.10462559759616852, + "learning_rate": 0.001187596159536404, + "loss": 2.254, + "step": 416500 + }, + { + "epoch": 1.6101111781169304, + "grad_norm": 0.0960330218076706, + "learning_rate": 0.0011873500138435981, + "loss": 2.2833, + "step": 416510 + }, + { + "epoch": 1.6101498353203136, + "grad_norm": 0.10928565263748169, + "learning_rate": 0.0011871039426839369, + "loss": 2.2635, + "step": 416520 + }, + { + "epoch": 1.610188492523697, + "grad_norm": 0.10919661819934845, + "learning_rate": 0.0011868579459897552, + "loss": 2.2692, + "step": 416530 + }, + { + "epoch": 1.6102271497270801, + "grad_norm": 0.09764064848423004, + "learning_rate": 0.00118661202369349, + "loss": 2.2708, + "step": 416540 + }, + { + "epoch": 1.6102658069304634, + "grad_norm": 0.10673554241657257, + "learning_rate": 0.0011863661757276806, + "loss": 2.2755, + "step": 416550 + }, + { + "epoch": 1.6103044641338466, + "grad_norm": 0.09112170338630676, + "learning_rate": 0.001186120402024968, + "loss": 2.2722, + "step": 416560 + }, + { + "epoch": 1.61034312133723, + "grad_norm": 0.09223476052284241, + "learning_rate": 0.0011858747025180953, + "loss": 2.2769, + "step": 416570 + }, + { + "epoch": 1.6103817785406132, + "grad_norm": 0.10158587247133255, + "learning_rate": 0.0011856290771399068, + "loss": 2.2778, + "step": 416580 + }, + { + "epoch": 1.6104204357439964, + "grad_norm": 0.11441430449485779, + "learning_rate": 0.001185383525823348, + "loss": 2.2623, + "step": 416590 + }, + { + "epoch": 1.6104590929473797, + "grad_norm": 0.10831929743289948, + "learning_rate": 0.0011851380485014653, + "loss": 2.2732, + "step": 416600 + }, + { + "epoch": 1.6104977501507631, + "grad_norm": 0.09476850181818008, + "learning_rate": 0.0011848926451074066, + "loss": 2.2768, + "step": 416610 + }, + { + "epoch": 1.6105364073541464, + "grad_norm": 0.11065233498811722, + "learning_rate": 0.0011846473155744197, + "loss": 2.2563, + "step": 416620 + }, + { + "epoch": 1.6105750645575296, + "grad_norm": 0.10710212588310242, + "learning_rate": 0.0011844020598358526, + "loss": 2.2628, + "step": 416630 + }, + { + "epoch": 1.6106137217609129, + "grad_norm": 0.10404687374830246, + "learning_rate": 0.0011841568778251546, + "loss": 2.2683, + "step": 416640 + }, + { + "epoch": 1.6106523789642964, + "grad_norm": 0.11419782042503357, + "learning_rate": 0.0011839117694758734, + "loss": 2.2712, + "step": 416650 + }, + { + "epoch": 1.6106910361676796, + "grad_norm": 0.10590862482786179, + "learning_rate": 0.001183666734721658, + "loss": 2.2856, + "step": 416660 + }, + { + "epoch": 1.6107296933710629, + "grad_norm": 0.1061839759349823, + "learning_rate": 0.001183421773496256, + "loss": 2.2688, + "step": 416670 + }, + { + "epoch": 1.6107683505744461, + "grad_norm": 0.09947504848241806, + "learning_rate": 0.0011831768857335145, + "loss": 2.264, + "step": 416680 + }, + { + "epoch": 1.6108070077778294, + "grad_norm": 0.0981912761926651, + "learning_rate": 0.0011829320713673801, + "loss": 2.2641, + "step": 416690 + }, + { + "epoch": 1.6108456649812126, + "grad_norm": 0.09732093662023544, + "learning_rate": 0.0011826873303318979, + "loss": 2.2709, + "step": 416700 + }, + { + "epoch": 1.6108843221845959, + "grad_norm": 0.10180546343326569, + "learning_rate": 0.0011824426625612122, + "loss": 2.2958, + "step": 416710 + }, + { + "epoch": 1.6109229793879791, + "grad_norm": 0.08668989688158035, + "learning_rate": 0.0011821980679895651, + "loss": 2.2715, + "step": 416720 + }, + { + "epoch": 1.6109616365913624, + "grad_norm": 0.1114187091588974, + "learning_rate": 0.0011819535465512976, + "loss": 2.2713, + "step": 416730 + }, + { + "epoch": 1.6110002937947456, + "grad_norm": 0.09553922712802887, + "learning_rate": 0.0011817090981808486, + "loss": 2.2822, + "step": 416740 + }, + { + "epoch": 1.6110389509981289, + "grad_norm": 0.10593093186616898, + "learning_rate": 0.001181464722812755, + "loss": 2.2658, + "step": 416750 + }, + { + "epoch": 1.6110776082015121, + "grad_norm": 0.11995851248502731, + "learning_rate": 0.0011812204203816513, + "loss": 2.2758, + "step": 416760 + }, + { + "epoch": 1.6111162654048954, + "grad_norm": 0.10872121900320053, + "learning_rate": 0.0011809761908222695, + "loss": 2.2619, + "step": 416770 + }, + { + "epoch": 1.6111549226082789, + "grad_norm": 0.11107048392295837, + "learning_rate": 0.0011807320340694383, + "loss": 2.2679, + "step": 416780 + }, + { + "epoch": 1.6111935798116621, + "grad_norm": 0.10314419865608215, + "learning_rate": 0.001180487950058085, + "loss": 2.2709, + "step": 416790 + }, + { + "epoch": 1.6112322370150454, + "grad_norm": 0.10333782434463501, + "learning_rate": 0.001180243938723232, + "loss": 2.2552, + "step": 416800 + }, + { + "epoch": 1.6112708942184286, + "grad_norm": 0.09844791144132614, + "learning_rate": 0.0011800000000000003, + "loss": 2.2751, + "step": 416810 + }, + { + "epoch": 1.611309551421812, + "grad_norm": 0.10671578347682953, + "learning_rate": 0.0011797561338236049, + "loss": 2.2686, + "step": 416820 + }, + { + "epoch": 1.6113482086251953, + "grad_norm": 0.1007014662027359, + "learning_rate": 0.0011795123401293593, + "loss": 2.2702, + "step": 416830 + }, + { + "epoch": 1.6113868658285786, + "grad_norm": 0.10503365844488144, + "learning_rate": 0.0011792686188526725, + "loss": 2.2804, + "step": 416840 + }, + { + "epoch": 1.6114255230319618, + "grad_norm": 0.09722265601158142, + "learning_rate": 0.0011790249699290482, + "loss": 2.2669, + "step": 416850 + }, + { + "epoch": 1.611464180235345, + "grad_norm": 0.09146405011415482, + "learning_rate": 0.0011787813932940877, + "loss": 2.2751, + "step": 416860 + }, + { + "epoch": 1.6115028374387284, + "grad_norm": 0.13335338234901428, + "learning_rate": 0.0011785378888834858, + "loss": 2.2839, + "step": 416870 + }, + { + "epoch": 1.6115414946421116, + "grad_norm": 0.09907438606023788, + "learning_rate": 0.0011782944566330344, + "loss": 2.2623, + "step": 416880 + }, + { + "epoch": 1.6115801518454949, + "grad_norm": 0.12023543566465378, + "learning_rate": 0.0011780510964786192, + "loss": 2.2806, + "step": 416890 + }, + { + "epoch": 1.611618809048878, + "grad_norm": 0.10835972428321838, + "learning_rate": 0.0011778078083562213, + "loss": 2.2843, + "step": 416900 + }, + { + "epoch": 1.6116574662522614, + "grad_norm": 0.09845547378063202, + "learning_rate": 0.0011775645922019165, + "loss": 2.2552, + "step": 416910 + }, + { + "epoch": 1.6116961234556446, + "grad_norm": 0.10376187413930893, + "learning_rate": 0.0011773214479518749, + "loss": 2.2728, + "step": 416920 + }, + { + "epoch": 1.6117347806590279, + "grad_norm": 0.10232806205749512, + "learning_rate": 0.0011770783755423607, + "loss": 2.2799, + "step": 416930 + }, + { + "epoch": 1.6117734378624111, + "grad_norm": 0.09905198216438293, + "learning_rate": 0.0011768353749097328, + "loss": 2.2745, + "step": 416940 + }, + { + "epoch": 1.6118120950657946, + "grad_norm": 0.1061449944972992, + "learning_rate": 0.001176592445990444, + "loss": 2.2773, + "step": 416950 + }, + { + "epoch": 1.6118507522691778, + "grad_norm": 0.10233290493488312, + "learning_rate": 0.00117634958872104, + "loss": 2.2644, + "step": 416960 + }, + { + "epoch": 1.611889409472561, + "grad_norm": 0.10700671374797821, + "learning_rate": 0.0011761068030381608, + "loss": 2.263, + "step": 416970 + }, + { + "epoch": 1.6119280666759443, + "grad_norm": 0.11679814755916595, + "learning_rate": 0.0011758640888785395, + "loss": 2.2845, + "step": 416980 + }, + { + "epoch": 1.6119667238793278, + "grad_norm": 0.09567833691835403, + "learning_rate": 0.0011756214461790023, + "loss": 2.2856, + "step": 416990 + }, + { + "epoch": 1.612005381082711, + "grad_norm": 0.1070074737071991, + "learning_rate": 0.001175378874876468, + "loss": 2.2666, + "step": 417000 + }, + { + "epoch": 1.6120440382860943, + "grad_norm": 0.10622667521238327, + "learning_rate": 0.0011751363749079489, + "loss": 2.2872, + "step": 417010 + }, + { + "epoch": 1.6120826954894776, + "grad_norm": 0.10399883985519409, + "learning_rate": 0.0011748939462105494, + "loss": 2.2621, + "step": 417020 + }, + { + "epoch": 1.6121213526928608, + "grad_norm": 0.11849434673786163, + "learning_rate": 0.0011746515887214662, + "loss": 2.2672, + "step": 417030 + }, + { + "epoch": 1.612160009896244, + "grad_norm": 0.10450667142868042, + "learning_rate": 0.0011744093023779883, + "loss": 2.2891, + "step": 417040 + }, + { + "epoch": 1.6121986670996273, + "grad_norm": 0.10449767112731934, + "learning_rate": 0.001174167087117497, + "loss": 2.2762, + "step": 417050 + }, + { + "epoch": 1.6122373243030106, + "grad_norm": 0.1145240068435669, + "learning_rate": 0.0011739249428774645, + "loss": 2.2736, + "step": 417060 + }, + { + "epoch": 1.6122759815063938, + "grad_norm": 0.10412728041410446, + "learning_rate": 0.001173682869595456, + "loss": 2.2601, + "step": 417070 + }, + { + "epoch": 1.612314638709777, + "grad_norm": 0.11344483494758606, + "learning_rate": 0.001173440867209127, + "loss": 2.2651, + "step": 417080 + }, + { + "epoch": 1.6123532959131603, + "grad_norm": 0.10842296481132507, + "learning_rate": 0.0011731989356562245, + "loss": 2.2543, + "step": 417090 + }, + { + "epoch": 1.6123919531165436, + "grad_norm": 0.10091166198253632, + "learning_rate": 0.0011729570748745869, + "loss": 2.2493, + "step": 417100 + }, + { + "epoch": 1.6124306103199268, + "grad_norm": 0.10709764808416367, + "learning_rate": 0.0011727152848021425, + "loss": 2.2795, + "step": 417110 + }, + { + "epoch": 1.6124692675233103, + "grad_norm": 0.11410240828990936, + "learning_rate": 0.001172473565376912, + "loss": 2.2663, + "step": 417120 + }, + { + "epoch": 1.6125079247266936, + "grad_norm": 0.1093989834189415, + "learning_rate": 0.001172231916537005, + "loss": 2.2734, + "step": 417130 + }, + { + "epoch": 1.6125465819300768, + "grad_norm": 0.11429251730442047, + "learning_rate": 0.0011719903382206218, + "loss": 2.2613, + "step": 417140 + }, + { + "epoch": 1.61258523913346, + "grad_norm": 0.102109394967556, + "learning_rate": 0.0011717488303660537, + "loss": 2.2867, + "step": 417150 + }, + { + "epoch": 1.6126238963368436, + "grad_norm": 0.09235858172178268, + "learning_rate": 0.0011715073929116809, + "loss": 2.2673, + "step": 417160 + }, + { + "epoch": 1.6126625535402268, + "grad_norm": 0.10976700484752655, + "learning_rate": 0.0011712660257959734, + "loss": 2.2827, + "step": 417170 + }, + { + "epoch": 1.61270121074361, + "grad_norm": 0.10967014729976654, + "learning_rate": 0.0011710247289574917, + "loss": 2.2583, + "step": 417180 + }, + { + "epoch": 1.6127398679469933, + "grad_norm": 0.10642175376415253, + "learning_rate": 0.001170783502334885, + "loss": 2.2621, + "step": 417190 + }, + { + "epoch": 1.6127785251503766, + "grad_norm": 0.09669990092515945, + "learning_rate": 0.0011705423458668914, + "loss": 2.2699, + "step": 417200 + }, + { + "epoch": 1.6128171823537598, + "grad_norm": 0.12274808436632156, + "learning_rate": 0.0011703012594923385, + "loss": 2.2674, + "step": 417210 + }, + { + "epoch": 1.612855839557143, + "grad_norm": 0.11121085286140442, + "learning_rate": 0.001170060243150143, + "loss": 2.2629, + "step": 417220 + }, + { + "epoch": 1.6128944967605263, + "grad_norm": 0.10337743163108826, + "learning_rate": 0.0011698192967793096, + "loss": 2.2863, + "step": 417230 + }, + { + "epoch": 1.6129331539639096, + "grad_norm": 0.09829306602478027, + "learning_rate": 0.0011695784203189321, + "loss": 2.2731, + "step": 417240 + }, + { + "epoch": 1.6129718111672928, + "grad_norm": 0.09615987539291382, + "learning_rate": 0.0011693376137081927, + "loss": 2.2701, + "step": 417250 + }, + { + "epoch": 1.613010468370676, + "grad_norm": 0.09579098224639893, + "learning_rate": 0.0011690968768863605, + "loss": 2.2729, + "step": 417260 + }, + { + "epoch": 1.6130491255740593, + "grad_norm": 0.10610536485910416, + "learning_rate": 0.0011688562097927943, + "loss": 2.2658, + "step": 417270 + }, + { + "epoch": 1.6130877827774428, + "grad_norm": 0.11483661830425262, + "learning_rate": 0.0011686156123669389, + "loss": 2.2666, + "step": 417280 + }, + { + "epoch": 1.613126439980826, + "grad_norm": 0.16898894309997559, + "learning_rate": 0.0011683750845483286, + "loss": 2.3003, + "step": 417290 + }, + { + "epoch": 1.6131650971842093, + "grad_norm": 0.11532501876354218, + "learning_rate": 0.0011681346262765833, + "loss": 2.2769, + "step": 417300 + }, + { + "epoch": 1.6132037543875926, + "grad_norm": 0.09754514694213867, + "learning_rate": 0.0011678942374914111, + "loss": 2.2691, + "step": 417310 + }, + { + "epoch": 1.6132424115909758, + "grad_norm": 0.12471870332956314, + "learning_rate": 0.0011676539181326075, + "loss": 2.2793, + "step": 417320 + }, + { + "epoch": 1.6132810687943593, + "grad_norm": 0.10189971327781677, + "learning_rate": 0.0011674136681400543, + "loss": 2.2641, + "step": 417330 + }, + { + "epoch": 1.6133197259977425, + "grad_norm": 0.09957670420408249, + "learning_rate": 0.0011671734874537194, + "loss": 2.2789, + "step": 417340 + }, + { + "epoch": 1.6133583832011258, + "grad_norm": 0.0903426930308342, + "learning_rate": 0.001166933376013659, + "loss": 2.2812, + "step": 417350 + }, + { + "epoch": 1.613397040404509, + "grad_norm": 0.10298719257116318, + "learning_rate": 0.0011666933337600137, + "loss": 2.2713, + "step": 417360 + }, + { + "epoch": 1.6134356976078923, + "grad_norm": 0.10459873825311661, + "learning_rate": 0.0011664533606330118, + "loss": 2.2795, + "step": 417370 + }, + { + "epoch": 1.6134743548112755, + "grad_norm": 0.15128566324710846, + "learning_rate": 0.0011662134565729666, + "loss": 2.2543, + "step": 417380 + }, + { + "epoch": 1.6135130120146588, + "grad_norm": 0.09230190515518188, + "learning_rate": 0.0011659736215202782, + "loss": 2.2753, + "step": 417390 + }, + { + "epoch": 1.613551669218042, + "grad_norm": 0.10914508253335953, + "learning_rate": 0.0011657338554154318, + "loss": 2.2791, + "step": 417400 + }, + { + "epoch": 1.6135903264214253, + "grad_norm": 0.11341839283704758, + "learning_rate": 0.0011654941581989973, + "loss": 2.2741, + "step": 417410 + }, + { + "epoch": 1.6136289836248086, + "grad_norm": 0.09619162231683731, + "learning_rate": 0.001165254529811632, + "loss": 2.2733, + "step": 417420 + }, + { + "epoch": 1.6136676408281918, + "grad_norm": 0.12042240798473358, + "learning_rate": 0.001165014970194076, + "loss": 2.2602, + "step": 417430 + }, + { + "epoch": 1.613706298031575, + "grad_norm": 0.09932298958301544, + "learning_rate": 0.0011647754792871558, + "loss": 2.2867, + "step": 417440 + }, + { + "epoch": 1.6137449552349585, + "grad_norm": 0.1054065153002739, + "learning_rate": 0.001164536057031783, + "loss": 2.2724, + "step": 417450 + }, + { + "epoch": 1.6137836124383418, + "grad_norm": 0.12224634736776352, + "learning_rate": 0.001164296703368953, + "loss": 2.2763, + "step": 417460 + }, + { + "epoch": 1.613822269641725, + "grad_norm": 0.10424677282571793, + "learning_rate": 0.0011640574182397454, + "loss": 2.2655, + "step": 417470 + }, + { + "epoch": 1.6138609268451083, + "grad_norm": 0.10834372043609619, + "learning_rate": 0.001163818201585325, + "loss": 2.263, + "step": 417480 + }, + { + "epoch": 1.6138995840484915, + "grad_norm": 0.10484948009252548, + "learning_rate": 0.0011635790533469407, + "loss": 2.278, + "step": 417490 + }, + { + "epoch": 1.613938241251875, + "grad_norm": 0.12994012236595154, + "learning_rate": 0.0011633399734659244, + "loss": 2.298, + "step": 417500 + }, + { + "epoch": 1.6139768984552583, + "grad_norm": 0.09662307798862457, + "learning_rate": 0.001163100961883693, + "loss": 2.2545, + "step": 417510 + }, + { + "epoch": 1.6140155556586415, + "grad_norm": 0.10316623002290726, + "learning_rate": 0.001162862018541746, + "loss": 2.2839, + "step": 417520 + }, + { + "epoch": 1.6140542128620248, + "grad_norm": 0.12849442660808563, + "learning_rate": 0.001162623143381667, + "loss": 2.2764, + "step": 417530 + }, + { + "epoch": 1.614092870065408, + "grad_norm": 0.11165708303451538, + "learning_rate": 0.0011623843363451232, + "loss": 2.273, + "step": 417540 + }, + { + "epoch": 1.6141315272687913, + "grad_norm": 0.11638086289167404, + "learning_rate": 0.0011621455973738635, + "loss": 2.2857, + "step": 417550 + }, + { + "epoch": 1.6141701844721745, + "grad_norm": 0.10104557871818542, + "learning_rate": 0.0011619069264097216, + "loss": 2.2802, + "step": 417560 + }, + { + "epoch": 1.6142088416755578, + "grad_norm": 0.09986381232738495, + "learning_rate": 0.0011616683233946126, + "loss": 2.2755, + "step": 417570 + }, + { + "epoch": 1.614247498878941, + "grad_norm": 0.10763323307037354, + "learning_rate": 0.0011614297882705346, + "loss": 2.2704, + "step": 417580 + }, + { + "epoch": 1.6142861560823243, + "grad_norm": 0.10754591226577759, + "learning_rate": 0.0011611913209795693, + "loss": 2.2666, + "step": 417590 + }, + { + "epoch": 1.6143248132857075, + "grad_norm": 0.10428612679243088, + "learning_rate": 0.0011609529214638788, + "loss": 2.2616, + "step": 417600 + }, + { + "epoch": 1.6143634704890908, + "grad_norm": 0.13924959301948547, + "learning_rate": 0.0011607145896657086, + "loss": 2.2711, + "step": 417610 + }, + { + "epoch": 1.6144021276924743, + "grad_norm": 0.11408555507659912, + "learning_rate": 0.0011604763255273858, + "loss": 2.2925, + "step": 417620 + }, + { + "epoch": 1.6144407848958575, + "grad_norm": 0.10074104368686676, + "learning_rate": 0.0011602381289913192, + "loss": 2.2502, + "step": 417630 + }, + { + "epoch": 1.6144794420992408, + "grad_norm": 0.10464302450418472, + "learning_rate": 0.0011600000000000002, + "loss": 2.296, + "step": 417640 + }, + { + "epoch": 1.614518099302624, + "grad_norm": 0.10674621909856796, + "learning_rate": 0.001159761938496, + "loss": 2.261, + "step": 417650 + }, + { + "epoch": 1.6145567565060075, + "grad_norm": 0.10986209660768509, + "learning_rate": 0.0011595239444219722, + "loss": 2.2839, + "step": 417660 + }, + { + "epoch": 1.6145954137093907, + "grad_norm": 0.11070261895656586, + "learning_rate": 0.0011592860177206518, + "loss": 2.2749, + "step": 417670 + }, + { + "epoch": 1.614634070912774, + "grad_norm": 0.10384651273488998, + "learning_rate": 0.0011590481583348545, + "loss": 2.2741, + "step": 417680 + }, + { + "epoch": 1.6146727281161573, + "grad_norm": 0.09847825020551682, + "learning_rate": 0.0011588103662074764, + "loss": 2.2594, + "step": 417690 + }, + { + "epoch": 1.6147113853195405, + "grad_norm": 0.1030513197183609, + "learning_rate": 0.001158572641281495, + "loss": 2.2655, + "step": 417700 + }, + { + "epoch": 1.6147500425229238, + "grad_norm": 0.11066064238548279, + "learning_rate": 0.0011583349834999675, + "loss": 2.2739, + "step": 417710 + }, + { + "epoch": 1.614788699726307, + "grad_norm": 0.11787351965904236, + "learning_rate": 0.0011580973928060326, + "loss": 2.2642, + "step": 417720 + }, + { + "epoch": 1.6148273569296903, + "grad_norm": 0.09980590641498566, + "learning_rate": 0.001157859869142908, + "loss": 2.2598, + "step": 417730 + }, + { + "epoch": 1.6148660141330735, + "grad_norm": 0.10738690942525864, + "learning_rate": 0.0011576224124538925, + "loss": 2.2705, + "step": 417740 + }, + { + "epoch": 1.6149046713364568, + "grad_norm": 0.10659084469079971, + "learning_rate": 0.0011573850226823642, + "loss": 2.266, + "step": 417750 + }, + { + "epoch": 1.61494332853984, + "grad_norm": 0.11544018238782883, + "learning_rate": 0.001157147699771781, + "loss": 2.272, + "step": 417760 + }, + { + "epoch": 1.6149819857432233, + "grad_norm": 0.0990079790353775, + "learning_rate": 0.0011569104436656803, + "loss": 2.2523, + "step": 417770 + }, + { + "epoch": 1.6150206429466065, + "grad_norm": 0.10228414833545685, + "learning_rate": 0.0011566732543076795, + "loss": 2.274, + "step": 417780 + }, + { + "epoch": 1.61505930014999, + "grad_norm": 0.11280784010887146, + "learning_rate": 0.0011564361316414745, + "loss": 2.2764, + "step": 417790 + }, + { + "epoch": 1.6150979573533732, + "grad_norm": 0.10561858117580414, + "learning_rate": 0.0011561990756108406, + "loss": 2.2863, + "step": 417800 + }, + { + "epoch": 1.6151366145567565, + "grad_norm": 0.1057622954249382, + "learning_rate": 0.0011559620861596323, + "loss": 2.267, + "step": 417810 + }, + { + "epoch": 1.6151752717601398, + "grad_norm": 0.10904049128293991, + "learning_rate": 0.0011557251632317827, + "loss": 2.2701, + "step": 417820 + }, + { + "epoch": 1.6152139289635232, + "grad_norm": 0.116305410861969, + "learning_rate": 0.0011554883067713035, + "loss": 2.2764, + "step": 417830 + }, + { + "epoch": 1.6152525861669065, + "grad_norm": 0.11812672764062881, + "learning_rate": 0.0011552515167222851, + "loss": 2.2764, + "step": 417840 + }, + { + "epoch": 1.6152912433702897, + "grad_norm": 0.10141696780920029, + "learning_rate": 0.0011550147930288956, + "loss": 2.2673, + "step": 417850 + }, + { + "epoch": 1.615329900573673, + "grad_norm": 0.11404278874397278, + "learning_rate": 0.0011547781356353825, + "loss": 2.2878, + "step": 417860 + }, + { + "epoch": 1.6153685577770562, + "grad_norm": 0.10533790290355682, + "learning_rate": 0.00115454154448607, + "loss": 2.2804, + "step": 417870 + }, + { + "epoch": 1.6154072149804395, + "grad_norm": 0.09895123541355133, + "learning_rate": 0.0011543050195253611, + "loss": 2.2665, + "step": 417880 + }, + { + "epoch": 1.6154458721838227, + "grad_norm": 0.10912610590457916, + "learning_rate": 0.001154068560697736, + "loss": 2.263, + "step": 417890 + }, + { + "epoch": 1.615484529387206, + "grad_norm": 0.11409979313611984, + "learning_rate": 0.0011538321679477527, + "loss": 2.289, + "step": 417900 + }, + { + "epoch": 1.6155231865905892, + "grad_norm": 0.09628956764936447, + "learning_rate": 0.0011535958412200468, + "loss": 2.2741, + "step": 417910 + }, + { + "epoch": 1.6155618437939725, + "grad_norm": 0.10939304530620575, + "learning_rate": 0.001153359580459331, + "loss": 2.2604, + "step": 417920 + }, + { + "epoch": 1.6156005009973557, + "grad_norm": 0.1213667020201683, + "learning_rate": 0.0011531233856103948, + "loss": 2.2608, + "step": 417930 + }, + { + "epoch": 1.615639158200739, + "grad_norm": 0.25866174697875977, + "learning_rate": 0.0011528872566181052, + "loss": 2.2706, + "step": 417940 + }, + { + "epoch": 1.6156778154041223, + "grad_norm": 0.10094214230775833, + "learning_rate": 0.0011526511934274058, + "loss": 2.2852, + "step": 417950 + }, + { + "epoch": 1.6157164726075057, + "grad_norm": 0.1102001741528511, + "learning_rate": 0.0011524151959833163, + "loss": 2.2738, + "step": 417960 + }, + { + "epoch": 1.615755129810889, + "grad_norm": 0.09849420934915543, + "learning_rate": 0.0011521792642309344, + "loss": 2.2626, + "step": 417970 + }, + { + "epoch": 1.6157937870142722, + "grad_norm": 0.09583023935556412, + "learning_rate": 0.0011519433981154324, + "loss": 2.2702, + "step": 417980 + }, + { + "epoch": 1.6158324442176555, + "grad_norm": 0.10503140091896057, + "learning_rate": 0.00115170759758206, + "loss": 2.2781, + "step": 417990 + }, + { + "epoch": 1.615871101421039, + "grad_norm": 0.10275324434041977, + "learning_rate": 0.001151471862576143, + "loss": 2.2715, + "step": 418000 + }, + { + "epoch": 1.6159097586244222, + "grad_norm": 0.10475151240825653, + "learning_rate": 0.0011512361930430823, + "loss": 2.2748, + "step": 418010 + }, + { + "epoch": 1.6159484158278055, + "grad_norm": 0.10688836872577667, + "learning_rate": 0.001151000588928355, + "loss": 2.2759, + "step": 418020 + }, + { + "epoch": 1.6159870730311887, + "grad_norm": 0.10008478909730911, + "learning_rate": 0.0011507650501775143, + "loss": 2.2548, + "step": 418030 + }, + { + "epoch": 1.616025730234572, + "grad_norm": 0.11711792647838593, + "learning_rate": 0.0011505295767361878, + "loss": 2.2767, + "step": 418040 + }, + { + "epoch": 1.6160643874379552, + "grad_norm": 0.1123252585530281, + "learning_rate": 0.0011502941685500798, + "loss": 2.2696, + "step": 418050 + }, + { + "epoch": 1.6161030446413385, + "grad_norm": 0.10946837067604065, + "learning_rate": 0.0011500588255649688, + "loss": 2.2647, + "step": 418060 + }, + { + "epoch": 1.6161417018447217, + "grad_norm": 0.11430559307336807, + "learning_rate": 0.0011498235477267087, + "loss": 2.266, + "step": 418070 + }, + { + "epoch": 1.616180359048105, + "grad_norm": 0.11472238600254059, + "learning_rate": 0.001149588334981228, + "loss": 2.2631, + "step": 418080 + }, + { + "epoch": 1.6162190162514882, + "grad_norm": 0.10393355786800385, + "learning_rate": 0.0011493531872745306, + "loss": 2.2642, + "step": 418090 + }, + { + "epoch": 1.6162576734548715, + "grad_norm": 0.10616232454776764, + "learning_rate": 0.0011491181045526942, + "loss": 2.2711, + "step": 418100 + }, + { + "epoch": 1.6162963306582547, + "grad_norm": 0.10849283635616302, + "learning_rate": 0.0011488830867618716, + "loss": 2.2722, + "step": 418110 + }, + { + "epoch": 1.616334987861638, + "grad_norm": 0.11171620339155197, + "learning_rate": 0.0011486481338482894, + "loss": 2.2597, + "step": 418120 + }, + { + "epoch": 1.6163736450650215, + "grad_norm": 0.10275192558765411, + "learning_rate": 0.0011484132457582495, + "loss": 2.2737, + "step": 418130 + }, + { + "epoch": 1.6164123022684047, + "grad_norm": 0.1024751290678978, + "learning_rate": 0.001148178422438126, + "loss": 2.2794, + "step": 418140 + }, + { + "epoch": 1.616450959471788, + "grad_norm": 0.1121668741106987, + "learning_rate": 0.0011479436638343683, + "loss": 2.2672, + "step": 418150 + }, + { + "epoch": 1.6164896166751712, + "grad_norm": 0.4614737033843994, + "learning_rate": 0.0011477089698934994, + "loss": 2.2657, + "step": 418160 + }, + { + "epoch": 1.6165282738785547, + "grad_norm": 0.11732994765043259, + "learning_rate": 0.0011474743405621153, + "loss": 2.2642, + "step": 418170 + }, + { + "epoch": 1.616566931081938, + "grad_norm": 0.10660345107316971, + "learning_rate": 0.001147239775786886, + "loss": 2.2844, + "step": 418180 + }, + { + "epoch": 1.6166055882853212, + "grad_norm": 0.10075422376394272, + "learning_rate": 0.0011470052755145552, + "loss": 2.271, + "step": 418190 + }, + { + "epoch": 1.6166442454887044, + "grad_norm": 0.11237199604511261, + "learning_rate": 0.0011467708396919382, + "loss": 2.26, + "step": 418200 + }, + { + "epoch": 1.6166829026920877, + "grad_norm": 0.11487163603305817, + "learning_rate": 0.0011465364682659255, + "loss": 2.2726, + "step": 418210 + }, + { + "epoch": 1.616721559895471, + "grad_norm": 0.09262558817863464, + "learning_rate": 0.001146302161183478, + "loss": 2.2807, + "step": 418220 + }, + { + "epoch": 1.6167602170988542, + "grad_norm": 0.09925800561904907, + "learning_rate": 0.0011460679183916322, + "loss": 2.2694, + "step": 418230 + }, + { + "epoch": 1.6167988743022375, + "grad_norm": 0.11094321310520172, + "learning_rate": 0.001145833739837495, + "loss": 2.2695, + "step": 418240 + }, + { + "epoch": 1.6168375315056207, + "grad_norm": 0.09434594959020615, + "learning_rate": 0.0011455996254682468, + "loss": 2.2744, + "step": 418250 + }, + { + "epoch": 1.616876188709004, + "grad_norm": 0.10965374112129211, + "learning_rate": 0.00114536557523114, + "loss": 2.264, + "step": 418260 + }, + { + "epoch": 1.6169148459123872, + "grad_norm": 0.10408959537744522, + "learning_rate": 0.0011451315890734994, + "loss": 2.2776, + "step": 418270 + }, + { + "epoch": 1.6169535031157705, + "grad_norm": 0.10442472994327545, + "learning_rate": 0.001144897666942722, + "loss": 2.2777, + "step": 418280 + }, + { + "epoch": 1.6169921603191537, + "grad_norm": 0.10826219618320465, + "learning_rate": 0.0011446638087862762, + "loss": 2.263, + "step": 418290 + }, + { + "epoch": 1.6170308175225372, + "grad_norm": 0.11713138967752457, + "learning_rate": 0.0011444300145517024, + "loss": 2.2698, + "step": 418300 + }, + { + "epoch": 1.6170694747259204, + "grad_norm": 0.1076996773481369, + "learning_rate": 0.0011441962841866134, + "loss": 2.2774, + "step": 418310 + }, + { + "epoch": 1.6171081319293037, + "grad_norm": 0.1485646516084671, + "learning_rate": 0.001143962617638692, + "loss": 2.2644, + "step": 418320 + }, + { + "epoch": 1.617146789132687, + "grad_norm": 0.10950548946857452, + "learning_rate": 0.0011437290148556942, + "loss": 2.2821, + "step": 418330 + }, + { + "epoch": 1.6171854463360704, + "grad_norm": 0.10513289272785187, + "learning_rate": 0.0011434954757854458, + "loss": 2.2699, + "step": 418340 + }, + { + "epoch": 1.6172241035394537, + "grad_norm": 0.1014241948723793, + "learning_rate": 0.0011432620003758442, + "loss": 2.2755, + "step": 418350 + }, + { + "epoch": 1.617262760742837, + "grad_norm": 0.09708770364522934, + "learning_rate": 0.0011430285885748578, + "loss": 2.2565, + "step": 418360 + }, + { + "epoch": 1.6173014179462202, + "grad_norm": 0.09347762167453766, + "learning_rate": 0.0011427952403305264, + "loss": 2.2654, + "step": 418370 + }, + { + "epoch": 1.6173400751496034, + "grad_norm": 0.09401445835828781, + "learning_rate": 0.0011425619555909595, + "loss": 2.2756, + "step": 418380 + }, + { + "epoch": 1.6173787323529867, + "grad_norm": 0.09593868255615234, + "learning_rate": 0.0011423287343043374, + "loss": 2.2673, + "step": 418390 + }, + { + "epoch": 1.61741738955637, + "grad_norm": 0.09695480018854141, + "learning_rate": 0.0011420955764189114, + "loss": 2.2621, + "step": 418400 + }, + { + "epoch": 1.6174560467597532, + "grad_norm": 0.10635862499475479, + "learning_rate": 0.0011418624818830025, + "loss": 2.2752, + "step": 418410 + }, + { + "epoch": 1.6174947039631364, + "grad_norm": 0.0985569879412651, + "learning_rate": 0.0011416294506450028, + "loss": 2.2679, + "step": 418420 + }, + { + "epoch": 1.6175333611665197, + "grad_norm": 0.10103638470172882, + "learning_rate": 0.0011413964826533726, + "loss": 2.2672, + "step": 418430 + }, + { + "epoch": 1.617572018369903, + "grad_norm": 0.11381982266902924, + "learning_rate": 0.0011411635778566444, + "loss": 2.2776, + "step": 418440 + }, + { + "epoch": 1.6176106755732862, + "grad_norm": 0.09681716561317444, + "learning_rate": 0.0011409307362034186, + "loss": 2.2727, + "step": 418450 + }, + { + "epoch": 1.6176493327766694, + "grad_norm": 0.10214970260858536, + "learning_rate": 0.0011406979576423666, + "loss": 2.2743, + "step": 418460 + }, + { + "epoch": 1.617687989980053, + "grad_norm": 0.10435772687196732, + "learning_rate": 0.001140465242122228, + "loss": 2.2689, + "step": 418470 + }, + { + "epoch": 1.6177266471834362, + "grad_norm": 0.10388590395450592, + "learning_rate": 0.0011402325895918128, + "loss": 2.2792, + "step": 418480 + }, + { + "epoch": 1.6177653043868194, + "grad_norm": 0.0986008271574974, + "learning_rate": 0.0011400000000000002, + "loss": 2.2697, + "step": 418490 + }, + { + "epoch": 1.6178039615902027, + "grad_norm": 0.09806541353464127, + "learning_rate": 0.0011397674732957373, + "loss": 2.2812, + "step": 418500 + }, + { + "epoch": 1.6178426187935862, + "grad_norm": 0.11309289932250977, + "learning_rate": 0.0011395350094280418, + "loss": 2.254, + "step": 418510 + }, + { + "epoch": 1.6178812759969694, + "grad_norm": 0.09836818277835846, + "learning_rate": 0.0011393026083459997, + "loss": 2.2732, + "step": 418520 + }, + { + "epoch": 1.6179199332003527, + "grad_norm": 0.09785986691713333, + "learning_rate": 0.0011390702699987647, + "loss": 2.2674, + "step": 418530 + }, + { + "epoch": 1.617958590403736, + "grad_norm": 0.11060678213834763, + "learning_rate": 0.0011388379943355607, + "loss": 2.2843, + "step": 418540 + }, + { + "epoch": 1.6179972476071192, + "grad_norm": 0.10308713465929031, + "learning_rate": 0.001138605781305679, + "loss": 2.2775, + "step": 418550 + }, + { + "epoch": 1.6180359048105024, + "grad_norm": 0.10552119463682175, + "learning_rate": 0.0011383736308584794, + "loss": 2.2556, + "step": 418560 + }, + { + "epoch": 1.6180745620138857, + "grad_norm": 0.10252942144870758, + "learning_rate": 0.0011381415429433904, + "loss": 2.2814, + "step": 418570 + }, + { + "epoch": 1.618113219217269, + "grad_norm": 0.09974144399166107, + "learning_rate": 0.0011379095175099079, + "loss": 2.2785, + "step": 418580 + }, + { + "epoch": 1.6181518764206522, + "grad_norm": 0.1105218306183815, + "learning_rate": 0.001137677554507596, + "loss": 2.2559, + "step": 418590 + }, + { + "epoch": 1.6181905336240354, + "grad_norm": 0.09096836298704147, + "learning_rate": 0.0011374456538860872, + "loss": 2.2802, + "step": 418600 + }, + { + "epoch": 1.6182291908274187, + "grad_norm": 0.10733989626169205, + "learning_rate": 0.0011372138155950804, + "loss": 2.2743, + "step": 418610 + }, + { + "epoch": 1.618267848030802, + "grad_norm": 0.09750019013881683, + "learning_rate": 0.0011369820395843433, + "loss": 2.2576, + "step": 418620 + }, + { + "epoch": 1.6183065052341852, + "grad_norm": 0.10365524888038635, + "learning_rate": 0.0011367503258037104, + "loss": 2.2797, + "step": 418630 + }, + { + "epoch": 1.6183451624375687, + "grad_norm": 0.11139106005430222, + "learning_rate": 0.0011365186742030838, + "loss": 2.2517, + "step": 418640 + }, + { + "epoch": 1.618383819640952, + "grad_norm": 0.12801063060760498, + "learning_rate": 0.0011362870847324326, + "loss": 2.2695, + "step": 418650 + }, + { + "epoch": 1.6184224768443352, + "grad_norm": 0.11003414541482925, + "learning_rate": 0.001136055557341793, + "loss": 2.2759, + "step": 418660 + }, + { + "epoch": 1.6184611340477184, + "grad_norm": 0.10692291706800461, + "learning_rate": 0.001135824091981268, + "loss": 2.267, + "step": 418670 + }, + { + "epoch": 1.6184997912511019, + "grad_norm": 0.09166417270898819, + "learning_rate": 0.0011355926886010277, + "loss": 2.2655, + "step": 418680 + }, + { + "epoch": 1.6185384484544851, + "grad_norm": 0.10425940155982971, + "learning_rate": 0.0011353613471513087, + "loss": 2.2675, + "step": 418690 + }, + { + "epoch": 1.6185771056578684, + "grad_norm": 0.10555826127529144, + "learning_rate": 0.0011351300675824139, + "loss": 2.2808, + "step": 418700 + }, + { + "epoch": 1.6186157628612516, + "grad_norm": 0.0973772332072258, + "learning_rate": 0.0011348988498447133, + "loss": 2.2829, + "step": 418710 + }, + { + "epoch": 1.618654420064635, + "grad_norm": 0.11358707398176193, + "learning_rate": 0.0011346676938886425, + "loss": 2.2616, + "step": 418720 + }, + { + "epoch": 1.6186930772680181, + "grad_norm": 0.10206224024295807, + "learning_rate": 0.0011344365996647041, + "loss": 2.2683, + "step": 418730 + }, + { + "epoch": 1.6187317344714014, + "grad_norm": 0.10238737612962723, + "learning_rate": 0.0011342055671234655, + "loss": 2.2721, + "step": 418740 + }, + { + "epoch": 1.6187703916747846, + "grad_norm": 0.10538827627897263, + "learning_rate": 0.0011339745962155613, + "loss": 2.2629, + "step": 418750 + }, + { + "epoch": 1.618809048878168, + "grad_norm": 0.10559193789958954, + "learning_rate": 0.0011337436868916915, + "loss": 2.2549, + "step": 418760 + }, + { + "epoch": 1.6188477060815512, + "grad_norm": 0.11526539921760559, + "learning_rate": 0.0011335128391026213, + "loss": 2.2805, + "step": 418770 + }, + { + "epoch": 1.6188863632849344, + "grad_norm": 0.1010223776102066, + "learning_rate": 0.001133282052799182, + "loss": 2.2703, + "step": 418780 + }, + { + "epoch": 1.6189250204883177, + "grad_norm": 0.11898636072874069, + "learning_rate": 0.0011330513279322703, + "loss": 2.2674, + "step": 418790 + }, + { + "epoch": 1.618963677691701, + "grad_norm": 0.11917950958013535, + "learning_rate": 0.001132820664452848, + "loss": 2.268, + "step": 418800 + }, + { + "epoch": 1.6190023348950844, + "grad_norm": 0.10815060138702393, + "learning_rate": 0.0011325900623119422, + "loss": 2.2786, + "step": 418810 + }, + { + "epoch": 1.6190409920984676, + "grad_norm": 0.09814000129699707, + "learning_rate": 0.0011323595214606454, + "loss": 2.2581, + "step": 418820 + }, + { + "epoch": 1.6190796493018509, + "grad_norm": 0.09670665860176086, + "learning_rate": 0.0011321290418501147, + "loss": 2.2665, + "step": 418830 + }, + { + "epoch": 1.6191183065052341, + "grad_norm": 0.11535345762968063, + "learning_rate": 0.0011318986234315719, + "loss": 2.2738, + "step": 418840 + }, + { + "epoch": 1.6191569637086176, + "grad_norm": 0.12406040728092194, + "learning_rate": 0.0011316682661563035, + "loss": 2.281, + "step": 418850 + }, + { + "epoch": 1.6191956209120009, + "grad_norm": 0.10750404745340347, + "learning_rate": 0.0011314379699756615, + "loss": 2.2669, + "step": 418860 + }, + { + "epoch": 1.6192342781153841, + "grad_norm": 0.10614997148513794, + "learning_rate": 0.0011312077348410612, + "loss": 2.2741, + "step": 418870 + }, + { + "epoch": 1.6192729353187674, + "grad_norm": 0.11189042776823044, + "learning_rate": 0.0011309775607039826, + "loss": 2.2819, + "step": 418880 + }, + { + "epoch": 1.6193115925221506, + "grad_norm": 0.10751363635063171, + "learning_rate": 0.001130747447515971, + "loss": 2.2717, + "step": 418890 + }, + { + "epoch": 1.6193502497255339, + "grad_norm": 0.10153806954622269, + "learning_rate": 0.0011305173952286336, + "loss": 2.2667, + "step": 418900 + }, + { + "epoch": 1.6193889069289171, + "grad_norm": 0.10106009989976883, + "learning_rate": 0.001130287403793644, + "loss": 2.2728, + "step": 418910 + }, + { + "epoch": 1.6194275641323004, + "grad_norm": 0.10318268090486526, + "learning_rate": 0.0011300574731627382, + "loss": 2.2667, + "step": 418920 + }, + { + "epoch": 1.6194662213356836, + "grad_norm": 0.11694622784852982, + "learning_rate": 0.0011298276032877164, + "loss": 2.263, + "step": 418930 + }, + { + "epoch": 1.6195048785390669, + "grad_norm": 0.10246624052524567, + "learning_rate": 0.0011295977941204423, + "loss": 2.2536, + "step": 418940 + }, + { + "epoch": 1.6195435357424501, + "grad_norm": 0.10643582791090012, + "learning_rate": 0.0011293680456128433, + "loss": 2.2751, + "step": 418950 + }, + { + "epoch": 1.6195821929458334, + "grad_norm": 0.09867900609970093, + "learning_rate": 0.0011291383577169106, + "loss": 2.2763, + "step": 418960 + }, + { + "epoch": 1.6196208501492166, + "grad_norm": 0.11613151431083679, + "learning_rate": 0.0011289087303846972, + "loss": 2.2676, + "step": 418970 + }, + { + "epoch": 1.6196595073526001, + "grad_norm": 0.10688113421201706, + "learning_rate": 0.0011286791635683215, + "loss": 2.2639, + "step": 418980 + }, + { + "epoch": 1.6196981645559834, + "grad_norm": 0.1168067455291748, + "learning_rate": 0.001128449657219963, + "loss": 2.2835, + "step": 418990 + }, + { + "epoch": 1.6197368217593666, + "grad_norm": 0.11430569738149643, + "learning_rate": 0.0011282202112918654, + "loss": 2.2703, + "step": 419000 + }, + { + "epoch": 1.6197754789627499, + "grad_norm": 0.09900334477424622, + "learning_rate": 0.0011279908257363345, + "loss": 2.2908, + "step": 419010 + }, + { + "epoch": 1.6198141361661333, + "grad_norm": 0.09957217425107956, + "learning_rate": 0.0011277615005057392, + "loss": 2.2657, + "step": 419020 + }, + { + "epoch": 1.6198527933695166, + "grad_norm": 0.1115746945142746, + "learning_rate": 0.0011275322355525103, + "loss": 2.2734, + "step": 419030 + }, + { + "epoch": 1.6198914505728998, + "grad_norm": 0.1849929392337799, + "learning_rate": 0.0011273030308291427, + "loss": 2.2604, + "step": 419040 + }, + { + "epoch": 1.619930107776283, + "grad_norm": 0.1009269431233406, + "learning_rate": 0.0011270738862881923, + "loss": 2.2667, + "step": 419050 + }, + { + "epoch": 1.6199687649796664, + "grad_norm": 0.11893726885318756, + "learning_rate": 0.0011268448018822773, + "loss": 2.2531, + "step": 419060 + }, + { + "epoch": 1.6200074221830496, + "grad_norm": 0.09925411641597748, + "learning_rate": 0.001126615777564078, + "loss": 2.2606, + "step": 419070 + }, + { + "epoch": 1.6200460793864329, + "grad_norm": 0.09997145086526871, + "learning_rate": 0.001126386813286338, + "loss": 2.2594, + "step": 419080 + }, + { + "epoch": 1.620084736589816, + "grad_norm": 0.10257407277822495, + "learning_rate": 0.001126157909001861, + "loss": 2.272, + "step": 419090 + }, + { + "epoch": 1.6201233937931994, + "grad_norm": 0.11000483483076096, + "learning_rate": 0.0011259290646635138, + "loss": 2.2661, + "step": 419100 + }, + { + "epoch": 1.6201620509965826, + "grad_norm": 0.10379195958375931, + "learning_rate": 0.0011257002802242243, + "loss": 2.2706, + "step": 419110 + }, + { + "epoch": 1.6202007081999659, + "grad_norm": 0.10364500433206558, + "learning_rate": 0.001125471555636982, + "loss": 2.2732, + "step": 419120 + }, + { + "epoch": 1.6202393654033491, + "grad_norm": 0.09665635973215103, + "learning_rate": 0.0011252428908548385, + "loss": 2.2565, + "step": 419130 + }, + { + "epoch": 1.6202780226067326, + "grad_norm": 0.11705910414457321, + "learning_rate": 0.0011250142858309057, + "loss": 2.2614, + "step": 419140 + }, + { + "epoch": 1.6203166798101158, + "grad_norm": 0.10699387639760971, + "learning_rate": 0.0011247857405183575, + "loss": 2.2674, + "step": 419150 + }, + { + "epoch": 1.620355337013499, + "grad_norm": 0.11837508529424667, + "learning_rate": 0.0011245572548704286, + "loss": 2.261, + "step": 419160 + }, + { + "epoch": 1.6203939942168823, + "grad_norm": 0.10276247560977936, + "learning_rate": 0.0011243288288404144, + "loss": 2.2798, + "step": 419170 + }, + { + "epoch": 1.6204326514202656, + "grad_norm": 0.10083647817373276, + "learning_rate": 0.0011241004623816725, + "loss": 2.2655, + "step": 419180 + }, + { + "epoch": 1.620471308623649, + "grad_norm": 0.10014968365430832, + "learning_rate": 0.0011238721554476198, + "loss": 2.267, + "step": 419190 + }, + { + "epoch": 1.6205099658270323, + "grad_norm": 0.10161503404378891, + "learning_rate": 0.0011236439079917342, + "loss": 2.2813, + "step": 419200 + }, + { + "epoch": 1.6205486230304156, + "grad_norm": 0.11093269288539886, + "learning_rate": 0.001123415719967555, + "loss": 2.2858, + "step": 419210 + }, + { + "epoch": 1.6205872802337988, + "grad_norm": 0.10087086260318756, + "learning_rate": 0.001123187591328681, + "loss": 2.2699, + "step": 419220 + }, + { + "epoch": 1.620625937437182, + "grad_norm": 0.11747071892023087, + "learning_rate": 0.001122959522028772, + "loss": 2.2757, + "step": 419230 + }, + { + "epoch": 1.6206645946405653, + "grad_norm": 0.11029363423585892, + "learning_rate": 0.0011227315120215478, + "loss": 2.2642, + "step": 419240 + }, + { + "epoch": 1.6207032518439486, + "grad_norm": 0.12040729075670242, + "learning_rate": 0.0011225035612607876, + "loss": 2.2605, + "step": 419250 + }, + { + "epoch": 1.6207419090473318, + "grad_norm": 0.11009768396615982, + "learning_rate": 0.0011222756697003324, + "loss": 2.2727, + "step": 419260 + }, + { + "epoch": 1.620780566250715, + "grad_norm": 0.11284641921520233, + "learning_rate": 0.0011220478372940812, + "loss": 2.2584, + "step": 419270 + }, + { + "epoch": 1.6208192234540983, + "grad_norm": 0.10155908763408661, + "learning_rate": 0.0011218200639959942, + "loss": 2.2634, + "step": 419280 + }, + { + "epoch": 1.6208578806574816, + "grad_norm": 0.10522062331438065, + "learning_rate": 0.0011215923497600898, + "loss": 2.2669, + "step": 419290 + }, + { + "epoch": 1.6208965378608648, + "grad_norm": 0.09858293831348419, + "learning_rate": 0.001121364694540448, + "loss": 2.2692, + "step": 419300 + }, + { + "epoch": 1.6209351950642483, + "grad_norm": 0.11517447233200073, + "learning_rate": 0.001121137098291207, + "loss": 2.2639, + "step": 419310 + }, + { + "epoch": 1.6209738522676316, + "grad_norm": 0.10266264528036118, + "learning_rate": 0.0011209095609665636, + "loss": 2.2645, + "step": 419320 + }, + { + "epoch": 1.6210125094710148, + "grad_norm": 0.1053464263677597, + "learning_rate": 0.0011206820825207756, + "loss": 2.2579, + "step": 419330 + }, + { + "epoch": 1.621051166674398, + "grad_norm": 0.09718520939350128, + "learning_rate": 0.001120454662908159, + "loss": 2.2619, + "step": 419340 + }, + { + "epoch": 1.6210898238777813, + "grad_norm": 0.13670788705348969, + "learning_rate": 0.0011202273020830891, + "loss": 2.2653, + "step": 419350 + }, + { + "epoch": 1.6211284810811648, + "grad_norm": 0.09898941963911057, + "learning_rate": 0.0011200000000000001, + "loss": 2.2638, + "step": 419360 + }, + { + "epoch": 1.621167138284548, + "grad_norm": 0.10254188627004623, + "learning_rate": 0.0011197727566133846, + "loss": 2.2611, + "step": 419370 + }, + { + "epoch": 1.6212057954879313, + "grad_norm": 0.12132681906223297, + "learning_rate": 0.0011195455718777945, + "loss": 2.2588, + "step": 419380 + }, + { + "epoch": 1.6212444526913146, + "grad_norm": 0.10235419124364853, + "learning_rate": 0.0011193184457478403, + "loss": 2.2632, + "step": 419390 + }, + { + "epoch": 1.6212831098946978, + "grad_norm": 0.08859186619520187, + "learning_rate": 0.0011190913781781904, + "loss": 2.2639, + "step": 419400 + }, + { + "epoch": 1.621321767098081, + "grad_norm": 0.10409567505121231, + "learning_rate": 0.0011188643691235724, + "loss": 2.2676, + "step": 419410 + }, + { + "epoch": 1.6213604243014643, + "grad_norm": 0.10689257085323334, + "learning_rate": 0.0011186374185387718, + "loss": 2.2685, + "step": 419420 + }, + { + "epoch": 1.6213990815048476, + "grad_norm": 0.09902604669332504, + "learning_rate": 0.0011184105263786323, + "loss": 2.2599, + "step": 419430 + }, + { + "epoch": 1.6214377387082308, + "grad_norm": 0.10523606836795807, + "learning_rate": 0.001118183692598056, + "loss": 2.2822, + "step": 419440 + }, + { + "epoch": 1.621476395911614, + "grad_norm": 0.11436944454908371, + "learning_rate": 0.0011179569171520024, + "loss": 2.2699, + "step": 419450 + }, + { + "epoch": 1.6215150531149973, + "grad_norm": 0.10277509689331055, + "learning_rate": 0.001117730199995489, + "loss": 2.2901, + "step": 419460 + }, + { + "epoch": 1.6215537103183806, + "grad_norm": 0.0945189818739891, + "learning_rate": 0.0011175035410835917, + "loss": 2.2642, + "step": 419470 + }, + { + "epoch": 1.621592367521764, + "grad_norm": 0.10312645882368088, + "learning_rate": 0.0011172769403714436, + "loss": 2.255, + "step": 419480 + }, + { + "epoch": 1.6216310247251473, + "grad_norm": 0.10012654960155487, + "learning_rate": 0.0011170503978142354, + "loss": 2.2658, + "step": 419490 + }, + { + "epoch": 1.6216696819285306, + "grad_norm": 0.11879194527864456, + "learning_rate": 0.0011168239133672153, + "loss": 2.2703, + "step": 419500 + }, + { + "epoch": 1.6217083391319138, + "grad_norm": 0.10942061245441437, + "learning_rate": 0.001116597486985689, + "loss": 2.278, + "step": 419510 + }, + { + "epoch": 1.621746996335297, + "grad_norm": 0.10702095180749893, + "learning_rate": 0.0011163711186250192, + "loss": 2.2676, + "step": 419520 + }, + { + "epoch": 1.6217856535386805, + "grad_norm": 0.09381652623414993, + "learning_rate": 0.0011161448082406258, + "loss": 2.2658, + "step": 419530 + }, + { + "epoch": 1.6218243107420638, + "grad_norm": 0.11745423823595047, + "learning_rate": 0.0011159185557879862, + "loss": 2.2641, + "step": 419540 + }, + { + "epoch": 1.621862967945447, + "grad_norm": 0.09949396550655365, + "learning_rate": 0.0011156923612226342, + "loss": 2.2657, + "step": 419550 + }, + { + "epoch": 1.6219016251488303, + "grad_norm": 0.10507374256849289, + "learning_rate": 0.0011154662245001607, + "loss": 2.2696, + "step": 419560 + }, + { + "epoch": 1.6219402823522135, + "grad_norm": 0.10793768614530563, + "learning_rate": 0.0011152401455762134, + "loss": 2.2578, + "step": 419570 + }, + { + "epoch": 1.6219789395555968, + "grad_norm": 0.10195645689964294, + "learning_rate": 0.0011150141244064967, + "loss": 2.2606, + "step": 419580 + }, + { + "epoch": 1.62201759675898, + "grad_norm": 0.09891880303621292, + "learning_rate": 0.001114788160946771, + "loss": 2.2521, + "step": 419590 + }, + { + "epoch": 1.6220562539623633, + "grad_norm": 0.1110696867108345, + "learning_rate": 0.0011145622551528539, + "loss": 2.2876, + "step": 419600 + }, + { + "epoch": 1.6220949111657466, + "grad_norm": 0.10356144607067108, + "learning_rate": 0.0011143364069806188, + "loss": 2.2686, + "step": 419610 + }, + { + "epoch": 1.6221335683691298, + "grad_norm": 0.09793224185705185, + "learning_rate": 0.001114110616385996, + "loss": 2.2732, + "step": 419620 + }, + { + "epoch": 1.622172225572513, + "grad_norm": 0.21252916753292084, + "learning_rate": 0.001113884883324971, + "loss": 2.2568, + "step": 419630 + }, + { + "epoch": 1.6222108827758963, + "grad_norm": 0.5498322248458862, + "learning_rate": 0.0011136592077535866, + "loss": 2.2856, + "step": 419640 + }, + { + "epoch": 1.6222495399792798, + "grad_norm": 0.10912423580884933, + "learning_rate": 0.00111343358962794, + "loss": 2.2727, + "step": 419650 + }, + { + "epoch": 1.622288197182663, + "grad_norm": 0.1227998435497284, + "learning_rate": 0.0011132080289041855, + "loss": 2.2589, + "step": 419660 + }, + { + "epoch": 1.6223268543860463, + "grad_norm": 0.10604656487703323, + "learning_rate": 0.0011129825255385326, + "loss": 2.2862, + "step": 419670 + }, + { + "epoch": 1.6223655115894295, + "grad_norm": 0.1027936339378357, + "learning_rate": 0.0011127570794872469, + "loss": 2.2722, + "step": 419680 + }, + { + "epoch": 1.622404168792813, + "grad_norm": 0.09441990405321121, + "learning_rate": 0.0011125316907066485, + "loss": 2.2821, + "step": 419690 + }, + { + "epoch": 1.6224428259961963, + "grad_norm": 0.11164455115795135, + "learning_rate": 0.0011123063591531141, + "loss": 2.2644, + "step": 419700 + }, + { + "epoch": 1.6224814831995795, + "grad_norm": 0.11239205300807953, + "learning_rate": 0.0011120810847830755, + "loss": 2.2709, + "step": 419710 + }, + { + "epoch": 1.6225201404029628, + "grad_norm": 0.10172021389007568, + "learning_rate": 0.0011118558675530196, + "loss": 2.2622, + "step": 419720 + }, + { + "epoch": 1.622558797606346, + "grad_norm": 0.11356845498085022, + "learning_rate": 0.0011116307074194877, + "loss": 2.2746, + "step": 419730 + }, + { + "epoch": 1.6225974548097293, + "grad_norm": 0.11192918568849564, + "learning_rate": 0.0011114056043390775, + "loss": 2.2632, + "step": 419740 + }, + { + "epoch": 1.6226361120131125, + "grad_norm": 0.10720881819725037, + "learning_rate": 0.001111180558268441, + "loss": 2.2551, + "step": 419750 + }, + { + "epoch": 1.6226747692164958, + "grad_norm": 0.10844343900680542, + "learning_rate": 0.0011109555691642852, + "loss": 2.2425, + "step": 419760 + }, + { + "epoch": 1.622713426419879, + "grad_norm": 0.09997853636741638, + "learning_rate": 0.0011107306369833715, + "loss": 2.2721, + "step": 419770 + }, + { + "epoch": 1.6227520836232623, + "grad_norm": 0.11213473230600357, + "learning_rate": 0.0011105057616825165, + "loss": 2.2665, + "step": 419780 + }, + { + "epoch": 1.6227907408266455, + "grad_norm": 0.11055673658847809, + "learning_rate": 0.0011102809432185913, + "loss": 2.2698, + "step": 419790 + }, + { + "epoch": 1.6228293980300288, + "grad_norm": 0.13885144889354706, + "learning_rate": 0.0011100561815485205, + "loss": 2.2631, + "step": 419800 + }, + { + "epoch": 1.622868055233412, + "grad_norm": 0.09793928265571594, + "learning_rate": 0.0011098314766292846, + "loss": 2.269, + "step": 419810 + }, + { + "epoch": 1.6229067124367955, + "grad_norm": 0.09951359778642654, + "learning_rate": 0.0011096068284179176, + "loss": 2.2551, + "step": 419820 + }, + { + "epoch": 1.6229453696401788, + "grad_norm": 0.11357228457927704, + "learning_rate": 0.0011093822368715073, + "loss": 2.2581, + "step": 419830 + }, + { + "epoch": 1.622984026843562, + "grad_norm": 0.11530306935310364, + "learning_rate": 0.0011091577019471965, + "loss": 2.2807, + "step": 419840 + }, + { + "epoch": 1.6230226840469453, + "grad_norm": 0.1709367036819458, + "learning_rate": 0.0011089332236021814, + "loss": 2.2832, + "step": 419850 + }, + { + "epoch": 1.6230613412503287, + "grad_norm": 0.10396969318389893, + "learning_rate": 0.0011087088017937123, + "loss": 2.2638, + "step": 419860 + }, + { + "epoch": 1.623099998453712, + "grad_norm": 0.0975300669670105, + "learning_rate": 0.0011084844364790932, + "loss": 2.2631, + "step": 419870 + }, + { + "epoch": 1.6231386556570953, + "grad_norm": 0.09769925475120544, + "learning_rate": 0.001108260127615682, + "loss": 2.2639, + "step": 419880 + }, + { + "epoch": 1.6231773128604785, + "grad_norm": 0.10420463979244232, + "learning_rate": 0.0011080358751608896, + "loss": 2.2704, + "step": 419890 + }, + { + "epoch": 1.6232159700638618, + "grad_norm": 0.10101456940174103, + "learning_rate": 0.0011078116790721815, + "loss": 2.2717, + "step": 419900 + }, + { + "epoch": 1.623254627267245, + "grad_norm": 0.09862508624792099, + "learning_rate": 0.0011075875393070757, + "loss": 2.268, + "step": 419910 + }, + { + "epoch": 1.6232932844706283, + "grad_norm": 0.10348057001829147, + "learning_rate": 0.001107363455823144, + "loss": 2.2682, + "step": 419920 + }, + { + "epoch": 1.6233319416740115, + "grad_norm": 0.09531467407941818, + "learning_rate": 0.0011071394285780114, + "loss": 2.2713, + "step": 419930 + }, + { + "epoch": 1.6233705988773948, + "grad_norm": 0.10556498169898987, + "learning_rate": 0.0011069154575293555, + "loss": 2.287, + "step": 419940 + }, + { + "epoch": 1.623409256080778, + "grad_norm": 0.10589485615491867, + "learning_rate": 0.0011066915426349083, + "loss": 2.2685, + "step": 419950 + }, + { + "epoch": 1.6234479132841613, + "grad_norm": 0.10087093710899353, + "learning_rate": 0.001106467683852453, + "loss": 2.2755, + "step": 419960 + }, + { + "epoch": 1.6234865704875445, + "grad_norm": 0.1181773990392685, + "learning_rate": 0.001106243881139827, + "loss": 2.2547, + "step": 419970 + }, + { + "epoch": 1.6235252276909278, + "grad_norm": 0.11872897297143936, + "learning_rate": 0.0011060201344549197, + "loss": 2.2633, + "step": 419980 + }, + { + "epoch": 1.6235638848943112, + "grad_norm": 0.10826791822910309, + "learning_rate": 0.0011057964437556737, + "loss": 2.2715, + "step": 419990 + }, + { + "epoch": 1.6236025420976945, + "grad_norm": 0.11399926245212555, + "learning_rate": 0.0011055728090000843, + "loss": 2.2629, + "step": 420000 + }, + { + "epoch": 1.6236411993010778, + "grad_norm": 0.10690005123615265, + "learning_rate": 0.0011053492301461986, + "loss": 2.2877, + "step": 420010 + }, + { + "epoch": 1.623679856504461, + "grad_norm": 0.09525414556264877, + "learning_rate": 0.0011051257071521164, + "loss": 2.2659, + "step": 420020 + }, + { + "epoch": 1.6237185137078445, + "grad_norm": 0.11313311755657196, + "learning_rate": 0.0011049022399759903, + "loss": 2.26, + "step": 420030 + }, + { + "epoch": 1.6237571709112277, + "grad_norm": 0.09924615174531937, + "learning_rate": 0.0011046788285760246, + "loss": 2.2647, + "step": 420040 + }, + { + "epoch": 1.623795828114611, + "grad_norm": 0.11179231852293015, + "learning_rate": 0.0011044554729104754, + "loss": 2.2733, + "step": 420050 + }, + { + "epoch": 1.6238344853179942, + "grad_norm": 0.11675616353750229, + "learning_rate": 0.0011042321729376524, + "loss": 2.2761, + "step": 420060 + }, + { + "epoch": 1.6238731425213775, + "grad_norm": 0.10813635587692261, + "learning_rate": 0.0011040089286159151, + "loss": 2.2721, + "step": 420070 + }, + { + "epoch": 1.6239117997247607, + "grad_norm": 0.10933609306812286, + "learning_rate": 0.0011037857399036769, + "loss": 2.2706, + "step": 420080 + }, + { + "epoch": 1.623950456928144, + "grad_norm": 0.10850953310728073, + "learning_rate": 0.0011035626067594012, + "loss": 2.2633, + "step": 420090 + }, + { + "epoch": 1.6239891141315272, + "grad_norm": 0.10027583688497543, + "learning_rate": 0.0011033395291416043, + "loss": 2.2656, + "step": 420100 + }, + { + "epoch": 1.6240277713349105, + "grad_norm": 0.12081553786993027, + "learning_rate": 0.0011031165070088536, + "loss": 2.2705, + "step": 420110 + }, + { + "epoch": 1.6240664285382937, + "grad_norm": 0.10444925725460052, + "learning_rate": 0.001102893540319768, + "loss": 2.2702, + "step": 420120 + }, + { + "epoch": 1.624105085741677, + "grad_norm": 0.09450006484985352, + "learning_rate": 0.0011026706290330178, + "loss": 2.2757, + "step": 420130 + }, + { + "epoch": 1.6241437429450603, + "grad_norm": 0.10357379168272018, + "learning_rate": 0.001102447773107325, + "loss": 2.2538, + "step": 420140 + }, + { + "epoch": 1.6241824001484435, + "grad_norm": 0.10619291663169861, + "learning_rate": 0.0011022249725014623, + "loss": 2.2647, + "step": 420150 + }, + { + "epoch": 1.624221057351827, + "grad_norm": 0.10615566372871399, + "learning_rate": 0.0011020022271742542, + "loss": 2.2627, + "step": 420160 + }, + { + "epoch": 1.6242597145552102, + "grad_norm": 0.10659230500459671, + "learning_rate": 0.0011017795370845752, + "loss": 2.2682, + "step": 420170 + }, + { + "epoch": 1.6242983717585935, + "grad_norm": 0.113923080265522, + "learning_rate": 0.0011015569021913519, + "loss": 2.2535, + "step": 420180 + }, + { + "epoch": 1.6243370289619767, + "grad_norm": 0.15182152390480042, + "learning_rate": 0.0011013343224535613, + "loss": 2.2496, + "step": 420190 + }, + { + "epoch": 1.6243756861653602, + "grad_norm": 0.10359169542789459, + "learning_rate": 0.0011011117978302308, + "loss": 2.2733, + "step": 420200 + }, + { + "epoch": 1.6244143433687435, + "grad_norm": 0.12292884290218353, + "learning_rate": 0.0011008893282804392, + "loss": 2.2629, + "step": 420210 + }, + { + "epoch": 1.6244530005721267, + "grad_norm": 0.09783503413200378, + "learning_rate": 0.0011006669137633154, + "loss": 2.2841, + "step": 420220 + }, + { + "epoch": 1.62449165777551, + "grad_norm": 0.10937552899122238, + "learning_rate": 0.0011004445542380393, + "loss": 2.2746, + "step": 420230 + }, + { + "epoch": 1.6245303149788932, + "grad_norm": 0.10353318601846695, + "learning_rate": 0.0011002222496638407, + "loss": 2.2522, + "step": 420240 + }, + { + "epoch": 1.6245689721822765, + "grad_norm": 0.10546938329935074, + "learning_rate": 0.0011, + "loss": 2.2612, + "step": 420250 + }, + { + "epoch": 1.6246076293856597, + "grad_norm": 0.10323894023895264, + "learning_rate": 0.0010997778052058483, + "loss": 2.2683, + "step": 420260 + }, + { + "epoch": 1.624646286589043, + "grad_norm": 0.10143966972827911, + "learning_rate": 0.0010995556652407656, + "loss": 2.2774, + "step": 420270 + }, + { + "epoch": 1.6246849437924262, + "grad_norm": 0.10892364382743835, + "learning_rate": 0.0010993335800641838, + "loss": 2.2676, + "step": 420280 + }, + { + "epoch": 1.6247236009958095, + "grad_norm": 0.10089428722858429, + "learning_rate": 0.0010991115496355834, + "loss": 2.2923, + "step": 420290 + }, + { + "epoch": 1.6247622581991927, + "grad_norm": 0.10978174954652786, + "learning_rate": 0.0010988895739144954, + "loss": 2.2562, + "step": 420300 + }, + { + "epoch": 1.624800915402576, + "grad_norm": 0.11797042936086655, + "learning_rate": 0.0010986676528605003, + "loss": 2.2604, + "step": 420310 + }, + { + "epoch": 1.6248395726059592, + "grad_norm": 0.10992129147052765, + "learning_rate": 0.0010984457864332284, + "loss": 2.2747, + "step": 420320 + }, + { + "epoch": 1.6248782298093427, + "grad_norm": 0.12120220065116882, + "learning_rate": 0.0010982239745923603, + "loss": 2.2727, + "step": 420330 + }, + { + "epoch": 1.624916887012726, + "grad_norm": 0.10300073027610779, + "learning_rate": 0.0010980022172976255, + "loss": 2.2783, + "step": 420340 + }, + { + "epoch": 1.6249555442161092, + "grad_norm": 0.11692921817302704, + "learning_rate": 0.0010977805145088034, + "loss": 2.274, + "step": 420350 + }, + { + "epoch": 1.6249942014194925, + "grad_norm": 0.12124430388212204, + "learning_rate": 0.0010975588661857217, + "loss": 2.2814, + "step": 420360 + }, + { + "epoch": 1.625032858622876, + "grad_norm": 0.10215267539024353, + "learning_rate": 0.0010973372722882595, + "loss": 2.271, + "step": 420370 + }, + { + "epoch": 1.6250715158262592, + "grad_norm": 0.10612752288579941, + "learning_rate": 0.001097115732776343, + "loss": 2.2673, + "step": 420380 + }, + { + "epoch": 1.6251101730296424, + "grad_norm": 0.09550405293703079, + "learning_rate": 0.001096894247609949, + "loss": 2.271, + "step": 420390 + }, + { + "epoch": 1.6251488302330257, + "grad_norm": 0.1077994704246521, + "learning_rate": 0.001096672816749103, + "loss": 2.2609, + "step": 420400 + }, + { + "epoch": 1.625187487436409, + "grad_norm": 0.12608639895915985, + "learning_rate": 0.0010964514401538789, + "loss": 2.243, + "step": 420410 + }, + { + "epoch": 1.6252261446397922, + "grad_norm": 0.10839331150054932, + "learning_rate": 0.0010962301177843998, + "loss": 2.2636, + "step": 420420 + }, + { + "epoch": 1.6252648018431755, + "grad_norm": 0.11711254715919495, + "learning_rate": 0.0010960088496008381, + "loss": 2.2559, + "step": 420430 + }, + { + "epoch": 1.6253034590465587, + "grad_norm": 0.10556913167238235, + "learning_rate": 0.0010957876355634148, + "loss": 2.2625, + "step": 420440 + }, + { + "epoch": 1.625342116249942, + "grad_norm": 0.11005790531635284, + "learning_rate": 0.001095566475632399, + "loss": 2.2638, + "step": 420450 + }, + { + "epoch": 1.6253807734533252, + "grad_norm": 0.121341273188591, + "learning_rate": 0.0010953453697681087, + "loss": 2.2733, + "step": 420460 + }, + { + "epoch": 1.6254194306567085, + "grad_norm": 0.12211687862873077, + "learning_rate": 0.0010951243179309106, + "loss": 2.2707, + "step": 420470 + }, + { + "epoch": 1.6254580878600917, + "grad_norm": 0.10479617863893509, + "learning_rate": 0.0010949033200812193, + "loss": 2.2619, + "step": 420480 + }, + { + "epoch": 1.625496745063475, + "grad_norm": 0.11355694383382797, + "learning_rate": 0.001094682376179498, + "loss": 2.268, + "step": 420490 + }, + { + "epoch": 1.6255354022668584, + "grad_norm": 0.11647163331508636, + "learning_rate": 0.0010944614861862583, + "loss": 2.2681, + "step": 420500 + }, + { + "epoch": 1.6255740594702417, + "grad_norm": 0.11764030158519745, + "learning_rate": 0.00109424065006206, + "loss": 2.2733, + "step": 420510 + }, + { + "epoch": 1.625612716673625, + "grad_norm": 0.1114291176199913, + "learning_rate": 0.00109401986776751, + "loss": 2.2679, + "step": 420520 + }, + { + "epoch": 1.6256513738770082, + "grad_norm": 0.1021851897239685, + "learning_rate": 0.0010937991392632647, + "loss": 2.2714, + "step": 420530 + }, + { + "epoch": 1.6256900310803917, + "grad_norm": 0.10080353170633316, + "learning_rate": 0.0010935784645100272, + "loss": 2.2602, + "step": 420540 + }, + { + "epoch": 1.625728688283775, + "grad_norm": 0.10580401122570038, + "learning_rate": 0.001093357843468549, + "loss": 2.2692, + "step": 420550 + }, + { + "epoch": 1.6257673454871582, + "grad_norm": 0.1270064115524292, + "learning_rate": 0.0010931372760996294, + "loss": 2.2556, + "step": 420560 + }, + { + "epoch": 1.6258060026905414, + "grad_norm": 0.10214035212993622, + "learning_rate": 0.0010929167623641147, + "loss": 2.2545, + "step": 420570 + }, + { + "epoch": 1.6258446598939247, + "grad_norm": 0.10115115344524384, + "learning_rate": 0.0010926963022228997, + "loss": 2.274, + "step": 420580 + }, + { + "epoch": 1.625883317097308, + "grad_norm": 0.09679996967315674, + "learning_rate": 0.0010924758956369258, + "loss": 2.2664, + "step": 420590 + }, + { + "epoch": 1.6259219743006912, + "grad_norm": 0.10778463631868362, + "learning_rate": 0.0010922555425671825, + "loss": 2.2791, + "step": 420600 + }, + { + "epoch": 1.6259606315040744, + "grad_norm": 0.1045646145939827, + "learning_rate": 0.0010920352429747066, + "loss": 2.2572, + "step": 420610 + }, + { + "epoch": 1.6259992887074577, + "grad_norm": 0.1000080555677414, + "learning_rate": 0.0010918149968205818, + "loss": 2.2739, + "step": 420620 + }, + { + "epoch": 1.626037945910841, + "grad_norm": 0.10625805705785751, + "learning_rate": 0.0010915948040659388, + "loss": 2.2732, + "step": 420630 + }, + { + "epoch": 1.6260766031142242, + "grad_norm": 0.09651408344507217, + "learning_rate": 0.0010913746646719561, + "loss": 2.2644, + "step": 420640 + }, + { + "epoch": 1.6261152603176074, + "grad_norm": 0.10682700574398041, + "learning_rate": 0.0010911545785998592, + "loss": 2.2696, + "step": 420650 + }, + { + "epoch": 1.6261539175209907, + "grad_norm": 0.10344824939966202, + "learning_rate": 0.001090934545810919, + "loss": 2.2726, + "step": 420660 + }, + { + "epoch": 1.6261925747243742, + "grad_norm": 0.09731447696685791, + "learning_rate": 0.0010907145662664556, + "loss": 2.2615, + "step": 420670 + }, + { + "epoch": 1.6262312319277574, + "grad_norm": 0.09883551299571991, + "learning_rate": 0.0010904946399278343, + "loss": 2.267, + "step": 420680 + }, + { + "epoch": 1.6262698891311407, + "grad_norm": 0.12334094941616058, + "learning_rate": 0.0010902747667564671, + "loss": 2.2599, + "step": 420690 + }, + { + "epoch": 1.626308546334524, + "grad_norm": 0.12517641484737396, + "learning_rate": 0.001090054946713814, + "loss": 2.2673, + "step": 420700 + }, + { + "epoch": 1.6263472035379074, + "grad_norm": 0.10702036321163177, + "learning_rate": 0.0010898351797613798, + "loss": 2.2733, + "step": 420710 + }, + { + "epoch": 1.6263858607412907, + "grad_norm": 0.11024714261293411, + "learning_rate": 0.0010896154658607167, + "loss": 2.271, + "step": 420720 + }, + { + "epoch": 1.626424517944674, + "grad_norm": 0.10311411321163177, + "learning_rate": 0.0010893958049734232, + "loss": 2.2775, + "step": 420730 + }, + { + "epoch": 1.6264631751480572, + "grad_norm": 0.1197841688990593, + "learning_rate": 0.001089176197061144, + "loss": 2.2853, + "step": 420740 + }, + { + "epoch": 1.6265018323514404, + "grad_norm": 0.10621661692857742, + "learning_rate": 0.0010889566420855701, + "loss": 2.2552, + "step": 420750 + }, + { + "epoch": 1.6265404895548237, + "grad_norm": 0.10532528907060623, + "learning_rate": 0.0010887371400084386, + "loss": 2.2599, + "step": 420760 + }, + { + "epoch": 1.626579146758207, + "grad_norm": 0.12722547352313995, + "learning_rate": 0.001088517690791533, + "loss": 2.2556, + "step": 420770 + }, + { + "epoch": 1.6266178039615902, + "grad_norm": 0.1100541278719902, + "learning_rate": 0.0010882982943966814, + "loss": 2.2662, + "step": 420780 + }, + { + "epoch": 1.6266564611649734, + "grad_norm": 0.10416977852582932, + "learning_rate": 0.0010880789507857602, + "loss": 2.2624, + "step": 420790 + }, + { + "epoch": 1.6266951183683567, + "grad_norm": 0.11241607367992401, + "learning_rate": 0.0010878596599206895, + "loss": 2.2601, + "step": 420800 + }, + { + "epoch": 1.62673377557174, + "grad_norm": 0.1130867525935173, + "learning_rate": 0.0010876404217634366, + "loss": 2.265, + "step": 420810 + }, + { + "epoch": 1.6267724327751232, + "grad_norm": 0.11152119189500809, + "learning_rate": 0.0010874212362760134, + "loss": 2.2722, + "step": 420820 + }, + { + "epoch": 1.6268110899785064, + "grad_norm": 0.12177984416484833, + "learning_rate": 0.001087202103420478, + "loss": 2.281, + "step": 420830 + }, + { + "epoch": 1.62684974718189, + "grad_norm": 0.1110476478934288, + "learning_rate": 0.0010869830231589339, + "loss": 2.2636, + "step": 420840 + }, + { + "epoch": 1.6268884043852732, + "grad_norm": 0.11386765539646149, + "learning_rate": 0.0010867639954535303, + "loss": 2.2469, + "step": 420850 + }, + { + "epoch": 1.6269270615886564, + "grad_norm": 0.11763129383325577, + "learning_rate": 0.001086545020266461, + "loss": 2.2629, + "step": 420860 + }, + { + "epoch": 1.6269657187920397, + "grad_norm": 0.10655208677053452, + "learning_rate": 0.0010863260975599665, + "loss": 2.2679, + "step": 420870 + }, + { + "epoch": 1.6270043759954231, + "grad_norm": 0.09815648943185806, + "learning_rate": 0.001086107227296331, + "loss": 2.2798, + "step": 420880 + }, + { + "epoch": 1.6270430331988064, + "grad_norm": 0.10880669951438904, + "learning_rate": 0.001085888409437885, + "loss": 2.2588, + "step": 420890 + }, + { + "epoch": 1.6270816904021896, + "grad_norm": 0.09757796674966812, + "learning_rate": 0.001085669643947003, + "loss": 2.2763, + "step": 420900 + }, + { + "epoch": 1.627120347605573, + "grad_norm": 0.11888976395130157, + "learning_rate": 0.0010854509307861057, + "loss": 2.2519, + "step": 420910 + }, + { + "epoch": 1.6271590048089561, + "grad_norm": 0.1087271049618721, + "learning_rate": 0.0010852322699176585, + "loss": 2.2682, + "step": 420920 + }, + { + "epoch": 1.6271976620123394, + "grad_norm": 0.10679204016923904, + "learning_rate": 0.0010850136613041702, + "loss": 2.2729, + "step": 420930 + }, + { + "epoch": 1.6272363192157226, + "grad_norm": 0.11084762960672379, + "learning_rate": 0.001084795104908196, + "loss": 2.2588, + "step": 420940 + }, + { + "epoch": 1.627274976419106, + "grad_norm": 0.09975182265043259, + "learning_rate": 0.0010845766006923354, + "loss": 2.2691, + "step": 420950 + }, + { + "epoch": 1.6273136336224892, + "grad_norm": 0.10565955936908722, + "learning_rate": 0.0010843581486192323, + "loss": 2.2632, + "step": 420960 + }, + { + "epoch": 1.6273522908258724, + "grad_norm": 0.10742072016000748, + "learning_rate": 0.0010841397486515752, + "loss": 2.2743, + "step": 420970 + }, + { + "epoch": 1.6273909480292557, + "grad_norm": 0.10366009920835495, + "learning_rate": 0.001083921400752097, + "loss": 2.2742, + "step": 420980 + }, + { + "epoch": 1.627429605232639, + "grad_norm": 0.11645475775003433, + "learning_rate": 0.0010837031048835754, + "loss": 2.263, + "step": 420990 + }, + { + "epoch": 1.6274682624360224, + "grad_norm": 0.11208124458789825, + "learning_rate": 0.0010834848610088322, + "loss": 2.2624, + "step": 421000 + }, + { + "epoch": 1.6275069196394056, + "grad_norm": 0.11434199661016464, + "learning_rate": 0.0010832666690907327, + "loss": 2.2651, + "step": 421010 + }, + { + "epoch": 1.6275455768427889, + "grad_norm": 0.13794845342636108, + "learning_rate": 0.0010830485290921881, + "loss": 2.2684, + "step": 421020 + }, + { + "epoch": 1.6275842340461721, + "grad_norm": 0.10673683136701584, + "learning_rate": 0.001082830440976152, + "loss": 2.2703, + "step": 421030 + }, + { + "epoch": 1.6276228912495554, + "grad_norm": 0.11057291179895401, + "learning_rate": 0.001082612404705623, + "loss": 2.2524, + "step": 421040 + }, + { + "epoch": 1.6276615484529389, + "grad_norm": 0.097368985414505, + "learning_rate": 0.001082394420243643, + "loss": 2.2665, + "step": 421050 + }, + { + "epoch": 1.6277002056563221, + "grad_norm": 0.10936520248651505, + "learning_rate": 0.0010821764875532987, + "loss": 2.2589, + "step": 421060 + }, + { + "epoch": 1.6277388628597054, + "grad_norm": 0.1091497614979744, + "learning_rate": 0.0010819586065977198, + "loss": 2.2682, + "step": 421070 + }, + { + "epoch": 1.6277775200630886, + "grad_norm": 0.10452881455421448, + "learning_rate": 0.0010817407773400802, + "loss": 2.252, + "step": 421080 + }, + { + "epoch": 1.6278161772664719, + "grad_norm": 0.1058262288570404, + "learning_rate": 0.0010815229997435973, + "loss": 2.2656, + "step": 421090 + }, + { + "epoch": 1.6278548344698551, + "grad_norm": 0.10648338496685028, + "learning_rate": 0.0010813052737715318, + "loss": 2.2762, + "step": 421100 + }, + { + "epoch": 1.6278934916732384, + "grad_norm": 0.1072186678647995, + "learning_rate": 0.0010810875993871884, + "loss": 2.2659, + "step": 421110 + }, + { + "epoch": 1.6279321488766216, + "grad_norm": 0.10891768336296082, + "learning_rate": 0.0010808699765539154, + "loss": 2.2588, + "step": 421120 + }, + { + "epoch": 1.6279708060800049, + "grad_norm": 0.09759233146905899, + "learning_rate": 0.0010806524052351038, + "loss": 2.2605, + "step": 421130 + }, + { + "epoch": 1.6280094632833881, + "grad_norm": 0.10803782194852829, + "learning_rate": 0.0010804348853941882, + "loss": 2.255, + "step": 421140 + }, + { + "epoch": 1.6280481204867714, + "grad_norm": 0.10355690866708755, + "learning_rate": 0.0010802174169946466, + "loss": 2.2568, + "step": 421150 + }, + { + "epoch": 1.6280867776901546, + "grad_norm": 0.09561111778020859, + "learning_rate": 0.00108, + "loss": 2.2736, + "step": 421160 + }, + { + "epoch": 1.6281254348935381, + "grad_norm": 0.1167539432644844, + "learning_rate": 0.0010797826343738129, + "loss": 2.2633, + "step": 421170 + }, + { + "epoch": 1.6281640920969214, + "grad_norm": 0.11473812162876129, + "learning_rate": 0.001079565320079692, + "loss": 2.2681, + "step": 421180 + }, + { + "epoch": 1.6282027493003046, + "grad_norm": 0.11049634218215942, + "learning_rate": 0.0010793480570812878, + "loss": 2.2605, + "step": 421190 + }, + { + "epoch": 1.6282414065036879, + "grad_norm": 0.10097356140613556, + "learning_rate": 0.001079130845342293, + "loss": 2.2665, + "step": 421200 + }, + { + "epoch": 1.6282800637070711, + "grad_norm": 0.1207822859287262, + "learning_rate": 0.0010789136848264435, + "loss": 2.2594, + "step": 421210 + }, + { + "epoch": 1.6283187209104546, + "grad_norm": 0.11211457848548889, + "learning_rate": 0.0010786965754975184, + "loss": 2.2519, + "step": 421220 + }, + { + "epoch": 1.6283573781138379, + "grad_norm": 0.10926574468612671, + "learning_rate": 0.0010784795173193382, + "loss": 2.2675, + "step": 421230 + }, + { + "epoch": 1.628396035317221, + "grad_norm": 0.11232150346040726, + "learning_rate": 0.001078262510255767, + "loss": 2.2647, + "step": 421240 + }, + { + "epoch": 1.6284346925206044, + "grad_norm": 0.1060982421040535, + "learning_rate": 0.0010780455542707113, + "loss": 2.2656, + "step": 421250 + }, + { + "epoch": 1.6284733497239876, + "grad_norm": 0.11022268980741501, + "learning_rate": 0.0010778286493281198, + "loss": 2.2537, + "step": 421260 + }, + { + "epoch": 1.6285120069273709, + "grad_norm": 0.10144762694835663, + "learning_rate": 0.0010776117953919834, + "loss": 2.2738, + "step": 421270 + }, + { + "epoch": 1.628550664130754, + "grad_norm": 0.11215156316757202, + "learning_rate": 0.0010773949924263364, + "loss": 2.2656, + "step": 421280 + }, + { + "epoch": 1.6285893213341374, + "grad_norm": 0.1022103950381279, + "learning_rate": 0.0010771782403952539, + "loss": 2.258, + "step": 421290 + }, + { + "epoch": 1.6286279785375206, + "grad_norm": 0.10472244024276733, + "learning_rate": 0.0010769615392628537, + "loss": 2.2595, + "step": 421300 + }, + { + "epoch": 1.6286666357409039, + "grad_norm": 0.09746657311916351, + "learning_rate": 0.0010767448889932967, + "loss": 2.2646, + "step": 421310 + }, + { + "epoch": 1.6287052929442871, + "grad_norm": 0.12648841738700867, + "learning_rate": 0.0010765282895507844, + "loss": 2.2643, + "step": 421320 + }, + { + "epoch": 1.6287439501476704, + "grad_norm": 0.10947658121585846, + "learning_rate": 0.0010763117408995608, + "loss": 2.2662, + "step": 421330 + }, + { + "epoch": 1.6287826073510538, + "grad_norm": 0.10089129209518433, + "learning_rate": 0.0010760952430039121, + "loss": 2.2622, + "step": 421340 + }, + { + "epoch": 1.628821264554437, + "grad_norm": 0.10407378524541855, + "learning_rate": 0.0010758787958281663, + "loss": 2.2571, + "step": 421350 + }, + { + "epoch": 1.6288599217578204, + "grad_norm": 0.10484025627374649, + "learning_rate": 0.0010756623993366924, + "loss": 2.2617, + "step": 421360 + }, + { + "epoch": 1.6288985789612036, + "grad_norm": 0.09679654240608215, + "learning_rate": 0.0010754460534939025, + "loss": 2.2711, + "step": 421370 + }, + { + "epoch": 1.6289372361645869, + "grad_norm": 0.10423847287893295, + "learning_rate": 0.0010752297582642487, + "loss": 2.278, + "step": 421380 + }, + { + "epoch": 1.6289758933679703, + "grad_norm": 0.1128976121544838, + "learning_rate": 0.001075013513612226, + "loss": 2.269, + "step": 421390 + }, + { + "epoch": 1.6290145505713536, + "grad_norm": 0.10138994455337524, + "learning_rate": 0.0010747973195023696, + "loss": 2.2543, + "step": 421400 + }, + { + "epoch": 1.6290532077747368, + "grad_norm": 0.1152886152267456, + "learning_rate": 0.001074581175899258, + "loss": 2.2649, + "step": 421410 + }, + { + "epoch": 1.62909186497812, + "grad_norm": 0.11228503286838531, + "learning_rate": 0.001074365082767509, + "loss": 2.2435, + "step": 421420 + }, + { + "epoch": 1.6291305221815033, + "grad_norm": 0.10597655922174454, + "learning_rate": 0.0010741490400717835, + "loss": 2.2545, + "step": 421430 + }, + { + "epoch": 1.6291691793848866, + "grad_norm": 0.11497277766466141, + "learning_rate": 0.0010739330477767819, + "loss": 2.2544, + "step": 421440 + }, + { + "epoch": 1.6292078365882698, + "grad_norm": 0.10805842280387878, + "learning_rate": 0.0010737171058472473, + "loss": 2.2745, + "step": 421450 + }, + { + "epoch": 1.629246493791653, + "grad_norm": 0.09347794204950333, + "learning_rate": 0.0010735012142479624, + "loss": 2.2508, + "step": 421460 + }, + { + "epoch": 1.6292851509950363, + "grad_norm": 0.10947436839342117, + "learning_rate": 0.0010732853729437527, + "loss": 2.2613, + "step": 421470 + }, + { + "epoch": 1.6293238081984196, + "grad_norm": 0.10064587742090225, + "learning_rate": 0.0010730695818994827, + "loss": 2.27, + "step": 421480 + }, + { + "epoch": 1.6293624654018029, + "grad_norm": 0.10791762918233871, + "learning_rate": 0.0010728538410800592, + "loss": 2.2668, + "step": 421490 + }, + { + "epoch": 1.629401122605186, + "grad_norm": 0.11491987109184265, + "learning_rate": 0.0010726381504504296, + "loss": 2.2609, + "step": 421500 + }, + { + "epoch": 1.6294397798085696, + "grad_norm": 0.11298567801713943, + "learning_rate": 0.0010724225099755815, + "loss": 2.2602, + "step": 421510 + }, + { + "epoch": 1.6294784370119528, + "grad_norm": 0.10923970490694046, + "learning_rate": 0.0010722069196205438, + "loss": 2.2569, + "step": 421520 + }, + { + "epoch": 1.629517094215336, + "grad_norm": 0.10620459914207458, + "learning_rate": 0.0010719913793503855, + "loss": 2.2606, + "step": 421530 + }, + { + "epoch": 1.6295557514187193, + "grad_norm": 0.10068482905626297, + "learning_rate": 0.0010717758891302168, + "loss": 2.2579, + "step": 421540 + }, + { + "epoch": 1.6295944086221028, + "grad_norm": 0.10469921678304672, + "learning_rate": 0.0010715604489251872, + "loss": 2.2707, + "step": 421550 + }, + { + "epoch": 1.629633065825486, + "grad_norm": 0.10165794938802719, + "learning_rate": 0.001071345058700488, + "loss": 2.2648, + "step": 421560 + }, + { + "epoch": 1.6296717230288693, + "grad_norm": 0.10368034243583679, + "learning_rate": 0.0010711297184213505, + "loss": 2.2566, + "step": 421570 + }, + { + "epoch": 1.6297103802322526, + "grad_norm": 1.6353310346603394, + "learning_rate": 0.0010709144280530451, + "loss": 2.2658, + "step": 421580 + }, + { + "epoch": 1.6297490374356358, + "grad_norm": 0.1396324336528778, + "learning_rate": 0.0010706991875608844, + "loss": 2.2754, + "step": 421590 + }, + { + "epoch": 1.629787694639019, + "grad_norm": 0.26416754722595215, + "learning_rate": 0.00107048399691022, + "loss": 2.2707, + "step": 421600 + }, + { + "epoch": 1.6298263518424023, + "grad_norm": 0.10929203778505325, + "learning_rate": 0.0010702688560664433, + "loss": 2.2718, + "step": 421610 + }, + { + "epoch": 1.6298650090457856, + "grad_norm": 0.10347554087638855, + "learning_rate": 0.0010700537649949865, + "loss": 2.2622, + "step": 421620 + }, + { + "epoch": 1.6299036662491688, + "grad_norm": 0.10963423550128937, + "learning_rate": 0.0010698387236613212, + "loss": 2.2673, + "step": 421630 + }, + { + "epoch": 1.629942323452552, + "grad_norm": 0.10632467269897461, + "learning_rate": 0.0010696237320309595, + "loss": 2.2679, + "step": 421640 + }, + { + "epoch": 1.6299809806559353, + "grad_norm": 0.1066928431391716, + "learning_rate": 0.0010694087900694528, + "loss": 2.2515, + "step": 421650 + }, + { + "epoch": 1.6300196378593186, + "grad_norm": 0.10093319416046143, + "learning_rate": 0.0010691938977423923, + "loss": 2.2593, + "step": 421660 + }, + { + "epoch": 1.6300582950627018, + "grad_norm": 0.11082525551319122, + "learning_rate": 0.0010689790550154094, + "loss": 2.2578, + "step": 421670 + }, + { + "epoch": 1.6300969522660853, + "grad_norm": 0.10855924338102341, + "learning_rate": 0.0010687642618541747, + "loss": 2.2655, + "step": 421680 + }, + { + "epoch": 1.6301356094694686, + "grad_norm": 0.1240989938378334, + "learning_rate": 0.0010685495182243985, + "loss": 2.2705, + "step": 421690 + }, + { + "epoch": 1.6301742666728518, + "grad_norm": 0.1026359423995018, + "learning_rate": 0.001068334824091831, + "loss": 2.2647, + "step": 421700 + }, + { + "epoch": 1.630212923876235, + "grad_norm": 0.11498430371284485, + "learning_rate": 0.0010681201794222606, + "loss": 2.2564, + "step": 421710 + }, + { + "epoch": 1.6302515810796185, + "grad_norm": 0.11296871304512024, + "learning_rate": 0.0010679055841815166, + "loss": 2.2658, + "step": 421720 + }, + { + "epoch": 1.6302902382830018, + "grad_norm": 0.10595811158418655, + "learning_rate": 0.0010676910383354667, + "loss": 2.2702, + "step": 421730 + }, + { + "epoch": 1.630328895486385, + "grad_norm": 0.10797400027513504, + "learning_rate": 0.0010674765418500188, + "loss": 2.2681, + "step": 421740 + }, + { + "epoch": 1.6303675526897683, + "grad_norm": 0.11341440677642822, + "learning_rate": 0.0010672620946911185, + "loss": 2.2671, + "step": 421750 + }, + { + "epoch": 1.6304062098931515, + "grad_norm": 0.107243113219738, + "learning_rate": 0.001067047696824752, + "loss": 2.2596, + "step": 421760 + }, + { + "epoch": 1.6304448670965348, + "grad_norm": 0.10347296297550201, + "learning_rate": 0.0010668333482169437, + "loss": 2.2473, + "step": 421770 + }, + { + "epoch": 1.630483524299918, + "grad_norm": 0.12594357132911682, + "learning_rate": 0.0010666190488337573, + "loss": 2.2627, + "step": 421780 + }, + { + "epoch": 1.6305221815033013, + "grad_norm": 0.1074180155992508, + "learning_rate": 0.0010664047986412954, + "loss": 2.2652, + "step": 421790 + }, + { + "epoch": 1.6305608387066846, + "grad_norm": 0.11287180334329605, + "learning_rate": 0.0010661905976056998, + "loss": 2.2577, + "step": 421800 + }, + { + "epoch": 1.6305994959100678, + "grad_norm": 0.11506509780883789, + "learning_rate": 0.0010659764456931507, + "loss": 2.2769, + "step": 421810 + }, + { + "epoch": 1.630638153113451, + "grad_norm": 0.1361812800168991, + "learning_rate": 0.001065762342869867, + "loss": 2.2674, + "step": 421820 + }, + { + "epoch": 1.6306768103168343, + "grad_norm": 0.11115347594022751, + "learning_rate": 0.0010655482891021068, + "loss": 2.2571, + "step": 421830 + }, + { + "epoch": 1.6307154675202176, + "grad_norm": 0.11083522439002991, + "learning_rate": 0.0010653342843561662, + "loss": 2.2825, + "step": 421840 + }, + { + "epoch": 1.630754124723601, + "grad_norm": 0.10724809020757675, + "learning_rate": 0.0010651203285983806, + "loss": 2.2671, + "step": 421850 + }, + { + "epoch": 1.6307927819269843, + "grad_norm": 0.09956831485033035, + "learning_rate": 0.0010649064217951234, + "loss": 2.2543, + "step": 421860 + }, + { + "epoch": 1.6308314391303675, + "grad_norm": 0.09386299550533295, + "learning_rate": 0.0010646925639128063, + "loss": 2.2573, + "step": 421870 + }, + { + "epoch": 1.6308700963337508, + "grad_norm": 0.10659188777208328, + "learning_rate": 0.00106447875491788, + "loss": 2.2649, + "step": 421880 + }, + { + "epoch": 1.6309087535371343, + "grad_norm": 0.10014776140451431, + "learning_rate": 0.0010642649947768333, + "loss": 2.2568, + "step": 421890 + }, + { + "epoch": 1.6309474107405175, + "grad_norm": 0.1105884313583374, + "learning_rate": 0.0010640512834561929, + "loss": 2.2558, + "step": 421900 + }, + { + "epoch": 1.6309860679439008, + "grad_norm": 0.1113891750574112, + "learning_rate": 0.001063837620922524, + "loss": 2.2705, + "step": 421910 + }, + { + "epoch": 1.631024725147284, + "grad_norm": 0.1098250225186348, + "learning_rate": 0.00106362400714243, + "loss": 2.2734, + "step": 421920 + }, + { + "epoch": 1.6310633823506673, + "grad_norm": 0.10185971111059189, + "learning_rate": 0.0010634104420825525, + "loss": 2.2689, + "step": 421930 + }, + { + "epoch": 1.6311020395540505, + "grad_norm": 0.10226941853761673, + "learning_rate": 0.0010631969257095704, + "loss": 2.2564, + "step": 421940 + }, + { + "epoch": 1.6311406967574338, + "grad_norm": 0.11283935606479645, + "learning_rate": 0.0010629834579902017, + "loss": 2.2797, + "step": 421950 + }, + { + "epoch": 1.631179353960817, + "grad_norm": 0.10663514584302902, + "learning_rate": 0.0010627700388912014, + "loss": 2.27, + "step": 421960 + }, + { + "epoch": 1.6312180111642003, + "grad_norm": 0.10804902017116547, + "learning_rate": 0.0010625566683793628, + "loss": 2.2518, + "step": 421970 + }, + { + "epoch": 1.6312566683675835, + "grad_norm": 0.12151705473661423, + "learning_rate": 0.0010623433464215166, + "loss": 2.2595, + "step": 421980 + }, + { + "epoch": 1.6312953255709668, + "grad_norm": 0.11235487461090088, + "learning_rate": 0.0010621300729845315, + "loss": 2.2606, + "step": 421990 + }, + { + "epoch": 1.63133398277435, + "grad_norm": 0.10612896829843521, + "learning_rate": 0.001061916848035314, + "loss": 2.2611, + "step": 422000 + }, + { + "epoch": 1.6313726399777333, + "grad_norm": 0.1074008122086525, + "learning_rate": 0.001061703671540808, + "loss": 2.2533, + "step": 422010 + }, + { + "epoch": 1.6314112971811168, + "grad_norm": 0.10118181258440018, + "learning_rate": 0.0010614905434679946, + "loss": 2.2655, + "step": 422020 + }, + { + "epoch": 1.6314499543845, + "grad_norm": 0.11081536114215851, + "learning_rate": 0.001061277463783893, + "loss": 2.2743, + "step": 422030 + }, + { + "epoch": 1.6314886115878833, + "grad_norm": 0.098418690264225, + "learning_rate": 0.0010610644324555599, + "loss": 2.2712, + "step": 422040 + }, + { + "epoch": 1.6315272687912665, + "grad_norm": 0.11335956305265427, + "learning_rate": 0.0010608514494500883, + "loss": 2.2658, + "step": 422050 + }, + { + "epoch": 1.63156592599465, + "grad_norm": 0.11804255843162537, + "learning_rate": 0.0010606385147346098, + "loss": 2.266, + "step": 422060 + }, + { + "epoch": 1.6316045831980333, + "grad_norm": 0.12196633964776993, + "learning_rate": 0.0010604256282762924, + "loss": 2.2583, + "step": 422070 + }, + { + "epoch": 1.6316432404014165, + "grad_norm": 0.10328137874603271, + "learning_rate": 0.0010602127900423417, + "loss": 2.2675, + "step": 422080 + }, + { + "epoch": 1.6316818976047998, + "grad_norm": 0.09585174173116684, + "learning_rate": 0.0010600000000000002, + "loss": 2.283, + "step": 422090 + }, + { + "epoch": 1.631720554808183, + "grad_norm": 0.10802491754293442, + "learning_rate": 0.0010597872581165472, + "loss": 2.2461, + "step": 422100 + }, + { + "epoch": 1.6317592120115663, + "grad_norm": 0.10576233267784119, + "learning_rate": 0.0010595745643593002, + "loss": 2.2696, + "step": 422110 + }, + { + "epoch": 1.6317978692149495, + "grad_norm": 0.10108242928981781, + "learning_rate": 0.0010593619186956122, + "loss": 2.2586, + "step": 422120 + }, + { + "epoch": 1.6318365264183328, + "grad_norm": 0.11135169863700867, + "learning_rate": 0.001059149321092874, + "loss": 2.2715, + "step": 422130 + }, + { + "epoch": 1.631875183621716, + "grad_norm": 0.10790157318115234, + "learning_rate": 0.0010589367715185128, + "loss": 2.26, + "step": 422140 + }, + { + "epoch": 1.6319138408250993, + "grad_norm": 0.11493754386901855, + "learning_rate": 0.0010587242699399927, + "loss": 2.2616, + "step": 422150 + }, + { + "epoch": 1.6319524980284825, + "grad_norm": 0.1047576367855072, + "learning_rate": 0.0010585118163248144, + "loss": 2.2642, + "step": 422160 + }, + { + "epoch": 1.6319911552318658, + "grad_norm": 0.123683400452137, + "learning_rate": 0.0010582994106405157, + "loss": 2.2612, + "step": 422170 + }, + { + "epoch": 1.632029812435249, + "grad_norm": 0.10044895112514496, + "learning_rate": 0.0010580870528546706, + "loss": 2.2581, + "step": 422180 + }, + { + "epoch": 1.6320684696386325, + "grad_norm": 0.10729312151670456, + "learning_rate": 0.00105787474293489, + "loss": 2.2666, + "step": 422190 + }, + { + "epoch": 1.6321071268420158, + "grad_norm": 0.09733797609806061, + "learning_rate": 0.0010576624808488204, + "loss": 2.2641, + "step": 422200 + }, + { + "epoch": 1.632145784045399, + "grad_norm": 0.10946942120790482, + "learning_rate": 0.0010574502665641457, + "loss": 2.2657, + "step": 422210 + }, + { + "epoch": 1.6321844412487823, + "grad_norm": 0.14660519361495972, + "learning_rate": 0.001057238100048586, + "loss": 2.2738, + "step": 422220 + }, + { + "epoch": 1.6322230984521657, + "grad_norm": 0.1073886975646019, + "learning_rate": 0.0010570259812698972, + "loss": 2.2757, + "step": 422230 + }, + { + "epoch": 1.632261755655549, + "grad_norm": 0.10828237980604172, + "learning_rate": 0.0010568139101958724, + "loss": 2.2506, + "step": 422240 + }, + { + "epoch": 1.6323004128589322, + "grad_norm": 0.11324787884950638, + "learning_rate": 0.0010566018867943397, + "loss": 2.2758, + "step": 422250 + }, + { + "epoch": 1.6323390700623155, + "grad_norm": 0.1002441793680191, + "learning_rate": 0.001056389911033164, + "loss": 2.2523, + "step": 422260 + }, + { + "epoch": 1.6323777272656987, + "grad_norm": 0.10068082809448242, + "learning_rate": 0.0010561779828802468, + "loss": 2.2522, + "step": 422270 + }, + { + "epoch": 1.632416384469082, + "grad_norm": 0.10656104981899261, + "learning_rate": 0.0010559661023035244, + "loss": 2.261, + "step": 422280 + }, + { + "epoch": 1.6324550416724652, + "grad_norm": 0.10906139016151428, + "learning_rate": 0.0010557542692709698, + "loss": 2.2323, + "step": 422290 + }, + { + "epoch": 1.6324936988758485, + "grad_norm": 0.10339491069316864, + "learning_rate": 0.0010555424837505926, + "loss": 2.2685, + "step": 422300 + }, + { + "epoch": 1.6325323560792318, + "grad_norm": 0.1109342947602272, + "learning_rate": 0.0010553307457104367, + "loss": 2.2719, + "step": 422310 + }, + { + "epoch": 1.632571013282615, + "grad_norm": 0.11206500977277756, + "learning_rate": 0.0010551190551185824, + "loss": 2.2628, + "step": 422320 + }, + { + "epoch": 1.6326096704859983, + "grad_norm": 0.13042597472667694, + "learning_rate": 0.0010549074119431472, + "loss": 2.2602, + "step": 422330 + }, + { + "epoch": 1.6326483276893815, + "grad_norm": 0.10283195972442627, + "learning_rate": 0.001054695816152282, + "loss": 2.2567, + "step": 422340 + }, + { + "epoch": 1.6326869848927648, + "grad_norm": 0.1011483371257782, + "learning_rate": 0.001054484267714175, + "loss": 2.2665, + "step": 422350 + }, + { + "epoch": 1.6327256420961482, + "grad_norm": 0.10999699681997299, + "learning_rate": 0.001054272766597049, + "loss": 2.2669, + "step": 422360 + }, + { + "epoch": 1.6327642992995315, + "grad_norm": 0.10112316906452179, + "learning_rate": 0.0010540613127691626, + "loss": 2.2715, + "step": 422370 + }, + { + "epoch": 1.6328029565029147, + "grad_norm": 0.10901953279972076, + "learning_rate": 0.0010538499061988104, + "loss": 2.2596, + "step": 422380 + }, + { + "epoch": 1.632841613706298, + "grad_norm": 0.11298448592424393, + "learning_rate": 0.0010536385468543218, + "loss": 2.2585, + "step": 422390 + }, + { + "epoch": 1.6328802709096815, + "grad_norm": 0.10417155176401138, + "learning_rate": 0.0010534272347040615, + "loss": 2.2661, + "step": 422400 + }, + { + "epoch": 1.6329189281130647, + "grad_norm": 0.10402239859104156, + "learning_rate": 0.0010532159697164301, + "loss": 2.2584, + "step": 422410 + }, + { + "epoch": 1.632957585316448, + "grad_norm": 0.10203685611486435, + "learning_rate": 0.001053004751859863, + "loss": 2.2659, + "step": 422420 + }, + { + "epoch": 1.6329962425198312, + "grad_norm": 0.10778295248746872, + "learning_rate": 0.0010527935811028306, + "loss": 2.2572, + "step": 422430 + }, + { + "epoch": 1.6330348997232145, + "grad_norm": 0.10750208795070648, + "learning_rate": 0.0010525824574138393, + "loss": 2.2648, + "step": 422440 + }, + { + "epoch": 1.6330735569265977, + "grad_norm": 0.12133093178272247, + "learning_rate": 0.0010523713807614294, + "loss": 2.2556, + "step": 422450 + }, + { + "epoch": 1.633112214129981, + "grad_norm": 0.12352251261472702, + "learning_rate": 0.0010521603511141771, + "loss": 2.2641, + "step": 422460 + }, + { + "epoch": 1.6331508713333642, + "grad_norm": 0.10133741050958633, + "learning_rate": 0.0010519493684406935, + "loss": 2.2601, + "step": 422470 + }, + { + "epoch": 1.6331895285367475, + "grad_norm": 0.10464505851268768, + "learning_rate": 0.0010517384327096242, + "loss": 2.2511, + "step": 422480 + }, + { + "epoch": 1.6332281857401307, + "grad_norm": 0.10249783098697662, + "learning_rate": 0.00105152754388965, + "loss": 2.2537, + "step": 422490 + }, + { + "epoch": 1.633266842943514, + "grad_norm": 0.10035975277423859, + "learning_rate": 0.0010513167019494862, + "loss": 2.2676, + "step": 422500 + }, + { + "epoch": 1.6333055001468972, + "grad_norm": 0.10722977668046951, + "learning_rate": 0.0010511059068578832, + "loss": 2.2558, + "step": 422510 + }, + { + "epoch": 1.6333441573502805, + "grad_norm": 0.10174937546253204, + "learning_rate": 0.0010508951585836263, + "loss": 2.2707, + "step": 422520 + }, + { + "epoch": 1.633382814553664, + "grad_norm": 0.10561282187700272, + "learning_rate": 0.0010506844570955345, + "loss": 2.251, + "step": 422530 + }, + { + "epoch": 1.6334214717570472, + "grad_norm": 0.1309254914522171, + "learning_rate": 0.0010504738023624624, + "loss": 2.2728, + "step": 422540 + }, + { + "epoch": 1.6334601289604305, + "grad_norm": 0.11312398314476013, + "learning_rate": 0.0010502631943532987, + "loss": 2.2879, + "step": 422550 + }, + { + "epoch": 1.6334987861638137, + "grad_norm": 0.10548081248998642, + "learning_rate": 0.0010500526330369667, + "loss": 2.2589, + "step": 422560 + }, + { + "epoch": 1.6335374433671972, + "grad_norm": 0.10877972841262817, + "learning_rate": 0.001049842118382424, + "loss": 2.2461, + "step": 422570 + }, + { + "epoch": 1.6335761005705804, + "grad_norm": 0.14530453085899353, + "learning_rate": 0.0010496316503586622, + "loss": 2.2595, + "step": 422580 + }, + { + "epoch": 1.6336147577739637, + "grad_norm": 0.10653182119131088, + "learning_rate": 0.0010494212289347085, + "loss": 2.265, + "step": 422590 + }, + { + "epoch": 1.633653414977347, + "grad_norm": 0.10831808298826218, + "learning_rate": 0.001049210854079623, + "loss": 2.2501, + "step": 422600 + }, + { + "epoch": 1.6336920721807302, + "grad_norm": 0.10424677282571793, + "learning_rate": 0.0010490005257625008, + "loss": 2.2619, + "step": 422610 + }, + { + "epoch": 1.6337307293841135, + "grad_norm": 0.1058325320482254, + "learning_rate": 0.0010487902439524708, + "loss": 2.2685, + "step": 422620 + }, + { + "epoch": 1.6337693865874967, + "grad_norm": 0.10791575163602829, + "learning_rate": 0.0010485800086186964, + "loss": 2.271, + "step": 422630 + }, + { + "epoch": 1.63380804379088, + "grad_norm": 0.10271866619586945, + "learning_rate": 0.0010483698197303745, + "loss": 2.2712, + "step": 422640 + }, + { + "epoch": 1.6338467009942632, + "grad_norm": 0.10433712601661682, + "learning_rate": 0.0010481596772567365, + "loss": 2.2559, + "step": 422650 + }, + { + "epoch": 1.6338853581976465, + "grad_norm": 0.1071852296590805, + "learning_rate": 0.0010479495811670477, + "loss": 2.2714, + "step": 422660 + }, + { + "epoch": 1.6339240154010297, + "grad_norm": 0.10642591118812561, + "learning_rate": 0.001047739531430607, + "loss": 2.2587, + "step": 422670 + }, + { + "epoch": 1.633962672604413, + "grad_norm": 0.10073532909154892, + "learning_rate": 0.0010475295280167475, + "loss": 2.2643, + "step": 422680 + }, + { + "epoch": 1.6340013298077962, + "grad_norm": 0.11454087495803833, + "learning_rate": 0.0010473195708948359, + "loss": 2.2568, + "step": 422690 + }, + { + "epoch": 1.6340399870111797, + "grad_norm": 0.11369120329618454, + "learning_rate": 0.0010471096600342725, + "loss": 2.2581, + "step": 422700 + }, + { + "epoch": 1.634078644214563, + "grad_norm": 0.11510149389505386, + "learning_rate": 0.0010468997954044916, + "loss": 2.2638, + "step": 422710 + }, + { + "epoch": 1.6341173014179462, + "grad_norm": 0.10187017917633057, + "learning_rate": 0.0010466899769749612, + "loss": 2.283, + "step": 422720 + }, + { + "epoch": 1.6341559586213295, + "grad_norm": 0.10251758247613907, + "learning_rate": 0.0010464802047151828, + "loss": 2.249, + "step": 422730 + }, + { + "epoch": 1.634194615824713, + "grad_norm": 0.10828348249197006, + "learning_rate": 0.0010462704785946908, + "loss": 2.2592, + "step": 422740 + }, + { + "epoch": 1.6342332730280962, + "grad_norm": 0.11212880164384842, + "learning_rate": 0.0010460607985830544, + "loss": 2.277, + "step": 422750 + }, + { + "epoch": 1.6342719302314794, + "grad_norm": 0.12863798439502716, + "learning_rate": 0.0010458511646498749, + "loss": 2.252, + "step": 422760 + }, + { + "epoch": 1.6343105874348627, + "grad_norm": 0.1167854368686676, + "learning_rate": 0.001045641576764788, + "loss": 2.2646, + "step": 422770 + }, + { + "epoch": 1.634349244638246, + "grad_norm": 0.10574250668287277, + "learning_rate": 0.001045432034897462, + "loss": 2.2737, + "step": 422780 + }, + { + "epoch": 1.6343879018416292, + "grad_norm": 0.09611816704273224, + "learning_rate": 0.0010452225390175994, + "loss": 2.2563, + "step": 422790 + }, + { + "epoch": 1.6344265590450124, + "grad_norm": 0.18568646907806396, + "learning_rate": 0.0010450130890949341, + "loss": 2.2791, + "step": 422800 + }, + { + "epoch": 1.6344652162483957, + "grad_norm": 0.09252162277698517, + "learning_rate": 0.0010448036850992358, + "loss": 2.2496, + "step": 422810 + }, + { + "epoch": 1.634503873451779, + "grad_norm": 0.1045931875705719, + "learning_rate": 0.0010445943270003051, + "loss": 2.2759, + "step": 422820 + }, + { + "epoch": 1.6345425306551622, + "grad_norm": 0.1014002189040184, + "learning_rate": 0.0010443850147679767, + "loss": 2.2751, + "step": 422830 + }, + { + "epoch": 1.6345811878585454, + "grad_norm": 0.11607425659894943, + "learning_rate": 0.0010441757483721184, + "loss": 2.2571, + "step": 422840 + }, + { + "epoch": 1.6346198450619287, + "grad_norm": 0.10826249420642853, + "learning_rate": 0.0010439665277826304, + "loss": 2.2559, + "step": 422850 + }, + { + "epoch": 1.634658502265312, + "grad_norm": 0.10467202216386795, + "learning_rate": 0.0010437573529694464, + "loss": 2.2675, + "step": 422860 + }, + { + "epoch": 1.6346971594686954, + "grad_norm": 0.11609058082103729, + "learning_rate": 0.0010435482239025324, + "loss": 2.2568, + "step": 422870 + }, + { + "epoch": 1.6347358166720787, + "grad_norm": 0.10561811178922653, + "learning_rate": 0.0010433391405518882, + "loss": 2.2669, + "step": 422880 + }, + { + "epoch": 1.634774473875462, + "grad_norm": 0.10006106644868851, + "learning_rate": 0.0010431301028875452, + "loss": 2.2414, + "step": 422890 + }, + { + "epoch": 1.6348131310788452, + "grad_norm": 0.12403450161218643, + "learning_rate": 0.001042921110879568, + "loss": 2.2507, + "step": 422900 + }, + { + "epoch": 1.6348517882822287, + "grad_norm": 0.09937925636768341, + "learning_rate": 0.0010427121644980546, + "loss": 2.2478, + "step": 422910 + }, + { + "epoch": 1.634890445485612, + "grad_norm": 0.09592811018228531, + "learning_rate": 0.0010425032637131342, + "loss": 2.2595, + "step": 422920 + }, + { + "epoch": 1.6349291026889952, + "grad_norm": 0.12300334125757217, + "learning_rate": 0.0010422944084949697, + "loss": 2.2618, + "step": 422930 + }, + { + "epoch": 1.6349677598923784, + "grad_norm": 0.11170656234025955, + "learning_rate": 0.0010420855988137563, + "loss": 2.2604, + "step": 422940 + }, + { + "epoch": 1.6350064170957617, + "grad_norm": 0.10168015211820602, + "learning_rate": 0.0010418768346397212, + "loss": 2.2627, + "step": 422950 + }, + { + "epoch": 1.635045074299145, + "grad_norm": 0.11053097248077393, + "learning_rate": 0.001041668115943125, + "loss": 2.2479, + "step": 422960 + }, + { + "epoch": 1.6350837315025282, + "grad_norm": 0.12322186678647995, + "learning_rate": 0.001041459442694259, + "loss": 2.2623, + "step": 422970 + }, + { + "epoch": 1.6351223887059114, + "grad_norm": 0.0941690057516098, + "learning_rate": 0.0010412508148634494, + "loss": 2.2497, + "step": 422980 + }, + { + "epoch": 1.6351610459092947, + "grad_norm": 0.10818459838628769, + "learning_rate": 0.0010410422324210519, + "loss": 2.2528, + "step": 422990 + }, + { + "epoch": 1.635199703112678, + "grad_norm": 0.09662491828203201, + "learning_rate": 0.0010408336953374561, + "loss": 2.2543, + "step": 423000 + }, + { + "epoch": 1.6352383603160612, + "grad_norm": 0.10936406254768372, + "learning_rate": 0.0010406252035830837, + "loss": 2.2542, + "step": 423010 + }, + { + "epoch": 1.6352770175194444, + "grad_norm": 0.10065805166959763, + "learning_rate": 0.0010404167571283875, + "loss": 2.2614, + "step": 423020 + }, + { + "epoch": 1.635315674722828, + "grad_norm": 0.10415294766426086, + "learning_rate": 0.0010402083559438537, + "loss": 2.257, + "step": 423030 + }, + { + "epoch": 1.6353543319262112, + "grad_norm": 0.11044905334711075, + "learning_rate": 0.0010400000000000001, + "loss": 2.2656, + "step": 423040 + }, + { + "epoch": 1.6353929891295944, + "grad_norm": 0.11244837939739227, + "learning_rate": 0.0010397916892673758, + "loss": 2.265, + "step": 423050 + }, + { + "epoch": 1.6354316463329777, + "grad_norm": 0.11220791190862656, + "learning_rate": 0.0010395834237165624, + "loss": 2.271, + "step": 423060 + }, + { + "epoch": 1.635470303536361, + "grad_norm": 0.1089065745472908, + "learning_rate": 0.0010393752033181739, + "loss": 2.2728, + "step": 423070 + }, + { + "epoch": 1.6355089607397444, + "grad_norm": 0.09747709333896637, + "learning_rate": 0.0010391670280428548, + "loss": 2.2708, + "step": 423080 + }, + { + "epoch": 1.6355476179431276, + "grad_norm": 0.1153540313243866, + "learning_rate": 0.001038958897861283, + "loss": 2.2625, + "step": 423090 + }, + { + "epoch": 1.635586275146511, + "grad_norm": 0.11073118448257446, + "learning_rate": 0.0010387508127441666, + "loss": 2.268, + "step": 423100 + }, + { + "epoch": 1.6356249323498941, + "grad_norm": 0.11345057934522629, + "learning_rate": 0.0010385427726622466, + "loss": 2.2607, + "step": 423110 + }, + { + "epoch": 1.6356635895532774, + "grad_norm": 0.11611675471067429, + "learning_rate": 0.0010383347775862954, + "loss": 2.2599, + "step": 423120 + }, + { + "epoch": 1.6357022467566606, + "grad_norm": 0.10179933160543442, + "learning_rate": 0.0010381268274871163, + "loss": 2.2498, + "step": 423130 + }, + { + "epoch": 1.635740903960044, + "grad_norm": 0.1067461371421814, + "learning_rate": 0.0010379189223355446, + "loss": 2.2581, + "step": 423140 + }, + { + "epoch": 1.6357795611634272, + "grad_norm": 0.10228580236434937, + "learning_rate": 0.0010377110621024475, + "loss": 2.2611, + "step": 423150 + }, + { + "epoch": 1.6358182183668104, + "grad_norm": 0.11855067312717438, + "learning_rate": 0.0010375032467587228, + "loss": 2.2478, + "step": 423160 + }, + { + "epoch": 1.6358568755701937, + "grad_norm": 0.10486794263124466, + "learning_rate": 0.0010372954762753008, + "loss": 2.2541, + "step": 423170 + }, + { + "epoch": 1.635895532773577, + "grad_norm": 0.1444479078054428, + "learning_rate": 0.001037087750623142, + "loss": 2.2658, + "step": 423180 + }, + { + "epoch": 1.6359341899769602, + "grad_norm": 0.10084392130374908, + "learning_rate": 0.0010368800697732397, + "loss": 2.2671, + "step": 423190 + }, + { + "epoch": 1.6359728471803436, + "grad_norm": 0.11010993272066116, + "learning_rate": 0.0010366724336966161, + "loss": 2.2501, + "step": 423200 + }, + { + "epoch": 1.6360115043837269, + "grad_norm": 0.10917601734399796, + "learning_rate": 0.0010364648423643278, + "loss": 2.2724, + "step": 423210 + }, + { + "epoch": 1.6360501615871101, + "grad_norm": 0.11198030412197113, + "learning_rate": 0.0010362572957474594, + "loss": 2.2782, + "step": 423220 + }, + { + "epoch": 1.6360888187904934, + "grad_norm": 0.11350420862436295, + "learning_rate": 0.0010360497938171286, + "loss": 2.2729, + "step": 423230 + }, + { + "epoch": 1.6361274759938766, + "grad_norm": 0.10055453330278397, + "learning_rate": 0.001035842336544484, + "loss": 2.2706, + "step": 423240 + }, + { + "epoch": 1.6361661331972601, + "grad_norm": 0.10071729868650436, + "learning_rate": 0.0010356349239007043, + "loss": 2.2718, + "step": 423250 + }, + { + "epoch": 1.6362047904006434, + "grad_norm": 0.1166156753897667, + "learning_rate": 0.0010354275558570004, + "loss": 2.2711, + "step": 423260 + }, + { + "epoch": 1.6362434476040266, + "grad_norm": 0.10396468639373779, + "learning_rate": 0.0010352202323846132, + "loss": 2.2673, + "step": 423270 + }, + { + "epoch": 1.6362821048074099, + "grad_norm": 0.11050568521022797, + "learning_rate": 0.0010350129534548145, + "loss": 2.2595, + "step": 423280 + }, + { + "epoch": 1.6363207620107931, + "grad_norm": 0.12434356659650803, + "learning_rate": 0.0010348057190389078, + "loss": 2.2524, + "step": 423290 + }, + { + "epoch": 1.6363594192141764, + "grad_norm": 0.09937973320484161, + "learning_rate": 0.0010345985291082265, + "loss": 2.2495, + "step": 423300 + }, + { + "epoch": 1.6363980764175596, + "grad_norm": 0.10541477799415588, + "learning_rate": 0.001034391383634135, + "loss": 2.2456, + "step": 423310 + }, + { + "epoch": 1.6364367336209429, + "grad_norm": 0.09955312311649323, + "learning_rate": 0.0010341842825880292, + "loss": 2.2529, + "step": 423320 + }, + { + "epoch": 1.6364753908243261, + "grad_norm": 0.11587560921907425, + "learning_rate": 0.0010339772259413342, + "loss": 2.2644, + "step": 423330 + }, + { + "epoch": 1.6365140480277094, + "grad_norm": 0.11401667445898056, + "learning_rate": 0.0010337702136655072, + "loss": 2.2581, + "step": 423340 + }, + { + "epoch": 1.6365527052310926, + "grad_norm": 0.10360361635684967, + "learning_rate": 0.0010335632457320343, + "loss": 2.2647, + "step": 423350 + }, + { + "epoch": 1.636591362434476, + "grad_norm": 0.10124827921390533, + "learning_rate": 0.0010333563221124342, + "loss": 2.2672, + "step": 423360 + }, + { + "epoch": 1.6366300196378594, + "grad_norm": 0.12644001841545105, + "learning_rate": 0.0010331494427782543, + "loss": 2.2465, + "step": 423370 + }, + { + "epoch": 1.6366686768412426, + "grad_norm": 0.11874937266111374, + "learning_rate": 0.0010329426077010733, + "loss": 2.2641, + "step": 423380 + }, + { + "epoch": 1.6367073340446259, + "grad_norm": 0.10666025429964066, + "learning_rate": 0.0010327358168525003, + "loss": 2.2589, + "step": 423390 + }, + { + "epoch": 1.6367459912480091, + "grad_norm": 0.10593769699335098, + "learning_rate": 0.001032529070204174, + "loss": 2.2528, + "step": 423400 + }, + { + "epoch": 1.6367846484513926, + "grad_norm": 0.13542486727237701, + "learning_rate": 0.0010323223677277645, + "loss": 2.2633, + "step": 423410 + }, + { + "epoch": 1.6368233056547759, + "grad_norm": 0.11956963688135147, + "learning_rate": 0.001032115709394971, + "loss": 2.2643, + "step": 423420 + }, + { + "epoch": 1.636861962858159, + "grad_norm": 0.1231904923915863, + "learning_rate": 0.0010319090951775242, + "loss": 2.2479, + "step": 423430 + }, + { + "epoch": 1.6369006200615424, + "grad_norm": 0.10609181225299835, + "learning_rate": 0.0010317025250471837, + "loss": 2.2597, + "step": 423440 + }, + { + "epoch": 1.6369392772649256, + "grad_norm": 0.1126595214009285, + "learning_rate": 0.00103149599897574, + "loss": 2.266, + "step": 423450 + }, + { + "epoch": 1.6369779344683089, + "grad_norm": 0.12165053188800812, + "learning_rate": 0.0010312895169350131, + "loss": 2.2647, + "step": 423460 + }, + { + "epoch": 1.6370165916716921, + "grad_norm": 0.10681026428937912, + "learning_rate": 0.0010310830788968542, + "loss": 2.254, + "step": 423470 + }, + { + "epoch": 1.6370552488750754, + "grad_norm": 0.1236894503235817, + "learning_rate": 0.0010308766848331425, + "loss": 2.2657, + "step": 423480 + }, + { + "epoch": 1.6370939060784586, + "grad_norm": 0.12302964925765991, + "learning_rate": 0.0010306703347157894, + "loss": 2.2761, + "step": 423490 + }, + { + "epoch": 1.6371325632818419, + "grad_norm": 0.1260789930820465, + "learning_rate": 0.0010304640285167341, + "loss": 2.2721, + "step": 423500 + }, + { + "epoch": 1.6371712204852251, + "grad_norm": 0.10569873452186584, + "learning_rate": 0.0010302577662079475, + "loss": 2.2352, + "step": 423510 + }, + { + "epoch": 1.6372098776886084, + "grad_norm": 0.10717001557350159, + "learning_rate": 0.0010300515477614288, + "loss": 2.2717, + "step": 423520 + }, + { + "epoch": 1.6372485348919916, + "grad_norm": 0.10829834640026093, + "learning_rate": 0.0010298453731492076, + "loss": 2.2524, + "step": 423530 + }, + { + "epoch": 1.637287192095375, + "grad_norm": 0.10441777855157852, + "learning_rate": 0.0010296392423433437, + "loss": 2.2603, + "step": 423540 + }, + { + "epoch": 1.6373258492987584, + "grad_norm": 0.11129403859376907, + "learning_rate": 0.0010294331553159257, + "loss": 2.2615, + "step": 423550 + }, + { + "epoch": 1.6373645065021416, + "grad_norm": 0.09813987463712692, + "learning_rate": 0.0010292271120390724, + "loss": 2.2669, + "step": 423560 + }, + { + "epoch": 1.6374031637055249, + "grad_norm": 0.10924495756626129, + "learning_rate": 0.0010290211124849315, + "loss": 2.2598, + "step": 423570 + }, + { + "epoch": 1.6374418209089083, + "grad_norm": 0.1084313839673996, + "learning_rate": 0.0010288151566256812, + "loss": 2.2558, + "step": 423580 + }, + { + "epoch": 1.6374804781122916, + "grad_norm": 0.1174166277050972, + "learning_rate": 0.001028609244433529, + "loss": 2.2696, + "step": 423590 + }, + { + "epoch": 1.6375191353156748, + "grad_norm": 0.11019670218229294, + "learning_rate": 0.0010284033758807105, + "loss": 2.2579, + "step": 423600 + }, + { + "epoch": 1.637557792519058, + "grad_norm": 0.10024767369031906, + "learning_rate": 0.0010281975509394928, + "loss": 2.2658, + "step": 423610 + }, + { + "epoch": 1.6375964497224413, + "grad_norm": 0.10447598993778229, + "learning_rate": 0.0010279917695821706, + "loss": 2.2696, + "step": 423620 + }, + { + "epoch": 1.6376351069258246, + "grad_norm": 0.10102497041225433, + "learning_rate": 0.0010277860317810692, + "loss": 2.2516, + "step": 423630 + }, + { + "epoch": 1.6376737641292078, + "grad_norm": 0.11171722412109375, + "learning_rate": 0.0010275803375085425, + "loss": 2.2462, + "step": 423640 + }, + { + "epoch": 1.637712421332591, + "grad_norm": 0.10647977143526077, + "learning_rate": 0.0010273746867369737, + "loss": 2.2583, + "step": 423650 + }, + { + "epoch": 1.6377510785359743, + "grad_norm": 0.10822014510631561, + "learning_rate": 0.0010271690794387753, + "loss": 2.2726, + "step": 423660 + }, + { + "epoch": 1.6377897357393576, + "grad_norm": 0.13143102824687958, + "learning_rate": 0.0010269635155863888, + "loss": 2.2565, + "step": 423670 + }, + { + "epoch": 1.6378283929427409, + "grad_norm": 0.10811565816402435, + "learning_rate": 0.001026757995152285, + "loss": 2.2624, + "step": 423680 + }, + { + "epoch": 1.637867050146124, + "grad_norm": 0.13415491580963135, + "learning_rate": 0.0010265525181089635, + "loss": 2.2618, + "step": 423690 + }, + { + "epoch": 1.6379057073495074, + "grad_norm": 0.10379686206579208, + "learning_rate": 0.0010263470844289531, + "loss": 2.2542, + "step": 423700 + }, + { + "epoch": 1.6379443645528908, + "grad_norm": 0.1132299154996872, + "learning_rate": 0.001026141694084812, + "loss": 2.2588, + "step": 423710 + }, + { + "epoch": 1.637983021756274, + "grad_norm": 0.11209876835346222, + "learning_rate": 0.0010259363470491265, + "loss": 2.2782, + "step": 423720 + }, + { + "epoch": 1.6380216789596573, + "grad_norm": 0.10772533714771271, + "learning_rate": 0.0010257310432945119, + "loss": 2.2668, + "step": 423730 + }, + { + "epoch": 1.6380603361630406, + "grad_norm": 0.09981733560562134, + "learning_rate": 0.0010255257827936136, + "loss": 2.245, + "step": 423740 + }, + { + "epoch": 1.638098993366424, + "grad_norm": 0.1126519963145256, + "learning_rate": 0.001025320565519104, + "loss": 2.2495, + "step": 423750 + }, + { + "epoch": 1.6381376505698073, + "grad_norm": 0.09858756512403488, + "learning_rate": 0.0010251153914436848, + "loss": 2.254, + "step": 423760 + }, + { + "epoch": 1.6381763077731906, + "grad_norm": 0.11347083002328873, + "learning_rate": 0.0010249102605400875, + "loss": 2.2546, + "step": 423770 + }, + { + "epoch": 1.6382149649765738, + "grad_norm": 0.11383738368749619, + "learning_rate": 0.0010247051727810712, + "loss": 2.2639, + "step": 423780 + }, + { + "epoch": 1.638253622179957, + "grad_norm": 0.10293114185333252, + "learning_rate": 0.0010245001281394242, + "loss": 2.2554, + "step": 423790 + }, + { + "epoch": 1.6382922793833403, + "grad_norm": 0.10788684338331223, + "learning_rate": 0.0010242951265879627, + "loss": 2.27, + "step": 423800 + }, + { + "epoch": 1.6383309365867236, + "grad_norm": 0.11021386086940765, + "learning_rate": 0.0010240901680995318, + "loss": 2.2539, + "step": 423810 + }, + { + "epoch": 1.6383695937901068, + "grad_norm": 0.09868684411048889, + "learning_rate": 0.0010238852526470057, + "loss": 2.2646, + "step": 423820 + }, + { + "epoch": 1.63840825099349, + "grad_norm": 0.09737741947174072, + "learning_rate": 0.001023680380203286, + "loss": 2.2497, + "step": 423830 + }, + { + "epoch": 1.6384469081968733, + "grad_norm": 0.0985494926571846, + "learning_rate": 0.0010234755507413037, + "loss": 2.2438, + "step": 423840 + }, + { + "epoch": 1.6384855654002566, + "grad_norm": 0.11383437365293503, + "learning_rate": 0.0010232707642340176, + "loss": 2.2494, + "step": 423850 + }, + { + "epoch": 1.6385242226036398, + "grad_norm": 0.1099669560790062, + "learning_rate": 0.001023066020654415, + "loss": 2.2718, + "step": 423860 + }, + { + "epoch": 1.638562879807023, + "grad_norm": 0.11040462553501129, + "learning_rate": 0.0010228613199755113, + "loss": 2.2524, + "step": 423870 + }, + { + "epoch": 1.6386015370104066, + "grad_norm": 0.13061681389808655, + "learning_rate": 0.0010226566621703505, + "loss": 2.2567, + "step": 423880 + }, + { + "epoch": 1.6386401942137898, + "grad_norm": 0.0982203334569931, + "learning_rate": 0.001022452047212005, + "loss": 2.2596, + "step": 423890 + }, + { + "epoch": 1.638678851417173, + "grad_norm": 0.10891470313072205, + "learning_rate": 0.0010222474750735749, + "loss": 2.272, + "step": 423900 + }, + { + "epoch": 1.6387175086205563, + "grad_norm": 0.11531899124383926, + "learning_rate": 0.001022042945728188, + "loss": 2.2588, + "step": 423910 + }, + { + "epoch": 1.6387561658239398, + "grad_norm": 0.11567877233028412, + "learning_rate": 0.0010218384591490013, + "loss": 2.2682, + "step": 423920 + }, + { + "epoch": 1.638794823027323, + "grad_norm": 0.1000378280878067, + "learning_rate": 0.0010216340153091994, + "loss": 2.2711, + "step": 423930 + }, + { + "epoch": 1.6388334802307063, + "grad_norm": 0.10007666051387787, + "learning_rate": 0.0010214296141819945, + "loss": 2.2551, + "step": 423940 + }, + { + "epoch": 1.6388721374340895, + "grad_norm": 0.11694041639566422, + "learning_rate": 0.0010212252557406276, + "loss": 2.2614, + "step": 423950 + }, + { + "epoch": 1.6389107946374728, + "grad_norm": 0.10663101822137833, + "learning_rate": 0.0010210209399583667, + "loss": 2.2754, + "step": 423960 + }, + { + "epoch": 1.638949451840856, + "grad_norm": 0.10280250757932663, + "learning_rate": 0.0010208166668085083, + "loss": 2.2705, + "step": 423970 + }, + { + "epoch": 1.6389881090442393, + "grad_norm": 0.11807078868150711, + "learning_rate": 0.0010206124362643766, + "loss": 2.2682, + "step": 423980 + }, + { + "epoch": 1.6390267662476226, + "grad_norm": 0.11073262244462967, + "learning_rate": 0.0010204082482993236, + "loss": 2.2518, + "step": 423990 + }, + { + "epoch": 1.6390654234510058, + "grad_norm": 0.12303231656551361, + "learning_rate": 0.001020204102886729, + "loss": 2.2481, + "step": 424000 + }, + { + "epoch": 1.639104080654389, + "grad_norm": 0.10889635235071182, + "learning_rate": 0.00102, + "loss": 2.2615, + "step": 424010 + }, + { + "epoch": 1.6391427378577723, + "grad_norm": 0.09901122748851776, + "learning_rate": 0.0010197959396125724, + "loss": 2.2585, + "step": 424020 + }, + { + "epoch": 1.6391813950611556, + "grad_norm": 0.10052678734064102, + "learning_rate": 0.0010195919216979083, + "loss": 2.2644, + "step": 424030 + }, + { + "epoch": 1.6392200522645388, + "grad_norm": 0.11836764216423035, + "learning_rate": 0.0010193879462294991, + "loss": 2.2634, + "step": 424040 + }, + { + "epoch": 1.6392587094679223, + "grad_norm": 0.10381746292114258, + "learning_rate": 0.0010191840131808618, + "loss": 2.2557, + "step": 424050 + }, + { + "epoch": 1.6392973666713055, + "grad_norm": 0.11419299244880676, + "learning_rate": 0.001018980122525542, + "loss": 2.2556, + "step": 424060 + }, + { + "epoch": 1.6393360238746888, + "grad_norm": 0.10854178667068481, + "learning_rate": 0.0010187762742371138, + "loss": 2.2699, + "step": 424070 + }, + { + "epoch": 1.639374681078072, + "grad_norm": 0.1035003736615181, + "learning_rate": 0.0010185724682891762, + "loss": 2.2581, + "step": 424080 + }, + { + "epoch": 1.6394133382814555, + "grad_norm": 0.1015508621931076, + "learning_rate": 0.001018368704655358, + "loss": 2.2584, + "step": 424090 + }, + { + "epoch": 1.6394519954848388, + "grad_norm": 0.10948213934898376, + "learning_rate": 0.0010181649833093138, + "loss": 2.2237, + "step": 424100 + }, + { + "epoch": 1.639490652688222, + "grad_norm": 0.1012764498591423, + "learning_rate": 0.0010179613042247265, + "loss": 2.2599, + "step": 424110 + }, + { + "epoch": 1.6395293098916053, + "grad_norm": 0.1120927631855011, + "learning_rate": 0.0010177576673753063, + "loss": 2.2643, + "step": 424120 + }, + { + "epoch": 1.6395679670949885, + "grad_norm": 0.1091916486620903, + "learning_rate": 0.0010175540727347893, + "loss": 2.2615, + "step": 424130 + }, + { + "epoch": 1.6396066242983718, + "grad_norm": 0.09791362285614014, + "learning_rate": 0.0010173505202769402, + "loss": 2.2714, + "step": 424140 + }, + { + "epoch": 1.639645281501755, + "grad_norm": 0.0960848480463028, + "learning_rate": 0.001017147009975551, + "loss": 2.25, + "step": 424150 + }, + { + "epoch": 1.6396839387051383, + "grad_norm": 0.10298629850149155, + "learning_rate": 0.0010169435418044394, + "loss": 2.2562, + "step": 424160 + }, + { + "epoch": 1.6397225959085215, + "grad_norm": 0.09647177904844284, + "learning_rate": 0.0010167401157374517, + "loss": 2.2334, + "step": 424170 + }, + { + "epoch": 1.6397612531119048, + "grad_norm": 0.09860710054636002, + "learning_rate": 0.0010165367317484604, + "loss": 2.2593, + "step": 424180 + }, + { + "epoch": 1.639799910315288, + "grad_norm": 0.10834896564483643, + "learning_rate": 0.001016333389811365, + "loss": 2.2657, + "step": 424190 + }, + { + "epoch": 1.6398385675186713, + "grad_norm": 0.11073605716228485, + "learning_rate": 0.0010161300899000926, + "loss": 2.2648, + "step": 424200 + }, + { + "epoch": 1.6398772247220545, + "grad_norm": 0.10061077773571014, + "learning_rate": 0.0010159268319885966, + "loss": 2.2712, + "step": 424210 + }, + { + "epoch": 1.639915881925438, + "grad_norm": 0.09821108728647232, + "learning_rate": 0.0010157236160508573, + "loss": 2.2819, + "step": 424220 + }, + { + "epoch": 1.6399545391288213, + "grad_norm": 0.1119464710354805, + "learning_rate": 0.0010155204420608825, + "loss": 2.2598, + "step": 424230 + }, + { + "epoch": 1.6399931963322045, + "grad_norm": 0.11146452277898788, + "learning_rate": 0.0010153173099927062, + "loss": 2.2582, + "step": 424240 + }, + { + "epoch": 1.6400318535355878, + "grad_norm": 0.12815554440021515, + "learning_rate": 0.0010151142198203894, + "loss": 2.2592, + "step": 424250 + }, + { + "epoch": 1.6400705107389713, + "grad_norm": 0.10622846335172653, + "learning_rate": 0.00101491117151802, + "loss": 2.2555, + "step": 424260 + }, + { + "epoch": 1.6401091679423545, + "grad_norm": 0.10454724729061127, + "learning_rate": 0.0010147081650597118, + "loss": 2.26, + "step": 424270 + }, + { + "epoch": 1.6401478251457378, + "grad_norm": 0.10535355657339096, + "learning_rate": 0.0010145052004196064, + "loss": 2.2573, + "step": 424280 + }, + { + "epoch": 1.640186482349121, + "grad_norm": 0.11934984475374222, + "learning_rate": 0.0010143022775718716, + "loss": 2.2567, + "step": 424290 + }, + { + "epoch": 1.6402251395525043, + "grad_norm": 0.12079385668039322, + "learning_rate": 0.001014099396490701, + "loss": 2.2664, + "step": 424300 + }, + { + "epoch": 1.6402637967558875, + "grad_norm": 0.13715972006320953, + "learning_rate": 0.0010138965571503161, + "loss": 2.2608, + "step": 424310 + }, + { + "epoch": 1.6403024539592708, + "grad_norm": 0.11399926245212555, + "learning_rate": 0.0010136937595249637, + "loss": 2.2532, + "step": 424320 + }, + { + "epoch": 1.640341111162654, + "grad_norm": 0.1082891896367073, + "learning_rate": 0.0010134910035889183, + "loss": 2.2544, + "step": 424330 + }, + { + "epoch": 1.6403797683660373, + "grad_norm": 0.10113243758678436, + "learning_rate": 0.0010132882893164792, + "loss": 2.2608, + "step": 424340 + }, + { + "epoch": 1.6404184255694205, + "grad_norm": 0.1076381579041481, + "learning_rate": 0.0010130856166819737, + "loss": 2.2438, + "step": 424350 + }, + { + "epoch": 1.6404570827728038, + "grad_norm": 0.1019374430179596, + "learning_rate": 0.0010128829856597547, + "loss": 2.2544, + "step": 424360 + }, + { + "epoch": 1.640495739976187, + "grad_norm": 0.10723643004894257, + "learning_rate": 0.0010126803962242014, + "loss": 2.2533, + "step": 424370 + }, + { + "epoch": 1.6405343971795703, + "grad_norm": 0.10868024080991745, + "learning_rate": 0.0010124778483497194, + "loss": 2.2547, + "step": 424380 + }, + { + "epoch": 1.6405730543829538, + "grad_norm": 0.10560424625873566, + "learning_rate": 0.0010122753420107405, + "loss": 2.2505, + "step": 424390 + }, + { + "epoch": 1.640611711586337, + "grad_norm": 0.12155601382255554, + "learning_rate": 0.0010120728771817224, + "loss": 2.2348, + "step": 424400 + }, + { + "epoch": 1.6406503687897203, + "grad_norm": 0.11004646122455597, + "learning_rate": 0.00101187045383715, + "loss": 2.2577, + "step": 424410 + }, + { + "epoch": 1.6406890259931035, + "grad_norm": 0.11234424263238907, + "learning_rate": 0.001011668071951533, + "loss": 2.2611, + "step": 424420 + }, + { + "epoch": 1.640727683196487, + "grad_norm": 0.10152650624513626, + "learning_rate": 0.0010114657314994084, + "loss": 2.2629, + "step": 424430 + }, + { + "epoch": 1.6407663403998702, + "grad_norm": 0.11180943250656128, + "learning_rate": 0.0010112634324553381, + "loss": 2.2624, + "step": 424440 + }, + { + "epoch": 1.6408049976032535, + "grad_norm": 0.10553466528654099, + "learning_rate": 0.0010110611747939106, + "loss": 2.2676, + "step": 424450 + }, + { + "epoch": 1.6408436548066367, + "grad_norm": 0.10074066370725632, + "learning_rate": 0.001010858958489741, + "loss": 2.2606, + "step": 424460 + }, + { + "epoch": 1.64088231201002, + "grad_norm": 0.11224688589572906, + "learning_rate": 0.0010106567835174692, + "loss": 2.2572, + "step": 424470 + }, + { + "epoch": 1.6409209692134032, + "grad_norm": 0.10370156168937683, + "learning_rate": 0.0010104546498517614, + "loss": 2.2662, + "step": 424480 + }, + { + "epoch": 1.6409596264167865, + "grad_norm": 0.12131761759519577, + "learning_rate": 0.0010102525574673103, + "loss": 2.2643, + "step": 424490 + }, + { + "epoch": 1.6409982836201698, + "grad_norm": 0.09501279145479202, + "learning_rate": 0.0010100505063388335, + "loss": 2.2581, + "step": 424500 + }, + { + "epoch": 1.641036940823553, + "grad_norm": 0.1053239107131958, + "learning_rate": 0.0010098484964410747, + "loss": 2.2399, + "step": 424510 + }, + { + "epoch": 1.6410755980269363, + "grad_norm": 0.11659132689237595, + "learning_rate": 0.0010096465277488042, + "loss": 2.2612, + "step": 424520 + }, + { + "epoch": 1.6411142552303195, + "grad_norm": 0.11104684323072433, + "learning_rate": 0.0010094446002368168, + "loss": 2.2475, + "step": 424530 + }, + { + "epoch": 1.6411529124337028, + "grad_norm": 0.12283774465322495, + "learning_rate": 0.001009242713879933, + "loss": 2.2616, + "step": 424540 + }, + { + "epoch": 1.641191569637086, + "grad_norm": 0.09906791895627975, + "learning_rate": 0.0010090408686530003, + "loss": 2.2639, + "step": 424550 + }, + { + "epoch": 1.6412302268404695, + "grad_norm": 0.09812948852777481, + "learning_rate": 0.0010088390645308905, + "loss": 2.2635, + "step": 424560 + }, + { + "epoch": 1.6412688840438527, + "grad_norm": 0.1091708242893219, + "learning_rate": 0.0010086373014885014, + "loss": 2.2476, + "step": 424570 + }, + { + "epoch": 1.641307541247236, + "grad_norm": 0.10793473571538925, + "learning_rate": 0.0010084355795007566, + "loss": 2.2551, + "step": 424580 + }, + { + "epoch": 1.6413461984506192, + "grad_norm": 0.11073902249336243, + "learning_rate": 0.001008233898542605, + "loss": 2.2469, + "step": 424590 + }, + { + "epoch": 1.6413848556540027, + "grad_norm": 0.11795094609260559, + "learning_rate": 0.0010080322585890205, + "loss": 2.2548, + "step": 424600 + }, + { + "epoch": 1.641423512857386, + "grad_norm": 0.11132269352674484, + "learning_rate": 0.001007830659615003, + "loss": 2.2419, + "step": 424610 + }, + { + "epoch": 1.6414621700607692, + "grad_norm": 0.11662492156028748, + "learning_rate": 0.0010076291015955778, + "loss": 2.2697, + "step": 424620 + }, + { + "epoch": 1.6415008272641525, + "grad_norm": 0.11011848598718643, + "learning_rate": 0.0010074275845057955, + "loss": 2.2672, + "step": 424630 + }, + { + "epoch": 1.6415394844675357, + "grad_norm": 0.11036280542612076, + "learning_rate": 0.0010072261083207315, + "loss": 2.2444, + "step": 424640 + }, + { + "epoch": 1.641578141670919, + "grad_norm": 0.10838011652231216, + "learning_rate": 0.0010070246730154873, + "loss": 2.2596, + "step": 424650 + }, + { + "epoch": 1.6416167988743022, + "grad_norm": 0.11376731097698212, + "learning_rate": 0.001006823278565189, + "loss": 2.2606, + "step": 424660 + }, + { + "epoch": 1.6416554560776855, + "grad_norm": 0.11246142536401749, + "learning_rate": 0.0010066219249449885, + "loss": 2.2576, + "step": 424670 + }, + { + "epoch": 1.6416941132810687, + "grad_norm": 0.10369978845119476, + "learning_rate": 0.0010064206121300626, + "loss": 2.2474, + "step": 424680 + }, + { + "epoch": 1.641732770484452, + "grad_norm": 0.10715661942958832, + "learning_rate": 0.0010062193400956123, + "loss": 2.245, + "step": 424690 + }, + { + "epoch": 1.6417714276878352, + "grad_norm": 0.11022845655679703, + "learning_rate": 0.001006018108816866, + "loss": 2.271, + "step": 424700 + }, + { + "epoch": 1.6418100848912185, + "grad_norm": 0.09892480075359344, + "learning_rate": 0.0010058169182690746, + "loss": 2.2606, + "step": 424710 + }, + { + "epoch": 1.6418487420946017, + "grad_norm": 0.1053096204996109, + "learning_rate": 0.0010056157684275157, + "loss": 2.2538, + "step": 424720 + }, + { + "epoch": 1.6418873992979852, + "grad_norm": 0.1074117124080658, + "learning_rate": 0.0010054146592674916, + "loss": 2.2492, + "step": 424730 + }, + { + "epoch": 1.6419260565013685, + "grad_norm": 0.11926567554473877, + "learning_rate": 0.001005213590764329, + "loss": 2.2562, + "step": 424740 + }, + { + "epoch": 1.6419647137047517, + "grad_norm": 0.11387331038713455, + "learning_rate": 0.0010050125628933801, + "loss": 2.2639, + "step": 424750 + }, + { + "epoch": 1.642003370908135, + "grad_norm": 0.10490171611309052, + "learning_rate": 0.0010048115756300217, + "loss": 2.2594, + "step": 424760 + }, + { + "epoch": 1.6420420281115184, + "grad_norm": 0.12902913987636566, + "learning_rate": 0.0010046106289496558, + "loss": 2.262, + "step": 424770 + }, + { + "epoch": 1.6420806853149017, + "grad_norm": 0.10744645446538925, + "learning_rate": 0.0010044097228277087, + "loss": 2.258, + "step": 424780 + }, + { + "epoch": 1.642119342518285, + "grad_norm": 0.11677946895360947, + "learning_rate": 0.0010042088572396319, + "loss": 2.2542, + "step": 424790 + }, + { + "epoch": 1.6421579997216682, + "grad_norm": 0.11043225228786469, + "learning_rate": 0.0010040080321609014, + "loss": 2.2571, + "step": 424800 + }, + { + "epoch": 1.6421966569250515, + "grad_norm": 0.10317932069301605, + "learning_rate": 0.0010038072475670183, + "loss": 2.266, + "step": 424810 + }, + { + "epoch": 1.6422353141284347, + "grad_norm": 0.10283423215150833, + "learning_rate": 0.001003606503433508, + "loss": 2.2703, + "step": 424820 + }, + { + "epoch": 1.642273971331818, + "grad_norm": 0.0987296849489212, + "learning_rate": 0.0010034057997359205, + "loss": 2.2538, + "step": 424830 + }, + { + "epoch": 1.6423126285352012, + "grad_norm": 0.10434217751026154, + "learning_rate": 0.0010032051364498309, + "loss": 2.2522, + "step": 424840 + }, + { + "epoch": 1.6423512857385845, + "grad_norm": 0.1010420098900795, + "learning_rate": 0.0010030045135508386, + "loss": 2.2399, + "step": 424850 + }, + { + "epoch": 1.6423899429419677, + "grad_norm": 0.10522980988025665, + "learning_rate": 0.0010028039310145673, + "loss": 2.2664, + "step": 424860 + }, + { + "epoch": 1.642428600145351, + "grad_norm": 0.10818637907505035, + "learning_rate": 0.0010026033888166652, + "loss": 2.255, + "step": 424870 + }, + { + "epoch": 1.6424672573487342, + "grad_norm": 0.1131870374083519, + "learning_rate": 0.001002402886932806, + "loss": 2.2624, + "step": 424880 + }, + { + "epoch": 1.6425059145521177, + "grad_norm": 0.11998511850833893, + "learning_rate": 0.0010022024253386861, + "loss": 2.2607, + "step": 424890 + }, + { + "epoch": 1.642544571755501, + "grad_norm": 0.11467783898115158, + "learning_rate": 0.0010020020040100279, + "loss": 2.2609, + "step": 424900 + }, + { + "epoch": 1.6425832289588842, + "grad_norm": 0.1055130586028099, + "learning_rate": 0.0010018016229225775, + "loss": 2.2366, + "step": 424910 + }, + { + "epoch": 1.6426218861622675, + "grad_norm": 0.10552117228507996, + "learning_rate": 0.0010016012820521052, + "loss": 2.2577, + "step": 424920 + }, + { + "epoch": 1.6426605433656507, + "grad_norm": 0.104463130235672, + "learning_rate": 0.0010014009813744055, + "loss": 2.2649, + "step": 424930 + }, + { + "epoch": 1.6426992005690342, + "grad_norm": 0.11977139860391617, + "learning_rate": 0.0010012007208652983, + "loss": 2.2532, + "step": 424940 + }, + { + "epoch": 1.6427378577724174, + "grad_norm": 0.0970083475112915, + "learning_rate": 0.0010010005005006257, + "loss": 2.2499, + "step": 424950 + }, + { + "epoch": 1.6427765149758007, + "grad_norm": 0.11830824613571167, + "learning_rate": 0.0010008003202562562, + "loss": 2.2518, + "step": 424960 + }, + { + "epoch": 1.642815172179184, + "grad_norm": 0.11555589735507965, + "learning_rate": 0.0010006001801080812, + "loss": 2.2423, + "step": 424970 + }, + { + "epoch": 1.6428538293825672, + "grad_norm": 0.13867305219173431, + "learning_rate": 0.0010004000800320162, + "loss": 2.2492, + "step": 424980 + }, + { + "epoch": 1.6428924865859504, + "grad_norm": 0.1125546395778656, + "learning_rate": 0.0010002000200040012, + "loss": 2.2543, + "step": 424990 + }, + { + "epoch": 1.6429311437893337, + "grad_norm": 0.10740401595830917, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 425000 + }, + { + "epoch": 1.642969800992717, + "grad_norm": 0.10235648602247238, + "learning_rate": 0.000999800019996001, + "loss": 2.2456, + "step": 425010 + }, + { + "epoch": 1.6430084581961002, + "grad_norm": 0.10700272023677826, + "learning_rate": 0.000999600079968016, + "loss": 2.255, + "step": 425020 + }, + { + "epoch": 1.6430471153994834, + "grad_norm": 0.10306088626384735, + "learning_rate": 0.000999400179892081, + "loss": 2.2475, + "step": 425030 + }, + { + "epoch": 1.6430857726028667, + "grad_norm": 0.10797042399644852, + "learning_rate": 0.0009992003197442556, + "loss": 2.2633, + "step": 425040 + }, + { + "epoch": 1.64312442980625, + "grad_norm": 0.11185480654239655, + "learning_rate": 0.0009990004995006241, + "loss": 2.2482, + "step": 425050 + }, + { + "epoch": 1.6431630870096334, + "grad_norm": 0.09936090558767319, + "learning_rate": 0.0009988007191372938, + "loss": 2.2524, + "step": 425060 + }, + { + "epoch": 1.6432017442130167, + "grad_norm": 0.12256762385368347, + "learning_rate": 0.0009986009786303964, + "loss": 2.2479, + "step": 425070 + }, + { + "epoch": 1.6432404014164, + "grad_norm": 0.13450071215629578, + "learning_rate": 0.0009984012779560869, + "loss": 2.2452, + "step": 425080 + }, + { + "epoch": 1.6432790586197832, + "grad_norm": 0.0990997776389122, + "learning_rate": 0.0009982016170905447, + "loss": 2.2521, + "step": 425090 + }, + { + "epoch": 1.6433177158231664, + "grad_norm": 0.11909011751413345, + "learning_rate": 0.0009980019960099723, + "loss": 2.2547, + "step": 425100 + }, + { + "epoch": 1.64335637302655, + "grad_norm": 0.10571201890707016, + "learning_rate": 0.0009978024146905962, + "loss": 2.2402, + "step": 425110 + }, + { + "epoch": 1.6433950302299332, + "grad_norm": 0.10910934209823608, + "learning_rate": 0.0009976028731086665, + "loss": 2.2554, + "step": 425120 + }, + { + "epoch": 1.6434336874333164, + "grad_norm": 0.10239577293395996, + "learning_rate": 0.0009974033712404572, + "loss": 2.2534, + "step": 425130 + }, + { + "epoch": 1.6434723446366997, + "grad_norm": 0.10150449723005295, + "learning_rate": 0.000997203909062266, + "loss": 2.2713, + "step": 425140 + }, + { + "epoch": 1.643511001840083, + "grad_norm": 0.10944680124521255, + "learning_rate": 0.0009970044865504134, + "loss": 2.2461, + "step": 425150 + }, + { + "epoch": 1.6435496590434662, + "grad_norm": 0.09132153540849686, + "learning_rate": 0.0009968051036812438, + "loss": 2.2605, + "step": 425160 + }, + { + "epoch": 1.6435883162468494, + "grad_norm": 0.10329840332269669, + "learning_rate": 0.0009966057604311256, + "loss": 2.2648, + "step": 425170 + }, + { + "epoch": 1.6436269734502327, + "grad_norm": 0.10862980782985687, + "learning_rate": 0.0009964064567764496, + "loss": 2.2633, + "step": 425180 + }, + { + "epoch": 1.643665630653616, + "grad_norm": 0.11289060860872269, + "learning_rate": 0.0009962071926936315, + "loss": 2.252, + "step": 425190 + }, + { + "epoch": 1.6437042878569992, + "grad_norm": 0.09915979951620102, + "learning_rate": 0.0009960079681591094, + "loss": 2.2745, + "step": 425200 + }, + { + "epoch": 1.6437429450603824, + "grad_norm": 0.10634247958660126, + "learning_rate": 0.0009958087831493448, + "loss": 2.2621, + "step": 425210 + }, + { + "epoch": 1.6437816022637657, + "grad_norm": 0.1050662100315094, + "learning_rate": 0.0009956096376408225, + "loss": 2.2557, + "step": 425220 + }, + { + "epoch": 1.6438202594671492, + "grad_norm": 0.11785628646612167, + "learning_rate": 0.0009954105316100513, + "loss": 2.2499, + "step": 425230 + }, + { + "epoch": 1.6438589166705324, + "grad_norm": 0.12110677361488342, + "learning_rate": 0.0009952114650335624, + "loss": 2.2651, + "step": 425240 + }, + { + "epoch": 1.6438975738739157, + "grad_norm": 0.10299248993396759, + "learning_rate": 0.000995012437887911, + "loss": 2.2651, + "step": 425250 + }, + { + "epoch": 1.643936231077299, + "grad_norm": 0.10756676644086838, + "learning_rate": 0.0009948134501496751, + "loss": 2.2693, + "step": 425260 + }, + { + "epoch": 1.6439748882806822, + "grad_norm": 0.10314938426017761, + "learning_rate": 0.0009946145017954554, + "loss": 2.2577, + "step": 425270 + }, + { + "epoch": 1.6440135454840656, + "grad_norm": 0.11622797697782516, + "learning_rate": 0.0009944155928018773, + "loss": 2.2472, + "step": 425280 + }, + { + "epoch": 1.644052202687449, + "grad_norm": 0.11092694103717804, + "learning_rate": 0.0009942167231455875, + "loss": 2.2558, + "step": 425290 + }, + { + "epoch": 1.6440908598908321, + "grad_norm": 0.10614550113677979, + "learning_rate": 0.0009940178928032567, + "loss": 2.2537, + "step": 425300 + }, + { + "epoch": 1.6441295170942154, + "grad_norm": 0.11072509735822678, + "learning_rate": 0.0009938191017515787, + "loss": 2.2539, + "step": 425310 + }, + { + "epoch": 1.6441681742975987, + "grad_norm": 0.11297177523374557, + "learning_rate": 0.00099362034996727, + "loss": 2.2611, + "step": 425320 + }, + { + "epoch": 1.644206831500982, + "grad_norm": 0.11130084097385406, + "learning_rate": 0.0009934216374270703, + "loss": 2.2615, + "step": 425330 + }, + { + "epoch": 1.6442454887043652, + "grad_norm": 0.11282171308994293, + "learning_rate": 0.0009932229641077424, + "loss": 2.2504, + "step": 425340 + }, + { + "epoch": 1.6442841459077484, + "grad_norm": 0.10654444247484207, + "learning_rate": 0.0009930243299860718, + "loss": 2.2593, + "step": 425350 + }, + { + "epoch": 1.6443228031111317, + "grad_norm": 0.21849265694618225, + "learning_rate": 0.0009928257350388663, + "loss": 2.2636, + "step": 425360 + }, + { + "epoch": 1.644361460314515, + "grad_norm": 0.10909666121006012, + "learning_rate": 0.0009926271792429578, + "loss": 2.264, + "step": 425370 + }, + { + "epoch": 1.6444001175178982, + "grad_norm": 0.10071917623281479, + "learning_rate": 0.0009924286625752, + "loss": 2.2555, + "step": 425380 + }, + { + "epoch": 1.6444387747212814, + "grad_norm": 0.12604159116744995, + "learning_rate": 0.0009922301850124702, + "loss": 2.2665, + "step": 425390 + }, + { + "epoch": 1.644477431924665, + "grad_norm": 0.1069197803735733, + "learning_rate": 0.0009920317465316676, + "loss": 2.2636, + "step": 425400 + }, + { + "epoch": 1.6445160891280481, + "grad_norm": 0.12726236879825592, + "learning_rate": 0.0009918333471097152, + "loss": 2.2594, + "step": 425410 + }, + { + "epoch": 1.6445547463314314, + "grad_norm": 0.10156135261058807, + "learning_rate": 0.0009916349867235576, + "loss": 2.262, + "step": 425420 + }, + { + "epoch": 1.6445934035348146, + "grad_norm": 0.12336266785860062, + "learning_rate": 0.0009914366653501626, + "loss": 2.26, + "step": 425430 + }, + { + "epoch": 1.6446320607381981, + "grad_norm": 0.0935782939195633, + "learning_rate": 0.0009912383829665207, + "loss": 2.258, + "step": 425440 + }, + { + "epoch": 1.6446707179415814, + "grad_norm": 0.1038680374622345, + "learning_rate": 0.000991040139549645, + "loss": 2.2522, + "step": 425450 + }, + { + "epoch": 1.6447093751449646, + "grad_norm": 0.09510288387537003, + "learning_rate": 0.000990841935076571, + "loss": 2.2711, + "step": 425460 + }, + { + "epoch": 1.6447480323483479, + "grad_norm": 0.10547652095556259, + "learning_rate": 0.0009906437695243566, + "loss": 2.2335, + "step": 425470 + }, + { + "epoch": 1.6447866895517311, + "grad_norm": 0.10449301451444626, + "learning_rate": 0.000990445642870083, + "loss": 2.2724, + "step": 425480 + }, + { + "epoch": 1.6448253467551144, + "grad_norm": 0.12347263842821121, + "learning_rate": 0.000990247555090853, + "loss": 2.2578, + "step": 425490 + }, + { + "epoch": 1.6448640039584976, + "grad_norm": 0.10921213030815125, + "learning_rate": 0.000990049506163792, + "loss": 2.2582, + "step": 425500 + }, + { + "epoch": 1.6449026611618809, + "grad_norm": 0.09873101860284805, + "learning_rate": 0.0009898514960660488, + "loss": 2.2663, + "step": 425510 + }, + { + "epoch": 1.6449413183652641, + "grad_norm": 0.11116547882556915, + "learning_rate": 0.000989653524774793, + "loss": 2.2505, + "step": 425520 + }, + { + "epoch": 1.6449799755686474, + "grad_norm": 0.10780694335699081, + "learning_rate": 0.0009894555922672175, + "loss": 2.2628, + "step": 425530 + }, + { + "epoch": 1.6450186327720306, + "grad_norm": 0.11441248655319214, + "learning_rate": 0.0009892576985205377, + "loss": 2.2729, + "step": 425540 + }, + { + "epoch": 1.645057289975414, + "grad_norm": 0.11041711270809174, + "learning_rate": 0.000989059843511991, + "loss": 2.256, + "step": 425550 + }, + { + "epoch": 1.6450959471787971, + "grad_norm": 0.11487311869859695, + "learning_rate": 0.0009888620272188369, + "loss": 2.2597, + "step": 425560 + }, + { + "epoch": 1.6451346043821806, + "grad_norm": 0.10112126171588898, + "learning_rate": 0.0009886642496183574, + "loss": 2.2567, + "step": 425570 + }, + { + "epoch": 1.6451732615855639, + "grad_norm": 0.10316457599401474, + "learning_rate": 0.0009884665106878565, + "loss": 2.2661, + "step": 425580 + }, + { + "epoch": 1.6452119187889471, + "grad_norm": 0.10630446672439575, + "learning_rate": 0.000988268810404661, + "loss": 2.2652, + "step": 425590 + }, + { + "epoch": 1.6452505759923304, + "grad_norm": 0.1076459065079689, + "learning_rate": 0.0009880711487461186, + "loss": 2.2796, + "step": 425600 + }, + { + "epoch": 1.6452892331957139, + "grad_norm": 0.1187729686498642, + "learning_rate": 0.0009878735256896004, + "loss": 2.252, + "step": 425610 + }, + { + "epoch": 1.645327890399097, + "grad_norm": 0.10718236863613129, + "learning_rate": 0.000987675941212499, + "loss": 2.2586, + "step": 425620 + }, + { + "epoch": 1.6453665476024804, + "grad_norm": 0.10935017466545105, + "learning_rate": 0.0009874783952922288, + "loss": 2.2572, + "step": 425630 + }, + { + "epoch": 1.6454052048058636, + "grad_norm": 0.10489913076162338, + "learning_rate": 0.000987280887906227, + "loss": 2.2476, + "step": 425640 + }, + { + "epoch": 1.6454438620092469, + "grad_norm": 0.1212213784456253, + "learning_rate": 0.000987083419031952, + "loss": 2.2457, + "step": 425650 + }, + { + "epoch": 1.6454825192126301, + "grad_norm": 0.1128239780664444, + "learning_rate": 0.000986885988646885, + "loss": 2.2689, + "step": 425660 + }, + { + "epoch": 1.6455211764160134, + "grad_norm": 0.11003052443265915, + "learning_rate": 0.000986688596728528, + "loss": 2.2626, + "step": 425670 + }, + { + "epoch": 1.6455598336193966, + "grad_norm": 0.11292414367198944, + "learning_rate": 0.000986491243254406, + "loss": 2.2537, + "step": 425680 + }, + { + "epoch": 1.6455984908227799, + "grad_norm": 0.12339409440755844, + "learning_rate": 0.0009862939282020652, + "loss": 2.263, + "step": 425690 + }, + { + "epoch": 1.6456371480261631, + "grad_norm": 0.12118395417928696, + "learning_rate": 0.000986096651549074, + "loss": 2.2554, + "step": 425700 + }, + { + "epoch": 1.6456758052295464, + "grad_norm": 0.1183353140950203, + "learning_rate": 0.0009858994132730225, + "loss": 2.2703, + "step": 425710 + }, + { + "epoch": 1.6457144624329296, + "grad_norm": 0.10364186763763428, + "learning_rate": 0.0009857022133515228, + "loss": 2.2594, + "step": 425720 + }, + { + "epoch": 1.6457531196363129, + "grad_norm": 0.11221203953027725, + "learning_rate": 0.0009855050517622083, + "loss": 2.247, + "step": 425730 + }, + { + "epoch": 1.6457917768396964, + "grad_norm": 0.10890361666679382, + "learning_rate": 0.0009853079284827342, + "loss": 2.2598, + "step": 425740 + }, + { + "epoch": 1.6458304340430796, + "grad_norm": 0.10923411697149277, + "learning_rate": 0.000985110843490778, + "loss": 2.2519, + "step": 425750 + }, + { + "epoch": 1.6458690912464629, + "grad_norm": 0.10897491872310638, + "learning_rate": 0.0009849137967640384, + "loss": 2.2534, + "step": 425760 + }, + { + "epoch": 1.645907748449846, + "grad_norm": 0.10118908435106277, + "learning_rate": 0.0009847167882802356, + "loss": 2.2614, + "step": 425770 + }, + { + "epoch": 1.6459464056532296, + "grad_norm": 0.09627126157283783, + "learning_rate": 0.0009845198180171118, + "loss": 2.2502, + "step": 425780 + }, + { + "epoch": 1.6459850628566128, + "grad_norm": 0.11942839622497559, + "learning_rate": 0.0009843228859524303, + "loss": 2.258, + "step": 425790 + }, + { + "epoch": 1.646023720059996, + "grad_norm": 0.10836474597454071, + "learning_rate": 0.0009841259920639765, + "loss": 2.2394, + "step": 425800 + }, + { + "epoch": 1.6460623772633793, + "grad_norm": 0.13480186462402344, + "learning_rate": 0.0009839291363295572, + "loss": 2.2497, + "step": 425810 + }, + { + "epoch": 1.6461010344667626, + "grad_norm": 0.11613373458385468, + "learning_rate": 0.000983732318727, + "loss": 2.2545, + "step": 425820 + }, + { + "epoch": 1.6461396916701458, + "grad_norm": 0.111439548432827, + "learning_rate": 0.0009835355392341552, + "loss": 2.2602, + "step": 425830 + }, + { + "epoch": 1.646178348873529, + "grad_norm": 0.11013883352279663, + "learning_rate": 0.0009833387978288933, + "loss": 2.2707, + "step": 425840 + }, + { + "epoch": 1.6462170060769123, + "grad_norm": 0.11753743886947632, + "learning_rate": 0.0009831420944891073, + "loss": 2.2612, + "step": 425850 + }, + { + "epoch": 1.6462556632802956, + "grad_norm": 0.10817738622426987, + "learning_rate": 0.0009829454291927105, + "loss": 2.2462, + "step": 425860 + }, + { + "epoch": 1.6462943204836789, + "grad_norm": 0.10038787871599197, + "learning_rate": 0.0009827488019176385, + "loss": 2.2579, + "step": 425870 + }, + { + "epoch": 1.646332977687062, + "grad_norm": 0.11860653758049011, + "learning_rate": 0.0009825522126418477, + "loss": 2.2591, + "step": 425880 + }, + { + "epoch": 1.6463716348904454, + "grad_norm": 0.10090608894824982, + "learning_rate": 0.0009823556613433157, + "loss": 2.2707, + "step": 425890 + }, + { + "epoch": 1.6464102920938286, + "grad_norm": 0.1041901633143425, + "learning_rate": 0.0009821591480000422, + "loss": 2.247, + "step": 425900 + }, + { + "epoch": 1.646448949297212, + "grad_norm": 0.11358907073736191, + "learning_rate": 0.000981962672590047, + "loss": 2.2617, + "step": 425910 + }, + { + "epoch": 1.6464876065005953, + "grad_norm": 0.10183798521757126, + "learning_rate": 0.0009817662350913717, + "loss": 2.2519, + "step": 425920 + }, + { + "epoch": 1.6465262637039786, + "grad_norm": 0.11067637801170349, + "learning_rate": 0.0009815698354820788, + "loss": 2.2595, + "step": 425930 + }, + { + "epoch": 1.6465649209073618, + "grad_norm": 0.11494000256061554, + "learning_rate": 0.0009813734737402523, + "loss": 2.2702, + "step": 425940 + }, + { + "epoch": 1.6466035781107453, + "grad_norm": 0.10512025654315948, + "learning_rate": 0.0009811771498439976, + "loss": 2.2519, + "step": 425950 + }, + { + "epoch": 1.6466422353141286, + "grad_norm": 0.1056099459528923, + "learning_rate": 0.0009809808637714404, + "loss": 2.2437, + "step": 425960 + }, + { + "epoch": 1.6466808925175118, + "grad_norm": 0.6840528249740601, + "learning_rate": 0.0009807846155007274, + "loss": 2.2509, + "step": 425970 + }, + { + "epoch": 1.646719549720895, + "grad_norm": 0.1259155571460724, + "learning_rate": 0.0009805884050100274, + "loss": 2.2623, + "step": 425980 + }, + { + "epoch": 1.6467582069242783, + "grad_norm": 0.09720668196678162, + "learning_rate": 0.000980392232277529, + "loss": 2.2419, + "step": 425990 + }, + { + "epoch": 1.6467968641276616, + "grad_norm": 0.1126490980386734, + "learning_rate": 0.000980196097281443, + "loss": 2.2718, + "step": 426000 + }, + { + "epoch": 1.6468355213310448, + "grad_norm": 0.11232949048280716, + "learning_rate": 0.00098, + "loss": 2.2525, + "step": 426010 + }, + { + "epoch": 1.646874178534428, + "grad_norm": 0.10978632420301437, + "learning_rate": 0.000979803940411452, + "loss": 2.2546, + "step": 426020 + }, + { + "epoch": 1.6469128357378113, + "grad_norm": 0.12147060036659241, + "learning_rate": 0.0009796079184940722, + "loss": 2.2502, + "step": 426030 + }, + { + "epoch": 1.6469514929411946, + "grad_norm": 0.1027112528681755, + "learning_rate": 0.000979411934226154, + "loss": 2.2509, + "step": 426040 + }, + { + "epoch": 1.6469901501445778, + "grad_norm": 0.1303754299879074, + "learning_rate": 0.0009792159875860124, + "loss": 2.2437, + "step": 426050 + }, + { + "epoch": 1.647028807347961, + "grad_norm": 0.1287584751844406, + "learning_rate": 0.0009790200785519826, + "loss": 2.255, + "step": 426060 + }, + { + "epoch": 1.6470674645513443, + "grad_norm": 0.11077667027711868, + "learning_rate": 0.0009788242071024206, + "loss": 2.2641, + "step": 426070 + }, + { + "epoch": 1.6471061217547278, + "grad_norm": 0.11055146902799606, + "learning_rate": 0.0009786283732157038, + "loss": 2.2852, + "step": 426080 + }, + { + "epoch": 1.647144778958111, + "grad_norm": 0.11356287449598312, + "learning_rate": 0.0009784325768702293, + "loss": 2.2459, + "step": 426090 + }, + { + "epoch": 1.6471834361614943, + "grad_norm": 0.11561722308397293, + "learning_rate": 0.0009782368180444158, + "loss": 2.2573, + "step": 426100 + }, + { + "epoch": 1.6472220933648776, + "grad_norm": 0.11019952595233917, + "learning_rate": 0.0009780410967167026, + "loss": 2.26, + "step": 426110 + }, + { + "epoch": 1.647260750568261, + "grad_norm": 0.11386414617300034, + "learning_rate": 0.000977845412865549, + "loss": 2.2544, + "step": 426120 + }, + { + "epoch": 1.6472994077716443, + "grad_norm": 0.11020597070455551, + "learning_rate": 0.0009776497664694356, + "loss": 2.2622, + "step": 426130 + }, + { + "epoch": 1.6473380649750275, + "grad_norm": 0.10992365330457687, + "learning_rate": 0.0009774541575068628, + "loss": 2.2434, + "step": 426140 + }, + { + "epoch": 1.6473767221784108, + "grad_norm": 0.1001507043838501, + "learning_rate": 0.0009772585859563524, + "loss": 2.2524, + "step": 426150 + }, + { + "epoch": 1.647415379381794, + "grad_norm": 0.11525522172451019, + "learning_rate": 0.0009770630517964463, + "loss": 2.2604, + "step": 426160 + }, + { + "epoch": 1.6474540365851773, + "grad_norm": 0.12931174039840698, + "learning_rate": 0.0009768675550057072, + "loss": 2.2657, + "step": 426170 + }, + { + "epoch": 1.6474926937885606, + "grad_norm": 0.1104811578989029, + "learning_rate": 0.0009766720955627175, + "loss": 2.2442, + "step": 426180 + }, + { + "epoch": 1.6475313509919438, + "grad_norm": 0.11013077944517136, + "learning_rate": 0.000976476673446081, + "loss": 2.2659, + "step": 426190 + }, + { + "epoch": 1.647570008195327, + "grad_norm": 0.10370016098022461, + "learning_rate": 0.0009762812886344219, + "loss": 2.2669, + "step": 426200 + }, + { + "epoch": 1.6476086653987103, + "grad_norm": 0.1222548633813858, + "learning_rate": 0.0009760859411063836, + "loss": 2.2639, + "step": 426210 + }, + { + "epoch": 1.6476473226020936, + "grad_norm": 0.10938958823680878, + "learning_rate": 0.0009758906308406315, + "loss": 2.2544, + "step": 426220 + }, + { + "epoch": 1.6476859798054768, + "grad_norm": 0.11079689115285873, + "learning_rate": 0.0009756953578158498, + "loss": 2.2695, + "step": 426230 + }, + { + "epoch": 1.64772463700886, + "grad_norm": 0.1136854887008667, + "learning_rate": 0.0009755001220107442, + "loss": 2.275, + "step": 426240 + }, + { + "epoch": 1.6477632942122435, + "grad_norm": 0.10319668054580688, + "learning_rate": 0.00097530492340404, + "loss": 2.2559, + "step": 426250 + }, + { + "epoch": 1.6478019514156268, + "grad_norm": 0.10906700044870377, + "learning_rate": 0.0009751097619744836, + "loss": 2.2499, + "step": 426260 + }, + { + "epoch": 1.64784060861901, + "grad_norm": 0.10607217997312546, + "learning_rate": 0.00097491463770084, + "loss": 2.2514, + "step": 426270 + }, + { + "epoch": 1.6478792658223933, + "grad_norm": 0.100669264793396, + "learning_rate": 0.0009747195505618964, + "loss": 2.2576, + "step": 426280 + }, + { + "epoch": 1.6479179230257768, + "grad_norm": 0.10994735360145569, + "learning_rate": 0.0009745245005364585, + "loss": 2.2471, + "step": 426290 + }, + { + "epoch": 1.64795658022916, + "grad_norm": 0.10938809812068939, + "learning_rate": 0.0009743294876033532, + "loss": 2.2393, + "step": 426300 + }, + { + "epoch": 1.6479952374325433, + "grad_norm": 0.12271041423082352, + "learning_rate": 0.0009741345117414273, + "loss": 2.2595, + "step": 426310 + }, + { + "epoch": 1.6480338946359265, + "grad_norm": 0.1128772720694542, + "learning_rate": 0.0009739395729295473, + "loss": 2.2539, + "step": 426320 + }, + { + "epoch": 1.6480725518393098, + "grad_norm": 0.10963631421327591, + "learning_rate": 0.0009737446711466002, + "loss": 2.2432, + "step": 426330 + }, + { + "epoch": 1.648111209042693, + "grad_norm": 0.10848647356033325, + "learning_rate": 0.0009735498063714927, + "loss": 2.2454, + "step": 426340 + }, + { + "epoch": 1.6481498662460763, + "grad_norm": 0.10321880131959915, + "learning_rate": 0.0009733549785831522, + "loss": 2.2597, + "step": 426350 + }, + { + "epoch": 1.6481885234494595, + "grad_norm": 0.10751194506883621, + "learning_rate": 0.0009731601877605252, + "loss": 2.2644, + "step": 426360 + }, + { + "epoch": 1.6482271806528428, + "grad_norm": 0.10674343258142471, + "learning_rate": 0.0009729654338825786, + "loss": 2.2647, + "step": 426370 + }, + { + "epoch": 1.648265837856226, + "grad_norm": 0.10442473739385605, + "learning_rate": 0.0009727707169282996, + "loss": 2.274, + "step": 426380 + }, + { + "epoch": 1.6483044950596093, + "grad_norm": 0.10820572823286057, + "learning_rate": 0.0009725760368766943, + "loss": 2.2479, + "step": 426390 + }, + { + "epoch": 1.6483431522629926, + "grad_norm": 0.11544471234083176, + "learning_rate": 0.0009723813937067895, + "loss": 2.2568, + "step": 426400 + }, + { + "epoch": 1.6483818094663758, + "grad_norm": 0.1048877164721489, + "learning_rate": 0.0009721867873976322, + "loss": 2.2424, + "step": 426410 + }, + { + "epoch": 1.6484204666697593, + "grad_norm": 0.10477080196142197, + "learning_rate": 0.0009719922179282883, + "loss": 2.2582, + "step": 426420 + }, + { + "epoch": 1.6484591238731425, + "grad_norm": 0.12967807054519653, + "learning_rate": 0.0009717976852778438, + "loss": 2.2631, + "step": 426430 + }, + { + "epoch": 1.6484977810765258, + "grad_norm": 0.10460349172353745, + "learning_rate": 0.0009716031894254048, + "loss": 2.2536, + "step": 426440 + }, + { + "epoch": 1.648536438279909, + "grad_norm": 0.10953984409570694, + "learning_rate": 0.0009714087303500967, + "loss": 2.2624, + "step": 426450 + }, + { + "epoch": 1.6485750954832925, + "grad_norm": 0.10295036435127258, + "learning_rate": 0.0009712143080310651, + "loss": 2.2548, + "step": 426460 + }, + { + "epoch": 1.6486137526866758, + "grad_norm": 0.10450485348701477, + "learning_rate": 0.0009710199224474751, + "loss": 2.2464, + "step": 426470 + }, + { + "epoch": 1.648652409890059, + "grad_norm": 0.11599656194448471, + "learning_rate": 0.0009708255735785114, + "loss": 2.258, + "step": 426480 + }, + { + "epoch": 1.6486910670934423, + "grad_norm": 0.10033763945102692, + "learning_rate": 0.0009706312614033783, + "loss": 2.2594, + "step": 426490 + }, + { + "epoch": 1.6487297242968255, + "grad_norm": 0.10577484220266342, + "learning_rate": 0.0009704369859013, + "loss": 2.2442, + "step": 426500 + }, + { + "epoch": 1.6487683815002088, + "grad_norm": 0.11341305822134018, + "learning_rate": 0.0009702427470515198, + "loss": 2.2558, + "step": 426510 + }, + { + "epoch": 1.648807038703592, + "grad_norm": 0.1167931854724884, + "learning_rate": 0.0009700485448333014, + "loss": 2.2407, + "step": 426520 + }, + { + "epoch": 1.6488456959069753, + "grad_norm": 0.1225115954875946, + "learning_rate": 0.0009698543792259272, + "loss": 2.2495, + "step": 426530 + }, + { + "epoch": 1.6488843531103585, + "grad_norm": 0.10878846794366837, + "learning_rate": 0.0009696602502086993, + "loss": 2.2589, + "step": 426540 + }, + { + "epoch": 1.6489230103137418, + "grad_norm": 0.11994361132383347, + "learning_rate": 0.0009694661577609398, + "loss": 2.247, + "step": 426550 + }, + { + "epoch": 1.648961667517125, + "grad_norm": 0.11334957182407379, + "learning_rate": 0.0009692721018619898, + "loss": 2.243, + "step": 426560 + }, + { + "epoch": 1.6490003247205083, + "grad_norm": 0.13322052359580994, + "learning_rate": 0.0009690780824912102, + "loss": 2.266, + "step": 426570 + }, + { + "epoch": 1.6490389819238915, + "grad_norm": 0.10879038274288177, + "learning_rate": 0.0009688840996279808, + "loss": 2.242, + "step": 426580 + }, + { + "epoch": 1.649077639127275, + "grad_norm": 0.11077425628900528, + "learning_rate": 0.000968690153251701, + "loss": 2.2625, + "step": 426590 + }, + { + "epoch": 1.6491162963306583, + "grad_norm": 0.10870052129030228, + "learning_rate": 0.0009684962433417899, + "loss": 2.2531, + "step": 426600 + }, + { + "epoch": 1.6491549535340415, + "grad_norm": 0.11626043170690536, + "learning_rate": 0.0009683023698776856, + "loss": 2.2521, + "step": 426610 + }, + { + "epoch": 1.6491936107374248, + "grad_norm": 0.12165232002735138, + "learning_rate": 0.0009681085328388455, + "loss": 2.2427, + "step": 426620 + }, + { + "epoch": 1.6492322679408082, + "grad_norm": 0.11001374572515488, + "learning_rate": 0.0009679147322047465, + "loss": 2.2666, + "step": 426630 + }, + { + "epoch": 1.6492709251441915, + "grad_norm": 0.10634226351976395, + "learning_rate": 0.0009677209679548848, + "loss": 2.2462, + "step": 426640 + }, + { + "epoch": 1.6493095823475747, + "grad_norm": 0.11076271533966064, + "learning_rate": 0.0009675272400687755, + "loss": 2.263, + "step": 426650 + }, + { + "epoch": 1.649348239550958, + "grad_norm": 0.11283444613218307, + "learning_rate": 0.0009673335485259531, + "loss": 2.2408, + "step": 426660 + }, + { + "epoch": 1.6493868967543412, + "grad_norm": 0.1021152213215828, + "learning_rate": 0.0009671398933059714, + "loss": 2.2574, + "step": 426670 + }, + { + "epoch": 1.6494255539577245, + "grad_norm": 0.10668480396270752, + "learning_rate": 0.0009669462743884035, + "loss": 2.2569, + "step": 426680 + }, + { + "epoch": 1.6494642111611078, + "grad_norm": 0.12497959285974503, + "learning_rate": 0.0009667526917528409, + "loss": 2.2608, + "step": 426690 + }, + { + "epoch": 1.649502868364491, + "grad_norm": 0.11128103733062744, + "learning_rate": 0.0009665591453788949, + "loss": 2.2698, + "step": 426700 + }, + { + "epoch": 1.6495415255678743, + "grad_norm": 0.11678318679332733, + "learning_rate": 0.000966365635246196, + "loss": 2.2519, + "step": 426710 + }, + { + "epoch": 1.6495801827712575, + "grad_norm": 0.10596548020839691, + "learning_rate": 0.0009661721613343933, + "loss": 2.2611, + "step": 426720 + }, + { + "epoch": 1.6496188399746408, + "grad_norm": 0.10452347993850708, + "learning_rate": 0.0009659787236231549, + "loss": 2.2548, + "step": 426730 + }, + { + "epoch": 1.649657497178024, + "grad_norm": 0.10891494154930115, + "learning_rate": 0.0009657853220921684, + "loss": 2.2609, + "step": 426740 + }, + { + "epoch": 1.6496961543814075, + "grad_norm": 0.11339747905731201, + "learning_rate": 0.0009655919567211399, + "loss": 2.2433, + "step": 426750 + }, + { + "epoch": 1.6497348115847907, + "grad_norm": 0.10721655189990997, + "learning_rate": 0.0009653986274897951, + "loss": 2.2454, + "step": 426760 + }, + { + "epoch": 1.649773468788174, + "grad_norm": 0.10534560680389404, + "learning_rate": 0.0009652053343778777, + "loss": 2.2601, + "step": 426770 + }, + { + "epoch": 1.6498121259915572, + "grad_norm": 0.10680735856294632, + "learning_rate": 0.0009650120773651512, + "loss": 2.2511, + "step": 426780 + }, + { + "epoch": 1.6498507831949405, + "grad_norm": 0.11978033930063248, + "learning_rate": 0.0009648188564313974, + "loss": 2.2635, + "step": 426790 + }, + { + "epoch": 1.649889440398324, + "grad_norm": 0.12228116393089294, + "learning_rate": 0.0009646256715564172, + "loss": 2.2419, + "step": 426800 + }, + { + "epoch": 1.6499280976017072, + "grad_norm": 0.10044317692518234, + "learning_rate": 0.0009644325227200305, + "loss": 2.2565, + "step": 426810 + }, + { + "epoch": 1.6499667548050905, + "grad_norm": 0.10718821734189987, + "learning_rate": 0.0009642394099020759, + "loss": 2.2409, + "step": 426820 + }, + { + "epoch": 1.6500054120084737, + "grad_norm": 0.11532682180404663, + "learning_rate": 0.0009640463330824105, + "loss": 2.2531, + "step": 426830 + }, + { + "epoch": 1.650044069211857, + "grad_norm": 0.11697693169116974, + "learning_rate": 0.0009638532922409105, + "loss": 2.2531, + "step": 426840 + }, + { + "epoch": 1.6500827264152402, + "grad_norm": 0.10346714407205582, + "learning_rate": 0.0009636602873574707, + "loss": 2.2426, + "step": 426850 + }, + { + "epoch": 1.6501213836186235, + "grad_norm": 0.09724732488393784, + "learning_rate": 0.0009634673184120048, + "loss": 2.2553, + "step": 426860 + }, + { + "epoch": 1.6501600408220067, + "grad_norm": 0.10752703994512558, + "learning_rate": 0.0009632743853844452, + "loss": 2.2566, + "step": 426870 + }, + { + "epoch": 1.65019869802539, + "grad_norm": 0.10641635209321976, + "learning_rate": 0.0009630814882547425, + "loss": 2.2422, + "step": 426880 + }, + { + "epoch": 1.6502373552287732, + "grad_norm": 0.11156083643436432, + "learning_rate": 0.0009628886270028663, + "loss": 2.2586, + "step": 426890 + }, + { + "epoch": 1.6502760124321565, + "grad_norm": 0.12447494268417358, + "learning_rate": 0.0009626958016088048, + "loss": 2.2554, + "step": 426900 + }, + { + "epoch": 1.6503146696355397, + "grad_norm": 0.10488735139369965, + "learning_rate": 0.0009625030120525651, + "loss": 2.2754, + "step": 426910 + }, + { + "epoch": 1.6503533268389232, + "grad_norm": 0.10971377044916153, + "learning_rate": 0.0009623102583141723, + "loss": 2.2477, + "step": 426920 + }, + { + "epoch": 1.6503919840423065, + "grad_norm": 0.1068492978811264, + "learning_rate": 0.0009621175403736704, + "loss": 2.2591, + "step": 426930 + }, + { + "epoch": 1.6504306412456897, + "grad_norm": 0.1091977134346962, + "learning_rate": 0.0009619248582111217, + "loss": 2.2522, + "step": 426940 + }, + { + "epoch": 1.650469298449073, + "grad_norm": 0.10923954099416733, + "learning_rate": 0.0009617322118066072, + "loss": 2.2399, + "step": 426950 + }, + { + "epoch": 1.6505079556524562, + "grad_norm": 0.10942035168409348, + "learning_rate": 0.0009615396011402264, + "loss": 2.2533, + "step": 426960 + }, + { + "epoch": 1.6505466128558397, + "grad_norm": 0.10097722709178925, + "learning_rate": 0.0009613470261920971, + "loss": 2.253, + "step": 426970 + }, + { + "epoch": 1.650585270059223, + "grad_norm": 0.10516929626464844, + "learning_rate": 0.0009611544869423559, + "loss": 2.2447, + "step": 426980 + }, + { + "epoch": 1.6506239272626062, + "grad_norm": 0.11061711609363556, + "learning_rate": 0.0009609619833711569, + "loss": 2.2492, + "step": 426990 + }, + { + "epoch": 1.6506625844659895, + "grad_norm": 0.11997164040803909, + "learning_rate": 0.0009607695154586736, + "loss": 2.2495, + "step": 427000 + }, + { + "epoch": 1.6507012416693727, + "grad_norm": 0.11050672829151154, + "learning_rate": 0.0009605770831850973, + "loss": 2.2576, + "step": 427010 + }, + { + "epoch": 1.650739898872756, + "grad_norm": 0.10786289721727371, + "learning_rate": 0.000960384686530638, + "loss": 2.2417, + "step": 427020 + }, + { + "epoch": 1.6507785560761392, + "grad_norm": 0.11886247992515564, + "learning_rate": 0.0009601923254755234, + "loss": 2.2575, + "step": 427030 + }, + { + "epoch": 1.6508172132795225, + "grad_norm": 1.0953397750854492, + "learning_rate": 0.00096, + "loss": 2.2613, + "step": 427040 + }, + { + "epoch": 1.6508558704829057, + "grad_norm": 0.12538954615592957, + "learning_rate": 0.0009598077100843325, + "loss": 2.2515, + "step": 427050 + }, + { + "epoch": 1.650894527686289, + "grad_norm": 0.10170107334852219, + "learning_rate": 0.0009596154557088037, + "loss": 2.254, + "step": 427060 + }, + { + "epoch": 1.6509331848896722, + "grad_norm": 0.10495311766862869, + "learning_rate": 0.0009594232368537147, + "loss": 2.258, + "step": 427070 + }, + { + "epoch": 1.6509718420930555, + "grad_norm": 0.10858013480901718, + "learning_rate": 0.0009592310534993851, + "loss": 2.2545, + "step": 427080 + }, + { + "epoch": 1.651010499296439, + "grad_norm": 0.09755268692970276, + "learning_rate": 0.0009590389056261517, + "loss": 2.2653, + "step": 427090 + }, + { + "epoch": 1.6510491564998222, + "grad_norm": 0.1101205050945282, + "learning_rate": 0.0009588467932143703, + "loss": 2.2551, + "step": 427100 + }, + { + "epoch": 1.6510878137032055, + "grad_norm": 0.11821103096008301, + "learning_rate": 0.000958654716244415, + "loss": 2.2626, + "step": 427110 + }, + { + "epoch": 1.6511264709065887, + "grad_norm": 0.10163707286119461, + "learning_rate": 0.0009584626746966771, + "loss": 2.2663, + "step": 427120 + }, + { + "epoch": 1.651165128109972, + "grad_norm": 0.10656595975160599, + "learning_rate": 0.0009582706685515668, + "loss": 2.2503, + "step": 427130 + }, + { + "epoch": 1.6512037853133554, + "grad_norm": 0.10545650869607925, + "learning_rate": 0.0009580786977895115, + "loss": 2.2588, + "step": 427140 + }, + { + "epoch": 1.6512424425167387, + "grad_norm": 0.11352559924125671, + "learning_rate": 0.0009578867623909577, + "loss": 2.2588, + "step": 427150 + }, + { + "epoch": 1.651281099720122, + "grad_norm": 0.10905435681343079, + "learning_rate": 0.0009576948623363692, + "loss": 2.2396, + "step": 427160 + }, + { + "epoch": 1.6513197569235052, + "grad_norm": 0.11660381406545639, + "learning_rate": 0.0009575029976062281, + "loss": 2.261, + "step": 427170 + }, + { + "epoch": 1.6513584141268884, + "grad_norm": 0.10470879822969437, + "learning_rate": 0.0009573111681810341, + "loss": 2.2581, + "step": 427180 + }, + { + "epoch": 1.6513970713302717, + "grad_norm": 0.10995166003704071, + "learning_rate": 0.0009571193740413049, + "loss": 2.275, + "step": 427190 + }, + { + "epoch": 1.651435728533655, + "grad_norm": 0.10868128389120102, + "learning_rate": 0.000956927615167576, + "loss": 2.2598, + "step": 427200 + }, + { + "epoch": 1.6514743857370382, + "grad_norm": 0.12169184535741806, + "learning_rate": 0.0009567358915404018, + "loss": 2.2511, + "step": 427210 + }, + { + "epoch": 1.6515130429404214, + "grad_norm": 0.10812258720397949, + "learning_rate": 0.0009565442031403535, + "loss": 2.251, + "step": 427220 + }, + { + "epoch": 1.6515517001438047, + "grad_norm": 0.13253743946552277, + "learning_rate": 0.0009563525499480201, + "loss": 2.2602, + "step": 427230 + }, + { + "epoch": 1.651590357347188, + "grad_norm": 0.10694847255945206, + "learning_rate": 0.000956160931944009, + "loss": 2.2517, + "step": 427240 + }, + { + "epoch": 1.6516290145505712, + "grad_norm": 0.1060081422328949, + "learning_rate": 0.000955969349108945, + "loss": 2.2521, + "step": 427250 + }, + { + "epoch": 1.6516676717539547, + "grad_norm": 0.12398270517587662, + "learning_rate": 0.0009557778014234711, + "loss": 2.2524, + "step": 427260 + }, + { + "epoch": 1.651706328957338, + "grad_norm": 0.11210572719573975, + "learning_rate": 0.0009555862888682474, + "loss": 2.2627, + "step": 427270 + }, + { + "epoch": 1.6517449861607212, + "grad_norm": 0.10051599144935608, + "learning_rate": 0.0009553948114239524, + "loss": 2.2607, + "step": 427280 + }, + { + "epoch": 1.6517836433641044, + "grad_norm": 0.09925200045108795, + "learning_rate": 0.0009552033690712821, + "loss": 2.2416, + "step": 427290 + }, + { + "epoch": 1.651822300567488, + "grad_norm": 0.10382227599620819, + "learning_rate": 0.0009550119617909494, + "loss": 2.2518, + "step": 427300 + }, + { + "epoch": 1.6518609577708712, + "grad_norm": 0.11811905354261398, + "learning_rate": 0.0009548205895636864, + "loss": 2.2539, + "step": 427310 + }, + { + "epoch": 1.6518996149742544, + "grad_norm": 0.10599834471940994, + "learning_rate": 0.0009546292523702416, + "loss": 2.2412, + "step": 427320 + }, + { + "epoch": 1.6519382721776377, + "grad_norm": 0.09794063121080399, + "learning_rate": 0.0009544379501913816, + "loss": 2.2539, + "step": 427330 + }, + { + "epoch": 1.651976929381021, + "grad_norm": 0.10884908586740494, + "learning_rate": 0.0009542466830078904, + "loss": 2.2581, + "step": 427340 + }, + { + "epoch": 1.6520155865844042, + "grad_norm": 0.10881958901882172, + "learning_rate": 0.0009540554508005694, + "loss": 2.2551, + "step": 427350 + }, + { + "epoch": 1.6520542437877874, + "grad_norm": 0.10935207456350327, + "learning_rate": 0.0009538642535502384, + "loss": 2.2541, + "step": 427360 + }, + { + "epoch": 1.6520929009911707, + "grad_norm": 0.12142116576433182, + "learning_rate": 0.0009536730912377336, + "loss": 2.2406, + "step": 427370 + }, + { + "epoch": 1.652131558194554, + "grad_norm": 0.10374008864164352, + "learning_rate": 0.0009534819638439096, + "loss": 2.2327, + "step": 427380 + }, + { + "epoch": 1.6521702153979372, + "grad_norm": 0.1134655550122261, + "learning_rate": 0.000953290871349638, + "loss": 2.2278, + "step": 427390 + }, + { + "epoch": 1.6522088726013204, + "grad_norm": 0.11115321516990662, + "learning_rate": 0.000953099813735808, + "loss": 2.2584, + "step": 427400 + }, + { + "epoch": 1.6522475298047037, + "grad_norm": 0.10089276731014252, + "learning_rate": 0.0009529087909833261, + "loss": 2.2527, + "step": 427410 + }, + { + "epoch": 1.652286187008087, + "grad_norm": 0.10366006195545197, + "learning_rate": 0.0009527178030731163, + "loss": 2.2522, + "step": 427420 + }, + { + "epoch": 1.6523248442114704, + "grad_norm": 0.10919071733951569, + "learning_rate": 0.0009525268499861202, + "loss": 2.2589, + "step": 427430 + }, + { + "epoch": 1.6523635014148537, + "grad_norm": 0.10370808839797974, + "learning_rate": 0.0009523359317032964, + "loss": 2.246, + "step": 427440 + }, + { + "epoch": 1.652402158618237, + "grad_norm": 0.09934686124324799, + "learning_rate": 0.0009521450482056211, + "loss": 2.2522, + "step": 427450 + }, + { + "epoch": 1.6524408158216202, + "grad_norm": 0.11634613573551178, + "learning_rate": 0.0009519541994740879, + "loss": 2.2676, + "step": 427460 + }, + { + "epoch": 1.6524794730250036, + "grad_norm": 0.0998673290014267, + "learning_rate": 0.0009517633854897072, + "loss": 2.2456, + "step": 427470 + }, + { + "epoch": 1.652518130228387, + "grad_norm": 0.11801506578922272, + "learning_rate": 0.0009515726062335075, + "loss": 2.2584, + "step": 427480 + }, + { + "epoch": 1.6525567874317701, + "grad_norm": 0.12109418958425522, + "learning_rate": 0.0009513818616865337, + "loss": 2.2496, + "step": 427490 + }, + { + "epoch": 1.6525954446351534, + "grad_norm": 0.10361652821302414, + "learning_rate": 0.0009511911518298484, + "loss": 2.2555, + "step": 427500 + }, + { + "epoch": 1.6526341018385367, + "grad_norm": 0.0983557403087616, + "learning_rate": 0.0009510004766445315, + "loss": 2.2535, + "step": 427510 + }, + { + "epoch": 1.65267275904192, + "grad_norm": 0.12221435457468033, + "learning_rate": 0.0009508098361116799, + "loss": 2.2423, + "step": 427520 + }, + { + "epoch": 1.6527114162453032, + "grad_norm": 0.10871323198080063, + "learning_rate": 0.0009506192302124077, + "loss": 2.252, + "step": 427530 + }, + { + "epoch": 1.6527500734486864, + "grad_norm": 0.09926702082157135, + "learning_rate": 0.0009504286589278459, + "loss": 2.2522, + "step": 427540 + }, + { + "epoch": 1.6527887306520697, + "grad_norm": 0.1006474420428276, + "learning_rate": 0.0009502381222391432, + "loss": 2.2596, + "step": 427550 + }, + { + "epoch": 1.652827387855453, + "grad_norm": 0.1122966781258583, + "learning_rate": 0.0009500476201274651, + "loss": 2.2436, + "step": 427560 + }, + { + "epoch": 1.6528660450588362, + "grad_norm": 0.10408204793930054, + "learning_rate": 0.0009498571525739938, + "loss": 2.2635, + "step": 427570 + }, + { + "epoch": 1.6529047022622194, + "grad_norm": 0.09867561608552933, + "learning_rate": 0.0009496667195599294, + "loss": 2.2467, + "step": 427580 + }, + { + "epoch": 1.6529433594656027, + "grad_norm": 0.10931237787008286, + "learning_rate": 0.0009494763210664883, + "loss": 2.2519, + "step": 427590 + }, + { + "epoch": 1.6529820166689861, + "grad_norm": 0.11653488129377365, + "learning_rate": 0.0009492859570749042, + "loss": 2.2308, + "step": 427600 + }, + { + "epoch": 1.6530206738723694, + "grad_norm": 0.10837239027023315, + "learning_rate": 0.000949095627566428, + "loss": 2.2455, + "step": 427610 + }, + { + "epoch": 1.6530593310757526, + "grad_norm": 0.10967964679002762, + "learning_rate": 0.0009489053325223269, + "loss": 2.2432, + "step": 427620 + }, + { + "epoch": 1.653097988279136, + "grad_norm": 0.10297200828790665, + "learning_rate": 0.0009487150719238862, + "loss": 2.2671, + "step": 427630 + }, + { + "epoch": 1.6531366454825194, + "grad_norm": 0.10992071777582169, + "learning_rate": 0.0009485248457524068, + "loss": 2.2518, + "step": 427640 + }, + { + "epoch": 1.6531753026859026, + "grad_norm": 0.11099092662334442, + "learning_rate": 0.0009483346539892075, + "loss": 2.2612, + "step": 427650 + }, + { + "epoch": 1.6532139598892859, + "grad_norm": 0.11154554784297943, + "learning_rate": 0.0009481444966156237, + "loss": 2.2533, + "step": 427660 + }, + { + "epoch": 1.6532526170926691, + "grad_norm": 0.12147395312786102, + "learning_rate": 0.0009479543736130072, + "loss": 2.2529, + "step": 427670 + }, + { + "epoch": 1.6532912742960524, + "grad_norm": 0.11459475755691528, + "learning_rate": 0.0009477642849627277, + "loss": 2.248, + "step": 427680 + }, + { + "epoch": 1.6533299314994356, + "grad_norm": 0.1090463325381279, + "learning_rate": 0.0009475742306461705, + "loss": 2.2538, + "step": 427690 + }, + { + "epoch": 1.6533685887028189, + "grad_norm": 0.1133263036608696, + "learning_rate": 0.0009473842106447385, + "loss": 2.2485, + "step": 427700 + }, + { + "epoch": 1.6534072459062021, + "grad_norm": 0.10831756889820099, + "learning_rate": 0.0009471942249398515, + "loss": 2.2402, + "step": 427710 + }, + { + "epoch": 1.6534459031095854, + "grad_norm": 0.11087630689144135, + "learning_rate": 0.0009470042735129454, + "loss": 2.2558, + "step": 427720 + }, + { + "epoch": 1.6534845603129686, + "grad_norm": 0.12291962653398514, + "learning_rate": 0.0009468143563454732, + "loss": 2.2593, + "step": 427730 + }, + { + "epoch": 1.653523217516352, + "grad_norm": 0.11833938956260681, + "learning_rate": 0.0009466244734189046, + "loss": 2.2444, + "step": 427740 + }, + { + "epoch": 1.6535618747197351, + "grad_norm": 0.1083853617310524, + "learning_rate": 0.0009464346247147261, + "loss": 2.2469, + "step": 427750 + }, + { + "epoch": 1.6536005319231184, + "grad_norm": 0.10361374914646149, + "learning_rate": 0.0009462448102144407, + "loss": 2.249, + "step": 427760 + }, + { + "epoch": 1.6536391891265019, + "grad_norm": 0.10491149872541428, + "learning_rate": 0.0009460550298995683, + "loss": 2.2537, + "step": 427770 + }, + { + "epoch": 1.6536778463298851, + "grad_norm": 0.11759241670370102, + "learning_rate": 0.0009458652837516451, + "loss": 2.2442, + "step": 427780 + }, + { + "epoch": 1.6537165035332684, + "grad_norm": 0.13166950643062592, + "learning_rate": 0.0009456755717522239, + "loss": 2.2472, + "step": 427790 + }, + { + "epoch": 1.6537551607366516, + "grad_norm": 0.1135098785161972, + "learning_rate": 0.0009454858938828745, + "loss": 2.2581, + "step": 427800 + }, + { + "epoch": 1.653793817940035, + "grad_norm": 0.10833874344825745, + "learning_rate": 0.0009452962501251831, + "loss": 2.2566, + "step": 427810 + }, + { + "epoch": 1.6538324751434184, + "grad_norm": 0.10796923190355301, + "learning_rate": 0.0009451066404607525, + "loss": 2.2464, + "step": 427820 + }, + { + "epoch": 1.6538711323468016, + "grad_norm": 0.11246345937252045, + "learning_rate": 0.0009449170648712018, + "loss": 2.2557, + "step": 427830 + }, + { + "epoch": 1.6539097895501849, + "grad_norm": 0.11669411510229111, + "learning_rate": 0.0009447275233381664, + "loss": 2.2483, + "step": 427840 + }, + { + "epoch": 1.6539484467535681, + "grad_norm": 0.10845163464546204, + "learning_rate": 0.000944538015843299, + "loss": 2.2599, + "step": 427850 + }, + { + "epoch": 1.6539871039569514, + "grad_norm": 0.11250095069408417, + "learning_rate": 0.0009443485423682683, + "loss": 2.2588, + "step": 427860 + }, + { + "epoch": 1.6540257611603346, + "grad_norm": 0.10398449748754501, + "learning_rate": 0.0009441591028947592, + "loss": 2.2561, + "step": 427870 + }, + { + "epoch": 1.6540644183637179, + "grad_norm": 0.10691075772047043, + "learning_rate": 0.0009439696974044732, + "loss": 2.2514, + "step": 427880 + }, + { + "epoch": 1.6541030755671011, + "grad_norm": 0.1094600111246109, + "learning_rate": 0.0009437803258791287, + "loss": 2.2392, + "step": 427890 + }, + { + "epoch": 1.6541417327704844, + "grad_norm": 0.1066112220287323, + "learning_rate": 0.0009435909883004594, + "loss": 2.2663, + "step": 427900 + }, + { + "epoch": 1.6541803899738676, + "grad_norm": 0.11026117205619812, + "learning_rate": 0.0009434016846502167, + "loss": 2.2482, + "step": 427910 + }, + { + "epoch": 1.6542190471772509, + "grad_norm": 0.12330233305692673, + "learning_rate": 0.0009432124149101676, + "loss": 2.2624, + "step": 427920 + }, + { + "epoch": 1.6542577043806341, + "grad_norm": 0.10916101187467575, + "learning_rate": 0.0009430231790620951, + "loss": 2.2532, + "step": 427930 + }, + { + "epoch": 1.6542963615840176, + "grad_norm": 0.11618591845035553, + "learning_rate": 0.0009428339770877991, + "loss": 2.2408, + "step": 427940 + }, + { + "epoch": 1.6543350187874009, + "grad_norm": 0.10629193484783173, + "learning_rate": 0.0009426448089690958, + "loss": 2.256, + "step": 427950 + }, + { + "epoch": 1.654373675990784, + "grad_norm": 0.11059808731079102, + "learning_rate": 0.0009424556746878173, + "loss": 2.264, + "step": 427960 + }, + { + "epoch": 1.6544123331941674, + "grad_norm": 0.11043719202280045, + "learning_rate": 0.0009422665742258118, + "loss": 2.2513, + "step": 427970 + }, + { + "epoch": 1.6544509903975508, + "grad_norm": 0.1055753082036972, + "learning_rate": 0.0009420775075649445, + "loss": 2.2552, + "step": 427980 + }, + { + "epoch": 1.654489647600934, + "grad_norm": 0.1074535995721817, + "learning_rate": 0.0009418884746870962, + "loss": 2.2654, + "step": 427990 + }, + { + "epoch": 1.6545283048043173, + "grad_norm": 0.09793099015951157, + "learning_rate": 0.0009416994755741637, + "loss": 2.2577, + "step": 428000 + }, + { + "epoch": 1.6545669620077006, + "grad_norm": 0.09761539101600647, + "learning_rate": 0.0009415105102080608, + "loss": 2.2593, + "step": 428010 + }, + { + "epoch": 1.6546056192110838, + "grad_norm": 0.1087460145354271, + "learning_rate": 0.0009413215785707164, + "loss": 2.2625, + "step": 428020 + }, + { + "epoch": 1.654644276414467, + "grad_norm": 0.11112965643405914, + "learning_rate": 0.0009411326806440761, + "loss": 2.2731, + "step": 428030 + }, + { + "epoch": 1.6546829336178503, + "grad_norm": 0.10286249965429306, + "learning_rate": 0.0009409438164101018, + "loss": 2.2389, + "step": 428040 + }, + { + "epoch": 1.6547215908212336, + "grad_norm": 0.1073416993021965, + "learning_rate": 0.0009407549858507712, + "loss": 2.2416, + "step": 428050 + }, + { + "epoch": 1.6547602480246169, + "grad_norm": 0.09992457181215286, + "learning_rate": 0.0009405661889480779, + "loss": 2.2534, + "step": 428060 + }, + { + "epoch": 1.654798905228, + "grad_norm": 0.113508440554142, + "learning_rate": 0.0009403774256840316, + "loss": 2.2542, + "step": 428070 + }, + { + "epoch": 1.6548375624313834, + "grad_norm": 0.1302674412727356, + "learning_rate": 0.0009401886960406585, + "loss": 2.2504, + "step": 428080 + }, + { + "epoch": 1.6548762196347666, + "grad_norm": 0.11876347661018372, + "learning_rate": 0.00094, + "loss": 2.2503, + "step": 428090 + }, + { + "epoch": 1.6549148768381499, + "grad_norm": 0.12014816701412201, + "learning_rate": 0.0009398113375441142, + "loss": 2.2484, + "step": 428100 + }, + { + "epoch": 1.6549535340415333, + "grad_norm": 0.11447982490062714, + "learning_rate": 0.000939622708655075, + "loss": 2.2527, + "step": 428110 + }, + { + "epoch": 1.6549921912449166, + "grad_norm": 0.11171301454305649, + "learning_rate": 0.0009394341133149717, + "loss": 2.2517, + "step": 428120 + }, + { + "epoch": 1.6550308484482998, + "grad_norm": 0.11847539991140366, + "learning_rate": 0.0009392455515059106, + "loss": 2.2573, + "step": 428130 + }, + { + "epoch": 1.655069505651683, + "grad_norm": 0.12345657497644424, + "learning_rate": 0.0009390570232100127, + "loss": 2.2517, + "step": 428140 + }, + { + "epoch": 1.6551081628550666, + "grad_norm": 0.23822791874408722, + "learning_rate": 0.0009388685284094154, + "loss": 2.264, + "step": 428150 + }, + { + "epoch": 1.6551468200584498, + "grad_norm": 0.12832707166671753, + "learning_rate": 0.0009386800670862721, + "loss": 2.2367, + "step": 428160 + }, + { + "epoch": 1.655185477261833, + "grad_norm": 0.12387876212596893, + "learning_rate": 0.000938491639222752, + "loss": 2.2495, + "step": 428170 + }, + { + "epoch": 1.6552241344652163, + "grad_norm": 0.11859184503555298, + "learning_rate": 0.0009383032448010403, + "loss": 2.253, + "step": 428180 + }, + { + "epoch": 1.6552627916685996, + "grad_norm": 0.1142348125576973, + "learning_rate": 0.0009381148838033374, + "loss": 2.2502, + "step": 428190 + }, + { + "epoch": 1.6553014488719828, + "grad_norm": 0.10157260298728943, + "learning_rate": 0.0009379265562118597, + "loss": 2.2477, + "step": 428200 + }, + { + "epoch": 1.655340106075366, + "grad_norm": 0.10549125075340271, + "learning_rate": 0.0009377382620088399, + "loss": 2.254, + "step": 428210 + }, + { + "epoch": 1.6553787632787493, + "grad_norm": 0.1005273088812828, + "learning_rate": 0.0009375500011765259, + "loss": 2.2603, + "step": 428220 + }, + { + "epoch": 1.6554174204821326, + "grad_norm": 0.11275097727775574, + "learning_rate": 0.0009373617736971816, + "loss": 2.2494, + "step": 428230 + }, + { + "epoch": 1.6554560776855158, + "grad_norm": 0.11196176707744598, + "learning_rate": 0.0009371735795530862, + "loss": 2.2549, + "step": 428240 + }, + { + "epoch": 1.655494734888899, + "grad_norm": 0.11383403837680817, + "learning_rate": 0.0009369854187265352, + "loss": 2.2607, + "step": 428250 + }, + { + "epoch": 1.6555333920922823, + "grad_norm": 0.1119704395532608, + "learning_rate": 0.000936797291199839, + "loss": 2.2554, + "step": 428260 + }, + { + "epoch": 1.6555720492956656, + "grad_norm": 0.1140163317322731, + "learning_rate": 0.0009366091969553246, + "loss": 2.2582, + "step": 428270 + }, + { + "epoch": 1.655610706499049, + "grad_norm": 0.1339382827281952, + "learning_rate": 0.0009364211359753336, + "loss": 2.249, + "step": 428280 + }, + { + "epoch": 1.6556493637024323, + "grad_norm": 0.11727656424045563, + "learning_rate": 0.000936233108242224, + "loss": 2.2539, + "step": 428290 + }, + { + "epoch": 1.6556880209058156, + "grad_norm": 0.1095530167222023, + "learning_rate": 0.0009360451137383692, + "loss": 2.2535, + "step": 428300 + }, + { + "epoch": 1.6557266781091988, + "grad_norm": 0.11644624918699265, + "learning_rate": 0.0009358571524461577, + "loss": 2.2363, + "step": 428310 + }, + { + "epoch": 1.6557653353125823, + "grad_norm": 0.11836208403110504, + "learning_rate": 0.0009356692243479943, + "loss": 2.2377, + "step": 428320 + }, + { + "epoch": 1.6558039925159656, + "grad_norm": 0.11358748376369476, + "learning_rate": 0.0009354813294262989, + "loss": 2.2457, + "step": 428330 + }, + { + "epoch": 1.6558426497193488, + "grad_norm": 0.10452094674110413, + "learning_rate": 0.0009352934676635068, + "loss": 2.2444, + "step": 428340 + }, + { + "epoch": 1.655881306922732, + "grad_norm": 0.12100542336702347, + "learning_rate": 0.0009351056390420691, + "loss": 2.2365, + "step": 428350 + }, + { + "epoch": 1.6559199641261153, + "grad_norm": 0.12400267273187637, + "learning_rate": 0.0009349178435444521, + "loss": 2.265, + "step": 428360 + }, + { + "epoch": 1.6559586213294986, + "grad_norm": 0.10073690861463547, + "learning_rate": 0.0009347300811531381, + "loss": 2.2545, + "step": 428370 + }, + { + "epoch": 1.6559972785328818, + "grad_norm": 0.11887051910161972, + "learning_rate": 0.0009345423518506237, + "loss": 2.2383, + "step": 428380 + }, + { + "epoch": 1.656035935736265, + "grad_norm": 0.10939810425043106, + "learning_rate": 0.0009343546556194224, + "loss": 2.2501, + "step": 428390 + }, + { + "epoch": 1.6560745929396483, + "grad_norm": 0.11234070360660553, + "learning_rate": 0.000934166992442062, + "loss": 2.267, + "step": 428400 + }, + { + "epoch": 1.6561132501430316, + "grad_norm": 0.10380993783473969, + "learning_rate": 0.0009339793623010858, + "loss": 2.2573, + "step": 428410 + }, + { + "epoch": 1.6561519073464148, + "grad_norm": 0.11642767488956451, + "learning_rate": 0.0009337917651790528, + "loss": 2.2488, + "step": 428420 + }, + { + "epoch": 1.656190564549798, + "grad_norm": 0.1705406755208969, + "learning_rate": 0.0009336042010585375, + "loss": 2.262, + "step": 428430 + }, + { + "epoch": 1.6562292217531813, + "grad_norm": 0.10769027471542358, + "learning_rate": 0.0009334166699221293, + "loss": 2.2575, + "step": 428440 + }, + { + "epoch": 1.6562678789565648, + "grad_norm": 0.12309370189905167, + "learning_rate": 0.0009332291717524331, + "loss": 2.2636, + "step": 428450 + }, + { + "epoch": 1.656306536159948, + "grad_norm": 0.1017761379480362, + "learning_rate": 0.0009330417065320689, + "loss": 2.2526, + "step": 428460 + }, + { + "epoch": 1.6563451933633313, + "grad_norm": 0.10486218333244324, + "learning_rate": 0.000932854274243672, + "loss": 2.2651, + "step": 428470 + }, + { + "epoch": 1.6563838505667146, + "grad_norm": 0.10576413571834564, + "learning_rate": 0.0009326668748698934, + "loss": 2.2361, + "step": 428480 + }, + { + "epoch": 1.656422507770098, + "grad_norm": 0.11108098179101944, + "learning_rate": 0.0009324795083933986, + "loss": 2.2508, + "step": 428490 + }, + { + "epoch": 1.6564611649734813, + "grad_norm": 0.11978644132614136, + "learning_rate": 0.0009322921747968689, + "loss": 2.2496, + "step": 428500 + }, + { + "epoch": 1.6564998221768645, + "grad_norm": 0.1151486188173294, + "learning_rate": 0.0009321048740630005, + "loss": 2.2483, + "step": 428510 + }, + { + "epoch": 1.6565384793802478, + "grad_norm": 0.11211903393268585, + "learning_rate": 0.0009319176061745049, + "loss": 2.2325, + "step": 428520 + }, + { + "epoch": 1.656577136583631, + "grad_norm": 0.10804494470357895, + "learning_rate": 0.0009317303711141087, + "loss": 2.2513, + "step": 428530 + }, + { + "epoch": 1.6566157937870143, + "grad_norm": 0.10815191268920898, + "learning_rate": 0.0009315431688645535, + "loss": 2.2588, + "step": 428540 + }, + { + "epoch": 1.6566544509903975, + "grad_norm": 0.10301661491394043, + "learning_rate": 0.0009313559994085963, + "loss": 2.2488, + "step": 428550 + }, + { + "epoch": 1.6566931081937808, + "grad_norm": 0.1148388609290123, + "learning_rate": 0.0009311688627290089, + "loss": 2.2533, + "step": 428560 + }, + { + "epoch": 1.656731765397164, + "grad_norm": 0.11594461649656296, + "learning_rate": 0.0009309817588085787, + "loss": 2.244, + "step": 428570 + }, + { + "epoch": 1.6567704226005473, + "grad_norm": 0.11438317596912384, + "learning_rate": 0.0009307946876301072, + "loss": 2.2396, + "step": 428580 + }, + { + "epoch": 1.6568090798039306, + "grad_norm": 0.10656829923391342, + "learning_rate": 0.0009306076491764119, + "loss": 2.2418, + "step": 428590 + }, + { + "epoch": 1.6568477370073138, + "grad_norm": 0.10780458152294159, + "learning_rate": 0.000930420643430325, + "loss": 2.249, + "step": 428600 + }, + { + "epoch": 1.656886394210697, + "grad_norm": 0.10777982324361801, + "learning_rate": 0.0009302336703746934, + "loss": 2.2533, + "step": 428610 + }, + { + "epoch": 1.6569250514140805, + "grad_norm": 0.1133468970656395, + "learning_rate": 0.0009300467299923796, + "loss": 2.2359, + "step": 428620 + }, + { + "epoch": 1.6569637086174638, + "grad_norm": 0.1100376546382904, + "learning_rate": 0.0009298598222662604, + "loss": 2.2605, + "step": 428630 + }, + { + "epoch": 1.657002365820847, + "grad_norm": 0.10588014870882034, + "learning_rate": 0.0009296729471792278, + "loss": 2.2534, + "step": 428640 + }, + { + "epoch": 1.6570410230242303, + "grad_norm": 0.12535402178764343, + "learning_rate": 0.0009294861047141891, + "loss": 2.2533, + "step": 428650 + }, + { + "epoch": 1.6570796802276138, + "grad_norm": 0.1060071587562561, + "learning_rate": 0.0009292992948540662, + "loss": 2.2553, + "step": 428660 + }, + { + "epoch": 1.657118337430997, + "grad_norm": 0.11963797360658646, + "learning_rate": 0.0009291125175817956, + "loss": 2.248, + "step": 428670 + }, + { + "epoch": 1.6571569946343803, + "grad_norm": 0.11207103729248047, + "learning_rate": 0.0009289257728803293, + "loss": 2.2429, + "step": 428680 + }, + { + "epoch": 1.6571956518377635, + "grad_norm": 0.11880432069301605, + "learning_rate": 0.0009287390607326336, + "loss": 2.2482, + "step": 428690 + }, + { + "epoch": 1.6572343090411468, + "grad_norm": 0.11663631349802017, + "learning_rate": 0.0009285523811216902, + "loss": 2.251, + "step": 428700 + }, + { + "epoch": 1.65727296624453, + "grad_norm": 0.10760597884654999, + "learning_rate": 0.0009283657340304949, + "loss": 2.245, + "step": 428710 + }, + { + "epoch": 1.6573116234479133, + "grad_norm": 0.10502638667821884, + "learning_rate": 0.000928179119442059, + "loss": 2.2628, + "step": 428720 + }, + { + "epoch": 1.6573502806512965, + "grad_norm": 0.12468273192644119, + "learning_rate": 0.0009279925373394082, + "loss": 2.2573, + "step": 428730 + }, + { + "epoch": 1.6573889378546798, + "grad_norm": 0.1035030260682106, + "learning_rate": 0.0009278059877055833, + "loss": 2.2489, + "step": 428740 + }, + { + "epoch": 1.657427595058063, + "grad_norm": 0.12911434471607208, + "learning_rate": 0.0009276194705236391, + "loss": 2.2567, + "step": 428750 + }, + { + "epoch": 1.6574662522614463, + "grad_norm": 0.10658340156078339, + "learning_rate": 0.0009274329857766461, + "loss": 2.254, + "step": 428760 + }, + { + "epoch": 1.6575049094648295, + "grad_norm": 0.12113990634679794, + "learning_rate": 0.0009272465334476892, + "loss": 2.2473, + "step": 428770 + }, + { + "epoch": 1.657543566668213, + "grad_norm": 0.11048426479101181, + "learning_rate": 0.0009270601135198673, + "loss": 2.2467, + "step": 428780 + }, + { + "epoch": 1.6575822238715963, + "grad_norm": 0.11958407610654831, + "learning_rate": 0.000926873725976295, + "loss": 2.245, + "step": 428790 + }, + { + "epoch": 1.6576208810749795, + "grad_norm": 0.11110121756792068, + "learning_rate": 0.000926687370800101, + "loss": 2.2709, + "step": 428800 + }, + { + "epoch": 1.6576595382783628, + "grad_norm": 0.1004750207066536, + "learning_rate": 0.0009265010479744287, + "loss": 2.2589, + "step": 428810 + }, + { + "epoch": 1.657698195481746, + "grad_norm": 0.10071204602718353, + "learning_rate": 0.0009263147574824362, + "loss": 2.2381, + "step": 428820 + }, + { + "epoch": 1.6577368526851295, + "grad_norm": 0.10761163383722305, + "learning_rate": 0.000926128499307296, + "loss": 2.2352, + "step": 428830 + }, + { + "epoch": 1.6577755098885127, + "grad_norm": 0.1146104633808136, + "learning_rate": 0.0009259422734321959, + "loss": 2.2684, + "step": 428840 + }, + { + "epoch": 1.657814167091896, + "grad_norm": 0.11707509309053421, + "learning_rate": 0.0009257560798403373, + "loss": 2.2539, + "step": 428850 + }, + { + "epoch": 1.6578528242952792, + "grad_norm": 0.11978209018707275, + "learning_rate": 0.0009255699185149366, + "loss": 2.2507, + "step": 428860 + }, + { + "epoch": 1.6578914814986625, + "grad_norm": 0.11238358169794083, + "learning_rate": 0.0009253837894392249, + "loss": 2.2411, + "step": 428870 + }, + { + "epoch": 1.6579301387020458, + "grad_norm": 0.12410169094800949, + "learning_rate": 0.0009251976925964479, + "loss": 2.2599, + "step": 428880 + }, + { + "epoch": 1.657968795905429, + "grad_norm": 0.1125243753194809, + "learning_rate": 0.000925011627969865, + "loss": 2.2505, + "step": 428890 + }, + { + "epoch": 1.6580074531088123, + "grad_norm": 0.11119139194488525, + "learning_rate": 0.0009248255955427512, + "loss": 2.2575, + "step": 428900 + }, + { + "epoch": 1.6580461103121955, + "grad_norm": 0.11171048879623413, + "learning_rate": 0.0009246395952983948, + "loss": 2.2559, + "step": 428910 + }, + { + "epoch": 1.6580847675155788, + "grad_norm": 0.10550161451101303, + "learning_rate": 0.0009244536272200999, + "loss": 2.253, + "step": 428920 + }, + { + "epoch": 1.658123424718962, + "grad_norm": 0.12499670684337616, + "learning_rate": 0.0009242676912911838, + "loss": 2.2507, + "step": 428930 + }, + { + "epoch": 1.6581620819223453, + "grad_norm": 0.1260816603899002, + "learning_rate": 0.0009240817874949787, + "loss": 2.2499, + "step": 428940 + }, + { + "epoch": 1.6582007391257287, + "grad_norm": 0.11161303520202637, + "learning_rate": 0.0009238959158148317, + "loss": 2.2466, + "step": 428950 + }, + { + "epoch": 1.658239396329112, + "grad_norm": 0.10817040503025055, + "learning_rate": 0.0009237100762341031, + "loss": 2.2437, + "step": 428960 + }, + { + "epoch": 1.6582780535324952, + "grad_norm": 0.10055804997682571, + "learning_rate": 0.000923524268736169, + "loss": 2.2603, + "step": 428970 + }, + { + "epoch": 1.6583167107358785, + "grad_norm": 0.10730566084384918, + "learning_rate": 0.0009233384933044185, + "loss": 2.2334, + "step": 428980 + }, + { + "epoch": 1.6583553679392617, + "grad_norm": 0.11016399413347244, + "learning_rate": 0.0009231527499222556, + "loss": 2.2524, + "step": 428990 + }, + { + "epoch": 1.6583940251426452, + "grad_norm": 0.1141536608338356, + "learning_rate": 0.0009229670385730993, + "loss": 2.2545, + "step": 429000 + }, + { + "epoch": 1.6584326823460285, + "grad_norm": 0.10983778536319733, + "learning_rate": 0.0009227813592403815, + "loss": 2.2329, + "step": 429010 + }, + { + "epoch": 1.6584713395494117, + "grad_norm": 0.10804615169763565, + "learning_rate": 0.0009225957119075494, + "loss": 2.2365, + "step": 429020 + }, + { + "epoch": 1.658509996752795, + "grad_norm": 0.11451501399278641, + "learning_rate": 0.0009224100965580644, + "loss": 2.2429, + "step": 429030 + }, + { + "epoch": 1.6585486539561782, + "grad_norm": 0.11562760919332504, + "learning_rate": 0.0009222245131754017, + "loss": 2.2622, + "step": 429040 + }, + { + "epoch": 1.6585873111595615, + "grad_norm": 0.11589565873146057, + "learning_rate": 0.0009220389617430509, + "loss": 2.2514, + "step": 429050 + }, + { + "epoch": 1.6586259683629447, + "grad_norm": 0.11859513819217682, + "learning_rate": 0.0009218534422445155, + "loss": 2.2567, + "step": 429060 + }, + { + "epoch": 1.658664625566328, + "grad_norm": 0.11401848495006561, + "learning_rate": 0.0009216679546633143, + "loss": 2.254, + "step": 429070 + }, + { + "epoch": 1.6587032827697112, + "grad_norm": 0.10089422017335892, + "learning_rate": 0.0009214824989829789, + "loss": 2.2438, + "step": 429080 + }, + { + "epoch": 1.6587419399730945, + "grad_norm": 0.10089923441410065, + "learning_rate": 0.000921297075187056, + "loss": 2.255, + "step": 429090 + }, + { + "epoch": 1.6587805971764777, + "grad_norm": 0.11789452284574509, + "learning_rate": 0.0009211116832591057, + "loss": 2.2382, + "step": 429100 + }, + { + "epoch": 1.658819254379861, + "grad_norm": 0.10543384402990341, + "learning_rate": 0.0009209263231827032, + "loss": 2.2512, + "step": 429110 + }, + { + "epoch": 1.6588579115832445, + "grad_norm": 0.11147383600473404, + "learning_rate": 0.0009207409949414367, + "loss": 2.2502, + "step": 429120 + }, + { + "epoch": 1.6588965687866277, + "grad_norm": 0.12240874022245407, + "learning_rate": 0.0009205556985189091, + "loss": 2.2401, + "step": 429130 + }, + { + "epoch": 1.658935225990011, + "grad_norm": 0.10780297964811325, + "learning_rate": 0.0009203704338987378, + "loss": 2.2508, + "step": 429140 + }, + { + "epoch": 1.6589738831933942, + "grad_norm": 0.10800083726644516, + "learning_rate": 0.000920185201064553, + "loss": 2.2502, + "step": 429150 + }, + { + "epoch": 1.6590125403967777, + "grad_norm": 0.11921687424182892, + "learning_rate": 0.0009199999999999999, + "loss": 2.2589, + "step": 429160 + }, + { + "epoch": 1.659051197600161, + "grad_norm": 0.11528116464614868, + "learning_rate": 0.000919814830688738, + "loss": 2.261, + "step": 429170 + }, + { + "epoch": 1.6590898548035442, + "grad_norm": 0.10715014487504959, + "learning_rate": 0.0009196296931144397, + "loss": 2.2543, + "step": 429180 + }, + { + "epoch": 1.6591285120069275, + "grad_norm": 0.15646180510520935, + "learning_rate": 0.0009194445872607921, + "loss": 2.2577, + "step": 429190 + }, + { + "epoch": 1.6591671692103107, + "grad_norm": 0.12420929223299026, + "learning_rate": 0.0009192595131114965, + "loss": 2.2586, + "step": 429200 + }, + { + "epoch": 1.659205826413694, + "grad_norm": 0.11077667772769928, + "learning_rate": 0.0009190744706502672, + "loss": 2.2467, + "step": 429210 + }, + { + "epoch": 1.6592444836170772, + "grad_norm": 0.11203381419181824, + "learning_rate": 0.0009188894598608338, + "loss": 2.2385, + "step": 429220 + }, + { + "epoch": 1.6592831408204605, + "grad_norm": 0.12216757982969284, + "learning_rate": 0.0009187044807269385, + "loss": 2.2625, + "step": 429230 + }, + { + "epoch": 1.6593217980238437, + "grad_norm": 0.5819114446640015, + "learning_rate": 0.0009185195332323382, + "loss": 2.2358, + "step": 429240 + }, + { + "epoch": 1.659360455227227, + "grad_norm": 0.12224812805652618, + "learning_rate": 0.0009183346173608034, + "loss": 2.2408, + "step": 429250 + }, + { + "epoch": 1.6593991124306102, + "grad_norm": 0.112460657954216, + "learning_rate": 0.0009181497330961184, + "loss": 2.2388, + "step": 429260 + }, + { + "epoch": 1.6594377696339935, + "grad_norm": 0.10450280457735062, + "learning_rate": 0.0009179648804220817, + "loss": 2.2453, + "step": 429270 + }, + { + "epoch": 1.6594764268373767, + "grad_norm": 0.10354512929916382, + "learning_rate": 0.0009177800593225054, + "loss": 2.2328, + "step": 429280 + }, + { + "epoch": 1.6595150840407602, + "grad_norm": 0.10230404138565063, + "learning_rate": 0.0009175952697812154, + "loss": 2.2497, + "step": 429290 + }, + { + "epoch": 1.6595537412441435, + "grad_norm": 0.11147142946720123, + "learning_rate": 0.0009174105117820514, + "loss": 2.2462, + "step": 429300 + }, + { + "epoch": 1.6595923984475267, + "grad_norm": 0.10749982297420502, + "learning_rate": 0.0009172257853088668, + "loss": 2.2448, + "step": 429310 + }, + { + "epoch": 1.65963105565091, + "grad_norm": 0.11876123398542404, + "learning_rate": 0.0009170410903455293, + "loss": 2.2617, + "step": 429320 + }, + { + "epoch": 1.6596697128542934, + "grad_norm": 0.09572720527648926, + "learning_rate": 0.0009168564268759197, + "loss": 2.2545, + "step": 429330 + }, + { + "epoch": 1.6597083700576767, + "grad_norm": 0.11779268831014633, + "learning_rate": 0.0009166717948839327, + "loss": 2.2728, + "step": 429340 + }, + { + "epoch": 1.65974702726106, + "grad_norm": 0.10509568452835083, + "learning_rate": 0.0009164871943534769, + "loss": 2.2402, + "step": 429350 + }, + { + "epoch": 1.6597856844644432, + "grad_norm": 0.11478833109140396, + "learning_rate": 0.0009163026252684747, + "loss": 2.2635, + "step": 429360 + }, + { + "epoch": 1.6598243416678264, + "grad_norm": 0.11095292121171951, + "learning_rate": 0.0009161180876128619, + "loss": 2.2436, + "step": 429370 + }, + { + "epoch": 1.6598629988712097, + "grad_norm": 0.10812583565711975, + "learning_rate": 0.0009159335813705878, + "loss": 2.2479, + "step": 429380 + }, + { + "epoch": 1.659901656074593, + "grad_norm": 0.10396292060613632, + "learning_rate": 0.000915749106525616, + "loss": 2.2489, + "step": 429390 + }, + { + "epoch": 1.6599403132779762, + "grad_norm": 0.10780052095651627, + "learning_rate": 0.0009155646630619234, + "loss": 2.2573, + "step": 429400 + }, + { + "epoch": 1.6599789704813595, + "grad_norm": 0.10211651772260666, + "learning_rate": 0.0009153802509635001, + "loss": 2.249, + "step": 429410 + }, + { + "epoch": 1.6600176276847427, + "grad_norm": 0.11910141259431839, + "learning_rate": 0.0009151958702143506, + "loss": 2.2429, + "step": 429420 + }, + { + "epoch": 1.660056284888126, + "grad_norm": 0.12283129245042801, + "learning_rate": 0.0009150115207984926, + "loss": 2.2516, + "step": 429430 + }, + { + "epoch": 1.6600949420915092, + "grad_norm": 0.11943577229976654, + "learning_rate": 0.0009148272026999571, + "loss": 2.2532, + "step": 429440 + }, + { + "epoch": 1.6601335992948925, + "grad_norm": 0.10331227630376816, + "learning_rate": 0.0009146429159027892, + "loss": 2.2506, + "step": 429450 + }, + { + "epoch": 1.660172256498276, + "grad_norm": 0.10485690832138062, + "learning_rate": 0.0009144586603910472, + "loss": 2.2521, + "step": 429460 + }, + { + "epoch": 1.6602109137016592, + "grad_norm": 0.10611864179372787, + "learning_rate": 0.0009142744361488029, + "loss": 2.2539, + "step": 429470 + }, + { + "epoch": 1.6602495709050424, + "grad_norm": 0.10822539776563644, + "learning_rate": 0.0009140902431601419, + "loss": 2.2417, + "step": 429480 + }, + { + "epoch": 1.6602882281084257, + "grad_norm": 0.12675762176513672, + "learning_rate": 0.0009139060814091629, + "loss": 2.2652, + "step": 429490 + }, + { + "epoch": 1.6603268853118092, + "grad_norm": 0.1188879907131195, + "learning_rate": 0.0009137219508799785, + "loss": 2.247, + "step": 429500 + }, + { + "epoch": 1.6603655425151924, + "grad_norm": 0.10695726424455643, + "learning_rate": 0.0009135378515567145, + "loss": 2.2492, + "step": 429510 + }, + { + "epoch": 1.6604041997185757, + "grad_norm": 0.1097661629319191, + "learning_rate": 0.00091335378342351, + "loss": 2.2508, + "step": 429520 + }, + { + "epoch": 1.660442856921959, + "grad_norm": 0.11859133839607239, + "learning_rate": 0.0009131697464645182, + "loss": 2.2535, + "step": 429530 + }, + { + "epoch": 1.6604815141253422, + "grad_norm": 0.11683262139558792, + "learning_rate": 0.0009129857406639046, + "loss": 2.2557, + "step": 429540 + }, + { + "epoch": 1.6605201713287254, + "grad_norm": 0.10748874396085739, + "learning_rate": 0.0009128017660058494, + "loss": 2.2361, + "step": 429550 + }, + { + "epoch": 1.6605588285321087, + "grad_norm": 0.11847436428070068, + "learning_rate": 0.0009126178224745452, + "loss": 2.2459, + "step": 429560 + }, + { + "epoch": 1.660597485735492, + "grad_norm": 0.10861270874738693, + "learning_rate": 0.000912433910054198, + "loss": 2.2419, + "step": 429570 + }, + { + "epoch": 1.6606361429388752, + "grad_norm": 0.10589105635881424, + "learning_rate": 0.0009122500287290282, + "loss": 2.243, + "step": 429580 + }, + { + "epoch": 1.6606748001422584, + "grad_norm": 0.10588544607162476, + "learning_rate": 0.0009120661784832682, + "loss": 2.2527, + "step": 429590 + }, + { + "epoch": 1.6607134573456417, + "grad_norm": 0.10675523430109024, + "learning_rate": 0.0009118823593011645, + "loss": 2.2518, + "step": 429600 + }, + { + "epoch": 1.660752114549025, + "grad_norm": 0.12099213898181915, + "learning_rate": 0.0009116985711669767, + "loss": 2.2574, + "step": 429610 + }, + { + "epoch": 1.6607907717524082, + "grad_norm": 0.10642857104539871, + "learning_rate": 0.0009115148140649778, + "loss": 2.2414, + "step": 429620 + }, + { + "epoch": 1.6608294289557917, + "grad_norm": 0.11251429468393326, + "learning_rate": 0.0009113310879794536, + "loss": 2.2461, + "step": 429630 + }, + { + "epoch": 1.660868086159175, + "grad_norm": 0.12505125999450684, + "learning_rate": 0.000911147392894704, + "loss": 2.2353, + "step": 429640 + }, + { + "epoch": 1.6609067433625582, + "grad_norm": 0.11787581443786621, + "learning_rate": 0.0009109637287950414, + "loss": 2.2494, + "step": 429650 + }, + { + "epoch": 1.6609454005659414, + "grad_norm": 0.10979120433330536, + "learning_rate": 0.0009107800956647918, + "loss": 2.2496, + "step": 429660 + }, + { + "epoch": 1.660984057769325, + "grad_norm": 0.11420413851737976, + "learning_rate": 0.0009105964934882942, + "loss": 2.2467, + "step": 429670 + }, + { + "epoch": 1.6610227149727081, + "grad_norm": 0.12395305186510086, + "learning_rate": 0.0009104129222499011, + "loss": 2.2687, + "step": 429680 + }, + { + "epoch": 1.6610613721760914, + "grad_norm": 0.10555867850780487, + "learning_rate": 0.0009102293819339777, + "loss": 2.251, + "step": 429690 + }, + { + "epoch": 1.6611000293794747, + "grad_norm": 0.10683988779783249, + "learning_rate": 0.000910045872524903, + "loss": 2.2562, + "step": 429700 + }, + { + "epoch": 1.661138686582858, + "grad_norm": 0.10842365026473999, + "learning_rate": 0.0009098623940070685, + "loss": 2.2396, + "step": 429710 + }, + { + "epoch": 1.6611773437862412, + "grad_norm": 0.10488323122262955, + "learning_rate": 0.0009096789463648792, + "loss": 2.2285, + "step": 429720 + }, + { + "epoch": 1.6612160009896244, + "grad_norm": 0.11940070241689682, + "learning_rate": 0.0009094955295827532, + "loss": 2.242, + "step": 429730 + }, + { + "epoch": 1.6612546581930077, + "grad_norm": 0.10247818380594254, + "learning_rate": 0.0009093121436451215, + "loss": 2.2618, + "step": 429740 + }, + { + "epoch": 1.661293315396391, + "grad_norm": 0.11934773623943329, + "learning_rate": 0.0009091287885364286, + "loss": 2.2557, + "step": 429750 + }, + { + "epoch": 1.6613319725997742, + "grad_norm": 0.10337629169225693, + "learning_rate": 0.0009089454642411316, + "loss": 2.2742, + "step": 429760 + }, + { + "epoch": 1.6613706298031574, + "grad_norm": 0.10209142416715622, + "learning_rate": 0.0009087621707437008, + "loss": 2.2409, + "step": 429770 + }, + { + "epoch": 1.6614092870065407, + "grad_norm": 0.1119256392121315, + "learning_rate": 0.0009085789080286198, + "loss": 2.2386, + "step": 429780 + }, + { + "epoch": 1.661447944209924, + "grad_norm": 0.11805194616317749, + "learning_rate": 0.0009083956760803849, + "loss": 2.2608, + "step": 429790 + }, + { + "epoch": 1.6614866014133074, + "grad_norm": 0.10996301472187042, + "learning_rate": 0.0009082124748835057, + "loss": 2.2548, + "step": 429800 + }, + { + "epoch": 1.6615252586166906, + "grad_norm": 0.10933899879455566, + "learning_rate": 0.0009080293044225043, + "loss": 2.2535, + "step": 429810 + }, + { + "epoch": 1.661563915820074, + "grad_norm": 0.11633667349815369, + "learning_rate": 0.0009078461646819162, + "loss": 2.2526, + "step": 429820 + }, + { + "epoch": 1.6616025730234572, + "grad_norm": 0.11690562963485718, + "learning_rate": 0.0009076630556462902, + "loss": 2.2501, + "step": 429830 + }, + { + "epoch": 1.6616412302268406, + "grad_norm": 0.10380557924509048, + "learning_rate": 0.000907479977300187, + "loss": 2.2453, + "step": 429840 + }, + { + "epoch": 1.6616798874302239, + "grad_norm": 0.11537251621484756, + "learning_rate": 0.0009072969296281812, + "loss": 2.2411, + "step": 429850 + }, + { + "epoch": 1.6617185446336071, + "grad_norm": 0.10554520785808563, + "learning_rate": 0.00090711391261486, + "loss": 2.2456, + "step": 429860 + }, + { + "epoch": 1.6617572018369904, + "grad_norm": 0.11479167640209198, + "learning_rate": 0.0009069309262448233, + "loss": 2.249, + "step": 429870 + }, + { + "epoch": 1.6617958590403736, + "grad_norm": 0.12557867169380188, + "learning_rate": 0.0009067479705026842, + "loss": 2.2555, + "step": 429880 + }, + { + "epoch": 1.6618345162437569, + "grad_norm": 0.10398616641759872, + "learning_rate": 0.0009065650453730685, + "loss": 2.2422, + "step": 429890 + }, + { + "epoch": 1.6618731734471401, + "grad_norm": 0.10483341664075851, + "learning_rate": 0.0009063821508406147, + "loss": 2.2538, + "step": 429900 + }, + { + "epoch": 1.6619118306505234, + "grad_norm": 0.11726875603199005, + "learning_rate": 0.0009061992868899747, + "loss": 2.2467, + "step": 429910 + }, + { + "epoch": 1.6619504878539066, + "grad_norm": 0.11767543852329254, + "learning_rate": 0.0009060164535058126, + "loss": 2.2571, + "step": 429920 + }, + { + "epoch": 1.66198914505729, + "grad_norm": 0.14433105289936066, + "learning_rate": 0.0009058336506728056, + "loss": 2.246, + "step": 429930 + }, + { + "epoch": 1.6620278022606731, + "grad_norm": 0.11061185598373413, + "learning_rate": 0.0009056508783756438, + "loss": 2.2666, + "step": 429940 + }, + { + "epoch": 1.6620664594640564, + "grad_norm": 0.11711091548204422, + "learning_rate": 0.0009054681365990298, + "loss": 2.2388, + "step": 429950 + }, + { + "epoch": 1.6621051166674397, + "grad_norm": 0.11604579538106918, + "learning_rate": 0.0009052854253276795, + "loss": 2.2553, + "step": 429960 + }, + { + "epoch": 1.6621437738708231, + "grad_norm": 0.11772461980581284, + "learning_rate": 0.0009051027445463205, + "loss": 2.2394, + "step": 429970 + }, + { + "epoch": 1.6621824310742064, + "grad_norm": 0.12372942268848419, + "learning_rate": 0.0009049200942396942, + "loss": 2.2579, + "step": 429980 + }, + { + "epoch": 1.6622210882775896, + "grad_norm": 0.10607621818780899, + "learning_rate": 0.0009047374743925547, + "loss": 2.2435, + "step": 429990 + }, + { + "epoch": 1.6622597454809729, + "grad_norm": 0.09799272567033768, + "learning_rate": 0.0009045548849896678, + "loss": 2.2502, + "step": 430000 + }, + { + "epoch": 1.6622984026843564, + "grad_norm": 0.11451854556798935, + "learning_rate": 0.0009043723260158129, + "loss": 2.2488, + "step": 430010 + }, + { + "epoch": 1.6623370598877396, + "grad_norm": 0.10894210636615753, + "learning_rate": 0.000904189797455782, + "loss": 2.2416, + "step": 430020 + }, + { + "epoch": 1.6623757170911229, + "grad_norm": 0.11218996345996857, + "learning_rate": 0.0009040072992943795, + "loss": 2.2556, + "step": 430030 + }, + { + "epoch": 1.6624143742945061, + "grad_norm": 0.11067520827054977, + "learning_rate": 0.0009038248315164224, + "loss": 2.2386, + "step": 430040 + }, + { + "epoch": 1.6624530314978894, + "grad_norm": 0.11744607985019684, + "learning_rate": 0.0009036423941067405, + "loss": 2.2545, + "step": 430050 + }, + { + "epoch": 1.6624916887012726, + "grad_norm": 0.10939358174800873, + "learning_rate": 0.0009034599870501761, + "loss": 2.2637, + "step": 430060 + }, + { + "epoch": 1.6625303459046559, + "grad_norm": 0.11345665901899338, + "learning_rate": 0.0009032776103315843, + "loss": 2.246, + "step": 430070 + }, + { + "epoch": 1.6625690031080391, + "grad_norm": 0.13914474844932556, + "learning_rate": 0.000903095263935833, + "loss": 2.2506, + "step": 430080 + }, + { + "epoch": 1.6626076603114224, + "grad_norm": 0.11508025974035263, + "learning_rate": 0.0009029129478478017, + "loss": 2.2635, + "step": 430090 + }, + { + "epoch": 1.6626463175148056, + "grad_norm": 0.11344173550605774, + "learning_rate": 0.0009027306620523839, + "loss": 2.25, + "step": 430100 + }, + { + "epoch": 1.6626849747181889, + "grad_norm": 0.19048510491847992, + "learning_rate": 0.0009025484065344842, + "loss": 2.247, + "step": 430110 + }, + { + "epoch": 1.6627236319215721, + "grad_norm": 0.11867443472146988, + "learning_rate": 0.0009023661812790205, + "loss": 2.2456, + "step": 430120 + }, + { + "epoch": 1.6627622891249554, + "grad_norm": 0.12190329283475876, + "learning_rate": 0.0009021839862709233, + "loss": 2.2494, + "step": 430130 + }, + { + "epoch": 1.6628009463283389, + "grad_norm": 0.10324706137180328, + "learning_rate": 0.0009020018214951355, + "loss": 2.2587, + "step": 430140 + }, + { + "epoch": 1.662839603531722, + "grad_norm": 0.10841317474842072, + "learning_rate": 0.0009018196869366124, + "loss": 2.231, + "step": 430150 + }, + { + "epoch": 1.6628782607351054, + "grad_norm": 0.11271698772907257, + "learning_rate": 0.0009016375825803216, + "loss": 2.2357, + "step": 430160 + }, + { + "epoch": 1.6629169179384886, + "grad_norm": 0.09953344613313675, + "learning_rate": 0.0009014555084112432, + "loss": 2.2477, + "step": 430170 + }, + { + "epoch": 1.662955575141872, + "grad_norm": 0.11217759549617767, + "learning_rate": 0.0009012734644143703, + "loss": 2.2484, + "step": 430180 + }, + { + "epoch": 1.6629942323452553, + "grad_norm": 0.12850646674633026, + "learning_rate": 0.0009010914505747077, + "loss": 2.2343, + "step": 430190 + }, + { + "epoch": 1.6630328895486386, + "grad_norm": 0.1115790531039238, + "learning_rate": 0.000900909466877273, + "loss": 2.2501, + "step": 430200 + }, + { + "epoch": 1.6630715467520218, + "grad_norm": 0.1096728965640068, + "learning_rate": 0.0009007275133070964, + "loss": 2.2545, + "step": 430210 + }, + { + "epoch": 1.663110203955405, + "grad_norm": 0.11958139389753342, + "learning_rate": 0.0009005455898492198, + "loss": 2.2532, + "step": 430220 + }, + { + "epoch": 1.6631488611587883, + "grad_norm": 0.11160731315612793, + "learning_rate": 0.000900363696488698, + "loss": 2.2455, + "step": 430230 + }, + { + "epoch": 1.6631875183621716, + "grad_norm": 0.10113270580768585, + "learning_rate": 0.0009001818332105984, + "loss": 2.247, + "step": 430240 + }, + { + "epoch": 1.6632261755655549, + "grad_norm": 0.12471241503953934, + "learning_rate": 0.0009, + "loss": 2.2522, + "step": 430250 + }, + { + "epoch": 1.663264832768938, + "grad_norm": 0.11001195013523102, + "learning_rate": 0.0008998181968419948, + "loss": 2.2513, + "step": 430260 + }, + { + "epoch": 1.6633034899723214, + "grad_norm": 0.10488203167915344, + "learning_rate": 0.0008996364237216863, + "loss": 2.2361, + "step": 430270 + }, + { + "epoch": 1.6633421471757046, + "grad_norm": 0.10875547677278519, + "learning_rate": 0.0008994546806241917, + "loss": 2.2525, + "step": 430280 + }, + { + "epoch": 1.6633808043790879, + "grad_norm": 0.10580891370773315, + "learning_rate": 0.0008992729675346389, + "loss": 2.242, + "step": 430290 + }, + { + "epoch": 1.6634194615824711, + "grad_norm": 0.11021474748849869, + "learning_rate": 0.0008990912844381694, + "loss": 2.249, + "step": 430300 + }, + { + "epoch": 1.6634581187858546, + "grad_norm": 0.10796932131052017, + "learning_rate": 0.0008989096313199356, + "loss": 2.2521, + "step": 430310 + }, + { + "epoch": 1.6634967759892378, + "grad_norm": 0.10939501971006393, + "learning_rate": 0.0008987280081651037, + "loss": 2.2517, + "step": 430320 + }, + { + "epoch": 1.663535433192621, + "grad_norm": 0.11583207547664642, + "learning_rate": 0.0008985464149588509, + "loss": 2.2461, + "step": 430330 + }, + { + "epoch": 1.6635740903960043, + "grad_norm": 0.11901375651359558, + "learning_rate": 0.0008983648516863672, + "loss": 2.2551, + "step": 430340 + }, + { + "epoch": 1.6636127475993878, + "grad_norm": 0.10402972996234894, + "learning_rate": 0.0008981833183328545, + "loss": 2.2568, + "step": 430350 + }, + { + "epoch": 1.663651404802771, + "grad_norm": 0.11303649842739105, + "learning_rate": 0.0008980018148835271, + "loss": 2.253, + "step": 430360 + }, + { + "epoch": 1.6636900620061543, + "grad_norm": 0.13464075326919556, + "learning_rate": 0.0008978203413236116, + "loss": 2.265, + "step": 430370 + }, + { + "epoch": 1.6637287192095376, + "grad_norm": 0.10891130566596985, + "learning_rate": 0.0008976388976383465, + "loss": 2.2428, + "step": 430380 + }, + { + "epoch": 1.6637673764129208, + "grad_norm": 0.10768184065818787, + "learning_rate": 0.0008974574838129823, + "loss": 2.2448, + "step": 430390 + }, + { + "epoch": 1.663806033616304, + "grad_norm": 0.1049170047044754, + "learning_rate": 0.0008972760998327823, + "loss": 2.2468, + "step": 430400 + }, + { + "epoch": 1.6638446908196873, + "grad_norm": 0.12340379506349564, + "learning_rate": 0.0008970947456830212, + "loss": 2.2287, + "step": 430410 + }, + { + "epoch": 1.6638833480230706, + "grad_norm": 0.11856955289840698, + "learning_rate": 0.0008969134213489858, + "loss": 2.2431, + "step": 430420 + }, + { + "epoch": 1.6639220052264538, + "grad_norm": 0.11601000279188156, + "learning_rate": 0.0008967321268159758, + "loss": 2.2439, + "step": 430430 + }, + { + "epoch": 1.663960662429837, + "grad_norm": 0.10934384167194366, + "learning_rate": 0.0008965508620693024, + "loss": 2.2599, + "step": 430440 + }, + { + "epoch": 1.6639993196332203, + "grad_norm": 0.1057877317070961, + "learning_rate": 0.0008963696270942884, + "loss": 2.2452, + "step": 430450 + }, + { + "epoch": 1.6640379768366036, + "grad_norm": 0.10983796417713165, + "learning_rate": 0.0008961884218762697, + "loss": 2.2452, + "step": 430460 + }, + { + "epoch": 1.6640766340399868, + "grad_norm": 0.11676836758852005, + "learning_rate": 0.0008960072464005935, + "loss": 2.2562, + "step": 430470 + }, + { + "epoch": 1.6641152912433703, + "grad_norm": 0.11772457510232925, + "learning_rate": 0.0008958261006526191, + "loss": 2.2428, + "step": 430480 + }, + { + "epoch": 1.6641539484467536, + "grad_norm": 0.11872987449169159, + "learning_rate": 0.0008956449846177181, + "loss": 2.2394, + "step": 430490 + }, + { + "epoch": 1.6641926056501368, + "grad_norm": 0.10234441608190536, + "learning_rate": 0.0008954638982812739, + "loss": 2.2341, + "step": 430500 + }, + { + "epoch": 1.66423126285352, + "grad_norm": 0.10162276774644852, + "learning_rate": 0.000895282841628682, + "loss": 2.2467, + "step": 430510 + }, + { + "epoch": 1.6642699200569036, + "grad_norm": 0.11268150061368942, + "learning_rate": 0.0008951018146453494, + "loss": 2.2491, + "step": 430520 + }, + { + "epoch": 1.6643085772602868, + "grad_norm": 0.11581974476575851, + "learning_rate": 0.0008949208173166957, + "loss": 2.2468, + "step": 430530 + }, + { + "epoch": 1.66434723446367, + "grad_norm": 0.10957831144332886, + "learning_rate": 0.000894739849628152, + "loss": 2.2501, + "step": 430540 + }, + { + "epoch": 1.6643858916670533, + "grad_norm": 0.10688143968582153, + "learning_rate": 0.0008945589115651617, + "loss": 2.2277, + "step": 430550 + }, + { + "epoch": 1.6644245488704366, + "grad_norm": 0.09974631667137146, + "learning_rate": 0.0008943780031131799, + "loss": 2.2454, + "step": 430560 + }, + { + "epoch": 1.6644632060738198, + "grad_norm": 0.10290662199258804, + "learning_rate": 0.0008941971242576732, + "loss": 2.2437, + "step": 430570 + }, + { + "epoch": 1.664501863277203, + "grad_norm": 0.10654833167791367, + "learning_rate": 0.0008940162749841207, + "loss": 2.2453, + "step": 430580 + }, + { + "epoch": 1.6645405204805863, + "grad_norm": 0.10874156653881073, + "learning_rate": 0.0008938354552780135, + "loss": 2.2484, + "step": 430590 + }, + { + "epoch": 1.6645791776839696, + "grad_norm": 0.10365212708711624, + "learning_rate": 0.0008936546651248537, + "loss": 2.2587, + "step": 430600 + }, + { + "epoch": 1.6646178348873528, + "grad_norm": 0.10374290496110916, + "learning_rate": 0.0008934739045101559, + "loss": 2.2406, + "step": 430610 + }, + { + "epoch": 1.664656492090736, + "grad_norm": 0.11166556924581528, + "learning_rate": 0.0008932931734194463, + "loss": 2.2551, + "step": 430620 + }, + { + "epoch": 1.6646951492941193, + "grad_norm": 0.11560613662004471, + "learning_rate": 0.0008931124718382631, + "loss": 2.2479, + "step": 430630 + }, + { + "epoch": 1.6647338064975028, + "grad_norm": 0.12513025104999542, + "learning_rate": 0.0008929317997521562, + "loss": 2.2509, + "step": 430640 + }, + { + "epoch": 1.664772463700886, + "grad_norm": 0.10663405060768127, + "learning_rate": 0.0008927511571466873, + "loss": 2.2483, + "step": 430650 + }, + { + "epoch": 1.6648111209042693, + "grad_norm": 0.11393394321203232, + "learning_rate": 0.0008925705440074299, + "loss": 2.2414, + "step": 430660 + }, + { + "epoch": 1.6648497781076526, + "grad_norm": 0.1111840158700943, + "learning_rate": 0.000892389960319969, + "loss": 2.2541, + "step": 430670 + }, + { + "epoch": 1.6648884353110358, + "grad_norm": 0.11242654174566269, + "learning_rate": 0.0008922094060699017, + "loss": 2.2532, + "step": 430680 + }, + { + "epoch": 1.6649270925144193, + "grad_norm": 0.1310926228761673, + "learning_rate": 0.0008920288812428367, + "loss": 2.2469, + "step": 430690 + }, + { + "epoch": 1.6649657497178025, + "grad_norm": 0.12348271161317825, + "learning_rate": 0.0008918483858243947, + "loss": 2.2479, + "step": 430700 + }, + { + "epoch": 1.6650044069211858, + "grad_norm": 0.1277979016304016, + "learning_rate": 0.0008916679198002071, + "loss": 2.2559, + "step": 430710 + }, + { + "epoch": 1.665043064124569, + "grad_norm": 0.12305956333875656, + "learning_rate": 0.0008914874831559185, + "loss": 2.2578, + "step": 430720 + }, + { + "epoch": 1.6650817213279523, + "grad_norm": 0.1079287901520729, + "learning_rate": 0.0008913070758771841, + "loss": 2.2552, + "step": 430730 + }, + { + "epoch": 1.6651203785313355, + "grad_norm": 0.10787540674209595, + "learning_rate": 0.000891126697949671, + "loss": 2.2576, + "step": 430740 + }, + { + "epoch": 1.6651590357347188, + "grad_norm": 0.12472599744796753, + "learning_rate": 0.0008909463493590584, + "loss": 2.2463, + "step": 430750 + }, + { + "epoch": 1.665197692938102, + "grad_norm": 0.1202596053481102, + "learning_rate": 0.0008907660300910363, + "loss": 2.2422, + "step": 430760 + }, + { + "epoch": 1.6652363501414853, + "grad_norm": 0.109699547290802, + "learning_rate": 0.0008905857401313071, + "loss": 2.2489, + "step": 430770 + }, + { + "epoch": 1.6652750073448686, + "grad_norm": 0.11688680946826935, + "learning_rate": 0.0008904054794655842, + "loss": 2.2414, + "step": 430780 + }, + { + "epoch": 1.6653136645482518, + "grad_norm": 0.1130024716258049, + "learning_rate": 0.0008902252480795933, + "loss": 2.2508, + "step": 430790 + }, + { + "epoch": 1.665352321751635, + "grad_norm": 0.10916703939437866, + "learning_rate": 0.0008900450459590714, + "loss": 2.2454, + "step": 430800 + }, + { + "epoch": 1.6653909789550185, + "grad_norm": 0.1368051916360855, + "learning_rate": 0.0008898648730897667, + "loss": 2.2517, + "step": 430810 + }, + { + "epoch": 1.6654296361584018, + "grad_norm": 0.11163930594921112, + "learning_rate": 0.0008896847294574393, + "loss": 2.2516, + "step": 430820 + }, + { + "epoch": 1.665468293361785, + "grad_norm": 0.11241559684276581, + "learning_rate": 0.0008895046150478608, + "loss": 2.2304, + "step": 430830 + }, + { + "epoch": 1.6655069505651683, + "grad_norm": 0.0991520881652832, + "learning_rate": 0.0008893245298468145, + "loss": 2.2313, + "step": 430840 + }, + { + "epoch": 1.6655456077685515, + "grad_norm": 0.10892762243747711, + "learning_rate": 0.0008891444738400947, + "loss": 2.2454, + "step": 430850 + }, + { + "epoch": 1.665584264971935, + "grad_norm": 0.11580385267734528, + "learning_rate": 0.0008889644470135081, + "loss": 2.2417, + "step": 430860 + }, + { + "epoch": 1.6656229221753183, + "grad_norm": 0.11292415112257004, + "learning_rate": 0.0008887844493528721, + "loss": 2.2342, + "step": 430870 + }, + { + "epoch": 1.6656615793787015, + "grad_norm": 0.12061168998479843, + "learning_rate": 0.0008886044808440157, + "loss": 2.2514, + "step": 430880 + }, + { + "epoch": 1.6657002365820848, + "grad_norm": 0.12646254897117615, + "learning_rate": 0.0008884245414727797, + "loss": 2.2457, + "step": 430890 + }, + { + "epoch": 1.665738893785468, + "grad_norm": 0.10348045825958252, + "learning_rate": 0.0008882446312250163, + "loss": 2.2545, + "step": 430900 + }, + { + "epoch": 1.6657775509888513, + "grad_norm": 0.11797824501991272, + "learning_rate": 0.000888064750086589, + "loss": 2.2461, + "step": 430910 + }, + { + "epoch": 1.6658162081922345, + "grad_norm": 0.09973704814910889, + "learning_rate": 0.0008878848980433724, + "loss": 2.237, + "step": 430920 + }, + { + "epoch": 1.6658548653956178, + "grad_norm": 0.12584131956100464, + "learning_rate": 0.0008877050750812534, + "loss": 2.2363, + "step": 430930 + }, + { + "epoch": 1.665893522599001, + "grad_norm": 0.19706349074840546, + "learning_rate": 0.0008875252811861296, + "loss": 2.2357, + "step": 430940 + }, + { + "epoch": 1.6659321798023843, + "grad_norm": 0.11496458202600479, + "learning_rate": 0.0008873455163439101, + "loss": 2.2444, + "step": 430950 + }, + { + "epoch": 1.6659708370057675, + "grad_norm": 0.12585723400115967, + "learning_rate": 0.0008871657805405156, + "loss": 2.2509, + "step": 430960 + }, + { + "epoch": 1.6660094942091508, + "grad_norm": 0.11942458897829056, + "learning_rate": 0.000886986073761878, + "loss": 2.2366, + "step": 430970 + }, + { + "epoch": 1.6660481514125343, + "grad_norm": 0.10830435156822205, + "learning_rate": 0.0008868063959939403, + "loss": 2.2243, + "step": 430980 + }, + { + "epoch": 1.6660868086159175, + "grad_norm": 0.10963542014360428, + "learning_rate": 0.0008866267472226577, + "loss": 2.2427, + "step": 430990 + }, + { + "epoch": 1.6661254658193008, + "grad_norm": 0.11515636742115021, + "learning_rate": 0.0008864471274339956, + "loss": 2.254, + "step": 431000 + }, + { + "epoch": 1.666164123022684, + "grad_norm": 0.1160406842827797, + "learning_rate": 0.0008862675366139317, + "loss": 2.2439, + "step": 431010 + }, + { + "epoch": 1.6662027802260673, + "grad_norm": 0.11950808763504028, + "learning_rate": 0.0008860879747484545, + "loss": 2.253, + "step": 431020 + }, + { + "epoch": 1.6662414374294507, + "grad_norm": 0.11876123398542404, + "learning_rate": 0.0008859084418235637, + "loss": 2.2502, + "step": 431030 + }, + { + "epoch": 1.666280094632834, + "grad_norm": 0.11513090878725052, + "learning_rate": 0.0008857289378252705, + "loss": 2.2285, + "step": 431040 + }, + { + "epoch": 1.6663187518362172, + "grad_norm": 0.12304891645908356, + "learning_rate": 0.0008855494627395974, + "loss": 2.2394, + "step": 431050 + }, + { + "epoch": 1.6663574090396005, + "grad_norm": 0.10489034652709961, + "learning_rate": 0.0008853700165525782, + "loss": 2.2555, + "step": 431060 + }, + { + "epoch": 1.6663960662429838, + "grad_norm": 0.11117493361234665, + "learning_rate": 0.0008851905992502576, + "loss": 2.2338, + "step": 431070 + }, + { + "epoch": 1.666434723446367, + "grad_norm": 0.10987547039985657, + "learning_rate": 0.0008850112108186916, + "loss": 2.2481, + "step": 431080 + }, + { + "epoch": 1.6664733806497503, + "grad_norm": 0.118398517370224, + "learning_rate": 0.0008848318512439481, + "loss": 2.2264, + "step": 431090 + }, + { + "epoch": 1.6665120378531335, + "grad_norm": 0.1149090901017189, + "learning_rate": 0.0008846525205121052, + "loss": 2.2488, + "step": 431100 + }, + { + "epoch": 1.6665506950565168, + "grad_norm": 0.13283200562000275, + "learning_rate": 0.0008844732186092528, + "loss": 2.2594, + "step": 431110 + }, + { + "epoch": 1.6665893522599, + "grad_norm": 0.10119766741991043, + "learning_rate": 0.0008842939455214918, + "loss": 2.2459, + "step": 431120 + }, + { + "epoch": 1.6666280094632833, + "grad_norm": 0.10677947849035263, + "learning_rate": 0.0008841147012349342, + "loss": 2.2287, + "step": 431130 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.11360535025596619, + "learning_rate": 0.0008839354857357036, + "loss": 2.2408, + "step": 431140 + }, + { + "epoch": 1.66670532387005, + "grad_norm": 0.11793069541454315, + "learning_rate": 0.000883756299009934, + "loss": 2.2585, + "step": 431150 + }, + { + "epoch": 1.6667439810734332, + "grad_norm": 0.11654367297887802, + "learning_rate": 0.0008835771410437709, + "loss": 2.2534, + "step": 431160 + }, + { + "epoch": 1.6667826382768165, + "grad_norm": 0.11465385556221008, + "learning_rate": 0.0008833980118233714, + "loss": 2.2575, + "step": 431170 + }, + { + "epoch": 1.6668212954801997, + "grad_norm": 0.1096300333738327, + "learning_rate": 0.0008832189113349027, + "loss": 2.2482, + "step": 431180 + }, + { + "epoch": 1.6668599526835832, + "grad_norm": 0.11422011256217957, + "learning_rate": 0.0008830398395645438, + "loss": 2.2579, + "step": 431190 + }, + { + "epoch": 1.6668986098869665, + "grad_norm": 0.11084701120853424, + "learning_rate": 0.0008828607964984847, + "loss": 2.2696, + "step": 431200 + }, + { + "epoch": 1.6669372670903497, + "grad_norm": 0.10878776013851166, + "learning_rate": 0.0008826817821229263, + "loss": 2.2408, + "step": 431210 + }, + { + "epoch": 1.666975924293733, + "grad_norm": 0.1218942254781723, + "learning_rate": 0.0008825027964240806, + "loss": 2.2495, + "step": 431220 + }, + { + "epoch": 1.6670145814971162, + "grad_norm": 0.12331361323595047, + "learning_rate": 0.0008823238393881705, + "loss": 2.2684, + "step": 431230 + }, + { + "epoch": 1.6670532387004995, + "grad_norm": 0.10126879066228867, + "learning_rate": 0.0008821449110014304, + "loss": 2.2499, + "step": 431240 + }, + { + "epoch": 1.6670918959038827, + "grad_norm": 0.1190921738743782, + "learning_rate": 0.0008819660112501051, + "loss": 2.2495, + "step": 431250 + }, + { + "epoch": 1.667130553107266, + "grad_norm": 0.11584290117025375, + "learning_rate": 0.0008817871401204509, + "loss": 2.2431, + "step": 431260 + }, + { + "epoch": 1.6671692103106492, + "grad_norm": 0.11810228228569031, + "learning_rate": 0.0008816082975987349, + "loss": 2.2485, + "step": 431270 + }, + { + "epoch": 1.6672078675140325, + "grad_norm": 0.10765083879232407, + "learning_rate": 0.0008814294836712349, + "loss": 2.2312, + "step": 431280 + }, + { + "epoch": 1.6672465247174157, + "grad_norm": 0.10427460074424744, + "learning_rate": 0.0008812506983242403, + "loss": 2.2435, + "step": 431290 + }, + { + "epoch": 1.667285181920799, + "grad_norm": 0.11015084385871887, + "learning_rate": 0.0008810719415440509, + "loss": 2.2476, + "step": 431300 + }, + { + "epoch": 1.6673238391241822, + "grad_norm": 0.10388490557670593, + "learning_rate": 0.0008808932133169776, + "loss": 2.2322, + "step": 431310 + }, + { + "epoch": 1.6673624963275657, + "grad_norm": 0.12453647702932358, + "learning_rate": 0.0008807145136293422, + "loss": 2.2401, + "step": 431320 + }, + { + "epoch": 1.667401153530949, + "grad_norm": 0.10797908157110214, + "learning_rate": 0.0008805358424674776, + "loss": 2.2416, + "step": 431330 + }, + { + "epoch": 1.6674398107343322, + "grad_norm": 0.11077652126550674, + "learning_rate": 0.0008803571998177276, + "loss": 2.2297, + "step": 431340 + }, + { + "epoch": 1.6674784679377155, + "grad_norm": 0.10809510201215744, + "learning_rate": 0.0008801785856664466, + "loss": 2.2528, + "step": 431350 + }, + { + "epoch": 1.667517125141099, + "grad_norm": 0.12976865470409393, + "learning_rate": 0.0008800000000000001, + "loss": 2.2607, + "step": 431360 + }, + { + "epoch": 1.6675557823444822, + "grad_norm": 0.10788272321224213, + "learning_rate": 0.0008798214428047644, + "loss": 2.2536, + "step": 431370 + }, + { + "epoch": 1.6675944395478655, + "grad_norm": 0.1121840849518776, + "learning_rate": 0.0008796429140671265, + "loss": 2.2359, + "step": 431380 + }, + { + "epoch": 1.6676330967512487, + "grad_norm": 0.11245424300432205, + "learning_rate": 0.0008794644137734848, + "loss": 2.2442, + "step": 431390 + }, + { + "epoch": 1.667671753954632, + "grad_norm": 0.11103110015392303, + "learning_rate": 0.0008792859419102481, + "loss": 2.2438, + "step": 431400 + }, + { + "epoch": 1.6677104111580152, + "grad_norm": 0.10202562063932419, + "learning_rate": 0.0008791074984638358, + "loss": 2.2362, + "step": 431410 + }, + { + "epoch": 1.6677490683613985, + "grad_norm": 0.1100311353802681, + "learning_rate": 0.0008789290834206786, + "loss": 2.2495, + "step": 431420 + }, + { + "epoch": 1.6677877255647817, + "grad_norm": 0.11951546370983124, + "learning_rate": 0.0008787506967672177, + "loss": 2.2381, + "step": 431430 + }, + { + "epoch": 1.667826382768165, + "grad_norm": 0.1079200878739357, + "learning_rate": 0.0008785723384899051, + "loss": 2.2268, + "step": 431440 + }, + { + "epoch": 1.6678650399715482, + "grad_norm": 0.1171964779496193, + "learning_rate": 0.0008783940085752038, + "loss": 2.2546, + "step": 431450 + }, + { + "epoch": 1.6679036971749315, + "grad_norm": 0.10935444384813309, + "learning_rate": 0.0008782157070095873, + "loss": 2.235, + "step": 431460 + }, + { + "epoch": 1.6679423543783147, + "grad_norm": 0.11907418817281723, + "learning_rate": 0.0008780374337795401, + "loss": 2.2489, + "step": 431470 + }, + { + "epoch": 1.667981011581698, + "grad_norm": 0.1128414049744606, + "learning_rate": 0.000877859188871557, + "loss": 2.2393, + "step": 431480 + }, + { + "epoch": 1.6680196687850815, + "grad_norm": 0.11093578487634659, + "learning_rate": 0.0008776809722721439, + "loss": 2.2426, + "step": 431490 + }, + { + "epoch": 1.6680583259884647, + "grad_norm": 0.10630373656749725, + "learning_rate": 0.0008775027839678176, + "loss": 2.2528, + "step": 431500 + }, + { + "epoch": 1.668096983191848, + "grad_norm": 0.12331317365169525, + "learning_rate": 0.000877324623945105, + "loss": 2.2234, + "step": 431510 + }, + { + "epoch": 1.6681356403952312, + "grad_norm": 0.12409404665231705, + "learning_rate": 0.000877146492190544, + "loss": 2.2452, + "step": 431520 + }, + { + "epoch": 1.6681742975986147, + "grad_norm": 0.096986323595047, + "learning_rate": 0.0008769683886906834, + "loss": 2.2409, + "step": 431530 + }, + { + "epoch": 1.668212954801998, + "grad_norm": 0.11354723572731018, + "learning_rate": 0.0008767903134320823, + "loss": 2.2366, + "step": 431540 + }, + { + "epoch": 1.6682516120053812, + "grad_norm": 0.11607275903224945, + "learning_rate": 0.0008766122664013105, + "loss": 2.2302, + "step": 431550 + }, + { + "epoch": 1.6682902692087644, + "grad_norm": 0.12799812853336334, + "learning_rate": 0.0008764342475849489, + "loss": 2.2438, + "step": 431560 + }, + { + "epoch": 1.6683289264121477, + "grad_norm": 0.1058444157242775, + "learning_rate": 0.0008762562569695885, + "loss": 2.2385, + "step": 431570 + }, + { + "epoch": 1.668367583615531, + "grad_norm": 0.12132367491722107, + "learning_rate": 0.0008760782945418307, + "loss": 2.2529, + "step": 431580 + }, + { + "epoch": 1.6684062408189142, + "grad_norm": 0.11287418007850647, + "learning_rate": 0.0008759003602882882, + "loss": 2.2405, + "step": 431590 + }, + { + "epoch": 1.6684448980222975, + "grad_norm": 0.12102866917848587, + "learning_rate": 0.0008757224541955844, + "loss": 2.2565, + "step": 431600 + }, + { + "epoch": 1.6684835552256807, + "grad_norm": 0.12273454666137695, + "learning_rate": 0.0008755445762503522, + "loss": 2.2464, + "step": 431610 + }, + { + "epoch": 1.668522212429064, + "grad_norm": 0.11583053320646286, + "learning_rate": 0.0008753667264392362, + "loss": 2.2305, + "step": 431620 + }, + { + "epoch": 1.6685608696324472, + "grad_norm": 0.11362405866384506, + "learning_rate": 0.0008751889047488908, + "loss": 2.2496, + "step": 431630 + }, + { + "epoch": 1.6685995268358305, + "grad_norm": 0.13401475548744202, + "learning_rate": 0.0008750111111659813, + "loss": 2.2463, + "step": 431640 + }, + { + "epoch": 1.6686381840392137, + "grad_norm": 0.10942229628562927, + "learning_rate": 0.0008748333456771838, + "loss": 2.2409, + "step": 431650 + }, + { + "epoch": 1.6686768412425972, + "grad_norm": 0.11597096174955368, + "learning_rate": 0.0008746556082691842, + "loss": 2.2476, + "step": 431660 + }, + { + "epoch": 1.6687154984459804, + "grad_norm": 0.11072780191898346, + "learning_rate": 0.0008744778989286795, + "loss": 2.2465, + "step": 431670 + }, + { + "epoch": 1.6687541556493637, + "grad_norm": 0.12409374117851257, + "learning_rate": 0.0008743002176423768, + "loss": 2.2536, + "step": 431680 + }, + { + "epoch": 1.668792812852747, + "grad_norm": 0.11742675304412842, + "learning_rate": 0.0008741225643969944, + "loss": 2.2482, + "step": 431690 + }, + { + "epoch": 1.6688314700561304, + "grad_norm": 0.10112820565700531, + "learning_rate": 0.0008739449391792602, + "loss": 2.2443, + "step": 431700 + }, + { + "epoch": 1.6688701272595137, + "grad_norm": 0.11522382497787476, + "learning_rate": 0.0008737673419759131, + "loss": 2.2438, + "step": 431710 + }, + { + "epoch": 1.668908784462897, + "grad_norm": 0.11149576306343079, + "learning_rate": 0.0008735897727737022, + "loss": 2.2417, + "step": 431720 + }, + { + "epoch": 1.6689474416662802, + "grad_norm": 0.11229491233825684, + "learning_rate": 0.0008734122315593871, + "loss": 2.2558, + "step": 431730 + }, + { + "epoch": 1.6689860988696634, + "grad_norm": 0.12534652650356293, + "learning_rate": 0.000873234718319738, + "loss": 2.2479, + "step": 431740 + }, + { + "epoch": 1.6690247560730467, + "grad_norm": 0.11140571534633636, + "learning_rate": 0.0008730572330415356, + "loss": 2.2596, + "step": 431750 + }, + { + "epoch": 1.66906341327643, + "grad_norm": 0.12196607887744904, + "learning_rate": 0.0008728797757115703, + "loss": 2.2337, + "step": 431760 + }, + { + "epoch": 1.6691020704798132, + "grad_norm": 0.11696066707372665, + "learning_rate": 0.0008727023463166439, + "loss": 2.259, + "step": 431770 + }, + { + "epoch": 1.6691407276831964, + "grad_norm": 0.12137053906917572, + "learning_rate": 0.0008725249448435677, + "loss": 2.2402, + "step": 431780 + }, + { + "epoch": 1.6691793848865797, + "grad_norm": 0.1035657450556755, + "learning_rate": 0.0008723475712791639, + "loss": 2.2573, + "step": 431790 + }, + { + "epoch": 1.669218042089963, + "grad_norm": 0.11626846343278885, + "learning_rate": 0.0008721702256102653, + "loss": 2.2402, + "step": 431800 + }, + { + "epoch": 1.6692566992933462, + "grad_norm": 0.12115205824375153, + "learning_rate": 0.0008719929078237141, + "loss": 2.2604, + "step": 431810 + }, + { + "epoch": 1.6692953564967294, + "grad_norm": 0.11947688460350037, + "learning_rate": 0.0008718156179063636, + "loss": 2.2519, + "step": 431820 + }, + { + "epoch": 1.669334013700113, + "grad_norm": 0.11764021217823029, + "learning_rate": 0.0008716383558450776, + "loss": 2.2667, + "step": 431830 + }, + { + "epoch": 1.6693726709034962, + "grad_norm": 0.10446431487798691, + "learning_rate": 0.0008714611216267292, + "loss": 2.2413, + "step": 431840 + }, + { + "epoch": 1.6694113281068794, + "grad_norm": 0.10729694366455078, + "learning_rate": 0.0008712839152382031, + "loss": 2.2525, + "step": 431850 + }, + { + "epoch": 1.6694499853102627, + "grad_norm": 0.11575144529342651, + "learning_rate": 0.0008711067366663932, + "loss": 2.2576, + "step": 431860 + }, + { + "epoch": 1.6694886425136461, + "grad_norm": 0.11094532907009125, + "learning_rate": 0.0008709295858982045, + "loss": 2.2446, + "step": 431870 + }, + { + "epoch": 1.6695272997170294, + "grad_norm": 0.12855194509029388, + "learning_rate": 0.0008707524629205516, + "loss": 2.2415, + "step": 431880 + }, + { + "epoch": 1.6695659569204127, + "grad_norm": 0.12823374569416046, + "learning_rate": 0.0008705753677203601, + "loss": 2.2407, + "step": 431890 + }, + { + "epoch": 1.669604614123796, + "grad_norm": 0.11963459104299545, + "learning_rate": 0.000870398300284565, + "loss": 2.2368, + "step": 431900 + }, + { + "epoch": 1.6696432713271792, + "grad_norm": 0.11403647810220718, + "learning_rate": 0.000870221260600112, + "loss": 2.2371, + "step": 431910 + }, + { + "epoch": 1.6696819285305624, + "grad_norm": 0.10871980339288712, + "learning_rate": 0.0008700442486539573, + "loss": 2.2482, + "step": 431920 + }, + { + "epoch": 1.6697205857339457, + "grad_norm": 0.10171128064393997, + "learning_rate": 0.0008698672644330668, + "loss": 2.2281, + "step": 431930 + }, + { + "epoch": 1.669759242937329, + "grad_norm": 0.11919943988323212, + "learning_rate": 0.0008696903079244168, + "loss": 2.2469, + "step": 431940 + }, + { + "epoch": 1.6697979001407122, + "grad_norm": 0.11724691838026047, + "learning_rate": 0.0008695133791149937, + "loss": 2.2392, + "step": 431950 + }, + { + "epoch": 1.6698365573440954, + "grad_norm": 0.11415465176105499, + "learning_rate": 0.0008693364779917945, + "loss": 2.2448, + "step": 431960 + }, + { + "epoch": 1.6698752145474787, + "grad_norm": 0.11876343190670013, + "learning_rate": 0.0008691596045418258, + "loss": 2.2439, + "step": 431970 + }, + { + "epoch": 1.669913871750862, + "grad_norm": 0.10392026603221893, + "learning_rate": 0.0008689827587521047, + "loss": 2.2271, + "step": 431980 + }, + { + "epoch": 1.6699525289542452, + "grad_norm": 0.10942797362804413, + "learning_rate": 0.0008688059406096583, + "loss": 2.2455, + "step": 431990 + }, + { + "epoch": 1.6699911861576286, + "grad_norm": 0.10648475587368011, + "learning_rate": 0.000868629150101524, + "loss": 2.2466, + "step": 432000 + }, + { + "epoch": 1.670029843361012, + "grad_norm": 0.10488341748714447, + "learning_rate": 0.0008684523872147492, + "loss": 2.2435, + "step": 432010 + }, + { + "epoch": 1.6700685005643952, + "grad_norm": 0.10440246015787125, + "learning_rate": 0.0008682756519363914, + "loss": 2.2564, + "step": 432020 + }, + { + "epoch": 1.6701071577677784, + "grad_norm": 0.11767731606960297, + "learning_rate": 0.0008680989442535183, + "loss": 2.2363, + "step": 432030 + }, + { + "epoch": 1.6701458149711619, + "grad_norm": 0.13144199550151825, + "learning_rate": 0.0008679222641532076, + "loss": 2.2453, + "step": 432040 + }, + { + "epoch": 1.6701844721745451, + "grad_norm": 0.12658940255641937, + "learning_rate": 0.000867745611622547, + "loss": 2.2459, + "step": 432050 + }, + { + "epoch": 1.6702231293779284, + "grad_norm": 0.11060245335102081, + "learning_rate": 0.0008675689866486347, + "loss": 2.2388, + "step": 432060 + }, + { + "epoch": 1.6702617865813116, + "grad_norm": 0.09972178190946579, + "learning_rate": 0.0008673923892185784, + "loss": 2.2461, + "step": 432070 + }, + { + "epoch": 1.6703004437846949, + "grad_norm": 0.10841628164052963, + "learning_rate": 0.0008672158193194963, + "loss": 2.2379, + "step": 432080 + }, + { + "epoch": 1.6703391009880781, + "grad_norm": 0.12501399219036102, + "learning_rate": 0.0008670392769385163, + "loss": 2.237, + "step": 432090 + }, + { + "epoch": 1.6703777581914614, + "grad_norm": 0.1202363669872284, + "learning_rate": 0.0008668627620627765, + "loss": 2.2352, + "step": 432100 + }, + { + "epoch": 1.6704164153948446, + "grad_norm": 0.11828488856554031, + "learning_rate": 0.0008666862746794248, + "loss": 2.2343, + "step": 432110 + }, + { + "epoch": 1.670455072598228, + "grad_norm": 0.11299415677785873, + "learning_rate": 0.0008665098147756196, + "loss": 2.2432, + "step": 432120 + }, + { + "epoch": 1.6704937298016111, + "grad_norm": 0.11967718601226807, + "learning_rate": 0.0008663333823385291, + "loss": 2.2426, + "step": 432130 + }, + { + "epoch": 1.6705323870049944, + "grad_norm": 0.11156123131513596, + "learning_rate": 0.0008661569773553307, + "loss": 2.2385, + "step": 432140 + }, + { + "epoch": 1.6705710442083777, + "grad_norm": 0.11214423179626465, + "learning_rate": 0.0008659805998132131, + "loss": 2.2429, + "step": 432150 + }, + { + "epoch": 1.670609701411761, + "grad_norm": 0.11777114868164062, + "learning_rate": 0.0008658042496993741, + "loss": 2.2674, + "step": 432160 + }, + { + "epoch": 1.6706483586151444, + "grad_norm": 0.1242789775133133, + "learning_rate": 0.0008656279270010214, + "loss": 2.2411, + "step": 432170 + }, + { + "epoch": 1.6706870158185276, + "grad_norm": 0.11542826890945435, + "learning_rate": 0.0008654516317053734, + "loss": 2.266, + "step": 432180 + }, + { + "epoch": 1.6707256730219109, + "grad_norm": 0.12458527833223343, + "learning_rate": 0.0008652753637996574, + "loss": 2.2377, + "step": 432190 + }, + { + "epoch": 1.6707643302252941, + "grad_norm": 0.2273350954055786, + "learning_rate": 0.0008650991232711115, + "loss": 2.2562, + "step": 432200 + }, + { + "epoch": 1.6708029874286776, + "grad_norm": 0.12100516259670258, + "learning_rate": 0.0008649229101069831, + "loss": 2.2496, + "step": 432210 + }, + { + "epoch": 1.6708416446320609, + "grad_norm": 0.10887449234724045, + "learning_rate": 0.00086474672429453, + "loss": 2.2499, + "step": 432220 + }, + { + "epoch": 1.6708803018354441, + "grad_norm": 0.11747176945209503, + "learning_rate": 0.0008645705658210194, + "loss": 2.255, + "step": 432230 + }, + { + "epoch": 1.6709189590388274, + "grad_norm": 0.0999768003821373, + "learning_rate": 0.0008643944346737286, + "loss": 2.2439, + "step": 432240 + }, + { + "epoch": 1.6709576162422106, + "grad_norm": 0.11573263257741928, + "learning_rate": 0.0008642183308399454, + "loss": 2.2453, + "step": 432250 + }, + { + "epoch": 1.6709962734455939, + "grad_norm": 0.1162898987531662, + "learning_rate": 0.0008640422543069659, + "loss": 2.2726, + "step": 432260 + }, + { + "epoch": 1.6710349306489771, + "grad_norm": 0.09759674966335297, + "learning_rate": 0.0008638662050620975, + "loss": 2.249, + "step": 432270 + }, + { + "epoch": 1.6710735878523604, + "grad_norm": 0.11636500805616379, + "learning_rate": 0.0008636901830926567, + "loss": 2.2548, + "step": 432280 + }, + { + "epoch": 1.6711122450557436, + "grad_norm": 0.11740848422050476, + "learning_rate": 0.0008635141883859702, + "loss": 2.2332, + "step": 432290 + }, + { + "epoch": 1.6711509022591269, + "grad_norm": 0.1161784678697586, + "learning_rate": 0.0008633382209293744, + "loss": 2.2506, + "step": 432300 + }, + { + "epoch": 1.6711895594625101, + "grad_norm": 0.11073528230190277, + "learning_rate": 0.0008631622807102151, + "loss": 2.256, + "step": 432310 + }, + { + "epoch": 1.6712282166658934, + "grad_norm": 0.12388008832931519, + "learning_rate": 0.0008629863677158485, + "loss": 2.2599, + "step": 432320 + }, + { + "epoch": 1.6712668738692766, + "grad_norm": 0.11557994037866592, + "learning_rate": 0.0008628104819336402, + "loss": 2.2329, + "step": 432330 + }, + { + "epoch": 1.6713055310726601, + "grad_norm": 0.1085330918431282, + "learning_rate": 0.0008626346233509654, + "loss": 2.235, + "step": 432340 + }, + { + "epoch": 1.6713441882760434, + "grad_norm": 0.11548107117414474, + "learning_rate": 0.00086245879195521, + "loss": 2.2461, + "step": 432350 + }, + { + "epoch": 1.6713828454794266, + "grad_norm": 0.10183994472026825, + "learning_rate": 0.0008622829877337688, + "loss": 2.2411, + "step": 432360 + }, + { + "epoch": 1.6714215026828099, + "grad_norm": 0.12165366113185883, + "learning_rate": 0.000862107210674046, + "loss": 2.2321, + "step": 432370 + }, + { + "epoch": 1.6714601598861933, + "grad_norm": 0.13182103633880615, + "learning_rate": 0.0008619314607634565, + "loss": 2.2393, + "step": 432380 + }, + { + "epoch": 1.6714988170895766, + "grad_norm": 0.1082003116607666, + "learning_rate": 0.0008617557379894243, + "loss": 2.2366, + "step": 432390 + }, + { + "epoch": 1.6715374742929598, + "grad_norm": 0.10970823466777802, + "learning_rate": 0.0008615800423393833, + "loss": 2.2465, + "step": 432400 + }, + { + "epoch": 1.671576131496343, + "grad_norm": 0.11980883032083511, + "learning_rate": 0.0008614043738007773, + "loss": 2.236, + "step": 432410 + }, + { + "epoch": 1.6716147886997264, + "grad_norm": 0.10353901982307434, + "learning_rate": 0.0008612287323610593, + "loss": 2.2578, + "step": 432420 + }, + { + "epoch": 1.6716534459031096, + "grad_norm": 0.10307720303535461, + "learning_rate": 0.0008610531180076921, + "loss": 2.2465, + "step": 432430 + }, + { + "epoch": 1.6716921031064929, + "grad_norm": 0.2446184903383255, + "learning_rate": 0.0008608775307281486, + "loss": 2.2489, + "step": 432440 + }, + { + "epoch": 1.671730760309876, + "grad_norm": 0.1153741404414177, + "learning_rate": 0.0008607019705099108, + "loss": 2.2633, + "step": 432450 + }, + { + "epoch": 1.6717694175132594, + "grad_norm": 0.12404379993677139, + "learning_rate": 0.0008605264373404708, + "loss": 2.2432, + "step": 432460 + }, + { + "epoch": 1.6718080747166426, + "grad_norm": 0.11334104835987091, + "learning_rate": 0.0008603509312073298, + "loss": 2.239, + "step": 432470 + }, + { + "epoch": 1.6718467319200259, + "grad_norm": 0.11389446258544922, + "learning_rate": 0.0008601754520979994, + "loss": 2.2402, + "step": 432480 + }, + { + "epoch": 1.6718853891234091, + "grad_norm": 0.1087617427110672, + "learning_rate": 0.0008599999999999999, + "loss": 2.254, + "step": 432490 + }, + { + "epoch": 1.6719240463267926, + "grad_norm": 0.10388283431529999, + "learning_rate": 0.000859824574900862, + "loss": 2.2297, + "step": 432500 + }, + { + "epoch": 1.6719627035301758, + "grad_norm": 0.10273746401071548, + "learning_rate": 0.0008596491767881254, + "loss": 2.2353, + "step": 432510 + }, + { + "epoch": 1.672001360733559, + "grad_norm": 0.10638001561164856, + "learning_rate": 0.0008594738056493398, + "loss": 2.2482, + "step": 432520 + }, + { + "epoch": 1.6720400179369423, + "grad_norm": 0.11977694183588028, + "learning_rate": 0.0008592984614720642, + "loss": 2.2543, + "step": 432530 + }, + { + "epoch": 1.6720786751403256, + "grad_norm": 0.11505132168531418, + "learning_rate": 0.0008591231442438672, + "loss": 2.2266, + "step": 432540 + }, + { + "epoch": 1.672117332343709, + "grad_norm": 0.10981100797653198, + "learning_rate": 0.000858947853952327, + "loss": 2.2501, + "step": 432550 + }, + { + "epoch": 1.6721559895470923, + "grad_norm": 0.11681405454874039, + "learning_rate": 0.0008587725905850317, + "loss": 2.2366, + "step": 432560 + }, + { + "epoch": 1.6721946467504756, + "grad_norm": 0.10692845284938812, + "learning_rate": 0.000858597354129578, + "loss": 2.2335, + "step": 432570 + }, + { + "epoch": 1.6722333039538588, + "grad_norm": 0.11772792786359787, + "learning_rate": 0.0008584221445735731, + "loss": 2.2127, + "step": 432580 + }, + { + "epoch": 1.672271961157242, + "grad_norm": 0.1118854507803917, + "learning_rate": 0.0008582469619046331, + "loss": 2.2323, + "step": 432590 + }, + { + "epoch": 1.6723106183606253, + "grad_norm": 0.13144323229789734, + "learning_rate": 0.0008580718061103842, + "loss": 2.2465, + "step": 432600 + }, + { + "epoch": 1.6723492755640086, + "grad_norm": 0.12407439202070236, + "learning_rate": 0.0008578966771784613, + "loss": 2.2601, + "step": 432610 + }, + { + "epoch": 1.6723879327673918, + "grad_norm": 0.10566231608390808, + "learning_rate": 0.0008577215750965092, + "loss": 2.2387, + "step": 432620 + }, + { + "epoch": 1.672426589970775, + "grad_norm": 0.10437638312578201, + "learning_rate": 0.0008575464998521823, + "loss": 2.2387, + "step": 432630 + }, + { + "epoch": 1.6724652471741583, + "grad_norm": 0.10871285945177078, + "learning_rate": 0.0008573714514331439, + "loss": 2.2408, + "step": 432640 + }, + { + "epoch": 1.6725039043775416, + "grad_norm": 0.11669822037220001, + "learning_rate": 0.0008571964298270678, + "loss": 2.2493, + "step": 432650 + }, + { + "epoch": 1.6725425615809248, + "grad_norm": 0.10993584990501404, + "learning_rate": 0.0008570214350216361, + "loss": 2.242, + "step": 432660 + }, + { + "epoch": 1.6725812187843083, + "grad_norm": 0.11364573985338211, + "learning_rate": 0.000856846467004541, + "loss": 2.2404, + "step": 432670 + }, + { + "epoch": 1.6726198759876916, + "grad_norm": 0.2189556360244751, + "learning_rate": 0.0008566715257634838, + "loss": 2.2508, + "step": 432680 + }, + { + "epoch": 1.6726585331910748, + "grad_norm": 0.11758209019899368, + "learning_rate": 0.0008564966112861754, + "loss": 2.2624, + "step": 432690 + }, + { + "epoch": 1.672697190394458, + "grad_norm": 0.12056602537631989, + "learning_rate": 0.0008563217235603363, + "loss": 2.252, + "step": 432700 + }, + { + "epoch": 1.6727358475978413, + "grad_norm": 0.11321151256561279, + "learning_rate": 0.0008561468625736956, + "loss": 2.2469, + "step": 432710 + }, + { + "epoch": 1.6727745048012248, + "grad_norm": 0.10741432011127472, + "learning_rate": 0.0008559720283139926, + "loss": 2.2467, + "step": 432720 + }, + { + "epoch": 1.672813162004608, + "grad_norm": 0.10732309520244598, + "learning_rate": 0.0008557972207689757, + "loss": 2.2452, + "step": 432730 + }, + { + "epoch": 1.6728518192079913, + "grad_norm": 0.11348134279251099, + "learning_rate": 0.0008556224399264025, + "loss": 2.2465, + "step": 432740 + }, + { + "epoch": 1.6728904764113746, + "grad_norm": 0.10850790143013, + "learning_rate": 0.0008554476857740403, + "loss": 2.2373, + "step": 432750 + }, + { + "epoch": 1.6729291336147578, + "grad_norm": 0.106198750436306, + "learning_rate": 0.0008552729582996652, + "loss": 2.2509, + "step": 432760 + }, + { + "epoch": 1.672967790818141, + "grad_norm": 0.12779173254966736, + "learning_rate": 0.0008550982574910631, + "loss": 2.2461, + "step": 432770 + }, + { + "epoch": 1.6730064480215243, + "grad_norm": 0.11383172869682312, + "learning_rate": 0.0008549235833360292, + "loss": 2.247, + "step": 432780 + }, + { + "epoch": 1.6730451052249076, + "grad_norm": 0.10072283446788788, + "learning_rate": 0.0008547489358223674, + "loss": 2.2412, + "step": 432790 + }, + { + "epoch": 1.6730837624282908, + "grad_norm": 0.10738955438137054, + "learning_rate": 0.0008545743149378917, + "loss": 2.2481, + "step": 432800 + }, + { + "epoch": 1.673122419631674, + "grad_norm": 0.11644309014081955, + "learning_rate": 0.0008543997206704251, + "loss": 2.2521, + "step": 432810 + }, + { + "epoch": 1.6731610768350573, + "grad_norm": 0.10279635339975357, + "learning_rate": 0.0008542251530077998, + "loss": 2.2456, + "step": 432820 + }, + { + "epoch": 1.6731997340384406, + "grad_norm": 0.1066320389509201, + "learning_rate": 0.000854050611937857, + "loss": 2.2543, + "step": 432830 + }, + { + "epoch": 1.673238391241824, + "grad_norm": 0.11159369349479675, + "learning_rate": 0.0008538760974484476, + "loss": 2.241, + "step": 432840 + }, + { + "epoch": 1.6732770484452073, + "grad_norm": 0.13942484557628632, + "learning_rate": 0.0008537016095274319, + "loss": 2.2506, + "step": 432850 + }, + { + "epoch": 1.6733157056485906, + "grad_norm": 0.11398211121559143, + "learning_rate": 0.0008535271481626789, + "loss": 2.244, + "step": 432860 + }, + { + "epoch": 1.6733543628519738, + "grad_norm": 0.11198543012142181, + "learning_rate": 0.0008533527133420671, + "loss": 2.2465, + "step": 432870 + }, + { + "epoch": 1.673393020055357, + "grad_norm": 0.10565642267465591, + "learning_rate": 0.000853178305053484, + "loss": 2.2346, + "step": 432880 + }, + { + "epoch": 1.6734316772587405, + "grad_norm": 0.11750909686088562, + "learning_rate": 0.0008530039232848266, + "loss": 2.2336, + "step": 432890 + }, + { + "epoch": 1.6734703344621238, + "grad_norm": 0.13077422976493835, + "learning_rate": 0.000852829568024001, + "loss": 2.2583, + "step": 432900 + }, + { + "epoch": 1.673508991665507, + "grad_norm": 0.11255418509244919, + "learning_rate": 0.0008526552392589226, + "loss": 2.2267, + "step": 432910 + }, + { + "epoch": 1.6735476488688903, + "grad_norm": 0.11538899689912796, + "learning_rate": 0.0008524809369775159, + "loss": 2.2303, + "step": 432920 + }, + { + "epoch": 1.6735863060722735, + "grad_norm": 0.12092319875955582, + "learning_rate": 0.0008523066611677142, + "loss": 2.2376, + "step": 432930 + }, + { + "epoch": 1.6736249632756568, + "grad_norm": 0.1103396788239479, + "learning_rate": 0.0008521324118174605, + "loss": 2.2324, + "step": 432940 + }, + { + "epoch": 1.67366362047904, + "grad_norm": 0.11736710369586945, + "learning_rate": 0.0008519581889147068, + "loss": 2.2286, + "step": 432950 + }, + { + "epoch": 1.6737022776824233, + "grad_norm": 0.10646773129701614, + "learning_rate": 0.000851783992447414, + "loss": 2.2438, + "step": 432960 + }, + { + "epoch": 1.6737409348858066, + "grad_norm": 0.15531231462955475, + "learning_rate": 0.0008516098224035526, + "loss": 2.2338, + "step": 432970 + }, + { + "epoch": 1.6737795920891898, + "grad_norm": 0.11417527496814728, + "learning_rate": 0.0008514356787711017, + "loss": 2.2416, + "step": 432980 + }, + { + "epoch": 1.673818249292573, + "grad_norm": 0.11312133818864822, + "learning_rate": 0.0008512615615380496, + "loss": 2.2368, + "step": 432990 + }, + { + "epoch": 1.6738569064959563, + "grad_norm": 0.10843250900506973, + "learning_rate": 0.0008510874706923943, + "loss": 2.2359, + "step": 433000 + }, + { + "epoch": 1.6738955636993398, + "grad_norm": 0.12344299256801605, + "learning_rate": 0.000850913406222142, + "loss": 2.2437, + "step": 433010 + }, + { + "epoch": 1.673934220902723, + "grad_norm": 0.11518251150846481, + "learning_rate": 0.0008507393681153086, + "loss": 2.2436, + "step": 433020 + }, + { + "epoch": 1.6739728781061063, + "grad_norm": 0.10837063193321228, + "learning_rate": 0.000850565356359919, + "loss": 2.2399, + "step": 433030 + }, + { + "epoch": 1.6740115353094895, + "grad_norm": 0.10487712919712067, + "learning_rate": 0.0008503913709440068, + "loss": 2.2361, + "step": 433040 + }, + { + "epoch": 1.674050192512873, + "grad_norm": 0.11273378878831863, + "learning_rate": 0.0008502174118556152, + "loss": 2.2508, + "step": 433050 + }, + { + "epoch": 1.6740888497162563, + "grad_norm": 0.1132865771651268, + "learning_rate": 0.000850043479082796, + "loss": 2.2494, + "step": 433060 + }, + { + "epoch": 1.6741275069196395, + "grad_norm": 0.11141993850469589, + "learning_rate": 0.0008498695726136101, + "loss": 2.2422, + "step": 433070 + }, + { + "epoch": 1.6741661641230228, + "grad_norm": 0.11210117489099503, + "learning_rate": 0.0008496956924361277, + "loss": 2.2295, + "step": 433080 + }, + { + "epoch": 1.674204821326406, + "grad_norm": 0.11931067705154419, + "learning_rate": 0.0008495218385384276, + "loss": 2.2409, + "step": 433090 + }, + { + "epoch": 1.6742434785297893, + "grad_norm": 0.10923109203577042, + "learning_rate": 0.000849348010908598, + "loss": 2.2518, + "step": 433100 + }, + { + "epoch": 1.6742821357331725, + "grad_norm": 0.11988096684217453, + "learning_rate": 0.0008491742095347359, + "loss": 2.2395, + "step": 433110 + }, + { + "epoch": 1.6743207929365558, + "grad_norm": 0.10944250971078873, + "learning_rate": 0.0008490004344049474, + "loss": 2.251, + "step": 433120 + }, + { + "epoch": 1.674359450139939, + "grad_norm": 0.11326149851083755, + "learning_rate": 0.0008488266855073472, + "loss": 2.2478, + "step": 433130 + }, + { + "epoch": 1.6743981073433223, + "grad_norm": 0.10903026908636093, + "learning_rate": 0.0008486529628300597, + "loss": 2.2318, + "step": 433140 + }, + { + "epoch": 1.6744367645467055, + "grad_norm": 0.11459324508905411, + "learning_rate": 0.0008484792663612176, + "loss": 2.2409, + "step": 433150 + }, + { + "epoch": 1.6744754217500888, + "grad_norm": 0.1096375361084938, + "learning_rate": 0.0008483055960889625, + "loss": 2.2473, + "step": 433160 + }, + { + "epoch": 1.674514078953472, + "grad_norm": 0.6692434549331665, + "learning_rate": 0.0008481319520014457, + "loss": 2.2463, + "step": 433170 + }, + { + "epoch": 1.6745527361568555, + "grad_norm": 0.10968945920467377, + "learning_rate": 0.0008479583340868267, + "loss": 2.2401, + "step": 433180 + }, + { + "epoch": 1.6745913933602388, + "grad_norm": 0.1067478135228157, + "learning_rate": 0.000847784742333274, + "loss": 2.244, + "step": 433190 + }, + { + "epoch": 1.674630050563622, + "grad_norm": 0.11387550830841064, + "learning_rate": 0.0008476111767289654, + "loss": 2.2493, + "step": 433200 + }, + { + "epoch": 1.6746687077670053, + "grad_norm": 0.1319890320301056, + "learning_rate": 0.0008474376372620873, + "loss": 2.2588, + "step": 433210 + }, + { + "epoch": 1.6747073649703887, + "grad_norm": 0.1147097498178482, + "learning_rate": 0.000847264123920835, + "loss": 2.2387, + "step": 433220 + }, + { + "epoch": 1.674746022173772, + "grad_norm": 0.09701906889677048, + "learning_rate": 0.000847090636693413, + "loss": 2.2458, + "step": 433230 + }, + { + "epoch": 1.6747846793771552, + "grad_norm": 0.11148305237293243, + "learning_rate": 0.000846917175568034, + "loss": 2.2552, + "step": 433240 + }, + { + "epoch": 1.6748233365805385, + "grad_norm": 0.11302665621042252, + "learning_rate": 0.0008467437405329203, + "loss": 2.253, + "step": 433250 + }, + { + "epoch": 1.6748619937839218, + "grad_norm": 0.1179070770740509, + "learning_rate": 0.0008465703315763029, + "loss": 2.2427, + "step": 433260 + }, + { + "epoch": 1.674900650987305, + "grad_norm": 0.10579831898212433, + "learning_rate": 0.0008463969486864209, + "loss": 2.2545, + "step": 433270 + }, + { + "epoch": 1.6749393081906883, + "grad_norm": 0.11859673261642456, + "learning_rate": 0.0008462235918515235, + "loss": 2.2473, + "step": 433280 + }, + { + "epoch": 1.6749779653940715, + "grad_norm": 0.11856315284967422, + "learning_rate": 0.0008460502610598675, + "loss": 2.2396, + "step": 433290 + }, + { + "epoch": 1.6750166225974548, + "grad_norm": 0.10576929897069931, + "learning_rate": 0.0008458769562997193, + "loss": 2.2248, + "step": 433300 + }, + { + "epoch": 1.675055279800838, + "grad_norm": 0.1199721097946167, + "learning_rate": 0.0008457036775593538, + "loss": 2.2395, + "step": 433310 + }, + { + "epoch": 1.6750939370042213, + "grad_norm": 0.1224532350897789, + "learning_rate": 0.000845530424827055, + "loss": 2.2324, + "step": 433320 + }, + { + "epoch": 1.6751325942076045, + "grad_norm": 0.11451449990272522, + "learning_rate": 0.0008453571980911153, + "loss": 2.2386, + "step": 433330 + }, + { + "epoch": 1.6751712514109878, + "grad_norm": 0.11305505037307739, + "learning_rate": 0.000845183997339836, + "loss": 2.2481, + "step": 433340 + }, + { + "epoch": 1.6752099086143712, + "grad_norm": 0.11151070147752762, + "learning_rate": 0.0008450108225615271, + "loss": 2.2443, + "step": 433350 + }, + { + "epoch": 1.6752485658177545, + "grad_norm": 0.11432652175426483, + "learning_rate": 0.0008448376737445079, + "loss": 2.249, + "step": 433360 + }, + { + "epoch": 1.6752872230211378, + "grad_norm": 0.11757257580757141, + "learning_rate": 0.0008446645508771056, + "loss": 2.2494, + "step": 433370 + }, + { + "epoch": 1.675325880224521, + "grad_norm": 0.11913491040468216, + "learning_rate": 0.0008444914539476569, + "loss": 2.2326, + "step": 433380 + }, + { + "epoch": 1.6753645374279045, + "grad_norm": 0.11336777359247208, + "learning_rate": 0.0008443183829445067, + "loss": 2.2449, + "step": 433390 + }, + { + "epoch": 1.6754031946312877, + "grad_norm": 0.11028832942247391, + "learning_rate": 0.0008441453378560089, + "loss": 2.2437, + "step": 433400 + }, + { + "epoch": 1.675441851834671, + "grad_norm": 0.11487679183483124, + "learning_rate": 0.000843972318670526, + "loss": 2.2354, + "step": 433410 + }, + { + "epoch": 1.6754805090380542, + "grad_norm": 0.11123806983232498, + "learning_rate": 0.0008437993253764294, + "loss": 2.2586, + "step": 433420 + }, + { + "epoch": 1.6755191662414375, + "grad_norm": 0.11151211708784103, + "learning_rate": 0.0008436263579620989, + "loss": 2.2376, + "step": 433430 + }, + { + "epoch": 1.6755578234448207, + "grad_norm": 0.11394939571619034, + "learning_rate": 0.0008434534164159233, + "loss": 2.2531, + "step": 433440 + }, + { + "epoch": 1.675596480648204, + "grad_norm": 0.11680968105792999, + "learning_rate": 0.0008432805007262996, + "loss": 2.2414, + "step": 433450 + }, + { + "epoch": 1.6756351378515872, + "grad_norm": 0.11009000986814499, + "learning_rate": 0.0008431076108816342, + "loss": 2.2546, + "step": 433460 + }, + { + "epoch": 1.6756737950549705, + "grad_norm": 0.12866313755512238, + "learning_rate": 0.0008429347468703419, + "loss": 2.2518, + "step": 433470 + }, + { + "epoch": 1.6757124522583537, + "grad_norm": 0.10869014263153076, + "learning_rate": 0.0008427619086808453, + "loss": 2.2603, + "step": 433480 + }, + { + "epoch": 1.675751109461737, + "grad_norm": 0.11136332899332047, + "learning_rate": 0.0008425890963015772, + "loss": 2.2243, + "step": 433490 + }, + { + "epoch": 1.6757897666651203, + "grad_norm": 0.10977095365524292, + "learning_rate": 0.0008424163097209774, + "loss": 2.2328, + "step": 433500 + }, + { + "epoch": 1.6758284238685035, + "grad_norm": 0.11136292666196823, + "learning_rate": 0.0008422435489274958, + "loss": 2.2468, + "step": 433510 + }, + { + "epoch": 1.675867081071887, + "grad_norm": 0.10493572056293488, + "learning_rate": 0.0008420708139095899, + "loss": 2.2378, + "step": 433520 + }, + { + "epoch": 1.6759057382752702, + "grad_norm": 0.1025480106472969, + "learning_rate": 0.0008418981046557259, + "loss": 2.243, + "step": 433530 + }, + { + "epoch": 1.6759443954786535, + "grad_norm": 0.10371068120002747, + "learning_rate": 0.0008417254211543795, + "loss": 2.2481, + "step": 433540 + }, + { + "epoch": 1.6759830526820367, + "grad_norm": 0.1214233785867691, + "learning_rate": 0.0008415527633940336, + "loss": 2.2477, + "step": 433550 + }, + { + "epoch": 1.6760217098854202, + "grad_norm": 0.11474094539880753, + "learning_rate": 0.0008413801313631808, + "loss": 2.2529, + "step": 433560 + }, + { + "epoch": 1.6760603670888035, + "grad_norm": 0.11168044805526733, + "learning_rate": 0.0008412075250503222, + "loss": 2.2294, + "step": 433570 + }, + { + "epoch": 1.6760990242921867, + "grad_norm": 0.11817635595798492, + "learning_rate": 0.0008410349444439664, + "loss": 2.2516, + "step": 433580 + }, + { + "epoch": 1.67613768149557, + "grad_norm": 0.12926152348518372, + "learning_rate": 0.0008408623895326319, + "loss": 2.2297, + "step": 433590 + }, + { + "epoch": 1.6761763386989532, + "grad_norm": 0.1127639040350914, + "learning_rate": 0.000840689860304845, + "loss": 2.2331, + "step": 433600 + }, + { + "epoch": 1.6762149959023365, + "grad_norm": 0.10601234436035156, + "learning_rate": 0.0008405173567491405, + "loss": 2.2555, + "step": 433610 + }, + { + "epoch": 1.6762536531057197, + "grad_norm": 0.10784386098384857, + "learning_rate": 0.0008403448788540621, + "loss": 2.2521, + "step": 433620 + }, + { + "epoch": 1.676292310309103, + "grad_norm": 0.1180407926440239, + "learning_rate": 0.0008401724266081618, + "loss": 2.2318, + "step": 433630 + }, + { + "epoch": 1.6763309675124862, + "grad_norm": 0.12449698150157928, + "learning_rate": 0.0008400000000000001, + "loss": 2.2353, + "step": 433640 + }, + { + "epoch": 1.6763696247158695, + "grad_norm": 0.11709899455308914, + "learning_rate": 0.0008398275990181459, + "loss": 2.2449, + "step": 433650 + }, + { + "epoch": 1.6764082819192527, + "grad_norm": 0.1138455793261528, + "learning_rate": 0.0008396552236511769, + "loss": 2.2635, + "step": 433660 + }, + { + "epoch": 1.676446939122636, + "grad_norm": 0.10678897798061371, + "learning_rate": 0.0008394828738876795, + "loss": 2.223, + "step": 433670 + }, + { + "epoch": 1.6764855963260192, + "grad_norm": 0.10805068910121918, + "learning_rate": 0.0008393105497162475, + "loss": 2.2387, + "step": 433680 + }, + { + "epoch": 1.6765242535294027, + "grad_norm": 0.10988160222768784, + "learning_rate": 0.0008391382511254839, + "loss": 2.2353, + "step": 433690 + }, + { + "epoch": 1.676562910732786, + "grad_norm": 0.13759155571460724, + "learning_rate": 0.0008389659781040007, + "loss": 2.2423, + "step": 433700 + }, + { + "epoch": 1.6766015679361692, + "grad_norm": 0.10952576994895935, + "learning_rate": 0.0008387937306404172, + "loss": 2.2527, + "step": 433710 + }, + { + "epoch": 1.6766402251395525, + "grad_norm": 0.11295206844806671, + "learning_rate": 0.0008386215087233619, + "loss": 2.2396, + "step": 433720 + }, + { + "epoch": 1.676678882342936, + "grad_norm": 0.10747696459293365, + "learning_rate": 0.0008384493123414718, + "loss": 2.2501, + "step": 433730 + }, + { + "epoch": 1.6767175395463192, + "grad_norm": 0.5068920254707336, + "learning_rate": 0.0008382771414833916, + "loss": 2.2422, + "step": 433740 + }, + { + "epoch": 1.6767561967497024, + "grad_norm": 0.11631965637207031, + "learning_rate": 0.0008381049961377749, + "loss": 2.2445, + "step": 433750 + }, + { + "epoch": 1.6767948539530857, + "grad_norm": 0.11662419885396957, + "learning_rate": 0.000837932876293284, + "loss": 2.2426, + "step": 433760 + }, + { + "epoch": 1.676833511156469, + "grad_norm": 0.11134974658489227, + "learning_rate": 0.0008377607819385891, + "loss": 2.2489, + "step": 433770 + }, + { + "epoch": 1.6768721683598522, + "grad_norm": 0.10038750618696213, + "learning_rate": 0.0008375887130623687, + "loss": 2.243, + "step": 433780 + }, + { + "epoch": 1.6769108255632355, + "grad_norm": 0.1259811818599701, + "learning_rate": 0.0008374166696533106, + "loss": 2.2413, + "step": 433790 + }, + { + "epoch": 1.6769494827666187, + "grad_norm": 0.11071491986513138, + "learning_rate": 0.0008372446517001094, + "loss": 2.2426, + "step": 433800 + }, + { + "epoch": 1.676988139970002, + "grad_norm": 0.11462465673685074, + "learning_rate": 0.0008370726591914694, + "loss": 2.2504, + "step": 433810 + }, + { + "epoch": 1.6770267971733852, + "grad_norm": 0.11320709437131882, + "learning_rate": 0.000836900692116103, + "loss": 2.2483, + "step": 433820 + }, + { + "epoch": 1.6770654543767685, + "grad_norm": 0.11926060914993286, + "learning_rate": 0.0008367287504627308, + "loss": 2.2477, + "step": 433830 + }, + { + "epoch": 1.6771041115801517, + "grad_norm": 0.11070722341537476, + "learning_rate": 0.0008365568342200812, + "loss": 2.2517, + "step": 433840 + }, + { + "epoch": 1.677142768783535, + "grad_norm": 0.1156579926609993, + "learning_rate": 0.0008363849433768915, + "loss": 2.2366, + "step": 433850 + }, + { + "epoch": 1.6771814259869184, + "grad_norm": 0.7100057005882263, + "learning_rate": 0.0008362130779219075, + "loss": 2.2295, + "step": 433860 + }, + { + "epoch": 1.6772200831903017, + "grad_norm": 0.115716353058815, + "learning_rate": 0.0008360412378438831, + "loss": 2.2474, + "step": 433870 + }, + { + "epoch": 1.677258740393685, + "grad_norm": 0.11384305357933044, + "learning_rate": 0.0008358694231315802, + "loss": 2.2434, + "step": 433880 + }, + { + "epoch": 1.6772973975970682, + "grad_norm": 0.12071774899959564, + "learning_rate": 0.0008356976337737692, + "loss": 2.2407, + "step": 433890 + }, + { + "epoch": 1.6773360548004517, + "grad_norm": 0.12149536609649658, + "learning_rate": 0.000835525869759229, + "loss": 2.2504, + "step": 433900 + }, + { + "epoch": 1.677374712003835, + "grad_norm": 0.12283730506896973, + "learning_rate": 0.0008353541310767465, + "loss": 2.2382, + "step": 433910 + }, + { + "epoch": 1.6774133692072182, + "grad_norm": 0.11751092225313187, + "learning_rate": 0.000835182417715117, + "loss": 2.2325, + "step": 433920 + }, + { + "epoch": 1.6774520264106014, + "grad_norm": 0.11363212764263153, + "learning_rate": 0.0008350107296631442, + "loss": 2.2431, + "step": 433930 + }, + { + "epoch": 1.6774906836139847, + "grad_norm": 0.10969270765781403, + "learning_rate": 0.0008348390669096393, + "loss": 2.2233, + "step": 433940 + }, + { + "epoch": 1.677529340817368, + "grad_norm": 0.10469599813222885, + "learning_rate": 0.0008346674294434226, + "loss": 2.2186, + "step": 433950 + }, + { + "epoch": 1.6775679980207512, + "grad_norm": 0.11149562895298004, + "learning_rate": 0.0008344958172533228, + "loss": 2.2441, + "step": 433960 + }, + { + "epoch": 1.6776066552241344, + "grad_norm": 0.11854884773492813, + "learning_rate": 0.0008343242303281757, + "loss": 2.2403, + "step": 433970 + }, + { + "epoch": 1.6776453124275177, + "grad_norm": 0.11190243810415268, + "learning_rate": 0.0008341526686568264, + "loss": 2.2354, + "step": 433980 + }, + { + "epoch": 1.677683969630901, + "grad_norm": 0.10802937299013138, + "learning_rate": 0.0008339811322281275, + "loss": 2.2483, + "step": 433990 + }, + { + "epoch": 1.6777226268342842, + "grad_norm": 0.10827367752790451, + "learning_rate": 0.0008338096210309398, + "loss": 2.2282, + "step": 434000 + }, + { + "epoch": 1.6777612840376674, + "grad_norm": 0.12005242705345154, + "learning_rate": 0.0008336381350541333, + "loss": 2.2401, + "step": 434010 + }, + { + "epoch": 1.6777999412410507, + "grad_norm": 0.11597011983394623, + "learning_rate": 0.0008334666742865851, + "loss": 2.2423, + "step": 434020 + }, + { + "epoch": 1.6778385984444342, + "grad_norm": 0.10232312232255936, + "learning_rate": 0.0008332952387171809, + "loss": 2.2447, + "step": 434030 + }, + { + "epoch": 1.6778772556478174, + "grad_norm": 0.12289629131555557, + "learning_rate": 0.0008331238283348143, + "loss": 2.2457, + "step": 434040 + }, + { + "epoch": 1.6779159128512007, + "grad_norm": 0.11531048268079758, + "learning_rate": 0.0008329524431283873, + "loss": 2.2458, + "step": 434050 + }, + { + "epoch": 1.677954570054584, + "grad_norm": 0.10182934254407883, + "learning_rate": 0.0008327810830868101, + "loss": 2.2387, + "step": 434060 + }, + { + "epoch": 1.6779932272579674, + "grad_norm": 0.11879070103168488, + "learning_rate": 0.0008326097481990009, + "loss": 2.2588, + "step": 434070 + }, + { + "epoch": 1.6780318844613507, + "grad_norm": 0.1200638934969902, + "learning_rate": 0.0008324384384538862, + "loss": 2.2382, + "step": 434080 + }, + { + "epoch": 1.678070541664734, + "grad_norm": 0.12407013773918152, + "learning_rate": 0.0008322671538404001, + "loss": 2.2371, + "step": 434090 + }, + { + "epoch": 1.6781091988681172, + "grad_norm": 0.1167188510298729, + "learning_rate": 0.0008320958943474854, + "loss": 2.2397, + "step": 434100 + }, + { + "epoch": 1.6781478560715004, + "grad_norm": 0.11031246930360794, + "learning_rate": 0.0008319246599640929, + "loss": 2.2392, + "step": 434110 + }, + { + "epoch": 1.6781865132748837, + "grad_norm": 0.11474674940109253, + "learning_rate": 0.0008317534506791813, + "loss": 2.2337, + "step": 434120 + }, + { + "epoch": 1.678225170478267, + "grad_norm": 0.11170880496501923, + "learning_rate": 0.0008315822664817176, + "loss": 2.2453, + "step": 434130 + }, + { + "epoch": 1.6782638276816502, + "grad_norm": 0.10359736531972885, + "learning_rate": 0.0008314111073606767, + "loss": 2.24, + "step": 434140 + }, + { + "epoch": 1.6783024848850334, + "grad_norm": 0.1339753270149231, + "learning_rate": 0.0008312399733050415, + "loss": 2.2454, + "step": 434150 + }, + { + "epoch": 1.6783411420884167, + "grad_norm": 0.11264664679765701, + "learning_rate": 0.0008310688643038034, + "loss": 2.2393, + "step": 434160 + }, + { + "epoch": 1.6783797992918, + "grad_norm": 0.11087250709533691, + "learning_rate": 0.0008308977803459614, + "loss": 2.2491, + "step": 434170 + }, + { + "epoch": 1.6784184564951832, + "grad_norm": 0.1208563819527626, + "learning_rate": 0.0008307267214205228, + "loss": 2.2409, + "step": 434180 + }, + { + "epoch": 1.6784571136985664, + "grad_norm": 0.10672164708375931, + "learning_rate": 0.0008305556875165026, + "loss": 2.2378, + "step": 434190 + }, + { + "epoch": 1.67849577090195, + "grad_norm": 0.1270405799150467, + "learning_rate": 0.0008303846786229243, + "loss": 2.2528, + "step": 434200 + }, + { + "epoch": 1.6785344281053332, + "grad_norm": 0.12053734809160233, + "learning_rate": 0.0008302136947288193, + "loss": 2.2447, + "step": 434210 + }, + { + "epoch": 1.6785730853087164, + "grad_norm": 0.11692651361227036, + "learning_rate": 0.0008300427358232268, + "loss": 2.248, + "step": 434220 + }, + { + "epoch": 1.6786117425120997, + "grad_norm": 0.11162137240171432, + "learning_rate": 0.0008298718018951941, + "loss": 2.2481, + "step": 434230 + }, + { + "epoch": 1.6786503997154831, + "grad_norm": 0.11267007142305374, + "learning_rate": 0.0008297008929337766, + "loss": 2.2286, + "step": 434240 + }, + { + "epoch": 1.6786890569188664, + "grad_norm": 0.10077975690364838, + "learning_rate": 0.0008295300089280375, + "loss": 2.2344, + "step": 434250 + }, + { + "epoch": 1.6787277141222496, + "grad_norm": 0.10727986693382263, + "learning_rate": 0.0008293591498670483, + "loss": 2.2429, + "step": 434260 + }, + { + "epoch": 1.678766371325633, + "grad_norm": 0.1061238944530487, + "learning_rate": 0.000829188315739888, + "loss": 2.2419, + "step": 434270 + }, + { + "epoch": 1.6788050285290161, + "grad_norm": 0.10600124299526215, + "learning_rate": 0.0008290175065356442, + "loss": 2.2373, + "step": 434280 + }, + { + "epoch": 1.6788436857323994, + "grad_norm": 0.10449835658073425, + "learning_rate": 0.0008288467222434119, + "loss": 2.232, + "step": 434290 + }, + { + "epoch": 1.6788823429357826, + "grad_norm": 0.11127332597970963, + "learning_rate": 0.0008286759628522942, + "loss": 2.2503, + "step": 434300 + }, + { + "epoch": 1.678921000139166, + "grad_norm": 0.11499613523483276, + "learning_rate": 0.0008285052283514023, + "loss": 2.2404, + "step": 434310 + }, + { + "epoch": 1.6789596573425491, + "grad_norm": 0.12473177164793015, + "learning_rate": 0.0008283345187298552, + "loss": 2.2386, + "step": 434320 + }, + { + "epoch": 1.6789983145459324, + "grad_norm": 0.11327109485864639, + "learning_rate": 0.0008281638339767798, + "loss": 2.2629, + "step": 434330 + }, + { + "epoch": 1.6790369717493157, + "grad_norm": 0.11517111957073212, + "learning_rate": 0.0008279931740813112, + "loss": 2.2466, + "step": 434340 + }, + { + "epoch": 1.679075628952699, + "grad_norm": 0.11408182233572006, + "learning_rate": 0.0008278225390325918, + "loss": 2.2413, + "step": 434350 + }, + { + "epoch": 1.6791142861560822, + "grad_norm": 0.12840931117534637, + "learning_rate": 0.0008276519288197724, + "loss": 2.2428, + "step": 434360 + }, + { + "epoch": 1.6791529433594656, + "grad_norm": 0.11731917411088943, + "learning_rate": 0.0008274813434320117, + "loss": 2.2335, + "step": 434370 + }, + { + "epoch": 1.6791916005628489, + "grad_norm": 0.12040312588214874, + "learning_rate": 0.0008273107828584762, + "loss": 2.2504, + "step": 434380 + }, + { + "epoch": 1.6792302577662321, + "grad_norm": 0.11635112762451172, + "learning_rate": 0.00082714024708834, + "loss": 2.2268, + "step": 434390 + }, + { + "epoch": 1.6792689149696154, + "grad_norm": 0.11051749438047409, + "learning_rate": 0.0008269697361107857, + "loss": 2.2221, + "step": 434400 + }, + { + "epoch": 1.6793075721729989, + "grad_norm": 0.10762675106525421, + "learning_rate": 0.0008267992499150028, + "loss": 2.2511, + "step": 434410 + }, + { + "epoch": 1.6793462293763821, + "grad_norm": 0.10734720528125763, + "learning_rate": 0.0008266287884901897, + "loss": 2.2316, + "step": 434420 + }, + { + "epoch": 1.6793848865797654, + "grad_norm": 0.1141144186258316, + "learning_rate": 0.000826458351825552, + "loss": 2.2486, + "step": 434430 + }, + { + "epoch": 1.6794235437831486, + "grad_norm": 0.12552301585674286, + "learning_rate": 0.0008262879399103031, + "loss": 2.2438, + "step": 434440 + }, + { + "epoch": 1.6794622009865319, + "grad_norm": 0.11154180765151978, + "learning_rate": 0.0008261175527336651, + "loss": 2.2512, + "step": 434450 + }, + { + "epoch": 1.6795008581899151, + "grad_norm": 0.115838922560215, + "learning_rate": 0.0008259471902848663, + "loss": 2.2562, + "step": 434460 + }, + { + "epoch": 1.6795395153932984, + "grad_norm": 0.11728062480688095, + "learning_rate": 0.0008257768525531444, + "loss": 2.2508, + "step": 434470 + }, + { + "epoch": 1.6795781725966816, + "grad_norm": 0.11453123390674591, + "learning_rate": 0.0008256065395277441, + "loss": 2.2384, + "step": 434480 + }, + { + "epoch": 1.6796168298000649, + "grad_norm": 0.11977676302194595, + "learning_rate": 0.0008254362511979181, + "loss": 2.2453, + "step": 434490 + }, + { + "epoch": 1.6796554870034481, + "grad_norm": 0.10919841378927231, + "learning_rate": 0.000825265987552927, + "loss": 2.241, + "step": 434500 + }, + { + "epoch": 1.6796941442068314, + "grad_norm": 0.1256256252527237, + "learning_rate": 0.0008250957485820388, + "loss": 2.2508, + "step": 434510 + }, + { + "epoch": 1.6797328014102146, + "grad_norm": 0.11276651918888092, + "learning_rate": 0.0008249255342745295, + "loss": 2.227, + "step": 434520 + }, + { + "epoch": 1.6797714586135981, + "grad_norm": 0.14407971501350403, + "learning_rate": 0.0008247553446196832, + "loss": 2.2511, + "step": 434530 + }, + { + "epoch": 1.6798101158169814, + "grad_norm": 0.11039101332426071, + "learning_rate": 0.0008245851796067909, + "loss": 2.2354, + "step": 434540 + }, + { + "epoch": 1.6798487730203646, + "grad_norm": 0.12132669985294342, + "learning_rate": 0.0008244150392251524, + "loss": 2.2445, + "step": 434550 + }, + { + "epoch": 1.6798874302237479, + "grad_norm": 0.10520847141742706, + "learning_rate": 0.0008242449234640745, + "loss": 2.2486, + "step": 434560 + }, + { + "epoch": 1.6799260874271311, + "grad_norm": 0.10257355123758316, + "learning_rate": 0.000824074832312872, + "loss": 2.2381, + "step": 434570 + }, + { + "epoch": 1.6799647446305146, + "grad_norm": 0.11039949953556061, + "learning_rate": 0.0008239047657608676, + "loss": 2.2327, + "step": 434580 + }, + { + "epoch": 1.6800034018338978, + "grad_norm": 0.11216014623641968, + "learning_rate": 0.0008237347237973911, + "loss": 2.2398, + "step": 434590 + }, + { + "epoch": 1.680042059037281, + "grad_norm": 0.11249549686908722, + "learning_rate": 0.0008235647064117806, + "loss": 2.2475, + "step": 434600 + }, + { + "epoch": 1.6800807162406644, + "grad_norm": 0.12227798253297806, + "learning_rate": 0.0008233947135933817, + "loss": 2.2414, + "step": 434610 + }, + { + "epoch": 1.6801193734440476, + "grad_norm": 0.1259644478559494, + "learning_rate": 0.000823224745331548, + "loss": 2.2347, + "step": 434620 + }, + { + "epoch": 1.6801580306474309, + "grad_norm": 0.10499641299247742, + "learning_rate": 0.0008230548016156403, + "loss": 2.238, + "step": 434630 + }, + { + "epoch": 1.680196687850814, + "grad_norm": 0.10853856056928635, + "learning_rate": 0.0008228848824350272, + "loss": 2.2311, + "step": 434640 + }, + { + "epoch": 1.6802353450541974, + "grad_norm": 0.11641088128089905, + "learning_rate": 0.0008227149877790852, + "loss": 2.2381, + "step": 434650 + }, + { + "epoch": 1.6802740022575806, + "grad_norm": 0.1016199067234993, + "learning_rate": 0.0008225451176371979, + "loss": 2.2369, + "step": 434660 + }, + { + "epoch": 1.6803126594609639, + "grad_norm": 0.10874945670366287, + "learning_rate": 0.0008223752719987576, + "loss": 2.2379, + "step": 434670 + }, + { + "epoch": 1.6803513166643471, + "grad_norm": 0.10710814595222473, + "learning_rate": 0.0008222054508531635, + "loss": 2.2216, + "step": 434680 + }, + { + "epoch": 1.6803899738677304, + "grad_norm": 0.11364852637052536, + "learning_rate": 0.0008220356541898224, + "loss": 2.225, + "step": 434690 + }, + { + "epoch": 1.6804286310711138, + "grad_norm": 0.1084376871585846, + "learning_rate": 0.0008218658819981489, + "loss": 2.2278, + "step": 434700 + }, + { + "epoch": 1.680467288274497, + "grad_norm": 0.10847639292478561, + "learning_rate": 0.0008216961342675651, + "loss": 2.2327, + "step": 434710 + }, + { + "epoch": 1.6805059454778803, + "grad_norm": 0.11690942943096161, + "learning_rate": 0.0008215264109875011, + "loss": 2.2303, + "step": 434720 + }, + { + "epoch": 1.6805446026812636, + "grad_norm": 0.13263235986232758, + "learning_rate": 0.0008213567121473944, + "loss": 2.2391, + "step": 434730 + }, + { + "epoch": 1.6805832598846469, + "grad_norm": 0.11565765738487244, + "learning_rate": 0.0008211870377366899, + "loss": 2.2387, + "step": 434740 + }, + { + "epoch": 1.6806219170880303, + "grad_norm": 0.10726010799407959, + "learning_rate": 0.0008210173877448403, + "loss": 2.2389, + "step": 434750 + }, + { + "epoch": 1.6806605742914136, + "grad_norm": 0.12307916581630707, + "learning_rate": 0.000820847762161306, + "loss": 2.2329, + "step": 434760 + }, + { + "epoch": 1.6806992314947968, + "grad_norm": 0.12959054112434387, + "learning_rate": 0.0008206781609755544, + "loss": 2.2452, + "step": 434770 + }, + { + "epoch": 1.68073788869818, + "grad_norm": 0.10963719338178635, + "learning_rate": 0.0008205085841770614, + "loss": 2.2464, + "step": 434780 + }, + { + "epoch": 1.6807765459015633, + "grad_norm": 0.11189696192741394, + "learning_rate": 0.0008203390317553099, + "loss": 2.2463, + "step": 434790 + }, + { + "epoch": 1.6808152031049466, + "grad_norm": 0.12894082069396973, + "learning_rate": 0.00082016950369979, + "loss": 2.2488, + "step": 434800 + }, + { + "epoch": 1.6808538603083298, + "grad_norm": 0.11304847151041031, + "learning_rate": 0.0008200000000000001, + "loss": 2.2495, + "step": 434810 + }, + { + "epoch": 1.680892517511713, + "grad_norm": 0.09899675101041794, + "learning_rate": 0.0008198305206454455, + "loss": 2.2271, + "step": 434820 + }, + { + "epoch": 1.6809311747150963, + "grad_norm": 0.11843240261077881, + "learning_rate": 0.0008196610656256398, + "loss": 2.2536, + "step": 434830 + }, + { + "epoch": 1.6809698319184796, + "grad_norm": 0.11657701432704926, + "learning_rate": 0.0008194916349301036, + "loss": 2.2409, + "step": 434840 + }, + { + "epoch": 1.6810084891218628, + "grad_norm": 0.11496693640947342, + "learning_rate": 0.0008193222285483648, + "loss": 2.2426, + "step": 434850 + }, + { + "epoch": 1.681047146325246, + "grad_norm": 0.10746250301599503, + "learning_rate": 0.0008191528464699591, + "loss": 2.2362, + "step": 434860 + }, + { + "epoch": 1.6810858035286296, + "grad_norm": 0.10326814651489258, + "learning_rate": 0.0008189834886844299, + "loss": 2.2427, + "step": 434870 + }, + { + "epoch": 1.6811244607320128, + "grad_norm": 0.11777054518461227, + "learning_rate": 0.0008188141551813279, + "loss": 2.2526, + "step": 434880 + }, + { + "epoch": 1.681163117935396, + "grad_norm": 0.11334793269634247, + "learning_rate": 0.0008186448459502113, + "loss": 2.2404, + "step": 434890 + }, + { + "epoch": 1.6812017751387793, + "grad_norm": 0.11864189058542252, + "learning_rate": 0.0008184755609806458, + "loss": 2.2406, + "step": 434900 + }, + { + "epoch": 1.6812404323421628, + "grad_norm": 0.10265098512172699, + "learning_rate": 0.0008183063002622042, + "loss": 2.2526, + "step": 434910 + }, + { + "epoch": 1.681279089545546, + "grad_norm": 0.1151779443025589, + "learning_rate": 0.0008181370637844674, + "loss": 2.2438, + "step": 434920 + }, + { + "epoch": 1.6813177467489293, + "grad_norm": 0.11113797873258591, + "learning_rate": 0.0008179678515370235, + "loss": 2.2429, + "step": 434930 + }, + { + "epoch": 1.6813564039523126, + "grad_norm": 0.1169753447175026, + "learning_rate": 0.0008177986635094681, + "loss": 2.2402, + "step": 434940 + }, + { + "epoch": 1.6813950611556958, + "grad_norm": 0.11297095566987991, + "learning_rate": 0.000817629499691404, + "loss": 2.2376, + "step": 434950 + }, + { + "epoch": 1.681433718359079, + "grad_norm": 0.11857719719409943, + "learning_rate": 0.0008174603600724413, + "loss": 2.2458, + "step": 434960 + }, + { + "epoch": 1.6814723755624623, + "grad_norm": 0.11382398009300232, + "learning_rate": 0.0008172912446421985, + "loss": 2.2401, + "step": 434970 + }, + { + "epoch": 1.6815110327658456, + "grad_norm": 0.11434874683618546, + "learning_rate": 0.0008171221533903003, + "loss": 2.2599, + "step": 434980 + }, + { + "epoch": 1.6815496899692288, + "grad_norm": 0.111310675740242, + "learning_rate": 0.0008169530863063798, + "loss": 2.2327, + "step": 434990 + }, + { + "epoch": 1.681588347172612, + "grad_norm": 0.1281977891921997, + "learning_rate": 0.0008167840433800769, + "loss": 2.2315, + "step": 435000 + }, + { + "epoch": 1.6816270043759953, + "grad_norm": 0.11103682219982147, + "learning_rate": 0.0008166150246010389, + "loss": 2.2362, + "step": 435010 + }, + { + "epoch": 1.6816656615793786, + "grad_norm": 0.10804154723882675, + "learning_rate": 0.0008164460299589207, + "loss": 2.2377, + "step": 435020 + }, + { + "epoch": 1.6817043187827618, + "grad_norm": 0.11247711628675461, + "learning_rate": 0.0008162770594433848, + "loss": 2.2278, + "step": 435030 + }, + { + "epoch": 1.6817429759861453, + "grad_norm": 0.104110486805439, + "learning_rate": 0.0008161081130441006, + "loss": 2.2496, + "step": 435040 + }, + { + "epoch": 1.6817816331895286, + "grad_norm": 0.12114261090755463, + "learning_rate": 0.0008159391907507454, + "loss": 2.2437, + "step": 435050 + }, + { + "epoch": 1.6818202903929118, + "grad_norm": 0.10757270455360413, + "learning_rate": 0.000815770292553003, + "loss": 2.2504, + "step": 435060 + }, + { + "epoch": 1.681858947596295, + "grad_norm": 0.10915961861610413, + "learning_rate": 0.0008156014184405656, + "loss": 2.2338, + "step": 435070 + }, + { + "epoch": 1.6818976047996785, + "grad_norm": 0.11000694334506989, + "learning_rate": 0.0008154325684031323, + "loss": 2.2493, + "step": 435080 + }, + { + "epoch": 1.6819362620030618, + "grad_norm": 0.11791567504405975, + "learning_rate": 0.0008152637424304092, + "loss": 2.238, + "step": 435090 + }, + { + "epoch": 1.681974919206445, + "grad_norm": 0.1171281635761261, + "learning_rate": 0.0008150949405121102, + "loss": 2.2355, + "step": 435100 + }, + { + "epoch": 1.6820135764098283, + "grad_norm": 0.1159917414188385, + "learning_rate": 0.000814926162637956, + "loss": 2.2312, + "step": 435110 + }, + { + "epoch": 1.6820522336132115, + "grad_norm": 0.11876057088375092, + "learning_rate": 0.0008147574087976757, + "loss": 2.2319, + "step": 435120 + }, + { + "epoch": 1.6820908908165948, + "grad_norm": 0.24806475639343262, + "learning_rate": 0.0008145886789810044, + "loss": 2.2469, + "step": 435130 + }, + { + "epoch": 1.682129548019978, + "grad_norm": 0.11031191796064377, + "learning_rate": 0.0008144199731776856, + "loss": 2.2502, + "step": 435140 + }, + { + "epoch": 1.6821682052233613, + "grad_norm": 0.13095290958881378, + "learning_rate": 0.0008142512913774691, + "loss": 2.2459, + "step": 435150 + }, + { + "epoch": 1.6822068624267446, + "grad_norm": 0.11507758498191833, + "learning_rate": 0.0008140826335701125, + "loss": 2.2297, + "step": 435160 + }, + { + "epoch": 1.6822455196301278, + "grad_norm": 0.11004200577735901, + "learning_rate": 0.0008139139997453811, + "loss": 2.2435, + "step": 435170 + }, + { + "epoch": 1.682284176833511, + "grad_norm": 0.10870189964771271, + "learning_rate": 0.0008137453898930467, + "loss": 2.2386, + "step": 435180 + }, + { + "epoch": 1.6823228340368943, + "grad_norm": 0.1069776639342308, + "learning_rate": 0.0008135768040028888, + "loss": 2.2409, + "step": 435190 + }, + { + "epoch": 1.6823614912402776, + "grad_norm": 0.11982312798500061, + "learning_rate": 0.000813408242064694, + "loss": 2.2342, + "step": 435200 + }, + { + "epoch": 1.682400148443661, + "grad_norm": 0.1095123365521431, + "learning_rate": 0.0008132397040682562, + "loss": 2.2532, + "step": 435210 + }, + { + "epoch": 1.6824388056470443, + "grad_norm": 0.12205439805984497, + "learning_rate": 0.0008130711900033769, + "loss": 2.2439, + "step": 435220 + }, + { + "epoch": 1.6824774628504275, + "grad_norm": 0.11363526433706284, + "learning_rate": 0.0008129026998598641, + "loss": 2.2415, + "step": 435230 + }, + { + "epoch": 1.6825161200538108, + "grad_norm": 0.1271890550851822, + "learning_rate": 0.0008127342336275336, + "loss": 2.2278, + "step": 435240 + }, + { + "epoch": 1.6825547772571943, + "grad_norm": 0.11817679554224014, + "learning_rate": 0.0008125657912962084, + "loss": 2.2376, + "step": 435250 + }, + { + "epoch": 1.6825934344605775, + "grad_norm": 0.15573549270629883, + "learning_rate": 0.0008123973728557182, + "loss": 2.2288, + "step": 435260 + }, + { + "epoch": 1.6826320916639608, + "grad_norm": 0.11569619923830032, + "learning_rate": 0.0008122289782959007, + "loss": 2.2432, + "step": 435270 + }, + { + "epoch": 1.682670748867344, + "grad_norm": 0.12004324793815613, + "learning_rate": 0.0008120606076066002, + "loss": 2.2458, + "step": 435280 + }, + { + "epoch": 1.6827094060707273, + "grad_norm": 0.12503516674041748, + "learning_rate": 0.0008118922607776684, + "loss": 2.2414, + "step": 435290 + }, + { + "epoch": 1.6827480632741105, + "grad_norm": 0.12091667205095291, + "learning_rate": 0.0008117239377989642, + "loss": 2.2288, + "step": 435300 + }, + { + "epoch": 1.6827867204774938, + "grad_norm": 0.10575675219297409, + "learning_rate": 0.0008115556386603535, + "loss": 2.2406, + "step": 435310 + }, + { + "epoch": 1.682825377680877, + "grad_norm": 0.10925006121397018, + "learning_rate": 0.0008113873633517099, + "loss": 2.2628, + "step": 435320 + }, + { + "epoch": 1.6828640348842603, + "grad_norm": 0.10843341797590256, + "learning_rate": 0.0008112191118629135, + "loss": 2.2336, + "step": 435330 + }, + { + "epoch": 1.6829026920876435, + "grad_norm": 0.12656168639659882, + "learning_rate": 0.000811050884183852, + "loss": 2.2303, + "step": 435340 + }, + { + "epoch": 1.6829413492910268, + "grad_norm": 0.12437848746776581, + "learning_rate": 0.0008108826803044201, + "loss": 2.2249, + "step": 435350 + }, + { + "epoch": 1.68298000649441, + "grad_norm": 0.12143638730049133, + "learning_rate": 0.0008107145002145196, + "loss": 2.2448, + "step": 435360 + }, + { + "epoch": 1.6830186636977933, + "grad_norm": 0.10871739685535431, + "learning_rate": 0.0008105463439040594, + "loss": 2.2366, + "step": 435370 + }, + { + "epoch": 1.6830573209011768, + "grad_norm": 0.10439775884151459, + "learning_rate": 0.0008103782113629558, + "loss": 2.2327, + "step": 435380 + }, + { + "epoch": 1.68309597810456, + "grad_norm": 0.12392468750476837, + "learning_rate": 0.0008102101025811322, + "loss": 2.2342, + "step": 435390 + }, + { + "epoch": 1.6831346353079433, + "grad_norm": 0.11335171014070511, + "learning_rate": 0.0008100420175485188, + "loss": 2.2235, + "step": 435400 + }, + { + "epoch": 1.6831732925113265, + "grad_norm": 0.11640524119138718, + "learning_rate": 0.0008098739562550528, + "loss": 2.2414, + "step": 435410 + }, + { + "epoch": 1.68321194971471, + "grad_norm": 0.11219155043363571, + "learning_rate": 0.0008097059186906792, + "loss": 2.2484, + "step": 435420 + }, + { + "epoch": 1.6832506069180933, + "grad_norm": 0.11859866231679916, + "learning_rate": 0.0008095379048453495, + "loss": 2.2264, + "step": 435430 + }, + { + "epoch": 1.6832892641214765, + "grad_norm": 0.10446832329034805, + "learning_rate": 0.0008093699147090228, + "loss": 2.2355, + "step": 435440 + }, + { + "epoch": 1.6833279213248598, + "grad_norm": 0.1112954244017601, + "learning_rate": 0.0008092019482716644, + "loss": 2.2348, + "step": 435450 + }, + { + "epoch": 1.683366578528243, + "grad_norm": 0.11098206788301468, + "learning_rate": 0.0008090340055232475, + "loss": 2.2291, + "step": 435460 + }, + { + "epoch": 1.6834052357316263, + "grad_norm": 0.11901814490556717, + "learning_rate": 0.0008088660864537522, + "loss": 2.2318, + "step": 435470 + }, + { + "epoch": 1.6834438929350095, + "grad_norm": 0.11065753549337387, + "learning_rate": 0.0008086981910531656, + "loss": 2.2411, + "step": 435480 + }, + { + "epoch": 1.6834825501383928, + "grad_norm": 0.11770408600568771, + "learning_rate": 0.0008085303193114817, + "loss": 2.2352, + "step": 435490 + }, + { + "epoch": 1.683521207341776, + "grad_norm": 0.11291767656803131, + "learning_rate": 0.0008083624712187016, + "loss": 2.2282, + "step": 435500 + }, + { + "epoch": 1.6835598645451593, + "grad_norm": 0.10530194640159607, + "learning_rate": 0.0008081946467648336, + "loss": 2.2432, + "step": 435510 + }, + { + "epoch": 1.6835985217485425, + "grad_norm": 0.11069527268409729, + "learning_rate": 0.0008080268459398928, + "loss": 2.246, + "step": 435520 + }, + { + "epoch": 1.6836371789519258, + "grad_norm": 0.10665157437324524, + "learning_rate": 0.0008078590687339018, + "loss": 2.2393, + "step": 435530 + }, + { + "epoch": 1.683675836155309, + "grad_norm": 0.10939627885818481, + "learning_rate": 0.0008076913151368896, + "loss": 2.2327, + "step": 435540 + }, + { + "epoch": 1.6837144933586925, + "grad_norm": 0.12476851791143417, + "learning_rate": 0.0008075235851388927, + "loss": 2.2323, + "step": 435550 + }, + { + "epoch": 1.6837531505620758, + "grad_norm": 0.1157846450805664, + "learning_rate": 0.000807355878729954, + "loss": 2.2307, + "step": 435560 + }, + { + "epoch": 1.683791807765459, + "grad_norm": 0.1271163523197174, + "learning_rate": 0.0008071881959001244, + "loss": 2.2507, + "step": 435570 + }, + { + "epoch": 1.6838304649688423, + "grad_norm": 0.11389724910259247, + "learning_rate": 0.0008070205366394609, + "loss": 2.2252, + "step": 435580 + }, + { + "epoch": 1.6838691221722257, + "grad_norm": 0.11247779428958893, + "learning_rate": 0.0008068529009380277, + "loss": 2.2317, + "step": 435590 + }, + { + "epoch": 1.683907779375609, + "grad_norm": 0.1254976987838745, + "learning_rate": 0.0008066852887858961, + "loss": 2.2501, + "step": 435600 + }, + { + "epoch": 1.6839464365789922, + "grad_norm": 0.12125872820615768, + "learning_rate": 0.0008065177001731445, + "loss": 2.2396, + "step": 435610 + }, + { + "epoch": 1.6839850937823755, + "grad_norm": 0.11988002061843872, + "learning_rate": 0.0008063501350898581, + "loss": 2.235, + "step": 435620 + }, + { + "epoch": 1.6840237509857587, + "grad_norm": 0.1185842901468277, + "learning_rate": 0.0008061825935261289, + "loss": 2.253, + "step": 435630 + }, + { + "epoch": 1.684062408189142, + "grad_norm": 0.09899508953094482, + "learning_rate": 0.0008060150754720561, + "loss": 2.2286, + "step": 435640 + }, + { + "epoch": 1.6841010653925252, + "grad_norm": 0.11395104229450226, + "learning_rate": 0.0008058475809177457, + "loss": 2.2287, + "step": 435650 + }, + { + "epoch": 1.6841397225959085, + "grad_norm": 0.11625102162361145, + "learning_rate": 0.0008056801098533108, + "loss": 2.2282, + "step": 435660 + }, + { + "epoch": 1.6841783797992917, + "grad_norm": 0.10915978997945786, + "learning_rate": 0.0008055126622688711, + "loss": 2.2437, + "step": 435670 + }, + { + "epoch": 1.684217037002675, + "grad_norm": 0.11652518808841705, + "learning_rate": 0.0008053452381545536, + "loss": 2.2294, + "step": 435680 + }, + { + "epoch": 1.6842556942060583, + "grad_norm": 0.12197697907686234, + "learning_rate": 0.0008051778375004924, + "loss": 2.2312, + "step": 435690 + }, + { + "epoch": 1.6842943514094415, + "grad_norm": 0.11725616455078125, + "learning_rate": 0.0008050104602968276, + "loss": 2.236, + "step": 435700 + }, + { + "epoch": 1.6843330086128248, + "grad_norm": 0.10901392996311188, + "learning_rate": 0.0008048431065337072, + "loss": 2.238, + "step": 435710 + }, + { + "epoch": 1.6843716658162082, + "grad_norm": 0.13330137729644775, + "learning_rate": 0.0008046757762012852, + "loss": 2.2334, + "step": 435720 + }, + { + "epoch": 1.6844103230195915, + "grad_norm": 0.11101020872592926, + "learning_rate": 0.0008045084692897235, + "loss": 2.2301, + "step": 435730 + }, + { + "epoch": 1.6844489802229747, + "grad_norm": 0.1197330430150032, + "learning_rate": 0.0008043411857891902, + "loss": 2.2456, + "step": 435740 + }, + { + "epoch": 1.684487637426358, + "grad_norm": 0.1282912790775299, + "learning_rate": 0.0008041739256898601, + "loss": 2.2396, + "step": 435750 + }, + { + "epoch": 1.6845262946297415, + "grad_norm": 0.14463169872760773, + "learning_rate": 0.0008040066889819158, + "loss": 2.2448, + "step": 435760 + }, + { + "epoch": 1.6845649518331247, + "grad_norm": 0.2296954095363617, + "learning_rate": 0.0008038394756555456, + "loss": 2.2475, + "step": 435770 + }, + { + "epoch": 1.684603609036508, + "grad_norm": 0.12037109583616257, + "learning_rate": 0.0008036722857009455, + "loss": 2.2443, + "step": 435780 + }, + { + "epoch": 1.6846422662398912, + "grad_norm": 0.11113885790109634, + "learning_rate": 0.0008035051191083183, + "loss": 2.2346, + "step": 435790 + }, + { + "epoch": 1.6846809234432745, + "grad_norm": 0.11480911821126938, + "learning_rate": 0.0008033379758678727, + "loss": 2.2301, + "step": 435800 + }, + { + "epoch": 1.6847195806466577, + "grad_norm": 0.11218812316656113, + "learning_rate": 0.0008031708559698257, + "loss": 2.2337, + "step": 435810 + }, + { + "epoch": 1.684758237850041, + "grad_norm": 0.10466307401657104, + "learning_rate": 0.0008030037594043998, + "loss": 2.253, + "step": 435820 + }, + { + "epoch": 1.6847968950534242, + "grad_norm": 0.1097770482301712, + "learning_rate": 0.0008028366861618253, + "loss": 2.2468, + "step": 435830 + }, + { + "epoch": 1.6848355522568075, + "grad_norm": 0.1140095442533493, + "learning_rate": 0.0008026696362323387, + "loss": 2.2301, + "step": 435840 + }, + { + "epoch": 1.6848742094601907, + "grad_norm": 0.11084846407175064, + "learning_rate": 0.0008025026096061839, + "loss": 2.2466, + "step": 435850 + }, + { + "epoch": 1.684912866663574, + "grad_norm": 0.11485522985458374, + "learning_rate": 0.0008023356062736107, + "loss": 2.2199, + "step": 435860 + }, + { + "epoch": 1.6849515238669572, + "grad_norm": 0.12026914954185486, + "learning_rate": 0.0008021686262248764, + "loss": 2.2324, + "step": 435870 + }, + { + "epoch": 1.6849901810703405, + "grad_norm": 0.1091637834906578, + "learning_rate": 0.000802001669450245, + "loss": 2.2447, + "step": 435880 + }, + { + "epoch": 1.685028838273724, + "grad_norm": 0.10844144970178604, + "learning_rate": 0.0008018347359399873, + "loss": 2.224, + "step": 435890 + }, + { + "epoch": 1.6850674954771072, + "grad_norm": 0.1304885745048523, + "learning_rate": 0.0008016678256843806, + "loss": 2.2458, + "step": 435900 + }, + { + "epoch": 1.6851061526804905, + "grad_norm": 0.11517968028783798, + "learning_rate": 0.0008015009386737093, + "loss": 2.242, + "step": 435910 + }, + { + "epoch": 1.6851448098838737, + "grad_norm": 0.11388219147920609, + "learning_rate": 0.0008013340748982642, + "loss": 2.2413, + "step": 435920 + }, + { + "epoch": 1.6851834670872572, + "grad_norm": 0.11688657850027084, + "learning_rate": 0.0008011672343483433, + "loss": 2.2418, + "step": 435930 + }, + { + "epoch": 1.6852221242906404, + "grad_norm": 0.11711782217025757, + "learning_rate": 0.0008010004170142511, + "loss": 2.2142, + "step": 435940 + }, + { + "epoch": 1.6852607814940237, + "grad_norm": 0.11181101948022842, + "learning_rate": 0.0008008336228862987, + "loss": 2.2418, + "step": 435950 + }, + { + "epoch": 1.685299438697407, + "grad_norm": 0.11422090977430344, + "learning_rate": 0.0008006668519548041, + "loss": 2.2268, + "step": 435960 + }, + { + "epoch": 1.6853380959007902, + "grad_norm": 0.1257568597793579, + "learning_rate": 0.0008005001042100922, + "loss": 2.2397, + "step": 435970 + }, + { + "epoch": 1.6853767531041735, + "grad_norm": 0.13817913830280304, + "learning_rate": 0.0008003333796424941, + "loss": 2.2585, + "step": 435980 + }, + { + "epoch": 1.6854154103075567, + "grad_norm": 0.12498199194669724, + "learning_rate": 0.0008001666782423484, + "loss": 2.2443, + "step": 435990 + }, + { + "epoch": 1.68545406751094, + "grad_norm": 0.1228543221950531, + "learning_rate": 0.0008, + "loss": 2.2499, + "step": 436000 + }, + { + "epoch": 1.6854927247143232, + "grad_norm": 0.10874808579683304, + "learning_rate": 0.0007998333449058002, + "loss": 2.2391, + "step": 436010 + }, + { + "epoch": 1.6855313819177065, + "grad_norm": 0.17795898020267487, + "learning_rate": 0.0007996667129501074, + "loss": 2.2406, + "step": 436020 + }, + { + "epoch": 1.6855700391210897, + "grad_norm": 0.11963041871786118, + "learning_rate": 0.0007995001041232866, + "loss": 2.244, + "step": 436030 + }, + { + "epoch": 1.685608696324473, + "grad_norm": 0.11893066018819809, + "learning_rate": 0.0007993335184157093, + "loss": 2.2372, + "step": 436040 + }, + { + "epoch": 1.6856473535278562, + "grad_norm": 0.11747374385595322, + "learning_rate": 0.0007991669558177541, + "loss": 2.2453, + "step": 436050 + }, + { + "epoch": 1.6856860107312397, + "grad_norm": 0.1093200072646141, + "learning_rate": 0.0007990004163198057, + "loss": 2.2456, + "step": 436060 + }, + { + "epoch": 1.685724667934623, + "grad_norm": 0.11985526978969574, + "learning_rate": 0.0007988338999122561, + "loss": 2.2312, + "step": 436070 + }, + { + "epoch": 1.6857633251380062, + "grad_norm": 0.11669023334980011, + "learning_rate": 0.0007986674065855035, + "loss": 2.2357, + "step": 436080 + }, + { + "epoch": 1.6858019823413894, + "grad_norm": 0.10922613739967346, + "learning_rate": 0.0007985009363299529, + "loss": 2.2336, + "step": 436090 + }, + { + "epoch": 1.685840639544773, + "grad_norm": 0.11648507416248322, + "learning_rate": 0.0007983344891360158, + "loss": 2.2257, + "step": 436100 + }, + { + "epoch": 1.6858792967481562, + "grad_norm": 0.12221698462963104, + "learning_rate": 0.0007981680649941107, + "loss": 2.2485, + "step": 436110 + }, + { + "epoch": 1.6859179539515394, + "grad_norm": 0.13133299350738525, + "learning_rate": 0.0007980016638946623, + "loss": 2.2318, + "step": 436120 + }, + { + "epoch": 1.6859566111549227, + "grad_norm": 0.11439281702041626, + "learning_rate": 0.0007978352858281026, + "loss": 2.2357, + "step": 436130 + }, + { + "epoch": 1.685995268358306, + "grad_norm": 0.11321944743394852, + "learning_rate": 0.0007976689307848691, + "loss": 2.2493, + "step": 436140 + }, + { + "epoch": 1.6860339255616892, + "grad_norm": 0.11675392836332321, + "learning_rate": 0.000797502598755407, + "loss": 2.2432, + "step": 436150 + }, + { + "epoch": 1.6860725827650724, + "grad_norm": 0.10947708785533905, + "learning_rate": 0.0007973362897301674, + "loss": 2.2416, + "step": 436160 + }, + { + "epoch": 1.6861112399684557, + "grad_norm": 0.10901670902967453, + "learning_rate": 0.0007971700036996084, + "loss": 2.2424, + "step": 436170 + }, + { + "epoch": 1.686149897171839, + "grad_norm": 0.11359596997499466, + "learning_rate": 0.0007970037406541948, + "loss": 2.2417, + "step": 436180 + }, + { + "epoch": 1.6861885543752222, + "grad_norm": 0.12114213407039642, + "learning_rate": 0.0007968375005843973, + "loss": 2.2356, + "step": 436190 + }, + { + "epoch": 1.6862272115786054, + "grad_norm": 0.10651782155036926, + "learning_rate": 0.0007966712834806942, + "loss": 2.2537, + "step": 436200 + }, + { + "epoch": 1.6862658687819887, + "grad_norm": 0.11422091722488403, + "learning_rate": 0.0007965050893335695, + "loss": 2.2345, + "step": 436210 + }, + { + "epoch": 1.686304525985372, + "grad_norm": 0.11653254926204681, + "learning_rate": 0.0007963389181335137, + "loss": 2.241, + "step": 436220 + }, + { + "epoch": 1.6863431831887554, + "grad_norm": 0.1081620454788208, + "learning_rate": 0.000796172769871025, + "loss": 2.2503, + "step": 436230 + }, + { + "epoch": 1.6863818403921387, + "grad_norm": 0.10417952388525009, + "learning_rate": 0.0007960066445366072, + "loss": 2.2409, + "step": 436240 + }, + { + "epoch": 1.686420497595522, + "grad_norm": 0.12107262015342712, + "learning_rate": 0.0007958405421207704, + "loss": 2.2275, + "step": 436250 + }, + { + "epoch": 1.6864591547989052, + "grad_norm": 0.1189051941037178, + "learning_rate": 0.0007956744626140322, + "loss": 2.2281, + "step": 436260 + }, + { + "epoch": 1.6864978120022887, + "grad_norm": 0.15268446505069733, + "learning_rate": 0.0007955084060069162, + "loss": 2.2323, + "step": 436270 + }, + { + "epoch": 1.686536469205672, + "grad_norm": 0.10644841194152832, + "learning_rate": 0.0007953423722899522, + "loss": 2.2357, + "step": 436280 + }, + { + "epoch": 1.6865751264090552, + "grad_norm": 0.10994943231344223, + "learning_rate": 0.0007951763614536773, + "loss": 2.2354, + "step": 436290 + }, + { + "epoch": 1.6866137836124384, + "grad_norm": 0.12903369963169098, + "learning_rate": 0.0007950103734886345, + "loss": 2.2514, + "step": 436300 + }, + { + "epoch": 1.6866524408158217, + "grad_norm": 0.1179652139544487, + "learning_rate": 0.0007948444083853736, + "loss": 2.2267, + "step": 436310 + }, + { + "epoch": 1.686691098019205, + "grad_norm": 0.12354667484760284, + "learning_rate": 0.0007946784661344509, + "loss": 2.2425, + "step": 436320 + }, + { + "epoch": 1.6867297552225882, + "grad_norm": 0.10569813847541809, + "learning_rate": 0.0007945125467264289, + "loss": 2.2445, + "step": 436330 + }, + { + "epoch": 1.6867684124259714, + "grad_norm": 0.11753513664007187, + "learning_rate": 0.0007943466501518772, + "loss": 2.2344, + "step": 436340 + }, + { + "epoch": 1.6868070696293547, + "grad_norm": 0.11324722319841385, + "learning_rate": 0.0007941807764013713, + "loss": 2.2285, + "step": 436350 + }, + { + "epoch": 1.686845726832738, + "grad_norm": 0.11834903061389923, + "learning_rate": 0.0007940149254654931, + "loss": 2.2293, + "step": 436360 + }, + { + "epoch": 1.6868843840361212, + "grad_norm": 0.12163624912500381, + "learning_rate": 0.0007938490973348318, + "loss": 2.241, + "step": 436370 + }, + { + "epoch": 1.6869230412395044, + "grad_norm": 0.1194707453250885, + "learning_rate": 0.0007936832919999824, + "loss": 2.2524, + "step": 436380 + }, + { + "epoch": 1.686961698442888, + "grad_norm": 0.1264716535806656, + "learning_rate": 0.0007935175094515461, + "loss": 2.241, + "step": 436390 + }, + { + "epoch": 1.6870003556462712, + "grad_norm": 0.11369436234235764, + "learning_rate": 0.0007933517496801314, + "loss": 2.2272, + "step": 436400 + }, + { + "epoch": 1.6870390128496544, + "grad_norm": 0.12565520405769348, + "learning_rate": 0.0007931860126763528, + "loss": 2.2445, + "step": 436410 + }, + { + "epoch": 1.6870776700530377, + "grad_norm": 0.11966858804225922, + "learning_rate": 0.000793020298430831, + "loss": 2.238, + "step": 436420 + }, + { + "epoch": 1.687116327256421, + "grad_norm": 0.11708260327577591, + "learning_rate": 0.0007928546069341937, + "loss": 2.2359, + "step": 436430 + }, + { + "epoch": 1.6871549844598044, + "grad_norm": 0.13556437194347382, + "learning_rate": 0.0007926889381770746, + "loss": 2.2413, + "step": 436440 + }, + { + "epoch": 1.6871936416631876, + "grad_norm": 0.11219292134046555, + "learning_rate": 0.0007925232921501135, + "loss": 2.2345, + "step": 436450 + }, + { + "epoch": 1.687232298866571, + "grad_norm": 0.13132259249687195, + "learning_rate": 0.0007923576688439578, + "loss": 2.2199, + "step": 436460 + }, + { + "epoch": 1.6872709560699541, + "grad_norm": 0.20593777298927307, + "learning_rate": 0.00079219206824926, + "loss": 2.241, + "step": 436470 + }, + { + "epoch": 1.6873096132733374, + "grad_norm": 0.12723521888256073, + "learning_rate": 0.0007920264903566801, + "loss": 2.2332, + "step": 436480 + }, + { + "epoch": 1.6873482704767206, + "grad_norm": 0.10929694771766663, + "learning_rate": 0.0007918609351568835, + "loss": 2.2362, + "step": 436490 + }, + { + "epoch": 1.687386927680104, + "grad_norm": 0.128523588180542, + "learning_rate": 0.0007916954026405428, + "loss": 2.2349, + "step": 436500 + }, + { + "epoch": 1.6874255848834872, + "grad_norm": 0.11908918619155884, + "learning_rate": 0.0007915298927983366, + "loss": 2.223, + "step": 436510 + }, + { + "epoch": 1.6874642420868704, + "grad_norm": 0.11230950802564621, + "learning_rate": 0.0007913644056209497, + "loss": 2.2304, + "step": 436520 + }, + { + "epoch": 1.6875028992902537, + "grad_norm": 0.10958006232976913, + "learning_rate": 0.0007911989410990741, + "loss": 2.2408, + "step": 436530 + }, + { + "epoch": 1.687541556493637, + "grad_norm": 0.1269649863243103, + "learning_rate": 0.000791033499223407, + "loss": 2.2347, + "step": 436540 + }, + { + "epoch": 1.6875802136970202, + "grad_norm": 0.12071368098258972, + "learning_rate": 0.0007908680799846528, + "loss": 2.2387, + "step": 436550 + }, + { + "epoch": 1.6876188709004036, + "grad_norm": 0.11131490767002106, + "learning_rate": 0.0007907026833735221, + "loss": 2.2226, + "step": 436560 + }, + { + "epoch": 1.6876575281037869, + "grad_norm": 0.13059940934181213, + "learning_rate": 0.0007905373093807315, + "loss": 2.2337, + "step": 436570 + }, + { + "epoch": 1.6876961853071701, + "grad_norm": 0.11803121864795685, + "learning_rate": 0.0007903719579970047, + "loss": 2.2357, + "step": 436580 + }, + { + "epoch": 1.6877348425105534, + "grad_norm": 0.11218202114105225, + "learning_rate": 0.000790206629213071, + "loss": 2.2211, + "step": 436590 + }, + { + "epoch": 1.6877734997139366, + "grad_norm": 0.1089283898472786, + "learning_rate": 0.0007900413230196661, + "loss": 2.2399, + "step": 436600 + }, + { + "epoch": 1.6878121569173201, + "grad_norm": 0.11095636337995529, + "learning_rate": 0.0007898760394075324, + "loss": 2.2284, + "step": 436610 + }, + { + "epoch": 1.6878508141207034, + "grad_norm": 0.11514124274253845, + "learning_rate": 0.0007897107783674185, + "loss": 2.2505, + "step": 436620 + }, + { + "epoch": 1.6878894713240866, + "grad_norm": 0.11509499698877335, + "learning_rate": 0.000789545539890079, + "loss": 2.2474, + "step": 436630 + }, + { + "epoch": 1.6879281285274699, + "grad_norm": 0.10745836794376373, + "learning_rate": 0.0007893803239662756, + "loss": 2.2213, + "step": 436640 + }, + { + "epoch": 1.6879667857308531, + "grad_norm": 0.12291122227907181, + "learning_rate": 0.0007892151305867752, + "loss": 2.2538, + "step": 436650 + }, + { + "epoch": 1.6880054429342364, + "grad_norm": 0.10713893920183182, + "learning_rate": 0.0007890499597423517, + "loss": 2.2462, + "step": 436660 + }, + { + "epoch": 1.6880441001376196, + "grad_norm": 0.11409778892993927, + "learning_rate": 0.0007888848114237852, + "loss": 2.2395, + "step": 436670 + }, + { + "epoch": 1.6880827573410029, + "grad_norm": 0.11547887325286865, + "learning_rate": 0.0007887196856218624, + "loss": 2.2307, + "step": 436680 + }, + { + "epoch": 1.6881214145443861, + "grad_norm": 0.11228621006011963, + "learning_rate": 0.0007885545823273755, + "loss": 2.2463, + "step": 436690 + }, + { + "epoch": 1.6881600717477694, + "grad_norm": 0.11682931333780289, + "learning_rate": 0.0007883895015311233, + "loss": 2.2332, + "step": 436700 + }, + { + "epoch": 1.6881987289511526, + "grad_norm": 0.11516440659761429, + "learning_rate": 0.0007882244432239112, + "loss": 2.2388, + "step": 436710 + }, + { + "epoch": 1.688237386154536, + "grad_norm": 0.10949617624282837, + "learning_rate": 0.0007880594073965505, + "loss": 2.2354, + "step": 436720 + }, + { + "epoch": 1.6882760433579194, + "grad_norm": 0.1049916222691536, + "learning_rate": 0.0007878943940398593, + "loss": 2.2306, + "step": 436730 + }, + { + "epoch": 1.6883147005613026, + "grad_norm": 0.11417264491319656, + "learning_rate": 0.0007877294031446609, + "loss": 2.2326, + "step": 436740 + }, + { + "epoch": 1.6883533577646859, + "grad_norm": 0.12302906811237335, + "learning_rate": 0.0007875644347017859, + "loss": 2.2302, + "step": 436750 + }, + { + "epoch": 1.6883920149680691, + "grad_norm": 0.11686044186353683, + "learning_rate": 0.0007873994887020706, + "loss": 2.233, + "step": 436760 + }, + { + "epoch": 1.6884306721714524, + "grad_norm": 0.11717838048934937, + "learning_rate": 0.0007872345651363575, + "loss": 2.2468, + "step": 436770 + }, + { + "epoch": 1.6884693293748358, + "grad_norm": 0.11655833572149277, + "learning_rate": 0.0007870696639954955, + "loss": 2.2305, + "step": 436780 + }, + { + "epoch": 1.688507986578219, + "grad_norm": 0.12989924848079681, + "learning_rate": 0.0007869047852703399, + "loss": 2.2287, + "step": 436790 + }, + { + "epoch": 1.6885466437816024, + "grad_norm": 0.10573769360780716, + "learning_rate": 0.0007867399289517518, + "loss": 2.2215, + "step": 436800 + }, + { + "epoch": 1.6885853009849856, + "grad_norm": 0.11621452867984772, + "learning_rate": 0.000786575095030599, + "loss": 2.2406, + "step": 436810 + }, + { + "epoch": 1.6886239581883689, + "grad_norm": 0.11387400329113007, + "learning_rate": 0.0007864102834977547, + "loss": 2.2355, + "step": 436820 + }, + { + "epoch": 1.688662615391752, + "grad_norm": 0.11899558454751968, + "learning_rate": 0.0007862454943440993, + "loss": 2.2448, + "step": 436830 + }, + { + "epoch": 1.6887012725951354, + "grad_norm": 0.10371288657188416, + "learning_rate": 0.0007860807275605186, + "loss": 2.2206, + "step": 436840 + }, + { + "epoch": 1.6887399297985186, + "grad_norm": 0.10565133392810822, + "learning_rate": 0.0007859159831379051, + "loss": 2.2243, + "step": 436850 + }, + { + "epoch": 1.6887785870019019, + "grad_norm": 0.11194362491369247, + "learning_rate": 0.0007857512610671569, + "loss": 2.2368, + "step": 436860 + }, + { + "epoch": 1.6888172442052851, + "grad_norm": 0.11670593172311783, + "learning_rate": 0.0007855865613391788, + "loss": 2.2415, + "step": 436870 + }, + { + "epoch": 1.6888559014086684, + "grad_norm": 0.12032661586999893, + "learning_rate": 0.0007854218839448819, + "loss": 2.252, + "step": 436880 + }, + { + "epoch": 1.6888945586120516, + "grad_norm": 0.48441967368125916, + "learning_rate": 0.000785257228875183, + "loss": 2.2309, + "step": 436890 + }, + { + "epoch": 1.688933215815435, + "grad_norm": 0.12135181576013565, + "learning_rate": 0.0007850925961210047, + "loss": 2.2247, + "step": 436900 + }, + { + "epoch": 1.6889718730188183, + "grad_norm": 0.11387261003255844, + "learning_rate": 0.0007849279856732772, + "loss": 2.2446, + "step": 436910 + }, + { + "epoch": 1.6890105302222016, + "grad_norm": 0.10993428528308868, + "learning_rate": 0.0007847633975229351, + "loss": 2.2272, + "step": 436920 + }, + { + "epoch": 1.6890491874255849, + "grad_norm": 0.11586098372936249, + "learning_rate": 0.0007845988316609203, + "loss": 2.2399, + "step": 436930 + }, + { + "epoch": 1.6890878446289683, + "grad_norm": 0.11747764050960541, + "learning_rate": 0.0007844342880781805, + "loss": 2.2379, + "step": 436940 + }, + { + "epoch": 1.6891265018323516, + "grad_norm": 0.1304158717393875, + "learning_rate": 0.0007842697667656693, + "loss": 2.2475, + "step": 436950 + }, + { + "epoch": 1.6891651590357348, + "grad_norm": 1.0930097103118896, + "learning_rate": 0.000784105267714347, + "loss": 2.2354, + "step": 436960 + }, + { + "epoch": 1.689203816239118, + "grad_norm": 0.11207807809114456, + "learning_rate": 0.0007839407909151793, + "loss": 2.2389, + "step": 436970 + }, + { + "epoch": 1.6892424734425013, + "grad_norm": 0.12363407760858536, + "learning_rate": 0.0007837763363591382, + "loss": 2.2415, + "step": 436980 + }, + { + "epoch": 1.6892811306458846, + "grad_norm": 0.12563955783843994, + "learning_rate": 0.0007836119040372025, + "loss": 2.2347, + "step": 436990 + }, + { + "epoch": 1.6893197878492678, + "grad_norm": 0.11330549418926239, + "learning_rate": 0.0007834474939403562, + "loss": 2.2352, + "step": 437000 + }, + { + "epoch": 1.689358445052651, + "grad_norm": 0.11254936456680298, + "learning_rate": 0.0007832831060595896, + "loss": 2.24, + "step": 437010 + }, + { + "epoch": 1.6893971022560343, + "grad_norm": 0.10804528743028641, + "learning_rate": 0.0007831187403858995, + "loss": 2.2298, + "step": 437020 + }, + { + "epoch": 1.6894357594594176, + "grad_norm": 0.10461485385894775, + "learning_rate": 0.0007829543969102884, + "loss": 2.2459, + "step": 437030 + }, + { + "epoch": 1.6894744166628008, + "grad_norm": 0.12711456418037415, + "learning_rate": 0.000782790075623765, + "loss": 2.2228, + "step": 437040 + }, + { + "epoch": 1.689513073866184, + "grad_norm": 0.1143861711025238, + "learning_rate": 0.0007826257765173439, + "loss": 2.2317, + "step": 437050 + }, + { + "epoch": 1.6895517310695674, + "grad_norm": 0.13040152192115784, + "learning_rate": 0.000782461499582046, + "loss": 2.2323, + "step": 437060 + }, + { + "epoch": 1.6895903882729508, + "grad_norm": 0.11680342257022858, + "learning_rate": 0.0007822972448088986, + "loss": 2.2316, + "step": 437070 + }, + { + "epoch": 1.689629045476334, + "grad_norm": 0.12657737731933594, + "learning_rate": 0.0007821330121889336, + "loss": 2.2422, + "step": 437080 + }, + { + "epoch": 1.6896677026797173, + "grad_norm": 0.1121034175157547, + "learning_rate": 0.0007819688017131909, + "loss": 2.2449, + "step": 437090 + }, + { + "epoch": 1.6897063598831006, + "grad_norm": 0.1272219717502594, + "learning_rate": 0.0007818046133727152, + "loss": 2.2453, + "step": 437100 + }, + { + "epoch": 1.689745017086484, + "grad_norm": 0.11899504065513611, + "learning_rate": 0.0007816404471585575, + "loss": 2.2234, + "step": 437110 + }, + { + "epoch": 1.6897836742898673, + "grad_norm": 0.11514095216989517, + "learning_rate": 0.0007814763030617746, + "loss": 2.2334, + "step": 437120 + }, + { + "epoch": 1.6898223314932506, + "grad_norm": 0.12913601100444794, + "learning_rate": 0.0007813121810734301, + "loss": 2.2434, + "step": 437130 + }, + { + "epoch": 1.6898609886966338, + "grad_norm": 0.10719872266054153, + "learning_rate": 0.0007811480811845927, + "loss": 2.2474, + "step": 437140 + }, + { + "epoch": 1.689899645900017, + "grad_norm": 0.12911660969257355, + "learning_rate": 0.0007809840033863378, + "loss": 2.2343, + "step": 437150 + }, + { + "epoch": 1.6899383031034003, + "grad_norm": 0.12792739272117615, + "learning_rate": 0.0007808199476697464, + "loss": 2.2358, + "step": 437160 + }, + { + "epoch": 1.6899769603067836, + "grad_norm": 0.148227721452713, + "learning_rate": 0.0007806559140259055, + "loss": 2.2331, + "step": 437170 + }, + { + "epoch": 1.6900156175101668, + "grad_norm": 0.14317235350608826, + "learning_rate": 0.0007804919024459083, + "loss": 2.2307, + "step": 437180 + }, + { + "epoch": 1.69005427471355, + "grad_norm": 0.11549597978591919, + "learning_rate": 0.0007803279129208541, + "loss": 2.2356, + "step": 437190 + }, + { + "epoch": 1.6900929319169333, + "grad_norm": 0.13664087653160095, + "learning_rate": 0.0007801639454418475, + "loss": 2.2359, + "step": 437200 + }, + { + "epoch": 1.6901315891203166, + "grad_norm": 0.11562107503414154, + "learning_rate": 0.0007800000000000001, + "loss": 2.2495, + "step": 437210 + }, + { + "epoch": 1.6901702463236998, + "grad_norm": 0.1262626051902771, + "learning_rate": 0.0007798360765864285, + "loss": 2.2357, + "step": 437220 + }, + { + "epoch": 1.690208903527083, + "grad_norm": 0.10728706419467926, + "learning_rate": 0.000779672175192256, + "loss": 2.2461, + "step": 437230 + }, + { + "epoch": 1.6902475607304666, + "grad_norm": 0.12150957435369492, + "learning_rate": 0.0007795082958086115, + "loss": 2.2385, + "step": 437240 + }, + { + "epoch": 1.6902862179338498, + "grad_norm": 0.11588594317436218, + "learning_rate": 0.0007793444384266297, + "loss": 2.2286, + "step": 437250 + }, + { + "epoch": 1.690324875137233, + "grad_norm": 0.12389259040355682, + "learning_rate": 0.0007791806030374518, + "loss": 2.2318, + "step": 437260 + }, + { + "epoch": 1.6903635323406163, + "grad_norm": 0.12836752831935883, + "learning_rate": 0.0007790167896322244, + "loss": 2.2452, + "step": 437270 + }, + { + "epoch": 1.6904021895439998, + "grad_norm": 0.13800232112407684, + "learning_rate": 0.0007788529982021002, + "loss": 2.2299, + "step": 437280 + }, + { + "epoch": 1.690440846747383, + "grad_norm": 0.13133391737937927, + "learning_rate": 0.0007786892287382379, + "loss": 2.23, + "step": 437290 + }, + { + "epoch": 1.6904795039507663, + "grad_norm": 0.11495410650968552, + "learning_rate": 0.0007785254812318023, + "loss": 2.2349, + "step": 437300 + }, + { + "epoch": 1.6905181611541495, + "grad_norm": 0.11895423382520676, + "learning_rate": 0.0007783617556739639, + "loss": 2.2475, + "step": 437310 + }, + { + "epoch": 1.6905568183575328, + "grad_norm": 0.10464183241128922, + "learning_rate": 0.0007781980520558989, + "loss": 2.225, + "step": 437320 + }, + { + "epoch": 1.690595475560916, + "grad_norm": 0.11086255311965942, + "learning_rate": 0.0007780343703687898, + "loss": 2.2374, + "step": 437330 + }, + { + "epoch": 1.6906341327642993, + "grad_norm": 0.13207247853279114, + "learning_rate": 0.000777870710603825, + "loss": 2.2261, + "step": 437340 + }, + { + "epoch": 1.6906727899676826, + "grad_norm": 0.10976863652467728, + "learning_rate": 0.0007777070727521982, + "loss": 2.2425, + "step": 437350 + }, + { + "epoch": 1.6907114471710658, + "grad_norm": 0.1116211786866188, + "learning_rate": 0.00077754345680511, + "loss": 2.2315, + "step": 437360 + }, + { + "epoch": 1.690750104374449, + "grad_norm": 0.1117439717054367, + "learning_rate": 0.000777379862753766, + "loss": 2.2355, + "step": 437370 + }, + { + "epoch": 1.6907887615778323, + "grad_norm": 0.12214884161949158, + "learning_rate": 0.0007772162905893783, + "loss": 2.2325, + "step": 437380 + }, + { + "epoch": 1.6908274187812156, + "grad_norm": 0.1459735631942749, + "learning_rate": 0.0007770527403031642, + "loss": 2.2192, + "step": 437390 + }, + { + "epoch": 1.6908660759845988, + "grad_norm": 0.1475234031677246, + "learning_rate": 0.0007768892118863476, + "loss": 2.2378, + "step": 437400 + }, + { + "epoch": 1.6909047331879823, + "grad_norm": 0.1206105649471283, + "learning_rate": 0.0007767257053301577, + "loss": 2.2379, + "step": 437410 + }, + { + "epoch": 1.6909433903913655, + "grad_norm": 0.11356187611818314, + "learning_rate": 0.0007765622206258303, + "loss": 2.2263, + "step": 437420 + }, + { + "epoch": 1.6909820475947488, + "grad_norm": 0.11754736304283142, + "learning_rate": 0.0007763987577646057, + "loss": 2.2509, + "step": 437430 + }, + { + "epoch": 1.691020704798132, + "grad_norm": 0.10678063333034515, + "learning_rate": 0.0007762353167377316, + "loss": 2.2204, + "step": 437440 + }, + { + "epoch": 1.6910593620015155, + "grad_norm": 0.11095842719078064, + "learning_rate": 0.0007760718975364607, + "loss": 2.231, + "step": 437450 + }, + { + "epoch": 1.6910980192048988, + "grad_norm": 0.11462483555078506, + "learning_rate": 0.0007759085001520516, + "loss": 2.2279, + "step": 437460 + }, + { + "epoch": 1.691136676408282, + "grad_norm": 0.11364367604255676, + "learning_rate": 0.0007757451245757687, + "loss": 2.2319, + "step": 437470 + }, + { + "epoch": 1.6911753336116653, + "grad_norm": 0.10768604278564453, + "learning_rate": 0.0007755817707988826, + "loss": 2.2347, + "step": 437480 + }, + { + "epoch": 1.6912139908150485, + "grad_norm": 0.1304168403148651, + "learning_rate": 0.0007754184388126693, + "loss": 2.2462, + "step": 437490 + }, + { + "epoch": 1.6912526480184318, + "grad_norm": 0.11165142059326172, + "learning_rate": 0.0007752551286084111, + "loss": 2.2348, + "step": 437500 + }, + { + "epoch": 1.691291305221815, + "grad_norm": 0.1312442570924759, + "learning_rate": 0.0007750918401773952, + "loss": 2.2279, + "step": 437510 + }, + { + "epoch": 1.6913299624251983, + "grad_norm": 0.1164218857884407, + "learning_rate": 0.0007749285735109158, + "loss": 2.2144, + "step": 437520 + }, + { + "epoch": 1.6913686196285815, + "grad_norm": 0.10903254896402359, + "learning_rate": 0.0007747653286002718, + "loss": 2.2336, + "step": 437530 + }, + { + "epoch": 1.6914072768319648, + "grad_norm": 0.11387763917446136, + "learning_rate": 0.0007746021054367687, + "loss": 2.2441, + "step": 437540 + }, + { + "epoch": 1.691445934035348, + "grad_norm": 0.1062643975019455, + "learning_rate": 0.0007744389040117177, + "loss": 2.2491, + "step": 437550 + }, + { + "epoch": 1.6914845912387313, + "grad_norm": 0.11953529715538025, + "learning_rate": 0.0007742757243164351, + "loss": 2.241, + "step": 437560 + }, + { + "epoch": 1.6915232484421145, + "grad_norm": 0.1143541932106018, + "learning_rate": 0.0007741125663422437, + "loss": 2.2266, + "step": 437570 + }, + { + "epoch": 1.691561905645498, + "grad_norm": 0.11018265038728714, + "learning_rate": 0.0007739494300804717, + "loss": 2.2375, + "step": 437580 + }, + { + "epoch": 1.6916005628488813, + "grad_norm": 0.10305047035217285, + "learning_rate": 0.0007737863155224534, + "loss": 2.2377, + "step": 437590 + }, + { + "epoch": 1.6916392200522645, + "grad_norm": 0.11621309071779251, + "learning_rate": 0.0007736232226595288, + "loss": 2.2271, + "step": 437600 + }, + { + "epoch": 1.6916778772556478, + "grad_norm": 0.11574336886405945, + "learning_rate": 0.000773460151483043, + "loss": 2.2305, + "step": 437610 + }, + { + "epoch": 1.6917165344590313, + "grad_norm": 0.1166810542345047, + "learning_rate": 0.0007732971019843478, + "loss": 2.2445, + "step": 437620 + }, + { + "epoch": 1.6917551916624145, + "grad_norm": 0.12316501885652542, + "learning_rate": 0.0007731340741547999, + "loss": 2.2501, + "step": 437630 + }, + { + "epoch": 1.6917938488657978, + "grad_norm": 0.11434192955493927, + "learning_rate": 0.0007729710679857626, + "loss": 2.2479, + "step": 437640 + }, + { + "epoch": 1.691832506069181, + "grad_norm": 0.11277862638235092, + "learning_rate": 0.0007728080834686045, + "loss": 2.235, + "step": 437650 + }, + { + "epoch": 1.6918711632725643, + "grad_norm": 0.12093667685985565, + "learning_rate": 0.0007726451205946993, + "loss": 2.2313, + "step": 437660 + }, + { + "epoch": 1.6919098204759475, + "grad_norm": 0.11708587408065796, + "learning_rate": 0.000772482179355428, + "loss": 2.2413, + "step": 437670 + }, + { + "epoch": 1.6919484776793308, + "grad_norm": 0.11986162513494492, + "learning_rate": 0.0007723192597421755, + "loss": 2.2377, + "step": 437680 + }, + { + "epoch": 1.691987134882714, + "grad_norm": 0.12617745995521545, + "learning_rate": 0.0007721563617463338, + "loss": 2.247, + "step": 437690 + }, + { + "epoch": 1.6920257920860973, + "grad_norm": 0.11964748054742813, + "learning_rate": 0.0007719934853592998, + "loss": 2.2231, + "step": 437700 + }, + { + "epoch": 1.6920644492894805, + "grad_norm": 0.11791455000638962, + "learning_rate": 0.0007718306305724768, + "loss": 2.2276, + "step": 437710 + }, + { + "epoch": 1.6921031064928638, + "grad_norm": 0.12315932661294937, + "learning_rate": 0.000771667797377273, + "loss": 2.2186, + "step": 437720 + }, + { + "epoch": 1.692141763696247, + "grad_norm": 0.11123772710561752, + "learning_rate": 0.0007715049857651029, + "loss": 2.2471, + "step": 437730 + }, + { + "epoch": 1.6921804208996303, + "grad_norm": 0.11152539402246475, + "learning_rate": 0.0007713421957273865, + "loss": 2.2474, + "step": 437740 + }, + { + "epoch": 1.6922190781030138, + "grad_norm": 0.1093481257557869, + "learning_rate": 0.0007711794272555493, + "loss": 2.2447, + "step": 437750 + }, + { + "epoch": 1.692257735306397, + "grad_norm": 0.11701294034719467, + "learning_rate": 0.0007710166803410228, + "loss": 2.2304, + "step": 437760 + }, + { + "epoch": 1.6922963925097803, + "grad_norm": 0.1318010836839676, + "learning_rate": 0.0007708539549752438, + "loss": 2.2436, + "step": 437770 + }, + { + "epoch": 1.6923350497131635, + "grad_norm": 0.1183793693780899, + "learning_rate": 0.0007706912511496553, + "loss": 2.2321, + "step": 437780 + }, + { + "epoch": 1.692373706916547, + "grad_norm": 0.1210879236459732, + "learning_rate": 0.0007705285688557053, + "loss": 2.2202, + "step": 437790 + }, + { + "epoch": 1.6924123641199302, + "grad_norm": 0.10407286137342453, + "learning_rate": 0.0007703659080848482, + "loss": 2.2356, + "step": 437800 + }, + { + "epoch": 1.6924510213233135, + "grad_norm": 0.13408103585243225, + "learning_rate": 0.0007702032688285434, + "loss": 2.2296, + "step": 437810 + }, + { + "epoch": 1.6924896785266967, + "grad_norm": 0.11854076385498047, + "learning_rate": 0.0007700406510782562, + "loss": 2.2367, + "step": 437820 + }, + { + "epoch": 1.69252833573008, + "grad_norm": 0.12286543101072311, + "learning_rate": 0.0007698780548254575, + "loss": 2.2357, + "step": 437830 + }, + { + "epoch": 1.6925669929334632, + "grad_norm": 0.11990126222372055, + "learning_rate": 0.0007697154800616242, + "loss": 2.2409, + "step": 437840 + }, + { + "epoch": 1.6926056501368465, + "grad_norm": 0.11182884126901627, + "learning_rate": 0.000769552926778238, + "loss": 2.247, + "step": 437850 + }, + { + "epoch": 1.6926443073402297, + "grad_norm": 0.10320017486810684, + "learning_rate": 0.0007693903949667874, + "loss": 2.2336, + "step": 437860 + }, + { + "epoch": 1.692682964543613, + "grad_norm": 0.1216001883149147, + "learning_rate": 0.000769227884618765, + "loss": 2.2217, + "step": 437870 + }, + { + "epoch": 1.6927216217469963, + "grad_norm": 0.11251985281705856, + "learning_rate": 0.0007690653957256705, + "loss": 2.2181, + "step": 437880 + }, + { + "epoch": 1.6927602789503795, + "grad_norm": 0.11518998444080353, + "learning_rate": 0.0007689029282790086, + "loss": 2.2333, + "step": 437890 + }, + { + "epoch": 1.6927989361537628, + "grad_norm": 0.10871678590774536, + "learning_rate": 0.0007687404822702892, + "loss": 2.2404, + "step": 437900 + }, + { + "epoch": 1.692837593357146, + "grad_norm": 0.11232757568359375, + "learning_rate": 0.0007685780576910284, + "loss": 2.2283, + "step": 437910 + }, + { + "epoch": 1.6928762505605295, + "grad_norm": 0.11824655532836914, + "learning_rate": 0.0007684156545327478, + "loss": 2.2392, + "step": 437920 + }, + { + "epoch": 1.6929149077639127, + "grad_norm": 0.12078500539064407, + "learning_rate": 0.0007682532727869742, + "loss": 2.2307, + "step": 437930 + }, + { + "epoch": 1.692953564967296, + "grad_norm": 0.11573991179466248, + "learning_rate": 0.0007680909124452404, + "loss": 2.2287, + "step": 437940 + }, + { + "epoch": 1.6929922221706792, + "grad_norm": 0.1250699907541275, + "learning_rate": 0.0007679285734990849, + "loss": 2.2252, + "step": 437950 + }, + { + "epoch": 1.6930308793740627, + "grad_norm": 0.12099843472242355, + "learning_rate": 0.0007677662559400509, + "loss": 2.2479, + "step": 437960 + }, + { + "epoch": 1.693069536577446, + "grad_norm": 0.11248032003641129, + "learning_rate": 0.0007676039597596884, + "loss": 2.2282, + "step": 437970 + }, + { + "epoch": 1.6931081937808292, + "grad_norm": 0.12657272815704346, + "learning_rate": 0.0007674416849495518, + "loss": 2.2384, + "step": 437980 + }, + { + "epoch": 1.6931468509842125, + "grad_norm": 0.15892231464385986, + "learning_rate": 0.0007672794315012019, + "loss": 2.2418, + "step": 437990 + }, + { + "epoch": 1.6931855081875957, + "grad_norm": 0.11984408646821976, + "learning_rate": 0.0007671171994062048, + "loss": 2.2419, + "step": 438000 + }, + { + "epoch": 1.693224165390979, + "grad_norm": 0.11554060876369476, + "learning_rate": 0.0007669549886561318, + "loss": 2.246, + "step": 438010 + }, + { + "epoch": 1.6932628225943622, + "grad_norm": 0.11163786053657532, + "learning_rate": 0.0007667927992425606, + "loss": 2.2258, + "step": 438020 + }, + { + "epoch": 1.6933014797977455, + "grad_norm": 0.1620757281780243, + "learning_rate": 0.000766630631157073, + "loss": 2.237, + "step": 438030 + }, + { + "epoch": 1.6933401370011287, + "grad_norm": 0.11534059047698975, + "learning_rate": 0.000766468484391258, + "loss": 2.2374, + "step": 438040 + }, + { + "epoch": 1.693378794204512, + "grad_norm": 0.10129819810390472, + "learning_rate": 0.0007663063589367091, + "loss": 2.2325, + "step": 438050 + }, + { + "epoch": 1.6934174514078952, + "grad_norm": 0.10799936205148697, + "learning_rate": 0.0007661442547850255, + "loss": 2.2235, + "step": 438060 + }, + { + "epoch": 1.6934561086112785, + "grad_norm": 0.12132129818201065, + "learning_rate": 0.0007659821719278122, + "loss": 2.2319, + "step": 438070 + }, + { + "epoch": 1.6934947658146617, + "grad_norm": 0.11375699192285538, + "learning_rate": 0.0007658201103566788, + "loss": 2.2386, + "step": 438080 + }, + { + "epoch": 1.6935334230180452, + "grad_norm": 0.1231660544872284, + "learning_rate": 0.0007656580700632421, + "loss": 2.2262, + "step": 438090 + }, + { + "epoch": 1.6935720802214285, + "grad_norm": 0.11016866564750671, + "learning_rate": 0.0007654960510391229, + "loss": 2.2417, + "step": 438100 + }, + { + "epoch": 1.6936107374248117, + "grad_norm": 0.11380646377801895, + "learning_rate": 0.0007653340532759479, + "loss": 2.2404, + "step": 438110 + }, + { + "epoch": 1.693649394628195, + "grad_norm": 0.11561800539493561, + "learning_rate": 0.0007651720767653497, + "loss": 2.2405, + "step": 438120 + }, + { + "epoch": 1.6936880518315784, + "grad_norm": 0.11656072735786438, + "learning_rate": 0.0007650101214989655, + "loss": 2.2353, + "step": 438130 + }, + { + "epoch": 1.6937267090349617, + "grad_norm": 0.12446340918540955, + "learning_rate": 0.0007648481874684393, + "loss": 2.2307, + "step": 438140 + }, + { + "epoch": 1.693765366238345, + "grad_norm": 0.11343683302402496, + "learning_rate": 0.0007646862746654193, + "loss": 2.2288, + "step": 438150 + }, + { + "epoch": 1.6938040234417282, + "grad_norm": 0.12251932173967361, + "learning_rate": 0.00076452438308156, + "loss": 2.2151, + "step": 438160 + }, + { + "epoch": 1.6938426806451115, + "grad_norm": 0.11948093771934509, + "learning_rate": 0.000764362512708521, + "loss": 2.2239, + "step": 438170 + }, + { + "epoch": 1.6938813378484947, + "grad_norm": 0.11201823502779007, + "learning_rate": 0.0007642006635379674, + "loss": 2.2578, + "step": 438180 + }, + { + "epoch": 1.693919995051878, + "grad_norm": 0.12762024998664856, + "learning_rate": 0.0007640388355615699, + "loss": 2.2318, + "step": 438190 + }, + { + "epoch": 1.6939586522552612, + "grad_norm": 0.10953720659017563, + "learning_rate": 0.0007638770287710046, + "loss": 2.225, + "step": 438200 + }, + { + "epoch": 1.6939973094586445, + "grad_norm": 0.10989776998758316, + "learning_rate": 0.0007637152431579529, + "loss": 2.2204, + "step": 438210 + }, + { + "epoch": 1.6940359666620277, + "grad_norm": 0.11345095187425613, + "learning_rate": 0.0007635534787141014, + "loss": 2.2301, + "step": 438220 + }, + { + "epoch": 1.694074623865411, + "grad_norm": 0.1317012459039688, + "learning_rate": 0.000763391735431143, + "loss": 2.2261, + "step": 438230 + }, + { + "epoch": 1.6941132810687942, + "grad_norm": 0.11273641139268875, + "learning_rate": 0.0007632300133007755, + "loss": 2.2358, + "step": 438240 + }, + { + "epoch": 1.6941519382721775, + "grad_norm": 0.11396468430757523, + "learning_rate": 0.0007630683123147018, + "loss": 2.2261, + "step": 438250 + }, + { + "epoch": 1.694190595475561, + "grad_norm": 0.12092260271310806, + "learning_rate": 0.0007629066324646307, + "loss": 2.2145, + "step": 438260 + }, + { + "epoch": 1.6942292526789442, + "grad_norm": 0.13460946083068848, + "learning_rate": 0.0007627449737422765, + "loss": 2.2435, + "step": 438270 + }, + { + "epoch": 1.6942679098823274, + "grad_norm": 0.11652474105358124, + "learning_rate": 0.0007625833361393586, + "loss": 2.2193, + "step": 438280 + }, + { + "epoch": 1.6943065670857107, + "grad_norm": 0.13675649464130402, + "learning_rate": 0.0007624217196476013, + "loss": 2.2361, + "step": 438290 + }, + { + "epoch": 1.6943452242890942, + "grad_norm": 0.11373266577720642, + "learning_rate": 0.000762260124258736, + "loss": 2.2246, + "step": 438300 + }, + { + "epoch": 1.6943838814924774, + "grad_norm": 0.1127764880657196, + "learning_rate": 0.0007620985499644975, + "loss": 2.232, + "step": 438310 + }, + { + "epoch": 1.6944225386958607, + "grad_norm": 0.12021084874868393, + "learning_rate": 0.0007619369967566272, + "loss": 2.2371, + "step": 438320 + }, + { + "epoch": 1.694461195899244, + "grad_norm": 0.11365853250026703, + "learning_rate": 0.0007617754646268715, + "loss": 2.2264, + "step": 438330 + }, + { + "epoch": 1.6944998531026272, + "grad_norm": 0.11727002263069153, + "learning_rate": 0.0007616139535669824, + "loss": 2.2274, + "step": 438340 + }, + { + "epoch": 1.6945385103060104, + "grad_norm": 0.11850646883249283, + "learning_rate": 0.0007614524635687172, + "loss": 2.2445, + "step": 438350 + }, + { + "epoch": 1.6945771675093937, + "grad_norm": 0.11157086491584778, + "learning_rate": 0.0007612909946238382, + "loss": 2.235, + "step": 438360 + }, + { + "epoch": 1.694615824712777, + "grad_norm": 0.11376459151506424, + "learning_rate": 0.0007611295467241137, + "loss": 2.2364, + "step": 438370 + }, + { + "epoch": 1.6946544819161602, + "grad_norm": 0.12699368596076965, + "learning_rate": 0.0007609681198613169, + "loss": 2.2349, + "step": 438380 + }, + { + "epoch": 1.6946931391195434, + "grad_norm": 0.1290704905986786, + "learning_rate": 0.0007608067140272266, + "loss": 2.2258, + "step": 438390 + }, + { + "epoch": 1.6947317963229267, + "grad_norm": 0.11632368713617325, + "learning_rate": 0.0007606453292136266, + "loss": 2.2269, + "step": 438400 + }, + { + "epoch": 1.69477045352631, + "grad_norm": 0.12675462663173676, + "learning_rate": 0.0007604839654123066, + "loss": 2.2516, + "step": 438410 + }, + { + "epoch": 1.6948091107296934, + "grad_norm": 0.13376300036907196, + "learning_rate": 0.0007603226226150612, + "loss": 2.2244, + "step": 438420 + }, + { + "epoch": 1.6948477679330767, + "grad_norm": 0.10991213470697403, + "learning_rate": 0.0007601613008136906, + "loss": 2.2433, + "step": 438430 + }, + { + "epoch": 1.69488642513646, + "grad_norm": 0.11957073211669922, + "learning_rate": 0.00076, + "loss": 2.2275, + "step": 438440 + }, + { + "epoch": 1.6949250823398432, + "grad_norm": 0.12245447188615799, + "learning_rate": 0.0007598387201658004, + "loss": 2.2361, + "step": 438450 + }, + { + "epoch": 1.6949637395432264, + "grad_norm": 0.12726320326328278, + "learning_rate": 0.0007596774613029078, + "loss": 2.2432, + "step": 438460 + }, + { + "epoch": 1.69500239674661, + "grad_norm": 0.11078235507011414, + "learning_rate": 0.0007595162234031434, + "loss": 2.2305, + "step": 438470 + }, + { + "epoch": 1.6950410539499932, + "grad_norm": 0.1352330446243286, + "learning_rate": 0.0007593550064583344, + "loss": 2.2238, + "step": 438480 + }, + { + "epoch": 1.6950797111533764, + "grad_norm": 0.1313025951385498, + "learning_rate": 0.000759193810460312, + "loss": 2.2367, + "step": 438490 + }, + { + "epoch": 1.6951183683567597, + "grad_norm": 0.13372932374477386, + "learning_rate": 0.0007590326354009143, + "loss": 2.2524, + "step": 438500 + }, + { + "epoch": 1.695157025560143, + "grad_norm": 0.11021479964256287, + "learning_rate": 0.0007588714812719837, + "loss": 2.2203, + "step": 438510 + }, + { + "epoch": 1.6951956827635262, + "grad_norm": 0.12476097792387009, + "learning_rate": 0.0007587103480653679, + "loss": 2.2314, + "step": 438520 + }, + { + "epoch": 1.6952343399669094, + "grad_norm": 0.12518496811389923, + "learning_rate": 0.0007585492357729205, + "loss": 2.2402, + "step": 438530 + }, + { + "epoch": 1.6952729971702927, + "grad_norm": 0.11186251789331436, + "learning_rate": 0.0007583881443864995, + "loss": 2.2293, + "step": 438540 + }, + { + "epoch": 1.695311654373676, + "grad_norm": 0.13721270859241486, + "learning_rate": 0.0007582270738979691, + "loss": 2.2125, + "step": 438550 + }, + { + "epoch": 1.6953503115770592, + "grad_norm": 0.11518587917089462, + "learning_rate": 0.0007580660242991981, + "loss": 2.2245, + "step": 438560 + }, + { + "epoch": 1.6953889687804424, + "grad_norm": 0.12563805282115936, + "learning_rate": 0.0007579049955820609, + "loss": 2.2328, + "step": 438570 + }, + { + "epoch": 1.6954276259838257, + "grad_norm": 0.10423476248979568, + "learning_rate": 0.0007577439877384373, + "loss": 2.2422, + "step": 438580 + }, + { + "epoch": 1.6954662831872092, + "grad_norm": 0.11267977207899094, + "learning_rate": 0.0007575830007602118, + "loss": 2.238, + "step": 438590 + }, + { + "epoch": 1.6955049403905924, + "grad_norm": 0.11336838454008102, + "learning_rate": 0.0007574220346392746, + "loss": 2.2182, + "step": 438600 + }, + { + "epoch": 1.6955435975939757, + "grad_norm": 0.11938924342393875, + "learning_rate": 0.0007572610893675214, + "loss": 2.2387, + "step": 438610 + }, + { + "epoch": 1.695582254797359, + "grad_norm": 0.10956771671772003, + "learning_rate": 0.0007571001649368523, + "loss": 2.2408, + "step": 438620 + }, + { + "epoch": 1.6956209120007422, + "grad_norm": 0.1219608262181282, + "learning_rate": 0.0007569392613391735, + "loss": 2.2466, + "step": 438630 + }, + { + "epoch": 1.6956595692041256, + "grad_norm": 0.10723282396793365, + "learning_rate": 0.0007567783785663957, + "loss": 2.2179, + "step": 438640 + }, + { + "epoch": 1.695698226407509, + "grad_norm": 0.11632703244686127, + "learning_rate": 0.0007566175166104356, + "loss": 2.22, + "step": 438650 + }, + { + "epoch": 1.6957368836108921, + "grad_norm": 0.10991436243057251, + "learning_rate": 0.000756456675463215, + "loss": 2.2231, + "step": 438660 + }, + { + "epoch": 1.6957755408142754, + "grad_norm": 0.10563716292381287, + "learning_rate": 0.0007562958551166601, + "loss": 2.2178, + "step": 438670 + }, + { + "epoch": 1.6958141980176586, + "grad_norm": 0.11046472191810608, + "learning_rate": 0.0007561350555627031, + "loss": 2.2455, + "step": 438680 + }, + { + "epoch": 1.695852855221042, + "grad_norm": 0.11357564479112625, + "learning_rate": 0.0007559742767932811, + "loss": 2.2423, + "step": 438690 + }, + { + "epoch": 1.6958915124244252, + "grad_norm": 0.10833998769521713, + "learning_rate": 0.0007558135188003368, + "loss": 2.2279, + "step": 438700 + }, + { + "epoch": 1.6959301696278084, + "grad_norm": 0.11807499825954437, + "learning_rate": 0.0007556527815758176, + "loss": 2.2377, + "step": 438710 + }, + { + "epoch": 1.6959688268311917, + "grad_norm": 0.1214083805680275, + "learning_rate": 0.0007554920651116763, + "loss": 2.2414, + "step": 438720 + }, + { + "epoch": 1.696007484034575, + "grad_norm": 0.11282476037740707, + "learning_rate": 0.0007553313693998711, + "loss": 2.2357, + "step": 438730 + }, + { + "epoch": 1.6960461412379582, + "grad_norm": 0.1160525232553482, + "learning_rate": 0.0007551706944323651, + "loss": 2.2273, + "step": 438740 + }, + { + "epoch": 1.6960847984413414, + "grad_norm": 0.10337352007627487, + "learning_rate": 0.0007550100402011268, + "loss": 2.2308, + "step": 438750 + }, + { + "epoch": 1.6961234556447249, + "grad_norm": 0.11672735959291458, + "learning_rate": 0.0007548494066981295, + "loss": 2.2276, + "step": 438760 + }, + { + "epoch": 1.6961621128481081, + "grad_norm": 0.10167520493268967, + "learning_rate": 0.0007546887939153524, + "loss": 2.2383, + "step": 438770 + }, + { + "epoch": 1.6962007700514914, + "grad_norm": 0.1253589689731598, + "learning_rate": 0.0007545282018447788, + "loss": 2.2296, + "step": 438780 + }, + { + "epoch": 1.6962394272548746, + "grad_norm": 0.1127503514289856, + "learning_rate": 0.0007543676304783984, + "loss": 2.2529, + "step": 438790 + }, + { + "epoch": 1.6962780844582581, + "grad_norm": 0.11058793216943741, + "learning_rate": 0.000754207079808205, + "loss": 2.2298, + "step": 438800 + }, + { + "epoch": 1.6963167416616414, + "grad_norm": 0.13061091303825378, + "learning_rate": 0.0007540465498261983, + "loss": 2.2353, + "step": 438810 + }, + { + "epoch": 1.6963553988650246, + "grad_norm": 0.15342095494270325, + "learning_rate": 0.0007538860405243828, + "loss": 2.2369, + "step": 438820 + }, + { + "epoch": 1.6963940560684079, + "grad_norm": 0.11850441992282867, + "learning_rate": 0.0007537255518947683, + "loss": 2.2259, + "step": 438830 + }, + { + "epoch": 1.6964327132717911, + "grad_norm": 0.12837547063827515, + "learning_rate": 0.0007535650839293693, + "loss": 2.2321, + "step": 438840 + }, + { + "epoch": 1.6964713704751744, + "grad_norm": 0.11992836743593216, + "learning_rate": 0.0007534046366202063, + "loss": 2.2297, + "step": 438850 + }, + { + "epoch": 1.6965100276785576, + "grad_norm": 0.11032480746507645, + "learning_rate": 0.000753244209959304, + "loss": 2.2297, + "step": 438860 + }, + { + "epoch": 1.6965486848819409, + "grad_norm": 0.10852088034152985, + "learning_rate": 0.000753083803938693, + "loss": 2.2426, + "step": 438870 + }, + { + "epoch": 1.6965873420853241, + "grad_norm": 0.12265793234109879, + "learning_rate": 0.0007529234185504084, + "loss": 2.2353, + "step": 438880 + }, + { + "epoch": 1.6966259992887074, + "grad_norm": 0.12141349166631699, + "learning_rate": 0.0007527630537864909, + "loss": 2.2293, + "step": 438890 + }, + { + "epoch": 1.6966646564920906, + "grad_norm": 0.12046696990728378, + "learning_rate": 0.000752602709638986, + "loss": 2.2295, + "step": 438900 + }, + { + "epoch": 1.696703313695474, + "grad_norm": 0.11540514975786209, + "learning_rate": 0.0007524423860999445, + "loss": 2.2345, + "step": 438910 + }, + { + "epoch": 1.6967419708988571, + "grad_norm": 0.11729709804058075, + "learning_rate": 0.0007522820831614224, + "loss": 2.2502, + "step": 438920 + }, + { + "epoch": 1.6967806281022406, + "grad_norm": 0.12267860025167465, + "learning_rate": 0.0007521218008154802, + "loss": 2.2374, + "step": 438930 + }, + { + "epoch": 1.6968192853056239, + "grad_norm": 0.12478138506412506, + "learning_rate": 0.0007519615390541845, + "loss": 2.2198, + "step": 438940 + }, + { + "epoch": 1.6968579425090071, + "grad_norm": 0.11276324093341827, + "learning_rate": 0.000751801297869606, + "loss": 2.2365, + "step": 438950 + }, + { + "epoch": 1.6968965997123904, + "grad_norm": 0.11225543171167374, + "learning_rate": 0.0007516410772538211, + "loss": 2.2429, + "step": 438960 + }, + { + "epoch": 1.6969352569157738, + "grad_norm": 0.1132887527346611, + "learning_rate": 0.0007514808771989113, + "loss": 2.2426, + "step": 438970 + }, + { + "epoch": 1.696973914119157, + "grad_norm": 0.12351055443286896, + "learning_rate": 0.0007513206976969627, + "loss": 2.242, + "step": 438980 + }, + { + "epoch": 1.6970125713225404, + "grad_norm": 0.27838459610939026, + "learning_rate": 0.0007511605387400668, + "loss": 2.2265, + "step": 438990 + }, + { + "epoch": 1.6970512285259236, + "grad_norm": 0.12767164409160614, + "learning_rate": 0.0007510004003203203, + "loss": 2.2389, + "step": 439000 + }, + { + "epoch": 1.6970898857293069, + "grad_norm": 0.125470831990242, + "learning_rate": 0.0007508402824298249, + "loss": 2.245, + "step": 439010 + }, + { + "epoch": 1.69712854293269, + "grad_norm": 0.10499616712331772, + "learning_rate": 0.0007506801850606868, + "loss": 2.2235, + "step": 439020 + }, + { + "epoch": 1.6971672001360734, + "grad_norm": 0.12618540227413177, + "learning_rate": 0.0007505201082050179, + "loss": 2.2373, + "step": 439030 + }, + { + "epoch": 1.6972058573394566, + "grad_norm": 0.1158444806933403, + "learning_rate": 0.0007503600518549352, + "loss": 2.2312, + "step": 439040 + }, + { + "epoch": 1.6972445145428399, + "grad_norm": 0.11925296485424042, + "learning_rate": 0.0007502000160025606, + "loss": 2.229, + "step": 439050 + }, + { + "epoch": 1.6972831717462231, + "grad_norm": 0.10973548889160156, + "learning_rate": 0.0007500400006400206, + "loss": 2.2363, + "step": 439060 + }, + { + "epoch": 1.6973218289496064, + "grad_norm": 0.12064634263515472, + "learning_rate": 0.000749880005759447, + "loss": 2.2325, + "step": 439070 + }, + { + "epoch": 1.6973604861529896, + "grad_norm": 0.11418332904577255, + "learning_rate": 0.0007497200313529772, + "loss": 2.2384, + "step": 439080 + }, + { + "epoch": 1.6973991433563729, + "grad_norm": 0.1222028136253357, + "learning_rate": 0.0007495600774127531, + "loss": 2.242, + "step": 439090 + }, + { + "epoch": 1.6974378005597563, + "grad_norm": 0.13102611899375916, + "learning_rate": 0.0007494001439309214, + "loss": 2.2403, + "step": 439100 + }, + { + "epoch": 1.6974764577631396, + "grad_norm": 0.12919282913208008, + "learning_rate": 0.0007492402308996345, + "loss": 2.2329, + "step": 439110 + }, + { + "epoch": 1.6975151149665229, + "grad_norm": 0.11866680532693863, + "learning_rate": 0.0007490803383110489, + "loss": 2.2272, + "step": 439120 + }, + { + "epoch": 1.697553772169906, + "grad_norm": 0.11926010996103287, + "learning_rate": 0.000748920466157327, + "loss": 2.2397, + "step": 439130 + }, + { + "epoch": 1.6975924293732896, + "grad_norm": 0.12326746433973312, + "learning_rate": 0.0007487606144306358, + "loss": 2.2397, + "step": 439140 + }, + { + "epoch": 1.6976310865766728, + "grad_norm": 0.11257810145616531, + "learning_rate": 0.0007486007831231474, + "loss": 2.2453, + "step": 439150 + }, + { + "epoch": 1.697669743780056, + "grad_norm": 0.11413078755140305, + "learning_rate": 0.0007484409722270386, + "loss": 2.2264, + "step": 439160 + }, + { + "epoch": 1.6977084009834393, + "grad_norm": 0.11529600620269775, + "learning_rate": 0.0007482811817344919, + "loss": 2.2171, + "step": 439170 + }, + { + "epoch": 1.6977470581868226, + "grad_norm": 0.11455454677343369, + "learning_rate": 0.000748121411637694, + "loss": 2.2401, + "step": 439180 + }, + { + "epoch": 1.6977857153902058, + "grad_norm": 0.10799404233694077, + "learning_rate": 0.0007479616619288369, + "loss": 2.2538, + "step": 439190 + }, + { + "epoch": 1.697824372593589, + "grad_norm": 0.12358862161636353, + "learning_rate": 0.0007478019326001178, + "loss": 2.2365, + "step": 439200 + }, + { + "epoch": 1.6978630297969723, + "grad_norm": 0.11603404581546783, + "learning_rate": 0.0007476422236437386, + "loss": 2.2275, + "step": 439210 + }, + { + "epoch": 1.6979016870003556, + "grad_norm": 0.12009704113006592, + "learning_rate": 0.0007474825350519059, + "loss": 2.2362, + "step": 439220 + }, + { + "epoch": 1.6979403442037388, + "grad_norm": 0.40134766697883606, + "learning_rate": 0.0007473228668168321, + "loss": 2.2192, + "step": 439230 + }, + { + "epoch": 1.697979001407122, + "grad_norm": 0.12011298537254333, + "learning_rate": 0.000747163218930734, + "loss": 2.2333, + "step": 439240 + }, + { + "epoch": 1.6980176586105054, + "grad_norm": 0.11388115584850311, + "learning_rate": 0.0007470035913858333, + "loss": 2.2308, + "step": 439250 + }, + { + "epoch": 1.6980563158138886, + "grad_norm": 0.11122046411037445, + "learning_rate": 0.0007468439841743567, + "loss": 2.2393, + "step": 439260 + }, + { + "epoch": 1.698094973017272, + "grad_norm": 0.12878040969371796, + "learning_rate": 0.0007466843972885362, + "loss": 2.225, + "step": 439270 + }, + { + "epoch": 1.6981336302206553, + "grad_norm": 0.12966978549957275, + "learning_rate": 0.0007465248307206083, + "loss": 2.2251, + "step": 439280 + }, + { + "epoch": 1.6981722874240386, + "grad_norm": 0.12152548879384995, + "learning_rate": 0.0007463652844628143, + "loss": 2.2385, + "step": 439290 + }, + { + "epoch": 1.6982109446274218, + "grad_norm": 0.11005580425262451, + "learning_rate": 0.0007462057585074015, + "loss": 2.2247, + "step": 439300 + }, + { + "epoch": 1.6982496018308053, + "grad_norm": 0.11342678219079971, + "learning_rate": 0.0007460462528466212, + "loss": 2.2347, + "step": 439310 + }, + { + "epoch": 1.6982882590341886, + "grad_norm": 0.1343384087085724, + "learning_rate": 0.0007458867674727294, + "loss": 2.2522, + "step": 439320 + }, + { + "epoch": 1.6983269162375718, + "grad_norm": 0.12136346101760864, + "learning_rate": 0.0007457273023779878, + "loss": 2.2129, + "step": 439330 + }, + { + "epoch": 1.698365573440955, + "grad_norm": 0.1206677258014679, + "learning_rate": 0.0007455678575546623, + "loss": 2.2405, + "step": 439340 + }, + { + "epoch": 1.6984042306443383, + "grad_norm": 0.12405674904584885, + "learning_rate": 0.0007454084329950244, + "loss": 2.226, + "step": 439350 + }, + { + "epoch": 1.6984428878477216, + "grad_norm": 0.11063183844089508, + "learning_rate": 0.0007452490286913502, + "loss": 2.2321, + "step": 439360 + }, + { + "epoch": 1.6984815450511048, + "grad_norm": 0.11682227998971939, + "learning_rate": 0.0007450896446359206, + "loss": 2.2382, + "step": 439370 + }, + { + "epoch": 1.698520202254488, + "grad_norm": 0.23449300229549408, + "learning_rate": 0.0007449302808210215, + "loss": 2.2405, + "step": 439380 + }, + { + "epoch": 1.6985588594578713, + "grad_norm": 0.10931873321533203, + "learning_rate": 0.0007447709372389437, + "loss": 2.2459, + "step": 439390 + }, + { + "epoch": 1.6985975166612546, + "grad_norm": 0.11637139320373535, + "learning_rate": 0.0007446116138819827, + "loss": 2.2302, + "step": 439400 + }, + { + "epoch": 1.6986361738646378, + "grad_norm": 0.12564273178577423, + "learning_rate": 0.0007444523107424394, + "loss": 2.2344, + "step": 439410 + }, + { + "epoch": 1.698674831068021, + "grad_norm": 0.12010937184095383, + "learning_rate": 0.000744293027812619, + "loss": 2.235, + "step": 439420 + }, + { + "epoch": 1.6987134882714043, + "grad_norm": 0.11572326719760895, + "learning_rate": 0.0007441337650848321, + "loss": 2.2332, + "step": 439430 + }, + { + "epoch": 1.6987521454747878, + "grad_norm": 0.10662706196308136, + "learning_rate": 0.0007439745225513934, + "loss": 2.2284, + "step": 439440 + }, + { + "epoch": 1.698790802678171, + "grad_norm": 0.11132440716028214, + "learning_rate": 0.0007438153002046235, + "loss": 2.2324, + "step": 439450 + }, + { + "epoch": 1.6988294598815543, + "grad_norm": 0.13395658135414124, + "learning_rate": 0.0007436560980368472, + "loss": 2.2202, + "step": 439460 + }, + { + "epoch": 1.6988681170849376, + "grad_norm": 0.12403550744056702, + "learning_rate": 0.0007434969160403944, + "loss": 2.221, + "step": 439470 + }, + { + "epoch": 1.698906774288321, + "grad_norm": 0.12234672158956528, + "learning_rate": 0.0007433377542075994, + "loss": 2.2176, + "step": 439480 + }, + { + "epoch": 1.6989454314917043, + "grad_norm": 0.11924166977405548, + "learning_rate": 0.000743178612530802, + "loss": 2.2254, + "step": 439490 + }, + { + "epoch": 1.6989840886950875, + "grad_norm": 0.11710778623819351, + "learning_rate": 0.0007430194910023465, + "loss": 2.2276, + "step": 439500 + }, + { + "epoch": 1.6990227458984708, + "grad_norm": 0.1245732381939888, + "learning_rate": 0.0007428603896145823, + "loss": 2.2331, + "step": 439510 + }, + { + "epoch": 1.699061403101854, + "grad_norm": 0.11419229954481125, + "learning_rate": 0.000742701308359863, + "loss": 2.2187, + "step": 439520 + }, + { + "epoch": 1.6991000603052373, + "grad_norm": 0.11181352287530899, + "learning_rate": 0.0007425422472305481, + "loss": 2.2307, + "step": 439530 + }, + { + "epoch": 1.6991387175086206, + "grad_norm": 0.10994283109903336, + "learning_rate": 0.000742383206219001, + "loss": 2.2313, + "step": 439540 + }, + { + "epoch": 1.6991773747120038, + "grad_norm": 0.1081324964761734, + "learning_rate": 0.0007422241853175899, + "loss": 2.2315, + "step": 439550 + }, + { + "epoch": 1.699216031915387, + "grad_norm": 0.1146218553185463, + "learning_rate": 0.0007420651845186889, + "loss": 2.2244, + "step": 439560 + }, + { + "epoch": 1.6992546891187703, + "grad_norm": 0.10938213020563126, + "learning_rate": 0.0007419062038146758, + "loss": 2.2324, + "step": 439570 + }, + { + "epoch": 1.6992933463221536, + "grad_norm": 0.12179584056138992, + "learning_rate": 0.0007417472431979338, + "loss": 2.2156, + "step": 439580 + }, + { + "epoch": 1.6993320035255368, + "grad_norm": 0.14004698395729065, + "learning_rate": 0.0007415883026608503, + "loss": 2.2273, + "step": 439590 + }, + { + "epoch": 1.69937066072892, + "grad_norm": 0.10983067005872726, + "learning_rate": 0.0007414293821958182, + "loss": 2.2346, + "step": 439600 + }, + { + "epoch": 1.6994093179323035, + "grad_norm": 0.11252199113368988, + "learning_rate": 0.0007412704817952349, + "loss": 2.2324, + "step": 439610 + }, + { + "epoch": 1.6994479751356868, + "grad_norm": 0.11938013881444931, + "learning_rate": 0.0007411116014515027, + "loss": 2.2355, + "step": 439620 + }, + { + "epoch": 1.69948663233907, + "grad_norm": 0.100449338555336, + "learning_rate": 0.0007409527411570287, + "loss": 2.2268, + "step": 439630 + }, + { + "epoch": 1.6995252895424533, + "grad_norm": 0.11269817501306534, + "learning_rate": 0.0007407939009042246, + "loss": 2.2361, + "step": 439640 + }, + { + "epoch": 1.6995639467458368, + "grad_norm": 0.12172277271747589, + "learning_rate": 0.0007406350806855067, + "loss": 2.2153, + "step": 439650 + }, + { + "epoch": 1.69960260394922, + "grad_norm": 0.1266396939754486, + "learning_rate": 0.000740476280493297, + "loss": 2.2371, + "step": 439660 + }, + { + "epoch": 1.6996412611526033, + "grad_norm": 0.1194564625620842, + "learning_rate": 0.0007403175003200211, + "loss": 2.2284, + "step": 439670 + }, + { + "epoch": 1.6996799183559865, + "grad_norm": 0.11998004466295242, + "learning_rate": 0.0007401587401581102, + "loss": 2.2247, + "step": 439680 + }, + { + "epoch": 1.6997185755593698, + "grad_norm": 0.11383472383022308, + "learning_rate": 0.00074, + "loss": 2.2396, + "step": 439690 + }, + { + "epoch": 1.699757232762753, + "grad_norm": 0.11866569519042969, + "learning_rate": 0.0007398412798381308, + "loss": 2.2296, + "step": 439700 + }, + { + "epoch": 1.6997958899661363, + "grad_norm": 0.1243966594338417, + "learning_rate": 0.000739682579664948, + "loss": 2.2135, + "step": 439710 + }, + { + "epoch": 1.6998345471695195, + "grad_norm": 0.11566292494535446, + "learning_rate": 0.0007395238994729016, + "loss": 2.2231, + "step": 439720 + }, + { + "epoch": 1.6998732043729028, + "grad_norm": 0.10893825441598892, + "learning_rate": 0.0007393652392544461, + "loss": 2.2209, + "step": 439730 + }, + { + "epoch": 1.699911861576286, + "grad_norm": 0.1245025172829628, + "learning_rate": 0.0007392065990020412, + "loss": 2.2291, + "step": 439740 + }, + { + "epoch": 1.6999505187796693, + "grad_norm": 0.11219801008701324, + "learning_rate": 0.0007390479787081508, + "loss": 2.2277, + "step": 439750 + }, + { + "epoch": 1.6999891759830525, + "grad_norm": 0.11668980866670609, + "learning_rate": 0.0007388893783652444, + "loss": 2.2231, + "step": 439760 + }, + { + "epoch": 1.7000278331864358, + "grad_norm": 0.1189895048737526, + "learning_rate": 0.0007387307979657952, + "loss": 2.2323, + "step": 439770 + }, + { + "epoch": 1.7000664903898193, + "grad_norm": 0.11900747567415237, + "learning_rate": 0.0007385722375022817, + "loss": 2.2281, + "step": 439780 + }, + { + "epoch": 1.7001051475932025, + "grad_norm": 0.11199655383825302, + "learning_rate": 0.0007384136969671874, + "loss": 2.2401, + "step": 439790 + }, + { + "epoch": 1.7001438047965858, + "grad_norm": 0.12800821661949158, + "learning_rate": 0.0007382551763529996, + "loss": 2.2334, + "step": 439800 + }, + { + "epoch": 1.700182461999969, + "grad_norm": 0.12204767763614655, + "learning_rate": 0.0007380966756522114, + "loss": 2.2353, + "step": 439810 + }, + { + "epoch": 1.7002211192033525, + "grad_norm": 0.11441326886415482, + "learning_rate": 0.0007379381948573201, + "loss": 2.2297, + "step": 439820 + }, + { + "epoch": 1.7002597764067358, + "grad_norm": 0.11331064999103546, + "learning_rate": 0.0007377797339608272, + "loss": 2.2188, + "step": 439830 + }, + { + "epoch": 1.700298433610119, + "grad_norm": 0.11376959830522537, + "learning_rate": 0.00073762129295524, + "loss": 2.2265, + "step": 439840 + }, + { + "epoch": 1.7003370908135023, + "grad_norm": 0.5333940386772156, + "learning_rate": 0.0007374628718330696, + "loss": 2.2425, + "step": 439850 + }, + { + "epoch": 1.7003757480168855, + "grad_norm": 0.11315008997917175, + "learning_rate": 0.0007373044705868321, + "loss": 2.2499, + "step": 439860 + }, + { + "epoch": 1.7004144052202688, + "grad_norm": 0.11870913207530975, + "learning_rate": 0.0007371460892090487, + "loss": 2.2377, + "step": 439870 + }, + { + "epoch": 1.700453062423652, + "grad_norm": 0.11550546437501907, + "learning_rate": 0.0007369877276922445, + "loss": 2.2279, + "step": 439880 + }, + { + "epoch": 1.7004917196270353, + "grad_norm": 0.11039859801530838, + "learning_rate": 0.0007368293860289498, + "loss": 2.2327, + "step": 439890 + }, + { + "epoch": 1.7005303768304185, + "grad_norm": 0.12192100286483765, + "learning_rate": 0.0007366710642116994, + "loss": 2.2262, + "step": 439900 + }, + { + "epoch": 1.7005690340338018, + "grad_norm": 0.11512763053178787, + "learning_rate": 0.000736512762233033, + "loss": 2.2384, + "step": 439910 + }, + { + "epoch": 1.700607691237185, + "grad_norm": 0.1126255914568901, + "learning_rate": 0.0007363544800854948, + "loss": 2.2286, + "step": 439920 + }, + { + "epoch": 1.7006463484405683, + "grad_norm": 0.12926539778709412, + "learning_rate": 0.0007361962177616338, + "loss": 2.2213, + "step": 439930 + }, + { + "epoch": 1.7006850056439515, + "grad_norm": 0.13191105425357819, + "learning_rate": 0.0007360379752540031, + "loss": 2.2367, + "step": 439940 + }, + { + "epoch": 1.700723662847335, + "grad_norm": 0.12585218250751495, + "learning_rate": 0.0007358797525551612, + "loss": 2.2326, + "step": 439950 + }, + { + "epoch": 1.7007623200507183, + "grad_norm": 0.12355031073093414, + "learning_rate": 0.0007357215496576712, + "loss": 2.2416, + "step": 439960 + }, + { + "epoch": 1.7008009772541015, + "grad_norm": 0.11259719729423523, + "learning_rate": 0.0007355633665541006, + "loss": 2.2416, + "step": 439970 + }, + { + "epoch": 1.7008396344574848, + "grad_norm": 0.11513018608093262, + "learning_rate": 0.0007354052032370211, + "loss": 2.2359, + "step": 439980 + }, + { + "epoch": 1.7008782916608682, + "grad_norm": 0.12916196882724762, + "learning_rate": 0.0007352470596990099, + "loss": 2.2247, + "step": 439990 + }, + { + "epoch": 1.7009169488642515, + "grad_norm": 0.13871927559375763, + "learning_rate": 0.0007350889359326483, + "loss": 2.2284, + "step": 440000 + }, + { + "epoch": 1.7009556060676347, + "grad_norm": 0.11416509002447128, + "learning_rate": 0.0007349308319305226, + "loss": 2.2375, + "step": 440010 + }, + { + "epoch": 1.700994263271018, + "grad_norm": 0.13041462004184723, + "learning_rate": 0.000734772747685223, + "loss": 2.2287, + "step": 440020 + }, + { + "epoch": 1.7010329204744012, + "grad_norm": 0.12060035020112991, + "learning_rate": 0.0007346146831893457, + "loss": 2.2406, + "step": 440030 + }, + { + "epoch": 1.7010715776777845, + "grad_norm": 0.11347172409296036, + "learning_rate": 0.00073445663843549, + "loss": 2.2122, + "step": 440040 + }, + { + "epoch": 1.7011102348811677, + "grad_norm": 0.1186690703034401, + "learning_rate": 0.0007342986134162608, + "loss": 2.2252, + "step": 440050 + }, + { + "epoch": 1.701148892084551, + "grad_norm": 0.12383898347616196, + "learning_rate": 0.0007341406081242672, + "loss": 2.2199, + "step": 440060 + }, + { + "epoch": 1.7011875492879343, + "grad_norm": 0.10738757997751236, + "learning_rate": 0.0007339826225521231, + "loss": 2.2372, + "step": 440070 + }, + { + "epoch": 1.7012262064913175, + "grad_norm": 0.11477794498205185, + "learning_rate": 0.000733824656692447, + "loss": 2.2221, + "step": 440080 + }, + { + "epoch": 1.7012648636947008, + "grad_norm": 0.1131504625082016, + "learning_rate": 0.0007336667105378616, + "loss": 2.2349, + "step": 440090 + }, + { + "epoch": 1.701303520898084, + "grad_norm": 0.11446657031774521, + "learning_rate": 0.0007335087840809949, + "loss": 2.2232, + "step": 440100 + }, + { + "epoch": 1.7013421781014673, + "grad_norm": 0.11273016035556793, + "learning_rate": 0.000733350877314479, + "loss": 2.2224, + "step": 440110 + }, + { + "epoch": 1.7013808353048507, + "grad_norm": 0.12036816030740738, + "learning_rate": 0.0007331929902309509, + "loss": 2.2417, + "step": 440120 + }, + { + "epoch": 1.701419492508234, + "grad_norm": 0.10991771519184113, + "learning_rate": 0.0007330351228230516, + "loss": 2.2201, + "step": 440130 + }, + { + "epoch": 1.7014581497116172, + "grad_norm": 0.12521474063396454, + "learning_rate": 0.0007328772750834273, + "loss": 2.2289, + "step": 440140 + }, + { + "epoch": 1.7014968069150005, + "grad_norm": 0.11640115827322006, + "learning_rate": 0.000732719447004729, + "loss": 2.231, + "step": 440150 + }, + { + "epoch": 1.701535464118384, + "grad_norm": 0.10815521329641342, + "learning_rate": 0.0007325616385796112, + "loss": 2.2207, + "step": 440160 + }, + { + "epoch": 1.7015741213217672, + "grad_norm": 0.11253977566957474, + "learning_rate": 0.0007324038498007342, + "loss": 2.2248, + "step": 440170 + }, + { + "epoch": 1.7016127785251505, + "grad_norm": 0.14925464987754822, + "learning_rate": 0.0007322460806607616, + "loss": 2.241, + "step": 440180 + }, + { + "epoch": 1.7016514357285337, + "grad_norm": 0.11597122997045517, + "learning_rate": 0.000732088331152363, + "loss": 2.2306, + "step": 440190 + }, + { + "epoch": 1.701690092931917, + "grad_norm": 0.10843616724014282, + "learning_rate": 0.0007319306012682114, + "loss": 2.2004, + "step": 440200 + }, + { + "epoch": 1.7017287501353002, + "grad_norm": 0.12545384466648102, + "learning_rate": 0.0007317728910009849, + "loss": 2.2162, + "step": 440210 + }, + { + "epoch": 1.7017674073386835, + "grad_norm": 0.12916113436222076, + "learning_rate": 0.0007316152003433658, + "loss": 2.2277, + "step": 440220 + }, + { + "epoch": 1.7018060645420667, + "grad_norm": 0.11176025122404099, + "learning_rate": 0.0007314575292880417, + "loss": 2.2222, + "step": 440230 + }, + { + "epoch": 1.70184472174545, + "grad_norm": 0.10779713094234467, + "learning_rate": 0.0007312998778277036, + "loss": 2.2345, + "step": 440240 + }, + { + "epoch": 1.7018833789488332, + "grad_norm": 0.1066531166434288, + "learning_rate": 0.0007311422459550478, + "loss": 2.2192, + "step": 440250 + }, + { + "epoch": 1.7019220361522165, + "grad_norm": 0.10790924727916718, + "learning_rate": 0.0007309846336627754, + "loss": 2.2154, + "step": 440260 + }, + { + "epoch": 1.7019606933555997, + "grad_norm": 0.11977184563875198, + "learning_rate": 0.0007308270409435916, + "loss": 2.2333, + "step": 440270 + }, + { + "epoch": 1.7019993505589832, + "grad_norm": 0.11540991812944412, + "learning_rate": 0.0007306694677902055, + "loss": 2.2302, + "step": 440280 + }, + { + "epoch": 1.7020380077623665, + "grad_norm": 0.11778786778450012, + "learning_rate": 0.000730511914195332, + "loss": 2.2422, + "step": 440290 + }, + { + "epoch": 1.7020766649657497, + "grad_norm": 0.11950423568487167, + "learning_rate": 0.0007303543801516896, + "loss": 2.2264, + "step": 440300 + }, + { + "epoch": 1.702115322169133, + "grad_norm": 0.11625012010335922, + "learning_rate": 0.0007301968656520019, + "loss": 2.2313, + "step": 440310 + }, + { + "epoch": 1.7021539793725162, + "grad_norm": 0.11900748312473297, + "learning_rate": 0.0007300393706889965, + "loss": 2.2341, + "step": 440320 + }, + { + "epoch": 1.7021926365758997, + "grad_norm": 0.1217237040400505, + "learning_rate": 0.0007298818952554059, + "loss": 2.2381, + "step": 440330 + }, + { + "epoch": 1.702231293779283, + "grad_norm": 0.1119767427444458, + "learning_rate": 0.0007297244393439665, + "loss": 2.2379, + "step": 440340 + }, + { + "epoch": 1.7022699509826662, + "grad_norm": 0.11901693046092987, + "learning_rate": 0.0007295670029474202, + "loss": 2.2327, + "step": 440350 + }, + { + "epoch": 1.7023086081860495, + "grad_norm": 0.12310083210468292, + "learning_rate": 0.0007294095860585128, + "loss": 2.2318, + "step": 440360 + }, + { + "epoch": 1.7023472653894327, + "grad_norm": 0.11015886813402176, + "learning_rate": 0.0007292521886699943, + "loss": 2.2377, + "step": 440370 + }, + { + "epoch": 1.702385922592816, + "grad_norm": 0.11281030625104904, + "learning_rate": 0.0007290948107746195, + "loss": 2.2326, + "step": 440380 + }, + { + "epoch": 1.7024245797961992, + "grad_norm": 0.11704161763191223, + "learning_rate": 0.0007289374523651482, + "loss": 2.2232, + "step": 440390 + }, + { + "epoch": 1.7024632369995825, + "grad_norm": 0.13201874494552612, + "learning_rate": 0.0007287801134343437, + "loss": 2.2255, + "step": 440400 + }, + { + "epoch": 1.7025018942029657, + "grad_norm": 0.13003620505332947, + "learning_rate": 0.0007286227939749746, + "loss": 2.2309, + "step": 440410 + }, + { + "epoch": 1.702540551406349, + "grad_norm": 0.11561232060194016, + "learning_rate": 0.0007284654939798134, + "loss": 2.2376, + "step": 440420 + }, + { + "epoch": 1.7025792086097322, + "grad_norm": 0.13177193701267242, + "learning_rate": 0.0007283082134416374, + "loss": 2.2421, + "step": 440430 + }, + { + "epoch": 1.7026178658131155, + "grad_norm": 0.11514715850353241, + "learning_rate": 0.0007281509523532284, + "loss": 2.2364, + "step": 440440 + }, + { + "epoch": 1.702656523016499, + "grad_norm": 0.1128527820110321, + "learning_rate": 0.0007279937107073722, + "loss": 2.2416, + "step": 440450 + }, + { + "epoch": 1.7026951802198822, + "grad_norm": 0.10707538574934006, + "learning_rate": 0.00072783648849686, + "loss": 2.235, + "step": 440460 + }, + { + "epoch": 1.7027338374232655, + "grad_norm": 0.12871518731117249, + "learning_rate": 0.0007276792857144863, + "loss": 2.2221, + "step": 440470 + }, + { + "epoch": 1.7027724946266487, + "grad_norm": 0.1377457082271576, + "learning_rate": 0.0007275221023530507, + "loss": 2.216, + "step": 440480 + }, + { + "epoch": 1.702811151830032, + "grad_norm": 0.11359865963459015, + "learning_rate": 0.0007273649384053573, + "loss": 2.2297, + "step": 440490 + }, + { + "epoch": 1.7028498090334154, + "grad_norm": 0.12144174426794052, + "learning_rate": 0.0007272077938642145, + "loss": 2.2326, + "step": 440500 + }, + { + "epoch": 1.7028884662367987, + "grad_norm": 0.10958414524793625, + "learning_rate": 0.000727050668722435, + "loss": 2.2301, + "step": 440510 + }, + { + "epoch": 1.702927123440182, + "grad_norm": 0.12128295749425888, + "learning_rate": 0.0007268935629728361, + "loss": 2.2342, + "step": 440520 + }, + { + "epoch": 1.7029657806435652, + "grad_norm": 0.12747161090373993, + "learning_rate": 0.0007267364766082396, + "loss": 2.2195, + "step": 440530 + }, + { + "epoch": 1.7030044378469484, + "grad_norm": 0.10787870734930038, + "learning_rate": 0.0007265794096214715, + "loss": 2.234, + "step": 440540 + }, + { + "epoch": 1.7030430950503317, + "grad_norm": 0.10992727428674698, + "learning_rate": 0.0007264223620053625, + "loss": 2.2427, + "step": 440550 + }, + { + "epoch": 1.703081752253715, + "grad_norm": 0.2564323842525482, + "learning_rate": 0.0007262653337527474, + "loss": 2.2179, + "step": 440560 + }, + { + "epoch": 1.7031204094570982, + "grad_norm": 0.13141652941703796, + "learning_rate": 0.0007261083248564657, + "loss": 2.2201, + "step": 440570 + }, + { + "epoch": 1.7031590666604814, + "grad_norm": 0.11201170086860657, + "learning_rate": 0.0007259513353093612, + "loss": 2.2353, + "step": 440580 + }, + { + "epoch": 1.7031977238638647, + "grad_norm": 0.11595015227794647, + "learning_rate": 0.000725794365104282, + "loss": 2.2409, + "step": 440590 + }, + { + "epoch": 1.703236381067248, + "grad_norm": 0.127515509724617, + "learning_rate": 0.0007256374142340807, + "loss": 2.2457, + "step": 440600 + }, + { + "epoch": 1.7032750382706312, + "grad_norm": 0.11486254632472992, + "learning_rate": 0.0007254804826916144, + "loss": 2.2223, + "step": 440610 + }, + { + "epoch": 1.7033136954740147, + "grad_norm": 0.10647368431091309, + "learning_rate": 0.0007253235704697447, + "loss": 2.2236, + "step": 440620 + }, + { + "epoch": 1.703352352677398, + "grad_norm": 0.12386307865381241, + "learning_rate": 0.0007251666775613371, + "loss": 2.2227, + "step": 440630 + }, + { + "epoch": 1.7033910098807812, + "grad_norm": 0.11835476756095886, + "learning_rate": 0.0007250098039592619, + "loss": 2.2424, + "step": 440640 + }, + { + "epoch": 1.7034296670841644, + "grad_norm": 0.12118931114673615, + "learning_rate": 0.0007248529496563936, + "loss": 2.2326, + "step": 440650 + }, + { + "epoch": 1.703468324287548, + "grad_norm": 0.11157720535993576, + "learning_rate": 0.0007246961146456113, + "loss": 2.2188, + "step": 440660 + }, + { + "epoch": 1.7035069814909312, + "grad_norm": 0.11851327121257782, + "learning_rate": 0.000724539298919798, + "loss": 2.2229, + "step": 440670 + }, + { + "epoch": 1.7035456386943144, + "grad_norm": 0.11927254498004913, + "learning_rate": 0.000724382502471842, + "loss": 2.2241, + "step": 440680 + }, + { + "epoch": 1.7035842958976977, + "grad_norm": 0.11081274598836899, + "learning_rate": 0.0007242257252946351, + "loss": 2.2228, + "step": 440690 + }, + { + "epoch": 1.703622953101081, + "grad_norm": 0.11584481596946716, + "learning_rate": 0.0007240689673810736, + "loss": 2.2289, + "step": 440700 + }, + { + "epoch": 1.7036616103044642, + "grad_norm": 0.11202621459960938, + "learning_rate": 0.0007239122287240583, + "loss": 2.2206, + "step": 440710 + }, + { + "epoch": 1.7037002675078474, + "grad_norm": 0.14524437487125397, + "learning_rate": 0.0007237555093164947, + "loss": 2.2355, + "step": 440720 + }, + { + "epoch": 1.7037389247112307, + "grad_norm": 0.12101338058710098, + "learning_rate": 0.000723598809151292, + "loss": 2.2313, + "step": 440730 + }, + { + "epoch": 1.703777581914614, + "grad_norm": 0.11293715983629227, + "learning_rate": 0.0007234421282213641, + "loss": 2.2358, + "step": 440740 + }, + { + "epoch": 1.7038162391179972, + "grad_norm": 0.11404402554035187, + "learning_rate": 0.0007232854665196295, + "loss": 2.223, + "step": 440750 + }, + { + "epoch": 1.7038548963213804, + "grad_norm": 0.12247447669506073, + "learning_rate": 0.0007231288240390106, + "loss": 2.227, + "step": 440760 + }, + { + "epoch": 1.7038935535247637, + "grad_norm": 0.11357226222753525, + "learning_rate": 0.0007229722007724342, + "loss": 2.2241, + "step": 440770 + }, + { + "epoch": 1.703932210728147, + "grad_norm": 0.125160813331604, + "learning_rate": 0.0007228155967128318, + "loss": 2.2468, + "step": 440780 + }, + { + "epoch": 1.7039708679315304, + "grad_norm": 0.14034807682037354, + "learning_rate": 0.0007226590118531389, + "loss": 2.2248, + "step": 440790 + }, + { + "epoch": 1.7040095251349137, + "grad_norm": 0.12015635520219803, + "learning_rate": 0.000722502446186295, + "loss": 2.2198, + "step": 440800 + }, + { + "epoch": 1.704048182338297, + "grad_norm": 0.11873217672109604, + "learning_rate": 0.000722345899705245, + "loss": 2.2231, + "step": 440810 + }, + { + "epoch": 1.7040868395416802, + "grad_norm": 0.12021807581186295, + "learning_rate": 0.0007221893724029369, + "loss": 2.2297, + "step": 440820 + }, + { + "epoch": 1.7041254967450636, + "grad_norm": 0.12242366373538971, + "learning_rate": 0.0007220328642723242, + "loss": 2.2212, + "step": 440830 + }, + { + "epoch": 1.704164153948447, + "grad_norm": 0.11865365505218506, + "learning_rate": 0.0007218763753063636, + "loss": 2.2154, + "step": 440840 + }, + { + "epoch": 1.7042028111518301, + "grad_norm": 0.11381933093070984, + "learning_rate": 0.0007217199054980166, + "loss": 2.2479, + "step": 440850 + }, + { + "epoch": 1.7042414683552134, + "grad_norm": 0.1155766099691391, + "learning_rate": 0.000721563454840249, + "loss": 2.2197, + "step": 440860 + }, + { + "epoch": 1.7042801255585966, + "grad_norm": 0.11473380029201508, + "learning_rate": 0.0007214070233260313, + "loss": 2.2314, + "step": 440870 + }, + { + "epoch": 1.70431878276198, + "grad_norm": 0.11671894788742065, + "learning_rate": 0.0007212506109483377, + "loss": 2.2294, + "step": 440880 + }, + { + "epoch": 1.7043574399653632, + "grad_norm": 0.1232934519648552, + "learning_rate": 0.0007210942177001467, + "loss": 2.2311, + "step": 440890 + }, + { + "epoch": 1.7043960971687464, + "grad_norm": 0.125223308801651, + "learning_rate": 0.0007209378435744416, + "loss": 2.2277, + "step": 440900 + }, + { + "epoch": 1.7044347543721297, + "grad_norm": 0.11535106599330902, + "learning_rate": 0.0007207814885642095, + "loss": 2.2408, + "step": 440910 + }, + { + "epoch": 1.704473411575513, + "grad_norm": 0.1292526125907898, + "learning_rate": 0.0007206251526624418, + "loss": 2.2395, + "step": 440920 + }, + { + "epoch": 1.7045120687788962, + "grad_norm": 0.1217828020453453, + "learning_rate": 0.000720468835862135, + "loss": 2.2428, + "step": 440930 + }, + { + "epoch": 1.7045507259822794, + "grad_norm": 0.13024525344371796, + "learning_rate": 0.0007203125381562887, + "loss": 2.2379, + "step": 440940 + }, + { + "epoch": 1.7045893831856627, + "grad_norm": 0.11056360602378845, + "learning_rate": 0.0007201562595379077, + "loss": 2.2234, + "step": 440950 + }, + { + "epoch": 1.7046280403890461, + "grad_norm": 0.11324920505285263, + "learning_rate": 0.0007199999999999999, + "loss": 2.241, + "step": 440960 + }, + { + "epoch": 1.7046666975924294, + "grad_norm": 0.1242329403758049, + "learning_rate": 0.0007198437595355791, + "loss": 2.2321, + "step": 440970 + }, + { + "epoch": 1.7047053547958126, + "grad_norm": 0.1274898648262024, + "learning_rate": 0.0007196875381376621, + "loss": 2.2192, + "step": 440980 + }, + { + "epoch": 1.704744011999196, + "grad_norm": 0.12029425799846649, + "learning_rate": 0.0007195313357992708, + "loss": 2.2226, + "step": 440990 + }, + { + "epoch": 1.7047826692025794, + "grad_norm": 0.12936897575855255, + "learning_rate": 0.0007193751525134302, + "loss": 2.2388, + "step": 441000 + }, + { + "epoch": 1.7048213264059626, + "grad_norm": 0.10859957337379456, + "learning_rate": 0.0007192189882731707, + "loss": 2.2357, + "step": 441010 + }, + { + "epoch": 1.7048599836093459, + "grad_norm": 0.11123181134462357, + "learning_rate": 0.0007190628430715267, + "loss": 2.2324, + "step": 441020 + }, + { + "epoch": 1.7048986408127291, + "grad_norm": 0.13620242476463318, + "learning_rate": 0.0007189067169015365, + "loss": 2.2336, + "step": 441030 + }, + { + "epoch": 1.7049372980161124, + "grad_norm": 0.1191759929060936, + "learning_rate": 0.0007187506097562427, + "loss": 2.2249, + "step": 441040 + }, + { + "epoch": 1.7049759552194956, + "grad_norm": 0.11479087173938751, + "learning_rate": 0.0007185945216286924, + "loss": 2.2355, + "step": 441050 + }, + { + "epoch": 1.7050146124228789, + "grad_norm": 0.11262121051549911, + "learning_rate": 0.0007184384525119365, + "loss": 2.2327, + "step": 441060 + }, + { + "epoch": 1.7050532696262621, + "grad_norm": 0.11017955839633942, + "learning_rate": 0.0007182824023990309, + "loss": 2.2182, + "step": 441070 + }, + { + "epoch": 1.7050919268296454, + "grad_norm": 0.12410023808479309, + "learning_rate": 0.0007181263712830348, + "loss": 2.217, + "step": 441080 + }, + { + "epoch": 1.7051305840330286, + "grad_norm": 0.12589412927627563, + "learning_rate": 0.0007179703591570124, + "loss": 2.2384, + "step": 441090 + }, + { + "epoch": 1.705169241236412, + "grad_norm": 0.12104815989732742, + "learning_rate": 0.0007178143660140315, + "loss": 2.2323, + "step": 441100 + }, + { + "epoch": 1.7052078984397951, + "grad_norm": 0.11863759160041809, + "learning_rate": 0.0007176583918471647, + "loss": 2.2349, + "step": 441110 + }, + { + "epoch": 1.7052465556431784, + "grad_norm": 0.12504743039608002, + "learning_rate": 0.0007175024366494882, + "loss": 2.2385, + "step": 441120 + }, + { + "epoch": 1.7052852128465619, + "grad_norm": 0.1270468831062317, + "learning_rate": 0.0007173465004140831, + "loss": 2.2271, + "step": 441130 + }, + { + "epoch": 1.7053238700499451, + "grad_norm": 0.10368954390287399, + "learning_rate": 0.0007171905831340339, + "loss": 2.2132, + "step": 441140 + }, + { + "epoch": 1.7053625272533284, + "grad_norm": 0.11982696503400803, + "learning_rate": 0.00071703468480243, + "loss": 2.2359, + "step": 441150 + }, + { + "epoch": 1.7054011844567116, + "grad_norm": 0.11196932196617126, + "learning_rate": 0.0007168788054123647, + "loss": 2.2196, + "step": 441160 + }, + { + "epoch": 1.705439841660095, + "grad_norm": 0.10839185863733292, + "learning_rate": 0.0007167229449569357, + "loss": 2.2373, + "step": 441170 + }, + { + "epoch": 1.7054784988634784, + "grad_norm": 0.11195364594459534, + "learning_rate": 0.0007165671034292442, + "loss": 2.2293, + "step": 441180 + }, + { + "epoch": 1.7055171560668616, + "grad_norm": 0.13077500462532043, + "learning_rate": 0.0007164112808223968, + "loss": 2.2397, + "step": 441190 + }, + { + "epoch": 1.7055558132702449, + "grad_norm": 0.12081755697727203, + "learning_rate": 0.0007162554771295031, + "loss": 2.2298, + "step": 441200 + }, + { + "epoch": 1.705594470473628, + "grad_norm": 0.11815162748098373, + "learning_rate": 0.0007160996923436774, + "loss": 2.2314, + "step": 441210 + }, + { + "epoch": 1.7056331276770114, + "grad_norm": 0.10875197499990463, + "learning_rate": 0.0007159439264580384, + "loss": 2.2288, + "step": 441220 + }, + { + "epoch": 1.7056717848803946, + "grad_norm": 0.11378846317529678, + "learning_rate": 0.0007157881794657082, + "loss": 2.239, + "step": 441230 + }, + { + "epoch": 1.7057104420837779, + "grad_norm": 0.11096439510583878, + "learning_rate": 0.0007156324513598142, + "loss": 2.2092, + "step": 441240 + }, + { + "epoch": 1.7057490992871611, + "grad_norm": 0.12965725362300873, + "learning_rate": 0.0007154767421334871, + "loss": 2.2304, + "step": 441250 + }, + { + "epoch": 1.7057877564905444, + "grad_norm": 0.10906452685594559, + "learning_rate": 0.0007153210517798621, + "loss": 2.2307, + "step": 441260 + }, + { + "epoch": 1.7058264136939276, + "grad_norm": 0.12163577973842621, + "learning_rate": 0.0007151653802920781, + "loss": 2.228, + "step": 441270 + }, + { + "epoch": 1.7058650708973109, + "grad_norm": 0.13851909339427948, + "learning_rate": 0.0007150097276632792, + "loss": 2.233, + "step": 441280 + }, + { + "epoch": 1.7059037281006941, + "grad_norm": 0.10406989604234695, + "learning_rate": 0.0007148540938866124, + "loss": 2.2227, + "step": 441290 + }, + { + "epoch": 1.7059423853040776, + "grad_norm": 0.10548216104507446, + "learning_rate": 0.0007146984789552298, + "loss": 2.2178, + "step": 441300 + }, + { + "epoch": 1.7059810425074609, + "grad_norm": 0.12286142259836197, + "learning_rate": 0.000714542882862287, + "loss": 2.2449, + "step": 441310 + }, + { + "epoch": 1.706019699710844, + "grad_norm": 0.12756648659706116, + "learning_rate": 0.0007143873056009442, + "loss": 2.2351, + "step": 441320 + }, + { + "epoch": 1.7060583569142274, + "grad_norm": 0.12511231005191803, + "learning_rate": 0.0007142317471643656, + "loss": 2.2333, + "step": 441330 + }, + { + "epoch": 1.7060970141176108, + "grad_norm": 0.11963135004043579, + "learning_rate": 0.0007140762075457192, + "loss": 2.2267, + "step": 441340 + }, + { + "epoch": 1.706135671320994, + "grad_norm": 0.11961111426353455, + "learning_rate": 0.000713920686738178, + "loss": 2.2079, + "step": 441350 + }, + { + "epoch": 1.7061743285243773, + "grad_norm": 0.12484431266784668, + "learning_rate": 0.0007137651847349178, + "loss": 2.2246, + "step": 441360 + }, + { + "epoch": 1.7062129857277606, + "grad_norm": 0.12263071537017822, + "learning_rate": 0.0007136097015291199, + "loss": 2.2212, + "step": 441370 + }, + { + "epoch": 1.7062516429311438, + "grad_norm": 0.1359153389930725, + "learning_rate": 0.0007134542371139687, + "loss": 2.2241, + "step": 441380 + }, + { + "epoch": 1.706290300134527, + "grad_norm": 0.1229487881064415, + "learning_rate": 0.0007132987914826534, + "loss": 2.2287, + "step": 441390 + }, + { + "epoch": 1.7063289573379103, + "grad_norm": 0.13365302979946136, + "learning_rate": 0.0007131433646283672, + "loss": 2.2353, + "step": 441400 + }, + { + "epoch": 1.7063676145412936, + "grad_norm": 0.10661803930997849, + "learning_rate": 0.0007129879565443064, + "loss": 2.2285, + "step": 441410 + }, + { + "epoch": 1.7064062717446769, + "grad_norm": 0.11683390289545059, + "learning_rate": 0.000712832567223673, + "loss": 2.2084, + "step": 441420 + }, + { + "epoch": 1.70644492894806, + "grad_norm": 0.11777091771364212, + "learning_rate": 0.0007126771966596724, + "loss": 2.2091, + "step": 441430 + }, + { + "epoch": 1.7064835861514434, + "grad_norm": 0.12691377103328705, + "learning_rate": 0.0007125218448455136, + "loss": 2.2209, + "step": 441440 + }, + { + "epoch": 1.7065222433548266, + "grad_norm": 0.11685075610876083, + "learning_rate": 0.0007123665117744103, + "loss": 2.2329, + "step": 441450 + }, + { + "epoch": 1.7065609005582099, + "grad_norm": 0.11418899893760681, + "learning_rate": 0.0007122111974395802, + "loss": 2.2261, + "step": 441460 + }, + { + "epoch": 1.7065995577615933, + "grad_norm": 0.11462923884391785, + "learning_rate": 0.0007120559018342449, + "loss": 2.2272, + "step": 441470 + }, + { + "epoch": 1.7066382149649766, + "grad_norm": 0.1235089898109436, + "learning_rate": 0.0007119006249516304, + "loss": 2.2162, + "step": 441480 + }, + { + "epoch": 1.7066768721683598, + "grad_norm": 0.12144121527671814, + "learning_rate": 0.0007117453667849667, + "loss": 2.2373, + "step": 441490 + }, + { + "epoch": 1.706715529371743, + "grad_norm": 0.12170761823654175, + "learning_rate": 0.0007115901273274874, + "loss": 2.2289, + "step": 441500 + }, + { + "epoch": 1.7067541865751266, + "grad_norm": 0.1287725567817688, + "learning_rate": 0.0007114349065724308, + "loss": 2.2374, + "step": 441510 + }, + { + "epoch": 1.7067928437785098, + "grad_norm": 0.1142718568444252, + "learning_rate": 0.000711279704513039, + "loss": 2.225, + "step": 441520 + }, + { + "epoch": 1.706831500981893, + "grad_norm": 0.1222819834947586, + "learning_rate": 0.0007111245211425581, + "loss": 2.2287, + "step": 441530 + }, + { + "epoch": 1.7068701581852763, + "grad_norm": 0.12381406128406525, + "learning_rate": 0.0007109693564542385, + "loss": 2.2443, + "step": 441540 + }, + { + "epoch": 1.7069088153886596, + "grad_norm": 0.11265474557876587, + "learning_rate": 0.0007108142104413344, + "loss": 2.232, + "step": 441550 + }, + { + "epoch": 1.7069474725920428, + "grad_norm": 0.2676023840904236, + "learning_rate": 0.0007106590830971043, + "loss": 2.2231, + "step": 441560 + }, + { + "epoch": 1.706986129795426, + "grad_norm": 0.10900603234767914, + "learning_rate": 0.0007105039744148103, + "loss": 2.2242, + "step": 441570 + }, + { + "epoch": 1.7070247869988093, + "grad_norm": 0.13303890824317932, + "learning_rate": 0.0007103488843877195, + "loss": 2.228, + "step": 441580 + }, + { + "epoch": 1.7070634442021926, + "grad_norm": 0.11852958798408508, + "learning_rate": 0.0007101938130091017, + "loss": 2.2438, + "step": 441590 + }, + { + "epoch": 1.7071021014055758, + "grad_norm": 0.13023656606674194, + "learning_rate": 0.0007100387602722321, + "loss": 2.2012, + "step": 441600 + }, + { + "epoch": 1.707140758608959, + "grad_norm": 0.12023354321718216, + "learning_rate": 0.000709883726170389, + "loss": 2.2342, + "step": 441610 + }, + { + "epoch": 1.7071794158123423, + "grad_norm": 0.10630054026842117, + "learning_rate": 0.0007097287106968551, + "loss": 2.215, + "step": 441620 + }, + { + "epoch": 1.7072180730157256, + "grad_norm": 0.11944133043289185, + "learning_rate": 0.000709573713844917, + "loss": 2.2422, + "step": 441630 + }, + { + "epoch": 1.707256730219109, + "grad_norm": 0.1279240846633911, + "learning_rate": 0.0007094187356078658, + "loss": 2.222, + "step": 441640 + }, + { + "epoch": 1.7072953874224923, + "grad_norm": 0.10906361043453217, + "learning_rate": 0.0007092637759789957, + "loss": 2.2284, + "step": 441650 + }, + { + "epoch": 1.7073340446258756, + "grad_norm": 0.12351000308990479, + "learning_rate": 0.000709108834951606, + "loss": 2.2395, + "step": 441660 + }, + { + "epoch": 1.7073727018292588, + "grad_norm": 0.13086514174938202, + "learning_rate": 0.0007089539125189992, + "loss": 2.2114, + "step": 441670 + }, + { + "epoch": 1.7074113590326423, + "grad_norm": 0.11564716696739197, + "learning_rate": 0.0007087990086744822, + "loss": 2.2449, + "step": 441680 + }, + { + "epoch": 1.7074500162360255, + "grad_norm": 0.1139092743396759, + "learning_rate": 0.0007086441234113659, + "loss": 2.2346, + "step": 441690 + }, + { + "epoch": 1.7074886734394088, + "grad_norm": 0.10772178322076797, + "learning_rate": 0.0007084892567229648, + "loss": 2.2295, + "step": 441700 + }, + { + "epoch": 1.707527330642792, + "grad_norm": 0.11219760775566101, + "learning_rate": 0.000708334408602598, + "loss": 2.2395, + "step": 441710 + }, + { + "epoch": 1.7075659878461753, + "grad_norm": 0.12415587902069092, + "learning_rate": 0.0007081795790435885, + "loss": 2.2292, + "step": 441720 + }, + { + "epoch": 1.7076046450495586, + "grad_norm": 0.3202018439769745, + "learning_rate": 0.0007080247680392631, + "loss": 2.2219, + "step": 441730 + }, + { + "epoch": 1.7076433022529418, + "grad_norm": 0.1251060515642166, + "learning_rate": 0.0007078699755829525, + "loss": 2.2395, + "step": 441740 + }, + { + "epoch": 1.707681959456325, + "grad_norm": 0.11131077259778976, + "learning_rate": 0.0007077152016679915, + "loss": 2.2115, + "step": 441750 + }, + { + "epoch": 1.7077206166597083, + "grad_norm": 0.11476995795965195, + "learning_rate": 0.000707560446287719, + "loss": 2.2454, + "step": 441760 + }, + { + "epoch": 1.7077592738630916, + "grad_norm": 0.12419164180755615, + "learning_rate": 0.000707405709435478, + "loss": 2.2283, + "step": 441770 + }, + { + "epoch": 1.7077979310664748, + "grad_norm": 0.11394430696964264, + "learning_rate": 0.000707250991104615, + "loss": 2.2277, + "step": 441780 + }, + { + "epoch": 1.707836588269858, + "grad_norm": 0.11585311591625214, + "learning_rate": 0.0007070962912884811, + "loss": 2.2278, + "step": 441790 + }, + { + "epoch": 1.7078752454732413, + "grad_norm": 0.11358223110437393, + "learning_rate": 0.0007069416099804309, + "loss": 2.2072, + "step": 441800 + }, + { + "epoch": 1.7079139026766248, + "grad_norm": 0.12552201747894287, + "learning_rate": 0.000706786947173823, + "loss": 2.23, + "step": 441810 + }, + { + "epoch": 1.707952559880008, + "grad_norm": 0.12313517183065414, + "learning_rate": 0.0007066323028620205, + "loss": 2.2224, + "step": 441820 + }, + { + "epoch": 1.7079912170833913, + "grad_norm": 0.11810357868671417, + "learning_rate": 0.0007064776770383898, + "loss": 2.2391, + "step": 441830 + }, + { + "epoch": 1.7080298742867746, + "grad_norm": 0.12771636247634888, + "learning_rate": 0.0007063230696963017, + "loss": 2.2318, + "step": 441840 + }, + { + "epoch": 1.708068531490158, + "grad_norm": 0.12263831496238708, + "learning_rate": 0.0007061684808291307, + "loss": 2.2251, + "step": 441850 + }, + { + "epoch": 1.7081071886935413, + "grad_norm": 0.11750944703817368, + "learning_rate": 0.000706013910430255, + "loss": 2.2202, + "step": 441860 + }, + { + "epoch": 1.7081458458969245, + "grad_norm": 0.11194665729999542, + "learning_rate": 0.0007058593584930577, + "loss": 2.2213, + "step": 441870 + }, + { + "epoch": 1.7081845031003078, + "grad_norm": 0.11499247699975967, + "learning_rate": 0.0007057048250109251, + "loss": 2.2294, + "step": 441880 + }, + { + "epoch": 1.708223160303691, + "grad_norm": 0.12411858141422272, + "learning_rate": 0.0007055503099772475, + "loss": 2.2097, + "step": 441890 + }, + { + "epoch": 1.7082618175070743, + "grad_norm": 0.11611645668745041, + "learning_rate": 0.0007053958133854195, + "loss": 2.236, + "step": 441900 + }, + { + "epoch": 1.7083004747104575, + "grad_norm": 0.11352533102035522, + "learning_rate": 0.0007052413352288389, + "loss": 2.221, + "step": 441910 + }, + { + "epoch": 1.7083391319138408, + "grad_norm": 0.12064797431230545, + "learning_rate": 0.0007050868755009084, + "loss": 2.2303, + "step": 441920 + }, + { + "epoch": 1.708377789117224, + "grad_norm": 0.1243981420993805, + "learning_rate": 0.000704932434195034, + "loss": 2.2332, + "step": 441930 + }, + { + "epoch": 1.7084164463206073, + "grad_norm": 0.12540730834007263, + "learning_rate": 0.0007047780113046258, + "loss": 2.224, + "step": 441940 + }, + { + "epoch": 1.7084551035239905, + "grad_norm": 0.1284138411283493, + "learning_rate": 0.000704623606823098, + "loss": 2.2289, + "step": 441950 + }, + { + "epoch": 1.7084937607273738, + "grad_norm": 0.12039217352867126, + "learning_rate": 0.0007044692207438683, + "loss": 2.2247, + "step": 441960 + }, + { + "epoch": 1.708532417930757, + "grad_norm": 0.11266115307807922, + "learning_rate": 0.0007043148530603587, + "loss": 2.2306, + "step": 441970 + }, + { + "epoch": 1.7085710751341405, + "grad_norm": 0.12143003940582275, + "learning_rate": 0.0007041605037659951, + "loss": 2.2297, + "step": 441980 + }, + { + "epoch": 1.7086097323375238, + "grad_norm": 0.12860099971294403, + "learning_rate": 0.0007040061728542068, + "loss": 2.2166, + "step": 441990 + }, + { + "epoch": 1.708648389540907, + "grad_norm": 0.13188594579696655, + "learning_rate": 0.0007038518603184281, + "loss": 2.2359, + "step": 442000 + }, + { + "epoch": 1.7086870467442903, + "grad_norm": 0.11238020658493042, + "learning_rate": 0.0007036975661520959, + "loss": 2.2043, + "step": 442010 + }, + { + "epoch": 1.7087257039476738, + "grad_norm": 0.11865348368883133, + "learning_rate": 0.0007035432903486519, + "loss": 2.2362, + "step": 442020 + }, + { + "epoch": 1.708764361151057, + "grad_norm": 0.11815221607685089, + "learning_rate": 0.0007033890329015415, + "loss": 2.2237, + "step": 442030 + }, + { + "epoch": 1.7088030183544403, + "grad_norm": 0.11716797202825546, + "learning_rate": 0.0007032347938042138, + "loss": 2.2311, + "step": 442040 + }, + { + "epoch": 1.7088416755578235, + "grad_norm": 0.1247939020395279, + "learning_rate": 0.0007030805730501219, + "loss": 2.2316, + "step": 442050 + }, + { + "epoch": 1.7088803327612068, + "grad_norm": 0.1377456784248352, + "learning_rate": 0.0007029263706327232, + "loss": 2.2231, + "step": 442060 + }, + { + "epoch": 1.70891898996459, + "grad_norm": 0.11857406049966812, + "learning_rate": 0.0007027721865454781, + "loss": 2.2285, + "step": 442070 + }, + { + "epoch": 1.7089576471679733, + "grad_norm": 0.1162823960185051, + "learning_rate": 0.0007026180207818515, + "loss": 2.2282, + "step": 442080 + }, + { + "epoch": 1.7089963043713565, + "grad_norm": 0.12219282239675522, + "learning_rate": 0.0007024638733353126, + "loss": 2.2222, + "step": 442090 + }, + { + "epoch": 1.7090349615747398, + "grad_norm": 0.12192238122224808, + "learning_rate": 0.0007023097441993334, + "loss": 2.22, + "step": 442100 + }, + { + "epoch": 1.709073618778123, + "grad_norm": 0.12040101736783981, + "learning_rate": 0.0007021556333673902, + "loss": 2.2399, + "step": 442110 + }, + { + "epoch": 1.7091122759815063, + "grad_norm": 0.11780326068401337, + "learning_rate": 0.0007020015408329639, + "loss": 2.2216, + "step": 442120 + }, + { + "epoch": 1.7091509331848895, + "grad_norm": 0.11659206449985504, + "learning_rate": 0.0007018474665895383, + "loss": 2.245, + "step": 442130 + }, + { + "epoch": 1.709189590388273, + "grad_norm": 0.121012844145298, + "learning_rate": 0.0007016934106306014, + "loss": 2.2198, + "step": 442140 + }, + { + "epoch": 1.7092282475916563, + "grad_norm": 0.1228698194026947, + "learning_rate": 0.0007015393729496455, + "loss": 2.2225, + "step": 442150 + }, + { + "epoch": 1.7092669047950395, + "grad_norm": 0.11647068709135056, + "learning_rate": 0.000701385353540166, + "loss": 2.2216, + "step": 442160 + }, + { + "epoch": 1.7093055619984228, + "grad_norm": 0.11902830004692078, + "learning_rate": 0.0007012313523956624, + "loss": 2.2339, + "step": 442170 + }, + { + "epoch": 1.709344219201806, + "grad_norm": 0.11790589243173599, + "learning_rate": 0.0007010773695096386, + "loss": 2.2266, + "step": 442180 + }, + { + "epoch": 1.7093828764051895, + "grad_norm": 0.11872167140245438, + "learning_rate": 0.0007009234048756017, + "loss": 2.2206, + "step": 442190 + }, + { + "epoch": 1.7094215336085727, + "grad_norm": 0.11382316797971725, + "learning_rate": 0.0007007694584870627, + "loss": 2.2259, + "step": 442200 + }, + { + "epoch": 1.709460190811956, + "grad_norm": 0.12118104845285416, + "learning_rate": 0.0007006155303375373, + "loss": 2.2368, + "step": 442210 + }, + { + "epoch": 1.7094988480153392, + "grad_norm": 0.12145468592643738, + "learning_rate": 0.0007004616204205434, + "loss": 2.2353, + "step": 442220 + }, + { + "epoch": 1.7095375052187225, + "grad_norm": 0.11479642242193222, + "learning_rate": 0.0007003077287296043, + "loss": 2.2225, + "step": 442230 + }, + { + "epoch": 1.7095761624221057, + "grad_norm": 0.12208642810583115, + "learning_rate": 0.0007001538552582463, + "loss": 2.2327, + "step": 442240 + }, + { + "epoch": 1.709614819625489, + "grad_norm": 0.11871983855962753, + "learning_rate": 0.0007, + "loss": 2.2391, + "step": 442250 + }, + { + "epoch": 1.7096534768288723, + "grad_norm": 0.12440042197704315, + "learning_rate": 0.0006998461629483994, + "loss": 2.2219, + "step": 442260 + }, + { + "epoch": 1.7096921340322555, + "grad_norm": 0.11856591701507568, + "learning_rate": 0.0006996923440969825, + "loss": 2.219, + "step": 442270 + }, + { + "epoch": 1.7097307912356388, + "grad_norm": 0.12889623641967773, + "learning_rate": 0.0006995385434392914, + "loss": 2.2218, + "step": 442280 + }, + { + "epoch": 1.709769448439022, + "grad_norm": 0.11815499514341354, + "learning_rate": 0.0006993847609688713, + "loss": 2.2189, + "step": 442290 + }, + { + "epoch": 1.7098081056424053, + "grad_norm": 0.11574645340442657, + "learning_rate": 0.000699230996679272, + "loss": 2.2393, + "step": 442300 + }, + { + "epoch": 1.7098467628457887, + "grad_norm": 0.11625123023986816, + "learning_rate": 0.0006990772505640468, + "loss": 2.2252, + "step": 442310 + }, + { + "epoch": 1.709885420049172, + "grad_norm": 0.11689998209476471, + "learning_rate": 0.0006989235226167526, + "loss": 2.2395, + "step": 442320 + }, + { + "epoch": 1.7099240772525552, + "grad_norm": 0.1170109286904335, + "learning_rate": 0.0006987698128309504, + "loss": 2.2008, + "step": 442330 + }, + { + "epoch": 1.7099627344559385, + "grad_norm": 0.12176986038684845, + "learning_rate": 0.0006986161212002049, + "loss": 2.2235, + "step": 442340 + }, + { + "epoch": 1.7100013916593217, + "grad_norm": 0.12022269517183304, + "learning_rate": 0.0006984624477180846, + "loss": 2.2106, + "step": 442350 + }, + { + "epoch": 1.7100400488627052, + "grad_norm": 0.11816758662462234, + "learning_rate": 0.0006983087923781617, + "loss": 2.2358, + "step": 442360 + }, + { + "epoch": 1.7100787060660885, + "grad_norm": 0.11895138025283813, + "learning_rate": 0.0006981551551740122, + "loss": 2.2133, + "step": 442370 + }, + { + "epoch": 1.7101173632694717, + "grad_norm": 0.12274456769227982, + "learning_rate": 0.0006980015360992164, + "loss": 2.2237, + "step": 442380 + }, + { + "epoch": 1.710156020472855, + "grad_norm": 0.11600464582443237, + "learning_rate": 0.0006978479351473577, + "loss": 2.2238, + "step": 442390 + }, + { + "epoch": 1.7101946776762382, + "grad_norm": 0.12388424575328827, + "learning_rate": 0.0006976943523120236, + "loss": 2.2129, + "step": 442400 + }, + { + "epoch": 1.7102333348796215, + "grad_norm": 0.12280719727277756, + "learning_rate": 0.0006975407875868052, + "loss": 2.2335, + "step": 442410 + }, + { + "epoch": 1.7102719920830047, + "grad_norm": 0.11842313408851624, + "learning_rate": 0.0006973872409652975, + "loss": 2.2475, + "step": 442420 + }, + { + "epoch": 1.710310649286388, + "grad_norm": 0.12219894677400589, + "learning_rate": 0.0006972337124410995, + "loss": 2.2275, + "step": 442430 + }, + { + "epoch": 1.7103493064897712, + "grad_norm": 0.12560917437076569, + "learning_rate": 0.0006970802020078136, + "loss": 2.2431, + "step": 442440 + }, + { + "epoch": 1.7103879636931545, + "grad_norm": 0.11803045868873596, + "learning_rate": 0.0006969267096590461, + "loss": 2.2286, + "step": 442450 + }, + { + "epoch": 1.7104266208965377, + "grad_norm": 0.12114856392145157, + "learning_rate": 0.0006967732353884074, + "loss": 2.2382, + "step": 442460 + }, + { + "epoch": 1.710465278099921, + "grad_norm": 0.10938682407140732, + "learning_rate": 0.0006966197791895105, + "loss": 2.2211, + "step": 442470 + }, + { + "epoch": 1.7105039353033045, + "grad_norm": 0.12028611451387405, + "learning_rate": 0.0006964663410559741, + "loss": 2.2159, + "step": 442480 + }, + { + "epoch": 1.7105425925066877, + "grad_norm": 0.1148734763264656, + "learning_rate": 0.000696312920981419, + "loss": 2.2237, + "step": 442490 + }, + { + "epoch": 1.710581249710071, + "grad_norm": 0.10638713091611862, + "learning_rate": 0.0006961595189594702, + "loss": 2.2443, + "step": 442500 + }, + { + "epoch": 1.7106199069134542, + "grad_norm": 0.12149275839328766, + "learning_rate": 0.000696006134983757, + "loss": 2.2041, + "step": 442510 + }, + { + "epoch": 1.7106585641168375, + "grad_norm": 0.1199207454919815, + "learning_rate": 0.0006958527690479115, + "loss": 2.2283, + "step": 442520 + }, + { + "epoch": 1.710697221320221, + "grad_norm": 0.12160733342170715, + "learning_rate": 0.0006956994211455705, + "loss": 2.2289, + "step": 442530 + }, + { + "epoch": 1.7107358785236042, + "grad_norm": 0.11109825223684311, + "learning_rate": 0.0006955460912703738, + "loss": 2.2329, + "step": 442540 + }, + { + "epoch": 1.7107745357269875, + "grad_norm": 0.1456461101770401, + "learning_rate": 0.0006953927794159653, + "loss": 2.237, + "step": 442550 + }, + { + "epoch": 1.7108131929303707, + "grad_norm": 0.1287081092596054, + "learning_rate": 0.0006952394855759927, + "loss": 2.2036, + "step": 442560 + }, + { + "epoch": 1.710851850133754, + "grad_norm": 0.11479346454143524, + "learning_rate": 0.0006950862097441073, + "loss": 2.234, + "step": 442570 + }, + { + "epoch": 1.7108905073371372, + "grad_norm": 0.11606258153915405, + "learning_rate": 0.000694932951913964, + "loss": 2.2269, + "step": 442580 + }, + { + "epoch": 1.7109291645405205, + "grad_norm": 0.1183663159608841, + "learning_rate": 0.0006947797120792214, + "loss": 2.2307, + "step": 442590 + }, + { + "epoch": 1.7109678217439037, + "grad_norm": 0.11088277399539948, + "learning_rate": 0.0006946264902335424, + "loss": 2.2407, + "step": 442600 + }, + { + "epoch": 1.711006478947287, + "grad_norm": 0.12547467648983002, + "learning_rate": 0.000694473286370593, + "loss": 2.2271, + "step": 442610 + }, + { + "epoch": 1.7110451361506702, + "grad_norm": 0.11433514952659607, + "learning_rate": 0.0006943201004840428, + "loss": 2.2148, + "step": 442620 + }, + { + "epoch": 1.7110837933540535, + "grad_norm": 0.10790496319532394, + "learning_rate": 0.0006941669325675659, + "loss": 2.2212, + "step": 442630 + }, + { + "epoch": 1.7111224505574367, + "grad_norm": 0.12909001111984253, + "learning_rate": 0.0006940137826148394, + "loss": 2.2184, + "step": 442640 + }, + { + "epoch": 1.7111611077608202, + "grad_norm": 0.12274988740682602, + "learning_rate": 0.0006938606506195444, + "loss": 2.2007, + "step": 442650 + }, + { + "epoch": 1.7111997649642035, + "grad_norm": 0.11600089818239212, + "learning_rate": 0.0006937075365753655, + "loss": 2.2283, + "step": 442660 + }, + { + "epoch": 1.7112384221675867, + "grad_norm": 0.13487693667411804, + "learning_rate": 0.0006935544404759913, + "loss": 2.2205, + "step": 442670 + }, + { + "epoch": 1.71127707937097, + "grad_norm": 0.11452995240688324, + "learning_rate": 0.0006934013623151141, + "loss": 2.2286, + "step": 442680 + }, + { + "epoch": 1.7113157365743534, + "grad_norm": 0.11441690474748611, + "learning_rate": 0.0006932483020864293, + "loss": 2.221, + "step": 442690 + }, + { + "epoch": 1.7113543937777367, + "grad_norm": 0.23739032447338104, + "learning_rate": 0.0006930952597836369, + "loss": 2.2291, + "step": 442700 + }, + { + "epoch": 1.71139305098112, + "grad_norm": 0.12318938225507736, + "learning_rate": 0.0006929422354004396, + "loss": 2.2293, + "step": 442710 + }, + { + "epoch": 1.7114317081845032, + "grad_norm": 0.11755301803350449, + "learning_rate": 0.0006927892289305446, + "loss": 2.2111, + "step": 442720 + }, + { + "epoch": 1.7114703653878864, + "grad_norm": 0.11043576151132584, + "learning_rate": 0.0006926362403676627, + "loss": 2.2184, + "step": 442730 + }, + { + "epoch": 1.7115090225912697, + "grad_norm": 0.12790408730506897, + "learning_rate": 0.0006924832697055076, + "loss": 2.2348, + "step": 442740 + }, + { + "epoch": 1.711547679794653, + "grad_norm": 0.12552985548973083, + "learning_rate": 0.000692330316937798, + "loss": 2.233, + "step": 442750 + }, + { + "epoch": 1.7115863369980362, + "grad_norm": 0.11527630686759949, + "learning_rate": 0.0006921773820582549, + "loss": 2.2267, + "step": 442760 + }, + { + "epoch": 1.7116249942014194, + "grad_norm": 0.14568762481212616, + "learning_rate": 0.0006920244650606036, + "loss": 2.2405, + "step": 442770 + }, + { + "epoch": 1.7116636514048027, + "grad_norm": 0.11509274691343307, + "learning_rate": 0.0006918715659385735, + "loss": 2.2165, + "step": 442780 + }, + { + "epoch": 1.711702308608186, + "grad_norm": 0.12123371660709381, + "learning_rate": 0.0006917186846858968, + "loss": 2.2079, + "step": 442790 + }, + { + "epoch": 1.7117409658115692, + "grad_norm": 0.12158721685409546, + "learning_rate": 0.0006915658212963098, + "loss": 2.2149, + "step": 442800 + }, + { + "epoch": 1.7117796230149525, + "grad_norm": 0.14907032251358032, + "learning_rate": 0.0006914129757635528, + "loss": 2.2102, + "step": 442810 + }, + { + "epoch": 1.711818280218336, + "grad_norm": 0.11518526077270508, + "learning_rate": 0.000691260148081369, + "loss": 2.2294, + "step": 442820 + }, + { + "epoch": 1.7118569374217192, + "grad_norm": 0.10713840276002884, + "learning_rate": 0.0006911073382435061, + "loss": 2.2254, + "step": 442830 + }, + { + "epoch": 1.7118955946251024, + "grad_norm": 0.1185065507888794, + "learning_rate": 0.0006909545462437143, + "loss": 2.224, + "step": 442840 + }, + { + "epoch": 1.7119342518284857, + "grad_norm": 0.11165358871221542, + "learning_rate": 0.0006908017720757487, + "loss": 2.2305, + "step": 442850 + }, + { + "epoch": 1.7119729090318692, + "grad_norm": 0.12423279881477356, + "learning_rate": 0.0006906490157333674, + "loss": 2.2175, + "step": 442860 + }, + { + "epoch": 1.7120115662352524, + "grad_norm": 0.11754172295331955, + "learning_rate": 0.0006904962772103318, + "loss": 2.2327, + "step": 442870 + }, + { + "epoch": 1.7120502234386357, + "grad_norm": 0.10995957255363464, + "learning_rate": 0.0006903435565004079, + "loss": 2.2268, + "step": 442880 + }, + { + "epoch": 1.712088880642019, + "grad_norm": 0.11185383051633835, + "learning_rate": 0.0006901908535973647, + "loss": 2.2169, + "step": 442890 + }, + { + "epoch": 1.7121275378454022, + "grad_norm": 0.12045712769031525, + "learning_rate": 0.0006900381684949748, + "loss": 2.2206, + "step": 442900 + }, + { + "epoch": 1.7121661950487854, + "grad_norm": 0.1269853264093399, + "learning_rate": 0.0006898855011870146, + "loss": 2.227, + "step": 442910 + }, + { + "epoch": 1.7122048522521687, + "grad_norm": 0.12694627046585083, + "learning_rate": 0.000689732851667264, + "loss": 2.2273, + "step": 442920 + }, + { + "epoch": 1.712243509455552, + "grad_norm": 0.13340239226818085, + "learning_rate": 0.0006895802199295068, + "loss": 2.2214, + "step": 442930 + }, + { + "epoch": 1.7122821666589352, + "grad_norm": 0.1058509573340416, + "learning_rate": 0.00068942760596753, + "loss": 2.2176, + "step": 442940 + }, + { + "epoch": 1.7123208238623184, + "grad_norm": 0.11555317789316177, + "learning_rate": 0.0006892750097751245, + "loss": 2.2183, + "step": 442950 + }, + { + "epoch": 1.7123594810657017, + "grad_norm": 0.12041497975587845, + "learning_rate": 0.0006891224313460848, + "loss": 2.2211, + "step": 442960 + }, + { + "epoch": 1.712398138269085, + "grad_norm": 0.13038748502731323, + "learning_rate": 0.0006889698706742092, + "loss": 2.2241, + "step": 442970 + }, + { + "epoch": 1.7124367954724682, + "grad_norm": 0.109809011220932, + "learning_rate": 0.000688817327753299, + "loss": 2.2233, + "step": 442980 + }, + { + "epoch": 1.7124754526758517, + "grad_norm": 0.12648747861385345, + "learning_rate": 0.0006886648025771595, + "loss": 2.2257, + "step": 442990 + }, + { + "epoch": 1.712514109879235, + "grad_norm": 0.12341243773698807, + "learning_rate": 0.0006885122951395999, + "loss": 2.2346, + "step": 443000 + }, + { + "epoch": 1.7125527670826182, + "grad_norm": 0.10717613250017166, + "learning_rate": 0.0006883598054344324, + "loss": 2.2232, + "step": 443010 + }, + { + "epoch": 1.7125914242860014, + "grad_norm": 0.12053383141756058, + "learning_rate": 0.0006882073334554732, + "loss": 2.2433, + "step": 443020 + }, + { + "epoch": 1.712630081489385, + "grad_norm": 0.11753083020448685, + "learning_rate": 0.000688054879196542, + "loss": 2.2348, + "step": 443030 + }, + { + "epoch": 1.7126687386927681, + "grad_norm": 0.13164658844470978, + "learning_rate": 0.0006879024426514619, + "loss": 2.2298, + "step": 443040 + }, + { + "epoch": 1.7127073958961514, + "grad_norm": 0.12515607476234436, + "learning_rate": 0.0006877500238140602, + "loss": 2.2234, + "step": 443050 + }, + { + "epoch": 1.7127460530995346, + "grad_norm": 0.12741313874721527, + "learning_rate": 0.0006875976226781666, + "loss": 2.2117, + "step": 443060 + }, + { + "epoch": 1.712784710302918, + "grad_norm": 0.12356637418270111, + "learning_rate": 0.0006874452392376156, + "loss": 2.2141, + "step": 443070 + }, + { + "epoch": 1.7128233675063012, + "grad_norm": 0.11521610617637634, + "learning_rate": 0.000687292873486245, + "loss": 2.2277, + "step": 443080 + }, + { + "epoch": 1.7128620247096844, + "grad_norm": 0.11198043078184128, + "learning_rate": 0.0006871405254178953, + "loss": 2.2267, + "step": 443090 + }, + { + "epoch": 1.7129006819130677, + "grad_norm": 0.12801900506019592, + "learning_rate": 0.0006869881950264118, + "loss": 2.2302, + "step": 443100 + }, + { + "epoch": 1.712939339116451, + "grad_norm": 0.11122292280197144, + "learning_rate": 0.0006868358823056426, + "loss": 2.2143, + "step": 443110 + }, + { + "epoch": 1.7129779963198342, + "grad_norm": 0.13782921433448792, + "learning_rate": 0.0006866835872494397, + "loss": 2.2395, + "step": 443120 + }, + { + "epoch": 1.7130166535232174, + "grad_norm": 0.11965320259332657, + "learning_rate": 0.0006865313098516585, + "loss": 2.2362, + "step": 443130 + }, + { + "epoch": 1.7130553107266007, + "grad_norm": 0.11909079551696777, + "learning_rate": 0.000686379050106158, + "loss": 2.229, + "step": 443140 + }, + { + "epoch": 1.713093967929984, + "grad_norm": 0.11601749807596207, + "learning_rate": 0.0006862268080068006, + "loss": 2.2231, + "step": 443150 + }, + { + "epoch": 1.7131326251333674, + "grad_norm": 0.12354492396116257, + "learning_rate": 0.0006860745835474527, + "loss": 2.2202, + "step": 443160 + }, + { + "epoch": 1.7131712823367506, + "grad_norm": 0.12105528265237808, + "learning_rate": 0.0006859223767219837, + "loss": 2.2322, + "step": 443170 + }, + { + "epoch": 1.713209939540134, + "grad_norm": 0.1187669187784195, + "learning_rate": 0.000685770187524267, + "loss": 2.2257, + "step": 443180 + }, + { + "epoch": 1.7132485967435171, + "grad_norm": 0.11206571757793427, + "learning_rate": 0.0006856180159481796, + "loss": 2.2207, + "step": 443190 + }, + { + "epoch": 1.7132872539469006, + "grad_norm": 0.10545578598976135, + "learning_rate": 0.0006854658619876013, + "loss": 2.2282, + "step": 443200 + }, + { + "epoch": 1.7133259111502839, + "grad_norm": 0.12910322844982147, + "learning_rate": 0.0006853137256364164, + "loss": 2.2446, + "step": 443210 + }, + { + "epoch": 1.7133645683536671, + "grad_norm": 0.1320909857749939, + "learning_rate": 0.000685161606888512, + "loss": 2.2198, + "step": 443220 + }, + { + "epoch": 1.7134032255570504, + "grad_norm": 0.120067298412323, + "learning_rate": 0.0006850095057377792, + "loss": 2.2227, + "step": 443230 + }, + { + "epoch": 1.7134418827604336, + "grad_norm": 0.11289399117231369, + "learning_rate": 0.0006848574221781123, + "loss": 2.2408, + "step": 443240 + }, + { + "epoch": 1.7134805399638169, + "grad_norm": 0.12850508093833923, + "learning_rate": 0.0006847053562034096, + "loss": 2.2264, + "step": 443250 + }, + { + "epoch": 1.7135191971672001, + "grad_norm": 0.11475897580385208, + "learning_rate": 0.0006845533078075723, + "loss": 2.2214, + "step": 443260 + }, + { + "epoch": 1.7135578543705834, + "grad_norm": 0.12015476077795029, + "learning_rate": 0.0006844012769845054, + "loss": 2.2107, + "step": 443270 + }, + { + "epoch": 1.7135965115739666, + "grad_norm": 0.13094781339168549, + "learning_rate": 0.000684249263728118, + "loss": 2.2236, + "step": 443280 + }, + { + "epoch": 1.71363516877735, + "grad_norm": 0.11818785220384598, + "learning_rate": 0.0006840972680323214, + "loss": 2.2141, + "step": 443290 + }, + { + "epoch": 1.7136738259807331, + "grad_norm": 0.11630553752183914, + "learning_rate": 0.0006839452898910319, + "loss": 2.2402, + "step": 443300 + }, + { + "epoch": 1.7137124831841164, + "grad_norm": 0.11747671663761139, + "learning_rate": 0.0006837933292981684, + "loss": 2.2243, + "step": 443310 + }, + { + "epoch": 1.7137511403874996, + "grad_norm": 0.11672230064868927, + "learning_rate": 0.0006836413862476534, + "loss": 2.243, + "step": 443320 + }, + { + "epoch": 1.7137897975908831, + "grad_norm": 0.11931329220533371, + "learning_rate": 0.0006834894607334128, + "loss": 2.2089, + "step": 443330 + }, + { + "epoch": 1.7138284547942664, + "grad_norm": 0.11820429563522339, + "learning_rate": 0.0006833375527493768, + "loss": 2.2415, + "step": 443340 + }, + { + "epoch": 1.7138671119976496, + "grad_norm": 0.11610442399978638, + "learning_rate": 0.0006831856622894784, + "loss": 2.218, + "step": 443350 + }, + { + "epoch": 1.7139057692010329, + "grad_norm": 0.1124664694070816, + "learning_rate": 0.0006830337893476537, + "loss": 2.2184, + "step": 443360 + }, + { + "epoch": 1.7139444264044164, + "grad_norm": 0.11392097175121307, + "learning_rate": 0.0006828819339178435, + "loss": 2.2293, + "step": 443370 + }, + { + "epoch": 1.7139830836077996, + "grad_norm": 0.11781250685453415, + "learning_rate": 0.0006827300959939911, + "loss": 2.2433, + "step": 443380 + }, + { + "epoch": 1.7140217408111829, + "grad_norm": 0.11644378304481506, + "learning_rate": 0.0006825782755700435, + "loss": 2.2392, + "step": 443390 + }, + { + "epoch": 1.7140603980145661, + "grad_norm": 0.12304277718067169, + "learning_rate": 0.0006824264726399516, + "loss": 2.2292, + "step": 443400 + }, + { + "epoch": 1.7140990552179494, + "grad_norm": 0.12490229308605194, + "learning_rate": 0.0006822746871976695, + "loss": 2.2272, + "step": 443410 + }, + { + "epoch": 1.7141377124213326, + "grad_norm": 0.1096290871500969, + "learning_rate": 0.0006821229192371543, + "loss": 2.2061, + "step": 443420 + }, + { + "epoch": 1.7141763696247159, + "grad_norm": 0.1179693341255188, + "learning_rate": 0.0006819711687523675, + "loss": 2.2079, + "step": 443430 + }, + { + "epoch": 1.7142150268280991, + "grad_norm": 0.1262316107749939, + "learning_rate": 0.0006818194357372736, + "loss": 2.2354, + "step": 443440 + }, + { + "epoch": 1.7142536840314824, + "grad_norm": 0.12826380133628845, + "learning_rate": 0.0006816677201858404, + "loss": 2.2022, + "step": 443450 + }, + { + "epoch": 1.7142923412348656, + "grad_norm": 0.10918699949979782, + "learning_rate": 0.0006815160220920393, + "loss": 2.2211, + "step": 443460 + }, + { + "epoch": 1.7143309984382489, + "grad_norm": 0.11608961224555969, + "learning_rate": 0.0006813643414498453, + "loss": 2.2409, + "step": 443470 + }, + { + "epoch": 1.7143696556416321, + "grad_norm": 0.12766702473163605, + "learning_rate": 0.000681212678253237, + "loss": 2.2255, + "step": 443480 + }, + { + "epoch": 1.7144083128450154, + "grad_norm": 0.11442360281944275, + "learning_rate": 0.0006810610324961961, + "loss": 2.2186, + "step": 443490 + }, + { + "epoch": 1.7144469700483989, + "grad_norm": 0.11635372787714005, + "learning_rate": 0.0006809094041727082, + "loss": 2.2253, + "step": 443500 + }, + { + "epoch": 1.714485627251782, + "grad_norm": 0.12290455400943756, + "learning_rate": 0.0006807577932767615, + "loss": 2.2331, + "step": 443510 + }, + { + "epoch": 1.7145242844551654, + "grad_norm": 0.12026640772819519, + "learning_rate": 0.0006806061998023488, + "loss": 2.2263, + "step": 443520 + }, + { + "epoch": 1.7145629416585486, + "grad_norm": 0.11638140678405762, + "learning_rate": 0.0006804546237434652, + "loss": 2.2382, + "step": 443530 + }, + { + "epoch": 1.714601598861932, + "grad_norm": 0.11954324692487717, + "learning_rate": 0.0006803030650941102, + "loss": 2.2281, + "step": 443540 + }, + { + "epoch": 1.7146402560653153, + "grad_norm": 0.12044857442378998, + "learning_rate": 0.0006801515238482867, + "loss": 2.2327, + "step": 443550 + }, + { + "epoch": 1.7146789132686986, + "grad_norm": 0.11535559594631195, + "learning_rate": 0.0006799999999999999, + "loss": 2.224, + "step": 443560 + }, + { + "epoch": 1.7147175704720818, + "grad_norm": 0.1135011836886406, + "learning_rate": 0.00067984849354326, + "loss": 2.2207, + "step": 443570 + }, + { + "epoch": 1.714756227675465, + "grad_norm": 0.11144589632749557, + "learning_rate": 0.0006796970044720797, + "loss": 2.2177, + "step": 443580 + }, + { + "epoch": 1.7147948848788483, + "grad_norm": 0.12376515567302704, + "learning_rate": 0.0006795455327804749, + "loss": 2.2236, + "step": 443590 + }, + { + "epoch": 1.7148335420822316, + "grad_norm": 0.11902966350317001, + "learning_rate": 0.0006793940784624657, + "loss": 2.2315, + "step": 443600 + }, + { + "epoch": 1.7148721992856149, + "grad_norm": 0.1146560087800026, + "learning_rate": 0.0006792426415120756, + "loss": 2.2475, + "step": 443610 + }, + { + "epoch": 1.714910856488998, + "grad_norm": 0.1380588263273239, + "learning_rate": 0.0006790912219233305, + "loss": 2.2383, + "step": 443620 + }, + { + "epoch": 1.7149495136923814, + "grad_norm": 0.11876972764730453, + "learning_rate": 0.000678939819690261, + "loss": 2.2289, + "step": 443630 + }, + { + "epoch": 1.7149881708957646, + "grad_norm": 0.11715719103813171, + "learning_rate": 0.0006787884348069005, + "loss": 2.2285, + "step": 443640 + }, + { + "epoch": 1.7150268280991479, + "grad_norm": 0.1241312250494957, + "learning_rate": 0.0006786370672672857, + "loss": 2.2041, + "step": 443650 + }, + { + "epoch": 1.7150654853025311, + "grad_norm": 0.13172374665737152, + "learning_rate": 0.0006784857170654568, + "loss": 2.2239, + "step": 443660 + }, + { + "epoch": 1.7151041425059146, + "grad_norm": 0.13301695883274078, + "learning_rate": 0.0006783343841954577, + "loss": 2.2153, + "step": 443670 + }, + { + "epoch": 1.7151427997092978, + "grad_norm": 0.12190935015678406, + "learning_rate": 0.0006781830686513355, + "loss": 2.2298, + "step": 443680 + }, + { + "epoch": 1.715181456912681, + "grad_norm": 0.11668330430984497, + "learning_rate": 0.0006780317704271409, + "loss": 2.22, + "step": 443690 + }, + { + "epoch": 1.7152201141160643, + "grad_norm": 0.11997809261083603, + "learning_rate": 0.0006778804895169273, + "loss": 2.2273, + "step": 443700 + }, + { + "epoch": 1.7152587713194478, + "grad_norm": 0.11524324119091034, + "learning_rate": 0.0006777292259147525, + "loss": 2.2307, + "step": 443710 + }, + { + "epoch": 1.715297428522831, + "grad_norm": 0.12497083842754364, + "learning_rate": 0.0006775779796146769, + "loss": 2.2292, + "step": 443720 + }, + { + "epoch": 1.7153360857262143, + "grad_norm": 0.1098659485578537, + "learning_rate": 0.0006774267506107648, + "loss": 2.2332, + "step": 443730 + }, + { + "epoch": 1.7153747429295976, + "grad_norm": 1.0404858589172363, + "learning_rate": 0.0006772755388970839, + "loss": 2.2316, + "step": 443740 + }, + { + "epoch": 1.7154134001329808, + "grad_norm": 0.11820700019598007, + "learning_rate": 0.0006771243444677047, + "loss": 2.2289, + "step": 443750 + }, + { + "epoch": 1.715452057336364, + "grad_norm": 0.11342157423496246, + "learning_rate": 0.0006769731673167019, + "loss": 2.2338, + "step": 443760 + }, + { + "epoch": 1.7154907145397473, + "grad_norm": 0.1244228407740593, + "learning_rate": 0.0006768220074381528, + "loss": 2.2127, + "step": 443770 + }, + { + "epoch": 1.7155293717431306, + "grad_norm": 0.13897007703781128, + "learning_rate": 0.0006766708648261386, + "loss": 2.2198, + "step": 443780 + }, + { + "epoch": 1.7155680289465138, + "grad_norm": 0.11900126934051514, + "learning_rate": 0.0006765197394747438, + "loss": 2.2161, + "step": 443790 + }, + { + "epoch": 1.715606686149897, + "grad_norm": 0.12850025296211243, + "learning_rate": 0.0006763686313780562, + "loss": 2.2388, + "step": 443800 + }, + { + "epoch": 1.7156453433532803, + "grad_norm": 0.11627134680747986, + "learning_rate": 0.0006762175405301672, + "loss": 2.217, + "step": 443810 + }, + { + "epoch": 1.7156840005566636, + "grad_norm": 0.11488353461027145, + "learning_rate": 0.0006760664669251709, + "loss": 2.2271, + "step": 443820 + }, + { + "epoch": 1.7157226577600468, + "grad_norm": 0.12036576122045517, + "learning_rate": 0.0006759154105571654, + "loss": 2.2228, + "step": 443830 + }, + { + "epoch": 1.7157613149634303, + "grad_norm": 0.1195918470621109, + "learning_rate": 0.0006757643714202521, + "loss": 2.2124, + "step": 443840 + }, + { + "epoch": 1.7157999721668136, + "grad_norm": 0.12076837569475174, + "learning_rate": 0.0006756133495085356, + "loss": 2.224, + "step": 443850 + }, + { + "epoch": 1.7158386293701968, + "grad_norm": 0.1215423047542572, + "learning_rate": 0.0006754623448161242, + "loss": 2.2251, + "step": 443860 + }, + { + "epoch": 1.71587728657358, + "grad_norm": 0.14442507922649384, + "learning_rate": 0.0006753113573371288, + "loss": 2.2188, + "step": 443870 + }, + { + "epoch": 1.7159159437769635, + "grad_norm": 0.1276344209909439, + "learning_rate": 0.0006751603870656644, + "loss": 2.2229, + "step": 443880 + }, + { + "epoch": 1.7159546009803468, + "grad_norm": 0.1226276233792305, + "learning_rate": 0.0006750094339958492, + "loss": 2.2298, + "step": 443890 + }, + { + "epoch": 1.71599325818373, + "grad_norm": 0.12590661644935608, + "learning_rate": 0.0006748584981218044, + "loss": 2.2336, + "step": 443900 + }, + { + "epoch": 1.7160319153871133, + "grad_norm": 0.10643420368432999, + "learning_rate": 0.000674707579437655, + "loss": 2.2199, + "step": 443910 + }, + { + "epoch": 1.7160705725904966, + "grad_norm": 0.12374252825975418, + "learning_rate": 0.0006745566779375287, + "loss": 2.2253, + "step": 443920 + }, + { + "epoch": 1.7161092297938798, + "grad_norm": 0.13089606165885925, + "learning_rate": 0.0006744057936155574, + "loss": 2.229, + "step": 443930 + }, + { + "epoch": 1.716147886997263, + "grad_norm": 0.12076699733734131, + "learning_rate": 0.0006742549264658759, + "loss": 2.2205, + "step": 443940 + }, + { + "epoch": 1.7161865442006463, + "grad_norm": 0.11502376198768616, + "learning_rate": 0.0006741040764826222, + "loss": 2.2202, + "step": 443950 + }, + { + "epoch": 1.7162252014040296, + "grad_norm": 0.11473656445741653, + "learning_rate": 0.0006739532436599381, + "loss": 2.2108, + "step": 443960 + }, + { + "epoch": 1.7162638586074128, + "grad_norm": 0.1203451007604599, + "learning_rate": 0.0006738024279919676, + "loss": 2.2314, + "step": 443970 + }, + { + "epoch": 1.716302515810796, + "grad_norm": 0.12273450940847397, + "learning_rate": 0.0006736516294728598, + "loss": 2.2244, + "step": 443980 + }, + { + "epoch": 1.7163411730141793, + "grad_norm": 0.12457942962646484, + "learning_rate": 0.0006735008480967657, + "loss": 2.2277, + "step": 443990 + }, + { + "epoch": 1.7163798302175626, + "grad_norm": 0.15763746201992035, + "learning_rate": 0.0006733500838578401, + "loss": 2.2214, + "step": 444000 + }, + { + "epoch": 1.716418487420946, + "grad_norm": 0.16036611795425415, + "learning_rate": 0.0006731993367502412, + "loss": 2.2192, + "step": 444010 + }, + { + "epoch": 1.7164571446243293, + "grad_norm": 0.12345317006111145, + "learning_rate": 0.0006730486067681303, + "loss": 2.231, + "step": 444020 + }, + { + "epoch": 1.7164958018277126, + "grad_norm": 0.11703797429800034, + "learning_rate": 0.0006728978939056725, + "loss": 2.2239, + "step": 444030 + }, + { + "epoch": 1.7165344590310958, + "grad_norm": 0.12373912334442139, + "learning_rate": 0.0006727471981570354, + "loss": 2.2086, + "step": 444040 + }, + { + "epoch": 1.7165731162344793, + "grad_norm": 0.11341448873281479, + "learning_rate": 0.0006725965195163905, + "loss": 2.2288, + "step": 444050 + }, + { + "epoch": 1.7166117734378625, + "grad_norm": 0.12175527960062027, + "learning_rate": 0.0006724458579779129, + "loss": 2.2337, + "step": 444060 + }, + { + "epoch": 1.7166504306412458, + "grad_norm": 0.11818042397499084, + "learning_rate": 0.00067229521353578, + "loss": 2.2296, + "step": 444070 + }, + { + "epoch": 1.716689087844629, + "grad_norm": 0.10906556248664856, + "learning_rate": 0.0006721445861841735, + "loss": 2.2157, + "step": 444080 + }, + { + "epoch": 1.7167277450480123, + "grad_norm": 0.11369844526052475, + "learning_rate": 0.0006719939759172777, + "loss": 2.2194, + "step": 444090 + }, + { + "epoch": 1.7167664022513955, + "grad_norm": 0.10923272371292114, + "learning_rate": 0.0006718433827292807, + "loss": 2.2263, + "step": 444100 + }, + { + "epoch": 1.7168050594547788, + "grad_norm": 0.1153886690735817, + "learning_rate": 0.0006716928066143735, + "loss": 2.2018, + "step": 444110 + }, + { + "epoch": 1.716843716658162, + "grad_norm": 0.11441920697689056, + "learning_rate": 0.0006715422475667508, + "loss": 2.211, + "step": 444120 + }, + { + "epoch": 1.7168823738615453, + "grad_norm": 0.14035643637180328, + "learning_rate": 0.0006713917055806102, + "loss": 2.2223, + "step": 444130 + }, + { + "epoch": 1.7169210310649285, + "grad_norm": 0.11534959822893143, + "learning_rate": 0.0006712411806501529, + "loss": 2.2296, + "step": 444140 + }, + { + "epoch": 1.7169596882683118, + "grad_norm": 0.13395413756370544, + "learning_rate": 0.0006710906727695829, + "loss": 2.2234, + "step": 444150 + }, + { + "epoch": 1.716998345471695, + "grad_norm": 0.12049634009599686, + "learning_rate": 0.000670940181933108, + "loss": 2.2113, + "step": 444160 + }, + { + "epoch": 1.7170370026750785, + "grad_norm": 0.11423641443252563, + "learning_rate": 0.0006707897081349392, + "loss": 2.2294, + "step": 444170 + }, + { + "epoch": 1.7170756598784618, + "grad_norm": 0.11800158768892288, + "learning_rate": 0.0006706392513692907, + "loss": 2.2239, + "step": 444180 + }, + { + "epoch": 1.717114317081845, + "grad_norm": 0.11680900305509567, + "learning_rate": 0.0006704888116303798, + "loss": 2.2212, + "step": 444190 + }, + { + "epoch": 1.7171529742852283, + "grad_norm": 0.1276027113199234, + "learning_rate": 0.000670338388912427, + "loss": 2.2155, + "step": 444200 + }, + { + "epoch": 1.7171916314886115, + "grad_norm": 0.12111736088991165, + "learning_rate": 0.0006701879832096569, + "loss": 2.2268, + "step": 444210 + }, + { + "epoch": 1.717230288691995, + "grad_norm": 0.12418892979621887, + "learning_rate": 0.0006700375945162962, + "loss": 2.227, + "step": 444220 + }, + { + "epoch": 1.7172689458953783, + "grad_norm": 0.13062898814678192, + "learning_rate": 0.0006698872228265755, + "loss": 2.2251, + "step": 444230 + }, + { + "epoch": 1.7173076030987615, + "grad_norm": 0.14824402332305908, + "learning_rate": 0.0006697368681347287, + "loss": 2.2186, + "step": 444240 + }, + { + "epoch": 1.7173462603021448, + "grad_norm": 0.1204281598329544, + "learning_rate": 0.0006695865304349928, + "loss": 2.2185, + "step": 444250 + }, + { + "epoch": 1.717384917505528, + "grad_norm": 0.12832044064998627, + "learning_rate": 0.0006694362097216083, + "loss": 2.2203, + "step": 444260 + }, + { + "epoch": 1.7174235747089113, + "grad_norm": 0.1236930713057518, + "learning_rate": 0.0006692859059888183, + "loss": 2.2224, + "step": 444270 + }, + { + "epoch": 1.7174622319122945, + "grad_norm": 0.12458539754152298, + "learning_rate": 0.00066913561923087, + "loss": 2.2163, + "step": 444280 + }, + { + "epoch": 1.7175008891156778, + "grad_norm": 0.11675764620304108, + "learning_rate": 0.0006689853494420131, + "loss": 2.2251, + "step": 444290 + }, + { + "epoch": 1.717539546319061, + "grad_norm": 0.12367656826972961, + "learning_rate": 0.0006688350966165012, + "loss": 2.241, + "step": 444300 + }, + { + "epoch": 1.7175782035224443, + "grad_norm": 0.11717633903026581, + "learning_rate": 0.0006686848607485905, + "loss": 2.2313, + "step": 444310 + }, + { + "epoch": 1.7176168607258275, + "grad_norm": 0.12098561972379684, + "learning_rate": 0.0006685346418325411, + "loss": 2.2126, + "step": 444320 + }, + { + "epoch": 1.7176555179292108, + "grad_norm": 0.11992432177066803, + "learning_rate": 0.0006683844398626156, + "loss": 2.2176, + "step": 444330 + }, + { + "epoch": 1.7176941751325943, + "grad_norm": 0.11694101244211197, + "learning_rate": 0.0006682342548330806, + "loss": 2.213, + "step": 444340 + }, + { + "epoch": 1.7177328323359775, + "grad_norm": 0.12024518102407455, + "learning_rate": 0.0006680840867382056, + "loss": 2.2342, + "step": 444350 + }, + { + "epoch": 1.7177714895393608, + "grad_norm": 0.11152735352516174, + "learning_rate": 0.0006679339355722631, + "loss": 2.2141, + "step": 444360 + }, + { + "epoch": 1.717810146742744, + "grad_norm": 0.12339860945940018, + "learning_rate": 0.000667783801329529, + "loss": 2.2292, + "step": 444370 + }, + { + "epoch": 1.7178488039461273, + "grad_norm": 0.12653346359729767, + "learning_rate": 0.0006676336840042827, + "loss": 2.2173, + "step": 444380 + }, + { + "epoch": 1.7178874611495107, + "grad_norm": 0.11451555788516998, + "learning_rate": 0.0006674835835908062, + "loss": 2.2211, + "step": 444390 + }, + { + "epoch": 1.717926118352894, + "grad_norm": 0.12210563570261002, + "learning_rate": 0.0006673335000833856, + "loss": 2.2222, + "step": 444400 + }, + { + "epoch": 1.7179647755562772, + "grad_norm": 0.13091878592967987, + "learning_rate": 0.0006671834334763091, + "loss": 2.2273, + "step": 444410 + }, + { + "epoch": 1.7180034327596605, + "grad_norm": 0.13674227893352509, + "learning_rate": 0.0006670333837638694, + "loss": 2.2236, + "step": 444420 + }, + { + "epoch": 1.7180420899630438, + "grad_norm": 0.13061882555484772, + "learning_rate": 0.0006668833509403614, + "loss": 2.2297, + "step": 444430 + }, + { + "epoch": 1.718080747166427, + "grad_norm": 0.11517611145973206, + "learning_rate": 0.0006667333350000833, + "loss": 2.2195, + "step": 444440 + }, + { + "epoch": 1.7181194043698103, + "grad_norm": 0.12318595498800278, + "learning_rate": 0.0006665833359373372, + "loss": 2.2204, + "step": 444450 + }, + { + "epoch": 1.7181580615731935, + "grad_norm": 0.11372929066419601, + "learning_rate": 0.000666433353746428, + "loss": 2.2293, + "step": 444460 + }, + { + "epoch": 1.7181967187765768, + "grad_norm": 0.13681162893772125, + "learning_rate": 0.0006662833884216633, + "loss": 2.2225, + "step": 444470 + }, + { + "epoch": 1.71823537597996, + "grad_norm": 0.12110299617052078, + "learning_rate": 0.0006661334399573547, + "loss": 2.2333, + "step": 444480 + }, + { + "epoch": 1.7182740331833433, + "grad_norm": 0.11480377614498138, + "learning_rate": 0.0006659835083478165, + "loss": 2.2093, + "step": 444490 + }, + { + "epoch": 1.7183126903867265, + "grad_norm": 0.11662760376930237, + "learning_rate": 0.0006658335935873668, + "loss": 2.2338, + "step": 444500 + }, + { + "epoch": 1.71835134759011, + "grad_norm": 0.1263725608587265, + "learning_rate": 0.0006656836956703258, + "loss": 2.2405, + "step": 444510 + }, + { + "epoch": 1.7183900047934932, + "grad_norm": 0.1221756860613823, + "learning_rate": 0.000665533814591018, + "loss": 2.1957, + "step": 444520 + }, + { + "epoch": 1.7184286619968765, + "grad_norm": 0.12357661873102188, + "learning_rate": 0.0006653839503437702, + "loss": 2.2182, + "step": 444530 + }, + { + "epoch": 1.7184673192002597, + "grad_norm": 0.11788289994001389, + "learning_rate": 0.0006652341029229131, + "loss": 2.2142, + "step": 444540 + }, + { + "epoch": 1.7185059764036432, + "grad_norm": 0.11715588718652725, + "learning_rate": 0.0006650842723227806, + "loss": 2.2243, + "step": 444550 + }, + { + "epoch": 1.7185446336070265, + "grad_norm": 0.12393707782030106, + "learning_rate": 0.000664934458537709, + "loss": 2.2029, + "step": 444560 + }, + { + "epoch": 1.7185832908104097, + "grad_norm": 0.11894424259662628, + "learning_rate": 0.0006647846615620385, + "loss": 2.2181, + "step": 444570 + }, + { + "epoch": 1.718621948013793, + "grad_norm": 0.124919593334198, + "learning_rate": 0.0006646348813901121, + "loss": 2.2404, + "step": 444580 + }, + { + "epoch": 1.7186606052171762, + "grad_norm": 0.1154218316078186, + "learning_rate": 0.000664485118016276, + "loss": 2.223, + "step": 444590 + }, + { + "epoch": 1.7186992624205595, + "grad_norm": 0.11173471808433533, + "learning_rate": 0.0006643353714348801, + "loss": 2.2221, + "step": 444600 + }, + { + "epoch": 1.7187379196239427, + "grad_norm": 0.11307603120803833, + "learning_rate": 0.0006641856416402765, + "loss": 2.2244, + "step": 444610 + }, + { + "epoch": 1.718776576827326, + "grad_norm": 0.12163484841585159, + "learning_rate": 0.0006640359286268212, + "loss": 2.2129, + "step": 444620 + }, + { + "epoch": 1.7188152340307092, + "grad_norm": 0.11416550725698471, + "learning_rate": 0.0006638862323888733, + "loss": 2.2057, + "step": 444630 + }, + { + "epoch": 1.7188538912340925, + "grad_norm": 0.13622942566871643, + "learning_rate": 0.0006637365529207948, + "loss": 2.2229, + "step": 444640 + }, + { + "epoch": 1.7188925484374757, + "grad_norm": 0.12028181552886963, + "learning_rate": 0.0006635868902169508, + "loss": 2.1998, + "step": 444650 + }, + { + "epoch": 1.718931205640859, + "grad_norm": 0.12066903710365295, + "learning_rate": 0.0006634372442717103, + "loss": 2.231, + "step": 444660 + }, + { + "epoch": 1.7189698628442422, + "grad_norm": 0.13217706978321075, + "learning_rate": 0.0006632876150794442, + "loss": 2.2073, + "step": 444670 + }, + { + "epoch": 1.7190085200476257, + "grad_norm": 0.12672095000743866, + "learning_rate": 0.0006631380026345278, + "loss": 2.2151, + "step": 444680 + }, + { + "epoch": 1.719047177251009, + "grad_norm": 0.12009608000516891, + "learning_rate": 0.0006629884069313385, + "loss": 2.2226, + "step": 444690 + }, + { + "epoch": 1.7190858344543922, + "grad_norm": 0.1271923929452896, + "learning_rate": 0.0006628388279642576, + "loss": 2.2376, + "step": 444700 + }, + { + "epoch": 1.7191244916577755, + "grad_norm": 0.11179065704345703, + "learning_rate": 0.0006626892657276695, + "loss": 2.2257, + "step": 444710 + }, + { + "epoch": 1.719163148861159, + "grad_norm": 0.11001638323068619, + "learning_rate": 0.0006625397202159611, + "loss": 2.2217, + "step": 444720 + }, + { + "epoch": 1.7192018060645422, + "grad_norm": 0.12382563203573227, + "learning_rate": 0.0006623901914235229, + "loss": 2.2224, + "step": 444730 + }, + { + "epoch": 1.7192404632679255, + "grad_norm": 0.13235601782798767, + "learning_rate": 0.0006622406793447485, + "loss": 2.2283, + "step": 444740 + }, + { + "epoch": 1.7192791204713087, + "grad_norm": 0.12467984110116959, + "learning_rate": 0.0006620911839740349, + "loss": 2.2097, + "step": 444750 + }, + { + "epoch": 1.719317777674692, + "grad_norm": 0.11879763007164001, + "learning_rate": 0.0006619417053057814, + "loss": 2.215, + "step": 444760 + }, + { + "epoch": 1.7193564348780752, + "grad_norm": 0.11860118806362152, + "learning_rate": 0.0006617922433343917, + "loss": 2.2388, + "step": 444770 + }, + { + "epoch": 1.7193950920814585, + "grad_norm": 0.12211289256811142, + "learning_rate": 0.0006616427980542714, + "loss": 2.2081, + "step": 444780 + }, + { + "epoch": 1.7194337492848417, + "grad_norm": 0.1151881217956543, + "learning_rate": 0.0006614933694598298, + "loss": 2.232, + "step": 444790 + }, + { + "epoch": 1.719472406488225, + "grad_norm": 0.11083666235208511, + "learning_rate": 0.0006613439575454791, + "loss": 2.2128, + "step": 444800 + }, + { + "epoch": 1.7195110636916082, + "grad_norm": 0.12252917885780334, + "learning_rate": 0.000661194562305635, + "loss": 2.2166, + "step": 444810 + }, + { + "epoch": 1.7195497208949915, + "grad_norm": 0.1250115931034088, + "learning_rate": 0.0006610451837347162, + "loss": 2.2224, + "step": 444820 + }, + { + "epoch": 1.7195883780983747, + "grad_norm": 0.11606471240520477, + "learning_rate": 0.0006608958218271441, + "loss": 2.2206, + "step": 444830 + }, + { + "epoch": 1.719627035301758, + "grad_norm": 0.11997007578611374, + "learning_rate": 0.0006607464765773435, + "loss": 2.2247, + "step": 444840 + }, + { + "epoch": 1.7196656925051415, + "grad_norm": 0.15658508241176605, + "learning_rate": 0.0006605971479797424, + "loss": 2.2172, + "step": 444850 + }, + { + "epoch": 1.7197043497085247, + "grad_norm": 0.12012956291437149, + "learning_rate": 0.000660447836028772, + "loss": 2.2259, + "step": 444860 + }, + { + "epoch": 1.719743006911908, + "grad_norm": 0.12634234130382538, + "learning_rate": 0.000660298540718866, + "loss": 2.2114, + "step": 444870 + }, + { + "epoch": 1.7197816641152912, + "grad_norm": 0.11848779022693634, + "learning_rate": 0.000660149262044462, + "loss": 2.2296, + "step": 444880 + }, + { + "epoch": 1.7198203213186747, + "grad_norm": 0.11735141277313232, + "learning_rate": 0.0006599999999999999, + "loss": 2.2307, + "step": 444890 + }, + { + "epoch": 1.719858978522058, + "grad_norm": 0.12146174907684326, + "learning_rate": 0.0006598507545799237, + "loss": 2.237, + "step": 444900 + }, + { + "epoch": 1.7198976357254412, + "grad_norm": 0.11710981279611588, + "learning_rate": 0.0006597015257786793, + "loss": 2.2072, + "step": 444910 + }, + { + "epoch": 1.7199362929288244, + "grad_norm": 0.10774870216846466, + "learning_rate": 0.0006595523135907169, + "loss": 2.2138, + "step": 444920 + }, + { + "epoch": 1.7199749501322077, + "grad_norm": 0.10720662027597427, + "learning_rate": 0.0006594031180104887, + "loss": 2.2166, + "step": 444930 + }, + { + "epoch": 1.720013607335591, + "grad_norm": 0.12658320367336273, + "learning_rate": 0.0006592539390324506, + "loss": 2.2122, + "step": 444940 + }, + { + "epoch": 1.7200522645389742, + "grad_norm": 0.12777094542980194, + "learning_rate": 0.0006591047766510614, + "loss": 2.22, + "step": 444950 + }, + { + "epoch": 1.7200909217423574, + "grad_norm": 0.12313709408044815, + "learning_rate": 0.0006589556308607831, + "loss": 2.2184, + "step": 444960 + }, + { + "epoch": 1.7201295789457407, + "grad_norm": 0.12547916173934937, + "learning_rate": 0.0006588065016560809, + "loss": 2.2357, + "step": 444970 + }, + { + "epoch": 1.720168236149124, + "grad_norm": 0.12291128933429718, + "learning_rate": 0.0006586573890314227, + "loss": 2.2213, + "step": 444980 + }, + { + "epoch": 1.7202068933525072, + "grad_norm": 0.12009264528751373, + "learning_rate": 0.0006585082929812798, + "loss": 2.2199, + "step": 444990 + }, + { + "epoch": 1.7202455505558905, + "grad_norm": 0.12587212026119232, + "learning_rate": 0.0006583592135001262, + "loss": 2.2265, + "step": 445000 + }, + { + "epoch": 1.7202842077592737, + "grad_norm": 0.11582623422145844, + "learning_rate": 0.0006582101505824394, + "loss": 2.226, + "step": 445010 + }, + { + "epoch": 1.7203228649626572, + "grad_norm": 0.11064667254686356, + "learning_rate": 0.0006580611042226998, + "loss": 2.2131, + "step": 445020 + }, + { + "epoch": 1.7203615221660404, + "grad_norm": 0.129017174243927, + "learning_rate": 0.0006579120744153906, + "loss": 2.2159, + "step": 445030 + }, + { + "epoch": 1.7204001793694237, + "grad_norm": 0.12192103266716003, + "learning_rate": 0.0006577630611549985, + "loss": 2.217, + "step": 445040 + }, + { + "epoch": 1.720438836572807, + "grad_norm": 0.12519840896129608, + "learning_rate": 0.0006576140644360133, + "loss": 2.2237, + "step": 445050 + }, + { + "epoch": 1.7204774937761904, + "grad_norm": 0.12600666284561157, + "learning_rate": 0.0006574650842529271, + "loss": 2.2137, + "step": 445060 + }, + { + "epoch": 1.7205161509795737, + "grad_norm": 0.12854309380054474, + "learning_rate": 0.0006573161206002361, + "loss": 2.2217, + "step": 445070 + }, + { + "epoch": 1.720554808182957, + "grad_norm": 0.13512104749679565, + "learning_rate": 0.0006571671734724385, + "loss": 2.2124, + "step": 445080 + }, + { + "epoch": 1.7205934653863402, + "grad_norm": 0.12210392951965332, + "learning_rate": 0.0006570182428640366, + "loss": 2.2163, + "step": 445090 + }, + { + "epoch": 1.7206321225897234, + "grad_norm": 0.12861166894435883, + "learning_rate": 0.0006568693287695348, + "loss": 2.2293, + "step": 445100 + }, + { + "epoch": 1.7206707797931067, + "grad_norm": 0.13264372944831848, + "learning_rate": 0.0006567204311834411, + "loss": 2.2396, + "step": 445110 + }, + { + "epoch": 1.72070943699649, + "grad_norm": 0.11817973107099533, + "learning_rate": 0.0006565715501002669, + "loss": 2.2088, + "step": 445120 + }, + { + "epoch": 1.7207480941998732, + "grad_norm": 0.11759193986654282, + "learning_rate": 0.0006564226855145254, + "loss": 2.2167, + "step": 445130 + }, + { + "epoch": 1.7207867514032564, + "grad_norm": 0.12298763543367386, + "learning_rate": 0.000656273837420734, + "loss": 2.2151, + "step": 445140 + }, + { + "epoch": 1.7208254086066397, + "grad_norm": 0.11796259880065918, + "learning_rate": 0.0006561250058134127, + "loss": 2.2232, + "step": 445150 + }, + { + "epoch": 1.720864065810023, + "grad_norm": 0.12086260318756104, + "learning_rate": 0.0006559761906870847, + "loss": 2.2131, + "step": 445160 + }, + { + "epoch": 1.7209027230134062, + "grad_norm": 0.1197434514760971, + "learning_rate": 0.0006558273920362758, + "loss": 2.2348, + "step": 445170 + }, + { + "epoch": 1.7209413802167894, + "grad_norm": 0.1237284317612648, + "learning_rate": 0.0006556786098555152, + "loss": 2.2158, + "step": 445180 + }, + { + "epoch": 1.720980037420173, + "grad_norm": 0.1284116953611374, + "learning_rate": 0.0006555298441393353, + "loss": 2.2081, + "step": 445190 + }, + { + "epoch": 1.7210186946235562, + "grad_norm": 0.11554042249917984, + "learning_rate": 0.0006553810948822711, + "loss": 2.2232, + "step": 445200 + }, + { + "epoch": 1.7210573518269394, + "grad_norm": 0.11683430522680283, + "learning_rate": 0.000655232362078861, + "loss": 2.2306, + "step": 445210 + }, + { + "epoch": 1.7210960090303227, + "grad_norm": 0.12061230093240738, + "learning_rate": 0.0006550836457236458, + "loss": 2.2214, + "step": 445220 + }, + { + "epoch": 1.7211346662337061, + "grad_norm": 0.11342422664165497, + "learning_rate": 0.0006549349458111702, + "loss": 2.2395, + "step": 445230 + }, + { + "epoch": 1.7211733234370894, + "grad_norm": 0.11453254520893097, + "learning_rate": 0.000654786262335981, + "loss": 2.2273, + "step": 445240 + }, + { + "epoch": 1.7212119806404726, + "grad_norm": 0.11891185492277145, + "learning_rate": 0.0006546375952926289, + "loss": 2.2225, + "step": 445250 + }, + { + "epoch": 1.721250637843856, + "grad_norm": 0.12148097902536392, + "learning_rate": 0.0006544889446756672, + "loss": 2.2243, + "step": 445260 + }, + { + "epoch": 1.7212892950472392, + "grad_norm": 0.11537045985460281, + "learning_rate": 0.0006543403104796518, + "loss": 2.2045, + "step": 445270 + }, + { + "epoch": 1.7213279522506224, + "grad_norm": 0.11426082253456116, + "learning_rate": 0.0006541916926991422, + "loss": 2.2174, + "step": 445280 + }, + { + "epoch": 1.7213666094540057, + "grad_norm": 0.14135867357254028, + "learning_rate": 0.0006540430913287008, + "loss": 2.221, + "step": 445290 + }, + { + "epoch": 1.721405266657389, + "grad_norm": 0.12051600217819214, + "learning_rate": 0.0006538945063628927, + "loss": 2.2112, + "step": 445300 + }, + { + "epoch": 1.7214439238607722, + "grad_norm": 0.12268465012311935, + "learning_rate": 0.0006537459377962864, + "loss": 2.2138, + "step": 445310 + }, + { + "epoch": 1.7214825810641554, + "grad_norm": 0.1147061213850975, + "learning_rate": 0.0006535973856234534, + "loss": 2.2405, + "step": 445320 + }, + { + "epoch": 1.7215212382675387, + "grad_norm": 0.12111300975084305, + "learning_rate": 0.0006534488498389675, + "loss": 2.2157, + "step": 445330 + }, + { + "epoch": 1.721559895470922, + "grad_norm": 0.1415233463048935, + "learning_rate": 0.0006533003304374058, + "loss": 2.2129, + "step": 445340 + }, + { + "epoch": 1.7215985526743052, + "grad_norm": 0.1263972520828247, + "learning_rate": 0.0006531518274133495, + "loss": 2.2091, + "step": 445350 + }, + { + "epoch": 1.7216372098776886, + "grad_norm": 0.12139548361301422, + "learning_rate": 0.0006530033407613811, + "loss": 2.2188, + "step": 445360 + }, + { + "epoch": 1.721675867081072, + "grad_norm": 0.11739082634449005, + "learning_rate": 0.0006528548704760871, + "loss": 2.2273, + "step": 445370 + }, + { + "epoch": 1.7217145242844552, + "grad_norm": 0.11622051149606705, + "learning_rate": 0.0006527064165520569, + "loss": 2.2207, + "step": 445380 + }, + { + "epoch": 1.7217531814878384, + "grad_norm": 0.12785564363002777, + "learning_rate": 0.0006525579789838822, + "loss": 2.2097, + "step": 445390 + }, + { + "epoch": 1.7217918386912219, + "grad_norm": 0.10811824351549149, + "learning_rate": 0.0006524095577661586, + "loss": 2.2177, + "step": 445400 + }, + { + "epoch": 1.7218304958946051, + "grad_norm": 0.12352680414915085, + "learning_rate": 0.0006522611528934843, + "loss": 2.2056, + "step": 445410 + }, + { + "epoch": 1.7218691530979884, + "grad_norm": 0.11806277185678482, + "learning_rate": 0.0006521127643604603, + "loss": 2.2267, + "step": 445420 + }, + { + "epoch": 1.7219078103013716, + "grad_norm": 0.1163213849067688, + "learning_rate": 0.0006519643921616907, + "loss": 2.2342, + "step": 445430 + }, + { + "epoch": 1.7219464675047549, + "grad_norm": 0.12437734007835388, + "learning_rate": 0.0006518160362917827, + "loss": 2.2137, + "step": 445440 + }, + { + "epoch": 1.7219851247081381, + "grad_norm": 0.1166505515575409, + "learning_rate": 0.0006516676967453461, + "loss": 2.2165, + "step": 445450 + }, + { + "epoch": 1.7220237819115214, + "grad_norm": 0.12200496345758438, + "learning_rate": 0.0006515193735169942, + "loss": 2.2214, + "step": 445460 + }, + { + "epoch": 1.7220624391149046, + "grad_norm": 0.1296072155237198, + "learning_rate": 0.0006513710666013428, + "loss": 2.2285, + "step": 445470 + }, + { + "epoch": 1.722101096318288, + "grad_norm": 0.11958147585391998, + "learning_rate": 0.0006512227759930108, + "loss": 2.2189, + "step": 445480 + }, + { + "epoch": 1.7221397535216711, + "grad_norm": 0.11607889831066132, + "learning_rate": 0.0006510745016866202, + "loss": 2.2212, + "step": 445490 + }, + { + "epoch": 1.7221784107250544, + "grad_norm": 0.12207704037427902, + "learning_rate": 0.0006509262436767958, + "loss": 2.2218, + "step": 445500 + }, + { + "epoch": 1.7222170679284377, + "grad_norm": 0.11960524320602417, + "learning_rate": 0.0006507780019581655, + "loss": 2.2181, + "step": 445510 + }, + { + "epoch": 1.722255725131821, + "grad_norm": 0.13021332025527954, + "learning_rate": 0.00065062977652536, + "loss": 2.2144, + "step": 445520 + }, + { + "epoch": 1.7222943823352044, + "grad_norm": 0.12092035263776779, + "learning_rate": 0.0006504815673730129, + "loss": 2.2227, + "step": 445530 + }, + { + "epoch": 1.7223330395385876, + "grad_norm": 0.12823942303657532, + "learning_rate": 0.0006503333744957609, + "loss": 2.2265, + "step": 445540 + }, + { + "epoch": 1.7223716967419709, + "grad_norm": 0.11256150901317596, + "learning_rate": 0.0006501851978882436, + "loss": 2.2281, + "step": 445550 + }, + { + "epoch": 1.7224103539453541, + "grad_norm": 0.12659218907356262, + "learning_rate": 0.0006500370375451036, + "loss": 2.2213, + "step": 445560 + }, + { + "epoch": 1.7224490111487376, + "grad_norm": 0.12542758882045746, + "learning_rate": 0.0006498888934609863, + "loss": 2.211, + "step": 445570 + }, + { + "epoch": 1.7224876683521209, + "grad_norm": 0.11898112297058105, + "learning_rate": 0.0006497407656305401, + "loss": 2.2196, + "step": 445580 + }, + { + "epoch": 1.7225263255555041, + "grad_norm": 0.12938259541988373, + "learning_rate": 0.0006495926540484163, + "loss": 2.2249, + "step": 445590 + }, + { + "epoch": 1.7225649827588874, + "grad_norm": 0.1213701143860817, + "learning_rate": 0.0006494445587092695, + "loss": 2.2273, + "step": 445600 + }, + { + "epoch": 1.7226036399622706, + "grad_norm": 0.11537019908428192, + "learning_rate": 0.0006492964796077566, + "loss": 2.2111, + "step": 445610 + }, + { + "epoch": 1.7226422971656539, + "grad_norm": 0.11240558326244354, + "learning_rate": 0.0006491484167385376, + "loss": 2.2243, + "step": 445620 + }, + { + "epoch": 1.7226809543690371, + "grad_norm": 0.11864791810512543, + "learning_rate": 0.0006490003700962756, + "loss": 2.2224, + "step": 445630 + }, + { + "epoch": 1.7227196115724204, + "grad_norm": 0.11303960531949997, + "learning_rate": 0.0006488523396756371, + "loss": 2.2231, + "step": 445640 + }, + { + "epoch": 1.7227582687758036, + "grad_norm": 0.11428240686655045, + "learning_rate": 0.0006487043254712906, + "loss": 2.2177, + "step": 445650 + }, + { + "epoch": 1.7227969259791869, + "grad_norm": 0.12397156655788422, + "learning_rate": 0.0006485563274779079, + "loss": 2.2114, + "step": 445660 + }, + { + "epoch": 1.7228355831825701, + "grad_norm": 0.13758109509944916, + "learning_rate": 0.0006484083456901637, + "loss": 2.2248, + "step": 445670 + }, + { + "epoch": 1.7228742403859534, + "grad_norm": 0.12868931889533997, + "learning_rate": 0.0006482603801027361, + "loss": 2.2144, + "step": 445680 + }, + { + "epoch": 1.7229128975893366, + "grad_norm": 0.11709859222173691, + "learning_rate": 0.000648112430710305, + "loss": 2.2239, + "step": 445690 + }, + { + "epoch": 1.72295155479272, + "grad_norm": 0.11064182966947556, + "learning_rate": 0.0006479644975075543, + "loss": 2.2251, + "step": 445700 + }, + { + "epoch": 1.7229902119961034, + "grad_norm": 0.12446606904268265, + "learning_rate": 0.0006478165804891705, + "loss": 2.213, + "step": 445710 + }, + { + "epoch": 1.7230288691994866, + "grad_norm": 0.12083175778388977, + "learning_rate": 0.0006476686796498426, + "loss": 2.2172, + "step": 445720 + }, + { + "epoch": 1.7230675264028699, + "grad_norm": 0.11902615427970886, + "learning_rate": 0.0006475207949842629, + "loss": 2.2191, + "step": 445730 + }, + { + "epoch": 1.7231061836062533, + "grad_norm": 0.11912401020526886, + "learning_rate": 0.0006473729264871266, + "loss": 2.2099, + "step": 445740 + }, + { + "epoch": 1.7231448408096366, + "grad_norm": 0.13067235052585602, + "learning_rate": 0.0006472250741531316, + "loss": 2.2282, + "step": 445750 + }, + { + "epoch": 1.7231834980130198, + "grad_norm": 0.12016969174146652, + "learning_rate": 0.000647077237976979, + "loss": 2.2329, + "step": 445760 + }, + { + "epoch": 1.723222155216403, + "grad_norm": 0.12072800099849701, + "learning_rate": 0.0006469294179533722, + "loss": 2.2306, + "step": 445770 + }, + { + "epoch": 1.7232608124197863, + "grad_norm": 0.12538011372089386, + "learning_rate": 0.0006467816140770182, + "loss": 2.2163, + "step": 445780 + }, + { + "epoch": 1.7232994696231696, + "grad_norm": 0.12371477484703064, + "learning_rate": 0.0006466338263426266, + "loss": 2.2263, + "step": 445790 + }, + { + "epoch": 1.7233381268265529, + "grad_norm": 0.11195444315671921, + "learning_rate": 0.0006464860547449096, + "loss": 2.2203, + "step": 445800 + }, + { + "epoch": 1.723376784029936, + "grad_norm": 0.11557309329509735, + "learning_rate": 0.0006463382992785827, + "loss": 2.2286, + "step": 445810 + }, + { + "epoch": 1.7234154412333194, + "grad_norm": 0.130571186542511, + "learning_rate": 0.0006461905599383642, + "loss": 2.2137, + "step": 445820 + }, + { + "epoch": 1.7234540984367026, + "grad_norm": 0.12194689363241196, + "learning_rate": 0.000646042836718975, + "loss": 2.2364, + "step": 445830 + }, + { + "epoch": 1.7234927556400859, + "grad_norm": 0.1201247125864029, + "learning_rate": 0.0006458951296151394, + "loss": 2.2271, + "step": 445840 + }, + { + "epoch": 1.7235314128434691, + "grad_norm": 0.1234743744134903, + "learning_rate": 0.000645747438621584, + "loss": 2.2252, + "step": 445850 + }, + { + "epoch": 1.7235700700468524, + "grad_norm": 0.1283080130815506, + "learning_rate": 0.0006455997637330389, + "loss": 2.2164, + "step": 445860 + }, + { + "epoch": 1.7236087272502358, + "grad_norm": 0.1170172318816185, + "learning_rate": 0.0006454521049442364, + "loss": 2.2252, + "step": 445870 + }, + { + "epoch": 1.723647384453619, + "grad_norm": 0.12163842469453812, + "learning_rate": 0.000645304462249912, + "loss": 2.2164, + "step": 445880 + }, + { + "epoch": 1.7236860416570023, + "grad_norm": 0.11199560016393661, + "learning_rate": 0.0006451568356448044, + "loss": 2.2202, + "step": 445890 + }, + { + "epoch": 1.7237246988603856, + "grad_norm": 0.12853404879570007, + "learning_rate": 0.0006450092251236541, + "loss": 2.22, + "step": 445900 + }, + { + "epoch": 1.723763356063769, + "grad_norm": 0.14157810807228088, + "learning_rate": 0.0006448616306812061, + "loss": 2.2222, + "step": 445910 + }, + { + "epoch": 1.7238020132671523, + "grad_norm": 0.11695588380098343, + "learning_rate": 0.0006447140523122068, + "loss": 2.2137, + "step": 445920 + }, + { + "epoch": 1.7238406704705356, + "grad_norm": 0.1321401298046112, + "learning_rate": 0.0006445664900114061, + "loss": 2.2065, + "step": 445930 + }, + { + "epoch": 1.7238793276739188, + "grad_norm": 0.1265600621700287, + "learning_rate": 0.0006444189437735566, + "loss": 2.2215, + "step": 445940 + }, + { + "epoch": 1.723917984877302, + "grad_norm": 0.12171867489814758, + "learning_rate": 0.000644271413593414, + "loss": 2.2215, + "step": 445950 + }, + { + "epoch": 1.7239566420806853, + "grad_norm": 0.12688489258289337, + "learning_rate": 0.0006441238994657366, + "loss": 2.2183, + "step": 445960 + }, + { + "epoch": 1.7239952992840686, + "grad_norm": 0.12067555636167526, + "learning_rate": 0.0006439764013852856, + "loss": 2.2144, + "step": 445970 + }, + { + "epoch": 1.7240339564874518, + "grad_norm": 0.12304933369159698, + "learning_rate": 0.0006438289193468252, + "loss": 2.2178, + "step": 445980 + }, + { + "epoch": 1.724072613690835, + "grad_norm": 0.12591667473316193, + "learning_rate": 0.0006436814533451223, + "loss": 2.2231, + "step": 445990 + }, + { + "epoch": 1.7241112708942183, + "grad_norm": 0.12514787912368774, + "learning_rate": 0.0006435340033749463, + "loss": 2.2262, + "step": 446000 + }, + { + "epoch": 1.7241499280976016, + "grad_norm": 0.11285724490880966, + "learning_rate": 0.0006433865694310704, + "loss": 2.2122, + "step": 446010 + }, + { + "epoch": 1.7241885853009848, + "grad_norm": 0.12354803830385208, + "learning_rate": 0.0006432391515082696, + "loss": 2.2238, + "step": 446020 + }, + { + "epoch": 1.7242272425043683, + "grad_norm": 0.1407475471496582, + "learning_rate": 0.0006430917496013227, + "loss": 2.2238, + "step": 446030 + }, + { + "epoch": 1.7242658997077516, + "grad_norm": 0.11316511034965515, + "learning_rate": 0.0006429443637050101, + "loss": 2.2267, + "step": 446040 + }, + { + "epoch": 1.7243045569111348, + "grad_norm": 0.13038979470729828, + "learning_rate": 0.0006427969938141162, + "loss": 2.2036, + "step": 446050 + }, + { + "epoch": 1.724343214114518, + "grad_norm": 0.12853805720806122, + "learning_rate": 0.0006426496399234279, + "loss": 2.2179, + "step": 446060 + }, + { + "epoch": 1.7243818713179013, + "grad_norm": 0.12553642690181732, + "learning_rate": 0.0006425023020277345, + "loss": 2.2264, + "step": 446070 + }, + { + "epoch": 1.7244205285212848, + "grad_norm": 0.12129683792591095, + "learning_rate": 0.0006423549801218287, + "loss": 2.2099, + "step": 446080 + }, + { + "epoch": 1.724459185724668, + "grad_norm": 0.12941096723079681, + "learning_rate": 0.0006422076742005059, + "loss": 2.2182, + "step": 446090 + }, + { + "epoch": 1.7244978429280513, + "grad_norm": 0.12963566184043884, + "learning_rate": 0.0006420603842585635, + "loss": 2.2186, + "step": 446100 + }, + { + "epoch": 1.7245365001314346, + "grad_norm": 0.12317825853824615, + "learning_rate": 0.0006419131102908031, + "loss": 2.2073, + "step": 446110 + }, + { + "epoch": 1.7245751573348178, + "grad_norm": 0.11838335543870926, + "learning_rate": 0.0006417658522920285, + "loss": 2.2142, + "step": 446120 + }, + { + "epoch": 1.724613814538201, + "grad_norm": 0.14520931243896484, + "learning_rate": 0.0006416186102570456, + "loss": 2.2245, + "step": 446130 + }, + { + "epoch": 1.7246524717415843, + "grad_norm": 0.11642967164516449, + "learning_rate": 0.0006414713841806645, + "loss": 2.2206, + "step": 446140 + }, + { + "epoch": 1.7246911289449676, + "grad_norm": 0.11772605031728745, + "learning_rate": 0.0006413241740576967, + "loss": 2.2062, + "step": 446150 + }, + { + "epoch": 1.7247297861483508, + "grad_norm": 0.12353634089231491, + "learning_rate": 0.0006411769798829577, + "loss": 2.2219, + "step": 446160 + }, + { + "epoch": 1.724768443351734, + "grad_norm": 0.12665092945098877, + "learning_rate": 0.000641029801651265, + "loss": 2.2258, + "step": 446170 + }, + { + "epoch": 1.7248071005551173, + "grad_norm": 0.11800059676170349, + "learning_rate": 0.0006408826393574394, + "loss": 2.2209, + "step": 446180 + }, + { + "epoch": 1.7248457577585006, + "grad_norm": 0.10817782580852509, + "learning_rate": 0.0006407354929963043, + "loss": 2.2159, + "step": 446190 + }, + { + "epoch": 1.724884414961884, + "grad_norm": 0.11082755029201508, + "learning_rate": 0.0006405883625626856, + "loss": 2.2224, + "step": 446200 + }, + { + "epoch": 1.7249230721652673, + "grad_norm": 0.11737308651208878, + "learning_rate": 0.0006404412480514128, + "loss": 2.2266, + "step": 446210 + }, + { + "epoch": 1.7249617293686506, + "grad_norm": 0.121700718998909, + "learning_rate": 0.0006402941494573173, + "loss": 2.2221, + "step": 446220 + }, + { + "epoch": 1.7250003865720338, + "grad_norm": 0.11579973995685577, + "learning_rate": 0.0006401470667752341, + "loss": 2.2178, + "step": 446230 + }, + { + "epoch": 1.725039043775417, + "grad_norm": 0.11861163377761841, + "learning_rate": 0.0006400000000000002, + "loss": 2.2232, + "step": 446240 + }, + { + "epoch": 1.7250777009788005, + "grad_norm": 0.11449922621250153, + "learning_rate": 0.0006398529491264557, + "loss": 2.2155, + "step": 446250 + }, + { + "epoch": 1.7251163581821838, + "grad_norm": 0.12742897868156433, + "learning_rate": 0.0006397059141494439, + "loss": 2.2319, + "step": 446260 + }, + { + "epoch": 1.725155015385567, + "grad_norm": 0.11458654701709747, + "learning_rate": 0.0006395588950638105, + "loss": 2.2206, + "step": 446270 + }, + { + "epoch": 1.7251936725889503, + "grad_norm": 0.13417372107505798, + "learning_rate": 0.000639411891864404, + "loss": 2.2128, + "step": 446280 + }, + { + "epoch": 1.7252323297923335, + "grad_norm": 0.1364544928073883, + "learning_rate": 0.0006392649045460759, + "loss": 2.219, + "step": 446290 + }, + { + "epoch": 1.7252709869957168, + "grad_norm": 0.10574916005134583, + "learning_rate": 0.0006391179331036801, + "loss": 2.2163, + "step": 446300 + }, + { + "epoch": 1.7253096441991, + "grad_norm": 0.12731365859508514, + "learning_rate": 0.0006389709775320733, + "loss": 2.2058, + "step": 446310 + }, + { + "epoch": 1.7253483014024833, + "grad_norm": 0.12686818838119507, + "learning_rate": 0.0006388240378261157, + "loss": 2.2266, + "step": 446320 + }, + { + "epoch": 1.7253869586058665, + "grad_norm": 0.11585552990436554, + "learning_rate": 0.0006386771139806693, + "loss": 2.2223, + "step": 446330 + }, + { + "epoch": 1.7254256158092498, + "grad_norm": 0.12526173889636993, + "learning_rate": 0.0006385302059905993, + "loss": 2.2305, + "step": 446340 + }, + { + "epoch": 1.725464273012633, + "grad_norm": 0.12107162177562714, + "learning_rate": 0.000638383313850774, + "loss": 2.2051, + "step": 446350 + }, + { + "epoch": 1.7255029302160163, + "grad_norm": 0.10977458953857422, + "learning_rate": 0.0006382364375560639, + "loss": 2.2154, + "step": 446360 + }, + { + "epoch": 1.7255415874193998, + "grad_norm": 0.12415580451488495, + "learning_rate": 0.0006380895771013426, + "loss": 2.211, + "step": 446370 + }, + { + "epoch": 1.725580244622783, + "grad_norm": 0.12037225067615509, + "learning_rate": 0.0006379427324814863, + "loss": 2.2301, + "step": 446380 + }, + { + "epoch": 1.7256189018261663, + "grad_norm": 0.1225074976682663, + "learning_rate": 0.0006377959036913743, + "loss": 2.2222, + "step": 446390 + }, + { + "epoch": 1.7256575590295495, + "grad_norm": 0.11446145921945572, + "learning_rate": 0.0006376490907258879, + "loss": 2.2202, + "step": 446400 + }, + { + "epoch": 1.7256962162329328, + "grad_norm": 0.11765024811029434, + "learning_rate": 0.0006375022935799122, + "loss": 2.1948, + "step": 446410 + }, + { + "epoch": 1.7257348734363163, + "grad_norm": 0.1386559158563614, + "learning_rate": 0.0006373555122483341, + "loss": 2.2254, + "step": 446420 + }, + { + "epoch": 1.7257735306396995, + "grad_norm": 0.1227501630783081, + "learning_rate": 0.0006372087467260438, + "loss": 2.2166, + "step": 446430 + }, + { + "epoch": 1.7258121878430828, + "grad_norm": 0.1231738030910492, + "learning_rate": 0.0006370619970079343, + "loss": 2.2255, + "step": 446440 + }, + { + "epoch": 1.725850845046466, + "grad_norm": 0.11780079454183578, + "learning_rate": 0.000636915263088901, + "loss": 2.2198, + "step": 446450 + }, + { + "epoch": 1.7258895022498493, + "grad_norm": 0.12161796540021896, + "learning_rate": 0.0006367685449638421, + "loss": 2.2301, + "step": 446460 + }, + { + "epoch": 1.7259281594532325, + "grad_norm": 0.11437132954597473, + "learning_rate": 0.0006366218426276589, + "loss": 2.2177, + "step": 446470 + }, + { + "epoch": 1.7259668166566158, + "grad_norm": 0.12394043058156967, + "learning_rate": 0.0006364751560752551, + "loss": 2.2125, + "step": 446480 + }, + { + "epoch": 1.726005473859999, + "grad_norm": 0.1160041093826294, + "learning_rate": 0.0006363284853015372, + "loss": 2.2283, + "step": 446490 + }, + { + "epoch": 1.7260441310633823, + "grad_norm": 0.1226138100028038, + "learning_rate": 0.0006361818303014144, + "loss": 2.2327, + "step": 446500 + }, + { + "epoch": 1.7260827882667655, + "grad_norm": 0.1209820955991745, + "learning_rate": 0.0006360351910697988, + "loss": 2.2324, + "step": 446510 + }, + { + "epoch": 1.7261214454701488, + "grad_norm": 0.16416043043136597, + "learning_rate": 0.0006358885676016053, + "loss": 2.2182, + "step": 446520 + }, + { + "epoch": 1.726160102673532, + "grad_norm": 0.11868501454591751, + "learning_rate": 0.0006357419598917514, + "loss": 2.2337, + "step": 446530 + }, + { + "epoch": 1.7261987598769155, + "grad_norm": 0.11926791816949844, + "learning_rate": 0.0006355953679351569, + "loss": 2.2073, + "step": 446540 + }, + { + "epoch": 1.7262374170802988, + "grad_norm": 0.12692750990390778, + "learning_rate": 0.0006354487917267451, + "loss": 2.2077, + "step": 446550 + }, + { + "epoch": 1.726276074283682, + "grad_norm": 0.11207547783851624, + "learning_rate": 0.0006353022312614416, + "loss": 2.2183, + "step": 446560 + }, + { + "epoch": 1.7263147314870653, + "grad_norm": 0.11618570238351822, + "learning_rate": 0.0006351556865341748, + "loss": 2.2015, + "step": 446570 + }, + { + "epoch": 1.7263533886904487, + "grad_norm": 0.13771525025367737, + "learning_rate": 0.0006350091575398757, + "loss": 2.2115, + "step": 446580 + }, + { + "epoch": 1.726392045893832, + "grad_norm": 0.11748812347650528, + "learning_rate": 0.0006348626442734784, + "loss": 2.226, + "step": 446590 + }, + { + "epoch": 1.7264307030972152, + "grad_norm": 0.13634926080703735, + "learning_rate": 0.0006347161467299189, + "loss": 2.2148, + "step": 446600 + }, + { + "epoch": 1.7264693603005985, + "grad_norm": 0.11584623903036118, + "learning_rate": 0.0006345696649041373, + "loss": 2.2066, + "step": 446610 + }, + { + "epoch": 1.7265080175039818, + "grad_norm": 0.11857582628726959, + "learning_rate": 0.0006344231987910749, + "loss": 2.2201, + "step": 446620 + }, + { + "epoch": 1.726546674707365, + "grad_norm": 0.1365407258272171, + "learning_rate": 0.0006342767483856768, + "loss": 2.2263, + "step": 446630 + }, + { + "epoch": 1.7265853319107483, + "grad_norm": 0.12051232159137726, + "learning_rate": 0.0006341303136828902, + "loss": 2.1958, + "step": 446640 + }, + { + "epoch": 1.7266239891141315, + "grad_norm": 0.11777618527412415, + "learning_rate": 0.0006339838946776653, + "loss": 2.2354, + "step": 446650 + }, + { + "epoch": 1.7266626463175148, + "grad_norm": 0.13300010561943054, + "learning_rate": 0.0006338374913649547, + "loss": 2.2211, + "step": 446660 + }, + { + "epoch": 1.726701303520898, + "grad_norm": 0.11809588968753815, + "learning_rate": 0.0006336911037397144, + "loss": 2.2186, + "step": 446670 + }, + { + "epoch": 1.7267399607242813, + "grad_norm": 0.11888469755649567, + "learning_rate": 0.0006335447317969023, + "loss": 2.2236, + "step": 446680 + }, + { + "epoch": 1.7267786179276645, + "grad_norm": 0.12028637528419495, + "learning_rate": 0.0006333983755314791, + "loss": 2.2162, + "step": 446690 + }, + { + "epoch": 1.7268172751310478, + "grad_norm": 0.12548750638961792, + "learning_rate": 0.000633252034938409, + "loss": 2.2181, + "step": 446700 + }, + { + "epoch": 1.7268559323344312, + "grad_norm": 0.12346018850803375, + "learning_rate": 0.0006331057100126578, + "loss": 2.2081, + "step": 446710 + }, + { + "epoch": 1.7268945895378145, + "grad_norm": 0.12081938236951828, + "learning_rate": 0.000632959400749195, + "loss": 2.2296, + "step": 446720 + }, + { + "epoch": 1.7269332467411977, + "grad_norm": 0.11280404776334763, + "learning_rate": 0.0006328131071429919, + "loss": 2.2113, + "step": 446730 + }, + { + "epoch": 1.726971903944581, + "grad_norm": 0.13663756847381592, + "learning_rate": 0.0006326668291890232, + "loss": 2.2204, + "step": 446740 + }, + { + "epoch": 1.7270105611479645, + "grad_norm": 0.118993379175663, + "learning_rate": 0.0006325205668822655, + "loss": 2.1998, + "step": 446750 + }, + { + "epoch": 1.7270492183513477, + "grad_norm": 0.11974366009235382, + "learning_rate": 0.0006323743202176993, + "loss": 2.2148, + "step": 446760 + }, + { + "epoch": 1.727087875554731, + "grad_norm": 0.11485361307859421, + "learning_rate": 0.0006322280891903065, + "loss": 2.2232, + "step": 446770 + }, + { + "epoch": 1.7271265327581142, + "grad_norm": 0.12438970804214478, + "learning_rate": 0.0006320818737950725, + "loss": 2.2116, + "step": 446780 + }, + { + "epoch": 1.7271651899614975, + "grad_norm": 0.1330035924911499, + "learning_rate": 0.0006319356740269851, + "loss": 2.2095, + "step": 446790 + }, + { + "epoch": 1.7272038471648807, + "grad_norm": 0.12693437933921814, + "learning_rate": 0.0006317894898810344, + "loss": 2.2123, + "step": 446800 + }, + { + "epoch": 1.727242504368264, + "grad_norm": 0.5244444608688354, + "learning_rate": 0.0006316433213522141, + "loss": 2.2197, + "step": 446810 + }, + { + "epoch": 1.7272811615716472, + "grad_norm": 0.11980904638767242, + "learning_rate": 0.0006314971684355197, + "loss": 2.2133, + "step": 446820 + }, + { + "epoch": 1.7273198187750305, + "grad_norm": 0.13215787708759308, + "learning_rate": 0.0006313510311259501, + "loss": 2.219, + "step": 446830 + }, + { + "epoch": 1.7273584759784137, + "grad_norm": 0.1250036060810089, + "learning_rate": 0.0006312049094185061, + "loss": 2.2113, + "step": 446840 + }, + { + "epoch": 1.727397133181797, + "grad_norm": 0.10954178869724274, + "learning_rate": 0.0006310588033081917, + "loss": 2.2151, + "step": 446850 + }, + { + "epoch": 1.7274357903851802, + "grad_norm": 0.11306620389223099, + "learning_rate": 0.0006309127127900136, + "loss": 2.2184, + "step": 446860 + }, + { + "epoch": 1.7274744475885635, + "grad_norm": 0.12376853823661804, + "learning_rate": 0.0006307666378589807, + "loss": 2.2204, + "step": 446870 + }, + { + "epoch": 1.727513104791947, + "grad_norm": 0.13085810840129852, + "learning_rate": 0.0006306205785101049, + "loss": 2.2203, + "step": 446880 + }, + { + "epoch": 1.7275517619953302, + "grad_norm": 0.11990802735090256, + "learning_rate": 0.0006304745347384007, + "loss": 2.2085, + "step": 446890 + }, + { + "epoch": 1.7275904191987135, + "grad_norm": 0.11620058864355087, + "learning_rate": 0.0006303285065388855, + "loss": 2.2054, + "step": 446900 + }, + { + "epoch": 1.7276290764020967, + "grad_norm": 0.12165633589029312, + "learning_rate": 0.0006301824939065788, + "loss": 2.2177, + "step": 446910 + }, + { + "epoch": 1.7276677336054802, + "grad_norm": 0.12509748339653015, + "learning_rate": 0.0006300364968365033, + "loss": 2.2255, + "step": 446920 + }, + { + "epoch": 1.7277063908088635, + "grad_norm": 0.13173136115074158, + "learning_rate": 0.000629890515323684, + "loss": 2.1986, + "step": 446930 + }, + { + "epoch": 1.7277450480122467, + "grad_norm": 0.14559979736804962, + "learning_rate": 0.0006297445493631488, + "loss": 2.2201, + "step": 446940 + }, + { + "epoch": 1.72778370521563, + "grad_norm": 0.13024260103702545, + "learning_rate": 0.0006295985989499283, + "loss": 2.2112, + "step": 446950 + }, + { + "epoch": 1.7278223624190132, + "grad_norm": 0.11923927813768387, + "learning_rate": 0.0006294526640790547, + "loss": 2.2082, + "step": 446960 + }, + { + "epoch": 1.7278610196223965, + "grad_norm": 0.12412068992853165, + "learning_rate": 0.000629306744745565, + "loss": 2.2242, + "step": 446970 + }, + { + "epoch": 1.7278996768257797, + "grad_norm": 0.14609181880950928, + "learning_rate": 0.0006291608409444965, + "loss": 2.2167, + "step": 446980 + }, + { + "epoch": 1.727938334029163, + "grad_norm": 0.12911032140254974, + "learning_rate": 0.0006290149526708909, + "loss": 2.2289, + "step": 446990 + }, + { + "epoch": 1.7279769912325462, + "grad_norm": 0.12092292308807373, + "learning_rate": 0.0006288690799197912, + "loss": 2.1986, + "step": 447000 + }, + { + "epoch": 1.7280156484359295, + "grad_norm": 0.1374613344669342, + "learning_rate": 0.0006287232226862441, + "loss": 2.2165, + "step": 447010 + }, + { + "epoch": 1.7280543056393127, + "grad_norm": 0.13300615549087524, + "learning_rate": 0.0006285773809652985, + "loss": 2.209, + "step": 447020 + }, + { + "epoch": 1.728092962842696, + "grad_norm": 0.12323442101478577, + "learning_rate": 0.0006284315547520059, + "loss": 2.2132, + "step": 447030 + }, + { + "epoch": 1.7281316200460792, + "grad_norm": 0.11762463301420212, + "learning_rate": 0.0006282857440414203, + "loss": 2.2134, + "step": 447040 + }, + { + "epoch": 1.7281702772494627, + "grad_norm": 0.12495895475149155, + "learning_rate": 0.0006281399488285986, + "loss": 2.2217, + "step": 447050 + }, + { + "epoch": 1.728208934452846, + "grad_norm": 0.11891249567270279, + "learning_rate": 0.0006279941691086004, + "loss": 2.2145, + "step": 447060 + }, + { + "epoch": 1.7282475916562292, + "grad_norm": 0.13742129504680634, + "learning_rate": 0.0006278484048764874, + "loss": 2.2091, + "step": 447070 + }, + { + "epoch": 1.7282862488596125, + "grad_norm": 0.13152971863746643, + "learning_rate": 0.0006277026561273247, + "loss": 2.21, + "step": 447080 + }, + { + "epoch": 1.728324906062996, + "grad_norm": 0.12500789761543274, + "learning_rate": 0.0006275569228561791, + "loss": 2.2193, + "step": 447090 + }, + { + "epoch": 1.7283635632663792, + "grad_norm": 0.11646206676959991, + "learning_rate": 0.0006274112050581209, + "loss": 2.2038, + "step": 447100 + }, + { + "epoch": 1.7284022204697624, + "grad_norm": 0.11558002233505249, + "learning_rate": 0.0006272655027282223, + "loss": 2.2186, + "step": 447110 + }, + { + "epoch": 1.7284408776731457, + "grad_norm": 0.1294437050819397, + "learning_rate": 0.000627119815861559, + "loss": 2.2197, + "step": 447120 + }, + { + "epoch": 1.728479534876529, + "grad_norm": 0.11961176246404648, + "learning_rate": 0.0006269741444532079, + "loss": 2.2251, + "step": 447130 + }, + { + "epoch": 1.7285181920799122, + "grad_norm": 0.12001509964466095, + "learning_rate": 0.0006268284884982503, + "loss": 2.2222, + "step": 447140 + }, + { + "epoch": 1.7285568492832954, + "grad_norm": 0.11960349977016449, + "learning_rate": 0.0006266828479917685, + "loss": 2.2247, + "step": 447150 + }, + { + "epoch": 1.7285955064866787, + "grad_norm": 0.12857332825660706, + "learning_rate": 0.0006265372229288484, + "loss": 2.2133, + "step": 447160 + }, + { + "epoch": 1.728634163690062, + "grad_norm": 0.11253602057695389, + "learning_rate": 0.0006263916133045779, + "loss": 2.2307, + "step": 447170 + }, + { + "epoch": 1.7286728208934452, + "grad_norm": 0.12653528153896332, + "learning_rate": 0.0006262460191140482, + "loss": 2.2116, + "step": 447180 + }, + { + "epoch": 1.7287114780968285, + "grad_norm": 0.12356472760438919, + "learning_rate": 0.0006261004403523524, + "loss": 2.2344, + "step": 447190 + }, + { + "epoch": 1.7287501353002117, + "grad_norm": 0.11877021193504333, + "learning_rate": 0.0006259548770145867, + "loss": 2.2024, + "step": 447200 + }, + { + "epoch": 1.728788792503595, + "grad_norm": 0.11799889802932739, + "learning_rate": 0.0006258093290958493, + "loss": 2.2131, + "step": 447210 + }, + { + "epoch": 1.7288274497069784, + "grad_norm": 0.12331540137529373, + "learning_rate": 0.0006256637965912417, + "loss": 2.2295, + "step": 447220 + }, + { + "epoch": 1.7288661069103617, + "grad_norm": 0.12345191091299057, + "learning_rate": 0.0006255182794958676, + "loss": 2.2403, + "step": 447230 + }, + { + "epoch": 1.728904764113745, + "grad_norm": 0.12320110946893692, + "learning_rate": 0.0006253727778048335, + "loss": 2.2158, + "step": 447240 + }, + { + "epoch": 1.7289434213171282, + "grad_norm": 0.1262044906616211, + "learning_rate": 0.000625227291513248, + "loss": 2.227, + "step": 447250 + }, + { + "epoch": 1.7289820785205117, + "grad_norm": 0.12599673867225647, + "learning_rate": 0.000625081820616223, + "loss": 2.2218, + "step": 447260 + }, + { + "epoch": 1.729020735723895, + "grad_norm": 0.11664579063653946, + "learning_rate": 0.0006249363651088724, + "loss": 2.2173, + "step": 447270 + }, + { + "epoch": 1.7290593929272782, + "grad_norm": 0.11499513685703278, + "learning_rate": 0.0006247909249863133, + "loss": 2.2268, + "step": 447280 + }, + { + "epoch": 1.7290980501306614, + "grad_norm": 0.1192687600851059, + "learning_rate": 0.0006246455002436644, + "loss": 2.2244, + "step": 447290 + }, + { + "epoch": 1.7291367073340447, + "grad_norm": 0.12507928907871246, + "learning_rate": 0.0006245000908760481, + "loss": 2.2109, + "step": 447300 + }, + { + "epoch": 1.729175364537428, + "grad_norm": 0.1301724910736084, + "learning_rate": 0.0006243546968785885, + "loss": 2.2207, + "step": 447310 + }, + { + "epoch": 1.7292140217408112, + "grad_norm": 0.11521703004837036, + "learning_rate": 0.0006242093182464128, + "loss": 2.2104, + "step": 447320 + }, + { + "epoch": 1.7292526789441944, + "grad_norm": 0.1285400092601776, + "learning_rate": 0.0006240639549746507, + "loss": 2.2045, + "step": 447330 + }, + { + "epoch": 1.7292913361475777, + "grad_norm": 0.13523685932159424, + "learning_rate": 0.0006239186070584342, + "loss": 2.2077, + "step": 447340 + }, + { + "epoch": 1.729329993350961, + "grad_norm": 0.11786042153835297, + "learning_rate": 0.0006237732744928981, + "loss": 2.2301, + "step": 447350 + }, + { + "epoch": 1.7293686505543442, + "grad_norm": 0.1203950047492981, + "learning_rate": 0.0006236279572731797, + "loss": 2.2259, + "step": 447360 + }, + { + "epoch": 1.7294073077577274, + "grad_norm": 0.12070076167583466, + "learning_rate": 0.0006234826553944188, + "loss": 2.2325, + "step": 447370 + }, + { + "epoch": 1.7294459649611107, + "grad_norm": 0.12071788311004639, + "learning_rate": 0.0006233373688517583, + "loss": 2.2153, + "step": 447380 + }, + { + "epoch": 1.7294846221644942, + "grad_norm": 0.12840916216373444, + "learning_rate": 0.0006231920976403427, + "loss": 2.2167, + "step": 447390 + }, + { + "epoch": 1.7295232793678774, + "grad_norm": 0.12395832687616348, + "learning_rate": 0.0006230468417553197, + "loss": 2.2113, + "step": 447400 + }, + { + "epoch": 1.7295619365712607, + "grad_norm": 0.1331869661808014, + "learning_rate": 0.0006229016011918394, + "loss": 2.2126, + "step": 447410 + }, + { + "epoch": 1.729600593774644, + "grad_norm": 0.11826955527067184, + "learning_rate": 0.0006227563759450545, + "loss": 2.2116, + "step": 447420 + }, + { + "epoch": 1.7296392509780274, + "grad_norm": 0.131355881690979, + "learning_rate": 0.0006226111660101204, + "loss": 2.2146, + "step": 447430 + }, + { + "epoch": 1.7296779081814107, + "grad_norm": 0.12471216917037964, + "learning_rate": 0.000622465971382195, + "loss": 2.2117, + "step": 447440 + }, + { + "epoch": 1.729716565384794, + "grad_norm": 0.12048184871673584, + "learning_rate": 0.0006223207920564382, + "loss": 2.2216, + "step": 447450 + }, + { + "epoch": 1.7297552225881772, + "grad_norm": 0.11597990989685059, + "learning_rate": 0.000622175628028013, + "loss": 2.2262, + "step": 447460 + }, + { + "epoch": 1.7297938797915604, + "grad_norm": 0.12440779060125351, + "learning_rate": 0.0006220304792920855, + "loss": 2.2227, + "step": 447470 + }, + { + "epoch": 1.7298325369949437, + "grad_norm": 0.13827833533287048, + "learning_rate": 0.0006218853458438227, + "loss": 2.2121, + "step": 447480 + }, + { + "epoch": 1.729871194198327, + "grad_norm": 0.12489471584558487, + "learning_rate": 0.0006217402276783959, + "loss": 2.2203, + "step": 447490 + }, + { + "epoch": 1.7299098514017102, + "grad_norm": 0.12917496263980865, + "learning_rate": 0.0006215951247909779, + "loss": 2.2124, + "step": 447500 + }, + { + "epoch": 1.7299485086050934, + "grad_norm": 0.1222403421998024, + "learning_rate": 0.0006214500371767442, + "loss": 2.2361, + "step": 447510 + }, + { + "epoch": 1.7299871658084767, + "grad_norm": 0.11857359856367111, + "learning_rate": 0.0006213049648308731, + "loss": 2.2152, + "step": 447520 + }, + { + "epoch": 1.73002582301186, + "grad_norm": 0.12194454669952393, + "learning_rate": 0.0006211599077485452, + "loss": 2.2267, + "step": 447530 + }, + { + "epoch": 1.7300644802152432, + "grad_norm": 0.12156818807125092, + "learning_rate": 0.0006210148659249442, + "loss": 2.2068, + "step": 447540 + }, + { + "epoch": 1.7301031374186264, + "grad_norm": 0.1164856106042862, + "learning_rate": 0.0006208698393552552, + "loss": 2.1847, + "step": 447550 + }, + { + "epoch": 1.73014179462201, + "grad_norm": 0.13245020806789398, + "learning_rate": 0.0006207248280346667, + "loss": 2.2223, + "step": 447560 + }, + { + "epoch": 1.7301804518253932, + "grad_norm": 0.11174594610929489, + "learning_rate": 0.0006205798319583695, + "loss": 2.2077, + "step": 447570 + }, + { + "epoch": 1.7302191090287764, + "grad_norm": 0.12638911604881287, + "learning_rate": 0.0006204348511215571, + "loss": 2.2113, + "step": 447580 + }, + { + "epoch": 1.7302577662321597, + "grad_norm": 0.13453972339630127, + "learning_rate": 0.0006202898855194255, + "loss": 2.2295, + "step": 447590 + }, + { + "epoch": 1.7302964234355431, + "grad_norm": 0.12614595890045166, + "learning_rate": 0.0006201449351471729, + "loss": 2.217, + "step": 447600 + }, + { + "epoch": 1.7303350806389264, + "grad_norm": 0.12270817905664444, + "learning_rate": 0.0006199999999999999, + "loss": 2.2219, + "step": 447610 + }, + { + "epoch": 1.7303737378423096, + "grad_norm": 0.12108409404754639, + "learning_rate": 0.0006198550800731107, + "loss": 2.2281, + "step": 447620 + }, + { + "epoch": 1.7304123950456929, + "grad_norm": 0.13020919263362885, + "learning_rate": 0.0006197101753617105, + "loss": 2.2101, + "step": 447630 + }, + { + "epoch": 1.7304510522490761, + "grad_norm": 0.12765903770923615, + "learning_rate": 0.0006195652858610081, + "loss": 2.2209, + "step": 447640 + }, + { + "epoch": 1.7304897094524594, + "grad_norm": 0.13279370963573456, + "learning_rate": 0.0006194204115662148, + "loss": 2.2142, + "step": 447650 + }, + { + "epoch": 1.7305283666558426, + "grad_norm": 0.11896040290594101, + "learning_rate": 0.0006192755524725435, + "loss": 2.2109, + "step": 447660 + }, + { + "epoch": 1.730567023859226, + "grad_norm": 0.11061503738164902, + "learning_rate": 0.0006191307085752105, + "loss": 2.2193, + "step": 447670 + }, + { + "epoch": 1.7306056810626091, + "grad_norm": 0.11819742619991302, + "learning_rate": 0.0006189858798694345, + "loss": 2.2226, + "step": 447680 + }, + { + "epoch": 1.7306443382659924, + "grad_norm": 0.1292245090007782, + "learning_rate": 0.0006188410663504363, + "loss": 2.2278, + "step": 447690 + }, + { + "epoch": 1.7306829954693757, + "grad_norm": 0.11678581684827805, + "learning_rate": 0.0006186962680134394, + "loss": 2.2191, + "step": 447700 + }, + { + "epoch": 1.730721652672759, + "grad_norm": 0.13057443499565125, + "learning_rate": 0.00061855148485367, + "loss": 2.2129, + "step": 447710 + }, + { + "epoch": 1.7307603098761422, + "grad_norm": 0.13378068804740906, + "learning_rate": 0.0006184067168663565, + "loss": 2.2068, + "step": 447720 + }, + { + "epoch": 1.7307989670795256, + "grad_norm": 0.12511661648750305, + "learning_rate": 0.00061826196404673, + "loss": 2.2216, + "step": 447730 + }, + { + "epoch": 1.7308376242829089, + "grad_norm": 0.1140625923871994, + "learning_rate": 0.0006181172263900241, + "loss": 2.2254, + "step": 447740 + }, + { + "epoch": 1.7308762814862921, + "grad_norm": 0.12888774275779724, + "learning_rate": 0.0006179725038914747, + "loss": 2.2218, + "step": 447750 + }, + { + "epoch": 1.7309149386896754, + "grad_norm": 0.1261301040649414, + "learning_rate": 0.0006178277965463204, + "loss": 2.2167, + "step": 447760 + }, + { + "epoch": 1.7309535958930589, + "grad_norm": 0.12283816933631897, + "learning_rate": 0.0006176831043498021, + "loss": 2.202, + "step": 447770 + }, + { + "epoch": 1.7309922530964421, + "grad_norm": 0.12190502136945724, + "learning_rate": 0.0006175384272971636, + "loss": 2.2091, + "step": 447780 + }, + { + "epoch": 1.7310309102998254, + "grad_norm": 0.13062943518161774, + "learning_rate": 0.0006173937653836506, + "loss": 2.2211, + "step": 447790 + }, + { + "epoch": 1.7310695675032086, + "grad_norm": 0.12369003146886826, + "learning_rate": 0.0006172491186045117, + "loss": 2.2243, + "step": 447800 + }, + { + "epoch": 1.7311082247065919, + "grad_norm": 0.1110776886343956, + "learning_rate": 0.0006171044869549977, + "loss": 2.2149, + "step": 447810 + }, + { + "epoch": 1.7311468819099751, + "grad_norm": 0.11940490454435349, + "learning_rate": 0.0006169598704303623, + "loss": 2.2093, + "step": 447820 + }, + { + "epoch": 1.7311855391133584, + "grad_norm": 0.11867483705282211, + "learning_rate": 0.0006168152690258615, + "loss": 2.2007, + "step": 447830 + }, + { + "epoch": 1.7312241963167416, + "grad_norm": 0.1168961450457573, + "learning_rate": 0.0006166706827367535, + "loss": 2.2347, + "step": 447840 + }, + { + "epoch": 1.7312628535201249, + "grad_norm": 0.11691093444824219, + "learning_rate": 0.0006165261115582992, + "loss": 2.2129, + "step": 447850 + }, + { + "epoch": 1.7313015107235081, + "grad_norm": 0.1283358633518219, + "learning_rate": 0.0006163815554857617, + "loss": 2.236, + "step": 447860 + }, + { + "epoch": 1.7313401679268914, + "grad_norm": 0.12356679141521454, + "learning_rate": 0.0006162370145144076, + "loss": 2.2175, + "step": 447870 + }, + { + "epoch": 1.7313788251302746, + "grad_norm": 0.11469010263681412, + "learning_rate": 0.0006160924886395046, + "loss": 2.2193, + "step": 447880 + }, + { + "epoch": 1.731417482333658, + "grad_norm": 0.11554548144340515, + "learning_rate": 0.0006159479778563235, + "loss": 2.2131, + "step": 447890 + }, + { + "epoch": 1.7314561395370414, + "grad_norm": 0.12214815616607666, + "learning_rate": 0.0006158034821601379, + "loss": 2.2112, + "step": 447900 + }, + { + "epoch": 1.7314947967404246, + "grad_norm": 0.10934476554393768, + "learning_rate": 0.0006156590015462231, + "loss": 2.2148, + "step": 447910 + }, + { + "epoch": 1.7315334539438079, + "grad_norm": 0.1100868284702301, + "learning_rate": 0.0006155145360098575, + "loss": 2.2088, + "step": 447920 + }, + { + "epoch": 1.7315721111471911, + "grad_norm": 0.1273280680179596, + "learning_rate": 0.0006153700855463218, + "loss": 2.2182, + "step": 447930 + }, + { + "epoch": 1.7316107683505746, + "grad_norm": 0.13480187952518463, + "learning_rate": 0.0006152256501508992, + "loss": 2.2206, + "step": 447940 + }, + { + "epoch": 1.7316494255539578, + "grad_norm": 0.11295372247695923, + "learning_rate": 0.000615081229818875, + "loss": 2.2338, + "step": 447950 + }, + { + "epoch": 1.731688082757341, + "grad_norm": 0.11729270219802856, + "learning_rate": 0.0006149368245455372, + "loss": 2.2163, + "step": 447960 + }, + { + "epoch": 1.7317267399607243, + "grad_norm": 0.11652059853076935, + "learning_rate": 0.0006147924343261766, + "loss": 2.2358, + "step": 447970 + }, + { + "epoch": 1.7317653971641076, + "grad_norm": 0.12786738574504852, + "learning_rate": 0.0006146480591560857, + "loss": 2.2123, + "step": 447980 + }, + { + "epoch": 1.7318040543674909, + "grad_norm": 0.1158597394824028, + "learning_rate": 0.0006145036990305604, + "loss": 2.213, + "step": 447990 + }, + { + "epoch": 1.731842711570874, + "grad_norm": 0.11124974489212036, + "learning_rate": 0.0006143593539448982, + "loss": 2.2188, + "step": 448000 + }, + { + "epoch": 1.7318813687742574, + "grad_norm": 0.1128598302602768, + "learning_rate": 0.0006142150238943994, + "loss": 2.2263, + "step": 448010 + }, + { + "epoch": 1.7319200259776406, + "grad_norm": 0.11894000321626663, + "learning_rate": 0.0006140707088743668, + "loss": 2.2184, + "step": 448020 + }, + { + "epoch": 1.7319586831810239, + "grad_norm": 0.12482871115207672, + "learning_rate": 0.0006139264088801058, + "loss": 2.2137, + "step": 448030 + }, + { + "epoch": 1.7319973403844071, + "grad_norm": 0.12437481433153152, + "learning_rate": 0.0006137821239069235, + "loss": 2.2008, + "step": 448040 + }, + { + "epoch": 1.7320359975877904, + "grad_norm": 0.11717011779546738, + "learning_rate": 0.0006136378539501304, + "loss": 2.2206, + "step": 448050 + }, + { + "epoch": 1.7320746547911738, + "grad_norm": 0.11646430939435959, + "learning_rate": 0.0006134935990050387, + "loss": 2.2076, + "step": 448060 + }, + { + "epoch": 1.732113311994557, + "grad_norm": 0.13522832095623016, + "learning_rate": 0.0006133493590669639, + "loss": 2.2138, + "step": 448070 + }, + { + "epoch": 1.7321519691979403, + "grad_norm": 0.1385970264673233, + "learning_rate": 0.0006132051341312229, + "loss": 2.2113, + "step": 448080 + }, + { + "epoch": 1.7321906264013236, + "grad_norm": 0.1351778656244278, + "learning_rate": 0.0006130609241931353, + "loss": 2.2277, + "step": 448090 + }, + { + "epoch": 1.7322292836047068, + "grad_norm": 0.12390823662281036, + "learning_rate": 0.0006129167292480238, + "loss": 2.2221, + "step": 448100 + }, + { + "epoch": 1.7322679408080903, + "grad_norm": 0.12284976989030838, + "learning_rate": 0.000612772549291213, + "loss": 2.2234, + "step": 448110 + }, + { + "epoch": 1.7323065980114736, + "grad_norm": 0.12573711574077606, + "learning_rate": 0.00061262838431803, + "loss": 2.2208, + "step": 448120 + }, + { + "epoch": 1.7323452552148568, + "grad_norm": 0.11867741495370865, + "learning_rate": 0.0006124842343238042, + "loss": 2.2038, + "step": 448130 + }, + { + "epoch": 1.73238391241824, + "grad_norm": 0.13029922544956207, + "learning_rate": 0.0006123400993038677, + "loss": 2.2247, + "step": 448140 + }, + { + "epoch": 1.7324225696216233, + "grad_norm": 0.13197588920593262, + "learning_rate": 0.0006121959792535547, + "loss": 2.2099, + "step": 448150 + }, + { + "epoch": 1.7324612268250066, + "grad_norm": 0.11653072386980057, + "learning_rate": 0.0006120518741682022, + "loss": 2.2156, + "step": 448160 + }, + { + "epoch": 1.7324998840283898, + "grad_norm": 0.115585096180439, + "learning_rate": 0.0006119077840431493, + "loss": 2.1938, + "step": 448170 + }, + { + "epoch": 1.732538541231773, + "grad_norm": 0.12081106007099152, + "learning_rate": 0.0006117637088737378, + "loss": 2.2054, + "step": 448180 + }, + { + "epoch": 1.7325771984351563, + "grad_norm": 0.12495361268520355, + "learning_rate": 0.0006116196486553119, + "loss": 2.2058, + "step": 448190 + }, + { + "epoch": 1.7326158556385396, + "grad_norm": 0.12298168241977692, + "learning_rate": 0.0006114756033832174, + "loss": 2.218, + "step": 448200 + }, + { + "epoch": 1.7326545128419228, + "grad_norm": 0.11500807851552963, + "learning_rate": 0.000611331573052804, + "loss": 2.2318, + "step": 448210 + }, + { + "epoch": 1.732693170045306, + "grad_norm": 0.12313594669103622, + "learning_rate": 0.0006111875576594223, + "loss": 2.2102, + "step": 448220 + }, + { + "epoch": 1.7327318272486896, + "grad_norm": 0.11742036789655685, + "learning_rate": 0.0006110435571984267, + "loss": 2.1994, + "step": 448230 + }, + { + "epoch": 1.7327704844520728, + "grad_norm": 0.11603717505931854, + "learning_rate": 0.000610899571665173, + "loss": 2.2122, + "step": 448240 + }, + { + "epoch": 1.732809141655456, + "grad_norm": 0.11510173976421356, + "learning_rate": 0.0006107556010550195, + "loss": 2.2255, + "step": 448250 + }, + { + "epoch": 1.7328477988588393, + "grad_norm": 0.14430029690265656, + "learning_rate": 0.0006106116453633275, + "loss": 2.2189, + "step": 448260 + }, + { + "epoch": 1.7328864560622226, + "grad_norm": 0.11953294277191162, + "learning_rate": 0.0006104677045854601, + "loss": 2.2153, + "step": 448270 + }, + { + "epoch": 1.732925113265606, + "grad_norm": 0.12118013948202133, + "learning_rate": 0.0006103237787167833, + "loss": 2.2345, + "step": 448280 + }, + { + "epoch": 1.7329637704689893, + "grad_norm": 0.11401266604661942, + "learning_rate": 0.0006101798677526648, + "loss": 2.2085, + "step": 448290 + }, + { + "epoch": 1.7330024276723726, + "grad_norm": 0.11563297361135483, + "learning_rate": 0.0006100359716884758, + "loss": 2.2193, + "step": 448300 + }, + { + "epoch": 1.7330410848757558, + "grad_norm": 0.12193930894136429, + "learning_rate": 0.0006098920905195886, + "loss": 2.207, + "step": 448310 + }, + { + "epoch": 1.733079742079139, + "grad_norm": 0.12167327105998993, + "learning_rate": 0.0006097482242413785, + "loss": 2.2036, + "step": 448320 + }, + { + "epoch": 1.7331183992825223, + "grad_norm": 0.11306396871805191, + "learning_rate": 0.000609604372849224, + "loss": 2.2245, + "step": 448330 + }, + { + "epoch": 1.7331570564859056, + "grad_norm": 0.13491253554821014, + "learning_rate": 0.0006094605363385044, + "loss": 2.2217, + "step": 448340 + }, + { + "epoch": 1.7331957136892888, + "grad_norm": 0.12088317424058914, + "learning_rate": 0.0006093167147046026, + "loss": 2.2149, + "step": 448350 + }, + { + "epoch": 1.733234370892672, + "grad_norm": 0.12405755370855331, + "learning_rate": 0.0006091729079429033, + "loss": 2.2224, + "step": 448360 + }, + { + "epoch": 1.7332730280960553, + "grad_norm": 0.13035620748996735, + "learning_rate": 0.0006090291160487937, + "loss": 2.2282, + "step": 448370 + }, + { + "epoch": 1.7333116852994386, + "grad_norm": 0.1224067360162735, + "learning_rate": 0.0006088853390176639, + "loss": 2.2041, + "step": 448380 + }, + { + "epoch": 1.7333503425028218, + "grad_norm": 0.11730349063873291, + "learning_rate": 0.0006087415768449056, + "loss": 2.2087, + "step": 448390 + }, + { + "epoch": 1.7333889997062053, + "grad_norm": 0.12100856751203537, + "learning_rate": 0.0006085978295259131, + "loss": 2.2237, + "step": 448400 + }, + { + "epoch": 1.7334276569095886, + "grad_norm": 0.11594785749912262, + "learning_rate": 0.0006084540970560835, + "loss": 2.2147, + "step": 448410 + }, + { + "epoch": 1.7334663141129718, + "grad_norm": 0.12283364683389664, + "learning_rate": 0.0006083103794308158, + "loss": 2.221, + "step": 448420 + }, + { + "epoch": 1.733504971316355, + "grad_norm": 0.436133474111557, + "learning_rate": 0.0006081666766455116, + "loss": 2.2137, + "step": 448430 + }, + { + "epoch": 1.7335436285197385, + "grad_norm": 0.1321917474269867, + "learning_rate": 0.0006080229886955748, + "loss": 2.2198, + "step": 448440 + }, + { + "epoch": 1.7335822857231218, + "grad_norm": 0.13761860132217407, + "learning_rate": 0.0006078793155764118, + "loss": 2.2164, + "step": 448450 + }, + { + "epoch": 1.733620942926505, + "grad_norm": 0.1234583705663681, + "learning_rate": 0.0006077356572834309, + "loss": 2.2194, + "step": 448460 + }, + { + "epoch": 1.7336596001298883, + "grad_norm": 0.12498248368501663, + "learning_rate": 0.0006075920138120436, + "loss": 2.2087, + "step": 448470 + }, + { + "epoch": 1.7336982573332715, + "grad_norm": 0.11420132219791412, + "learning_rate": 0.000607448385157663, + "loss": 2.2153, + "step": 448480 + }, + { + "epoch": 1.7337369145366548, + "grad_norm": 0.1190991923213005, + "learning_rate": 0.0006073047713157053, + "loss": 2.2134, + "step": 448490 + }, + { + "epoch": 1.733775571740038, + "grad_norm": 0.11640120297670364, + "learning_rate": 0.0006071611722815881, + "loss": 2.2098, + "step": 448500 + }, + { + "epoch": 1.7338142289434213, + "grad_norm": 0.11825591325759888, + "learning_rate": 0.0006070175880507321, + "loss": 2.2108, + "step": 448510 + }, + { + "epoch": 1.7338528861468046, + "grad_norm": 0.12558484077453613, + "learning_rate": 0.0006068740186185602, + "loss": 2.2028, + "step": 448520 + }, + { + "epoch": 1.7338915433501878, + "grad_norm": 0.13272039592266083, + "learning_rate": 0.0006067304639804973, + "loss": 2.2234, + "step": 448530 + }, + { + "epoch": 1.733930200553571, + "grad_norm": 0.11638154089450836, + "learning_rate": 0.0006065869241319715, + "loss": 2.2005, + "step": 448540 + }, + { + "epoch": 1.7339688577569543, + "grad_norm": 0.1183534637093544, + "learning_rate": 0.0006064433990684123, + "loss": 2.211, + "step": 448550 + }, + { + "epoch": 1.7340075149603376, + "grad_norm": 0.12037092447280884, + "learning_rate": 0.0006062998887852525, + "loss": 2.211, + "step": 448560 + }, + { + "epoch": 1.734046172163721, + "grad_norm": 0.12053132802248001, + "learning_rate": 0.0006061563932779259, + "loss": 2.2033, + "step": 448570 + }, + { + "epoch": 1.7340848293671043, + "grad_norm": 0.14399179816246033, + "learning_rate": 0.00060601291254187, + "loss": 2.227, + "step": 448580 + }, + { + "epoch": 1.7341234865704875, + "grad_norm": 0.1422278881072998, + "learning_rate": 0.0006058694465725243, + "loss": 2.2382, + "step": 448590 + }, + { + "epoch": 1.7341621437738708, + "grad_norm": 0.1431090086698532, + "learning_rate": 0.00060572599536533, + "loss": 2.2108, + "step": 448600 + }, + { + "epoch": 1.7342008009772543, + "grad_norm": 0.13537760078907013, + "learning_rate": 0.0006055825589157313, + "loss": 2.2011, + "step": 448610 + }, + { + "epoch": 1.7342394581806375, + "grad_norm": 0.11662229150533676, + "learning_rate": 0.0006054391372191745, + "loss": 2.2125, + "step": 448620 + }, + { + "epoch": 1.7342781153840208, + "grad_norm": 0.11179200559854507, + "learning_rate": 0.0006052957302711086, + "loss": 2.2193, + "step": 448630 + }, + { + "epoch": 1.734316772587404, + "grad_norm": 0.11571834236383438, + "learning_rate": 0.0006051523380669845, + "loss": 2.2138, + "step": 448640 + }, + { + "epoch": 1.7343554297907873, + "grad_norm": 0.13290511071681976, + "learning_rate": 0.0006050089606022553, + "loss": 2.2322, + "step": 448650 + }, + { + "epoch": 1.7343940869941705, + "grad_norm": 0.12603338062763214, + "learning_rate": 0.0006048655978723772, + "loss": 2.2116, + "step": 448660 + }, + { + "epoch": 1.7344327441975538, + "grad_norm": 0.12034429609775543, + "learning_rate": 0.0006047222498728076, + "loss": 2.2063, + "step": 448670 + }, + { + "epoch": 1.734471401400937, + "grad_norm": 0.12012781202793121, + "learning_rate": 0.0006045789165990075, + "loss": 2.2177, + "step": 448680 + }, + { + "epoch": 1.7345100586043203, + "grad_norm": 0.16808289289474487, + "learning_rate": 0.0006044355980464391, + "loss": 2.2166, + "step": 448690 + }, + { + "epoch": 1.7345487158077035, + "grad_norm": 0.12908385694026947, + "learning_rate": 0.000604292294210568, + "loss": 2.2184, + "step": 448700 + }, + { + "epoch": 1.7345873730110868, + "grad_norm": 0.11955581605434418, + "learning_rate": 0.000604149005086861, + "loss": 2.2119, + "step": 448710 + }, + { + "epoch": 1.73462603021447, + "grad_norm": 0.12239424139261246, + "learning_rate": 0.0006040057306707882, + "loss": 2.2173, + "step": 448720 + }, + { + "epoch": 1.7346646874178533, + "grad_norm": 0.12347353249788284, + "learning_rate": 0.0006038624709578215, + "loss": 2.2297, + "step": 448730 + }, + { + "epoch": 1.7347033446212368, + "grad_norm": 0.12630808353424072, + "learning_rate": 0.0006037192259434353, + "loss": 2.2182, + "step": 448740 + }, + { + "epoch": 1.73474200182462, + "grad_norm": 0.11457516252994537, + "learning_rate": 0.000603575995623106, + "loss": 2.2195, + "step": 448750 + }, + { + "epoch": 1.7347806590280033, + "grad_norm": 0.12003987282514572, + "learning_rate": 0.0006034327799923127, + "loss": 2.2304, + "step": 448760 + }, + { + "epoch": 1.7348193162313865, + "grad_norm": 0.12798136472702026, + "learning_rate": 0.0006032895790465369, + "loss": 2.1916, + "step": 448770 + }, + { + "epoch": 1.73485797343477, + "grad_norm": 0.1289728581905365, + "learning_rate": 0.0006031463927812622, + "loss": 2.2131, + "step": 448780 + }, + { + "epoch": 1.7348966306381532, + "grad_norm": 0.1295253038406372, + "learning_rate": 0.0006030032211919743, + "loss": 2.2124, + "step": 448790 + }, + { + "epoch": 1.7349352878415365, + "grad_norm": 0.12527629733085632, + "learning_rate": 0.0006028600642741615, + "loss": 2.2248, + "step": 448800 + }, + { + "epoch": 1.7349739450449198, + "grad_norm": 0.12497811764478683, + "learning_rate": 0.0006027169220233147, + "loss": 2.2218, + "step": 448810 + }, + { + "epoch": 1.735012602248303, + "grad_norm": 0.1233152449131012, + "learning_rate": 0.0006025737944349261, + "loss": 2.2148, + "step": 448820 + }, + { + "epoch": 1.7350512594516863, + "grad_norm": 0.12288574874401093, + "learning_rate": 0.0006024306815044916, + "loss": 2.2177, + "step": 448830 + }, + { + "epoch": 1.7350899166550695, + "grad_norm": 0.11841818690299988, + "learning_rate": 0.0006022875832275081, + "loss": 2.204, + "step": 448840 + }, + { + "epoch": 1.7351285738584528, + "grad_norm": 0.11461430042982101, + "learning_rate": 0.0006021444995994758, + "loss": 2.2094, + "step": 448850 + }, + { + "epoch": 1.735167231061836, + "grad_norm": 0.13102898001670837, + "learning_rate": 0.0006020014306158965, + "loss": 2.2057, + "step": 448860 + }, + { + "epoch": 1.7352058882652193, + "grad_norm": 0.12146873772144318, + "learning_rate": 0.0006018583762722747, + "loss": 2.1905, + "step": 448870 + }, + { + "epoch": 1.7352445454686025, + "grad_norm": 0.1266041398048401, + "learning_rate": 0.0006017153365641172, + "loss": 2.213, + "step": 448880 + }, + { + "epoch": 1.7352832026719858, + "grad_norm": 0.13279256224632263, + "learning_rate": 0.0006015723114869329, + "loss": 2.2169, + "step": 448890 + }, + { + "epoch": 1.735321859875369, + "grad_norm": 0.13102956116199493, + "learning_rate": 0.0006014293010362331, + "loss": 2.2103, + "step": 448900 + }, + { + "epoch": 1.7353605170787525, + "grad_norm": 0.12755140662193298, + "learning_rate": 0.000601286305207531, + "loss": 2.2107, + "step": 448910 + }, + { + "epoch": 1.7353991742821357, + "grad_norm": 0.12142160534858704, + "learning_rate": 0.000601143323996343, + "loss": 2.2136, + "step": 448920 + }, + { + "epoch": 1.735437831485519, + "grad_norm": 0.11610239744186401, + "learning_rate": 0.0006010003573981873, + "loss": 2.2124, + "step": 448930 + }, + { + "epoch": 1.7354764886889023, + "grad_norm": 0.11581006646156311, + "learning_rate": 0.0006008574054085838, + "loss": 2.2107, + "step": 448940 + }, + { + "epoch": 1.7355151458922857, + "grad_norm": 0.1240667849779129, + "learning_rate": 0.0006007144680230557, + "loss": 2.2187, + "step": 448950 + }, + { + "epoch": 1.735553803095669, + "grad_norm": 0.12831291556358337, + "learning_rate": 0.0006005715452371278, + "loss": 2.2144, + "step": 448960 + }, + { + "epoch": 1.7355924602990522, + "grad_norm": 0.12649348378181458, + "learning_rate": 0.0006004286370463277, + "loss": 2.2068, + "step": 448970 + }, + { + "epoch": 1.7356311175024355, + "grad_norm": 0.11840377002954483, + "learning_rate": 0.0006002857434461846, + "loss": 2.2232, + "step": 448980 + }, + { + "epoch": 1.7356697747058187, + "grad_norm": 0.12125206738710403, + "learning_rate": 0.0006001428644322307, + "loss": 2.2108, + "step": 448990 + }, + { + "epoch": 1.735708431909202, + "grad_norm": 0.12650954723358154, + "learning_rate": 0.0006000000000000001, + "loss": 2.2199, + "step": 449000 + }, + { + "epoch": 1.7357470891125852, + "grad_norm": 0.1308407336473465, + "learning_rate": 0.000599857150145029, + "loss": 2.2146, + "step": 449010 + }, + { + "epoch": 1.7357857463159685, + "grad_norm": 0.11478245258331299, + "learning_rate": 0.0005997143148628563, + "loss": 2.2307, + "step": 449020 + }, + { + "epoch": 1.7358244035193517, + "grad_norm": 0.11808864772319794, + "learning_rate": 0.0005995714941490229, + "loss": 2.2122, + "step": 449030 + }, + { + "epoch": 1.735863060722735, + "grad_norm": 0.122349813580513, + "learning_rate": 0.0005994286879990723, + "loss": 2.2138, + "step": 449040 + }, + { + "epoch": 1.7359017179261182, + "grad_norm": 0.12328719347715378, + "learning_rate": 0.0005992858964085497, + "loss": 2.206, + "step": 449050 + }, + { + "epoch": 1.7359403751295015, + "grad_norm": 0.11808309704065323, + "learning_rate": 0.0005991431193730032, + "loss": 2.2194, + "step": 449060 + }, + { + "epoch": 1.7359790323328848, + "grad_norm": 0.12056610733270645, + "learning_rate": 0.0005990003568879827, + "loss": 2.2169, + "step": 449070 + }, + { + "epoch": 1.7360176895362682, + "grad_norm": 0.11538299173116684, + "learning_rate": 0.0005988576089490406, + "loss": 2.2127, + "step": 449080 + }, + { + "epoch": 1.7360563467396515, + "grad_norm": 0.1362743377685547, + "learning_rate": 0.0005987148755517314, + "loss": 2.2164, + "step": 449090 + }, + { + "epoch": 1.7360950039430347, + "grad_norm": 0.13896846771240234, + "learning_rate": 0.0005985721566916121, + "loss": 2.2146, + "step": 449100 + }, + { + "epoch": 1.736133661146418, + "grad_norm": 0.12329170852899551, + "learning_rate": 0.0005984294523642415, + "loss": 2.211, + "step": 449110 + }, + { + "epoch": 1.7361723183498015, + "grad_norm": 0.14520715177059174, + "learning_rate": 0.0005982867625651813, + "loss": 2.221, + "step": 449120 + }, + { + "epoch": 1.7362109755531847, + "grad_norm": 0.12350068241357803, + "learning_rate": 0.0005981440872899954, + "loss": 2.2143, + "step": 449130 + }, + { + "epoch": 1.736249632756568, + "grad_norm": 0.12957386672496796, + "learning_rate": 0.0005980014265342492, + "loss": 2.2216, + "step": 449140 + }, + { + "epoch": 1.7362882899599512, + "grad_norm": 0.11654391139745712, + "learning_rate": 0.0005978587802935112, + "loss": 2.2209, + "step": 449150 + }, + { + "epoch": 1.7363269471633345, + "grad_norm": 0.12524645030498505, + "learning_rate": 0.0005977161485633518, + "loss": 2.2174, + "step": 449160 + }, + { + "epoch": 1.7363656043667177, + "grad_norm": 0.12301818281412125, + "learning_rate": 0.0005975735313393433, + "loss": 2.2034, + "step": 449170 + }, + { + "epoch": 1.736404261570101, + "grad_norm": 0.11993245035409927, + "learning_rate": 0.000597430928617061, + "loss": 2.2224, + "step": 449180 + }, + { + "epoch": 1.7364429187734842, + "grad_norm": 0.12051360309123993, + "learning_rate": 0.0005972883403920819, + "loss": 2.2104, + "step": 449190 + }, + { + "epoch": 1.7364815759768675, + "grad_norm": 0.128607839345932, + "learning_rate": 0.0005971457666599856, + "loss": 2.1977, + "step": 449200 + }, + { + "epoch": 1.7365202331802507, + "grad_norm": 0.2069489061832428, + "learning_rate": 0.0005970032074163534, + "loss": 2.2098, + "step": 449210 + }, + { + "epoch": 1.736558890383634, + "grad_norm": 0.11907302588224411, + "learning_rate": 0.0005968606626567695, + "loss": 2.1977, + "step": 449220 + }, + { + "epoch": 1.7365975475870172, + "grad_norm": 0.13132734596729279, + "learning_rate": 0.00059671813237682, + "loss": 2.2094, + "step": 449230 + }, + { + "epoch": 1.7366362047904005, + "grad_norm": 0.2081662267446518, + "learning_rate": 0.0005965756165720932, + "loss": 2.2004, + "step": 449240 + }, + { + "epoch": 1.736674861993784, + "grad_norm": 0.1204073578119278, + "learning_rate": 0.0005964331152381802, + "loss": 2.2146, + "step": 449250 + }, + { + "epoch": 1.7367135191971672, + "grad_norm": 0.12416879087686539, + "learning_rate": 0.0005962906283706731, + "loss": 2.1914, + "step": 449260 + }, + { + "epoch": 1.7367521764005505, + "grad_norm": 0.11044029891490936, + "learning_rate": 0.0005961481559651673, + "loss": 2.2234, + "step": 449270 + }, + { + "epoch": 1.7367908336039337, + "grad_norm": 0.11880560219287872, + "learning_rate": 0.0005960056980172606, + "loss": 2.2078, + "step": 449280 + }, + { + "epoch": 1.7368294908073172, + "grad_norm": 0.13040591776371002, + "learning_rate": 0.000595863254522552, + "loss": 2.2239, + "step": 449290 + }, + { + "epoch": 1.7368681480107004, + "grad_norm": 0.12390241771936417, + "learning_rate": 0.0005957208254766433, + "loss": 2.1982, + "step": 449300 + }, + { + "epoch": 1.7369068052140837, + "grad_norm": 0.11649197340011597, + "learning_rate": 0.000595578410875139, + "loss": 2.2188, + "step": 449310 + }, + { + "epoch": 1.736945462417467, + "grad_norm": 0.13983146846294403, + "learning_rate": 0.000595436010713645, + "loss": 2.2209, + "step": 449320 + }, + { + "epoch": 1.7369841196208502, + "grad_norm": 0.12697160243988037, + "learning_rate": 0.0005952936249877699, + "loss": 2.2083, + "step": 449330 + }, + { + "epoch": 1.7370227768242334, + "grad_norm": 0.13057808578014374, + "learning_rate": 0.0005951512536931245, + "loss": 2.2, + "step": 449340 + }, + { + "epoch": 1.7370614340276167, + "grad_norm": 0.1150062158703804, + "learning_rate": 0.0005950088968253215, + "loss": 2.2219, + "step": 449350 + }, + { + "epoch": 1.737100091231, + "grad_norm": 0.12175679206848145, + "learning_rate": 0.0005948665543799763, + "loss": 2.2185, + "step": 449360 + }, + { + "epoch": 1.7371387484343832, + "grad_norm": 0.12388991564512253, + "learning_rate": 0.0005947242263527061, + "loss": 2.2012, + "step": 449370 + }, + { + "epoch": 1.7371774056377665, + "grad_norm": 0.12117626518011093, + "learning_rate": 0.0005945819127391308, + "loss": 2.2063, + "step": 449380 + }, + { + "epoch": 1.7372160628411497, + "grad_norm": 0.14415040612220764, + "learning_rate": 0.000594439613534872, + "loss": 2.2086, + "step": 449390 + }, + { + "epoch": 1.737254720044533, + "grad_norm": 0.1306789666414261, + "learning_rate": 0.0005942973287355538, + "loss": 2.2124, + "step": 449400 + }, + { + "epoch": 1.7372933772479162, + "grad_norm": 0.11676350980997086, + "learning_rate": 0.0005941550583368023, + "loss": 2.224, + "step": 449410 + }, + { + "epoch": 1.7373320344512997, + "grad_norm": 0.14833512902259827, + "learning_rate": 0.0005940128023342461, + "loss": 2.2216, + "step": 449420 + }, + { + "epoch": 1.737370691654683, + "grad_norm": 0.11934653669595718, + "learning_rate": 0.0005938705607235157, + "loss": 2.2032, + "step": 449430 + }, + { + "epoch": 1.7374093488580662, + "grad_norm": 0.11536265164613724, + "learning_rate": 0.0005937283335002443, + "loss": 2.226, + "step": 449440 + }, + { + "epoch": 1.7374480060614494, + "grad_norm": 0.11779036372900009, + "learning_rate": 0.0005935861206600668, + "loss": 2.2144, + "step": 449450 + }, + { + "epoch": 1.737486663264833, + "grad_norm": 0.13895969092845917, + "learning_rate": 0.0005934439221986208, + "loss": 2.2101, + "step": 449460 + }, + { + "epoch": 1.7375253204682162, + "grad_norm": 0.1290643811225891, + "learning_rate": 0.000593301738111545, + "loss": 2.21, + "step": 449470 + }, + { + "epoch": 1.7375639776715994, + "grad_norm": 0.1520080864429474, + "learning_rate": 0.0005931595683944822, + "loss": 2.2248, + "step": 449480 + }, + { + "epoch": 1.7376026348749827, + "grad_norm": 0.12581993639469147, + "learning_rate": 0.0005930174130430755, + "loss": 2.1992, + "step": 449490 + }, + { + "epoch": 1.737641292078366, + "grad_norm": 0.12312814593315125, + "learning_rate": 0.0005928752720529711, + "loss": 2.2185, + "step": 449500 + }, + { + "epoch": 1.7376799492817492, + "grad_norm": 0.12611688673496246, + "learning_rate": 0.0005927331454198177, + "loss": 2.2151, + "step": 449510 + }, + { + "epoch": 1.7377186064851324, + "grad_norm": 0.13197670876979828, + "learning_rate": 0.0005925910331392657, + "loss": 2.2227, + "step": 449520 + }, + { + "epoch": 1.7377572636885157, + "grad_norm": 0.13880470395088196, + "learning_rate": 0.0005924489352069673, + "loss": 2.2194, + "step": 449530 + }, + { + "epoch": 1.737795920891899, + "grad_norm": 0.11517778784036636, + "learning_rate": 0.0005923068516185781, + "loss": 2.217, + "step": 449540 + }, + { + "epoch": 1.7378345780952822, + "grad_norm": 0.12511885166168213, + "learning_rate": 0.0005921647823697547, + "loss": 2.2034, + "step": 449550 + }, + { + "epoch": 1.7378732352986654, + "grad_norm": 0.11904377490282059, + "learning_rate": 0.0005920227274561567, + "loss": 2.207, + "step": 449560 + }, + { + "epoch": 1.7379118925020487, + "grad_norm": 0.12568025290966034, + "learning_rate": 0.0005918806868734454, + "loss": 2.2043, + "step": 449570 + }, + { + "epoch": 1.737950549705432, + "grad_norm": 0.13138815760612488, + "learning_rate": 0.0005917386606172844, + "loss": 2.2226, + "step": 449580 + }, + { + "epoch": 1.7379892069088154, + "grad_norm": 0.1285579949617386, + "learning_rate": 0.0005915966486833397, + "loss": 2.2, + "step": 449590 + }, + { + "epoch": 1.7380278641121987, + "grad_norm": 0.12767644226551056, + "learning_rate": 0.0005914546510672793, + "loss": 2.2297, + "step": 449600 + }, + { + "epoch": 1.738066521315582, + "grad_norm": 0.11568326503038406, + "learning_rate": 0.0005913126677647734, + "loss": 2.2072, + "step": 449610 + }, + { + "epoch": 1.7381051785189652, + "grad_norm": 0.13093574345111847, + "learning_rate": 0.0005911706987714942, + "loss": 2.2219, + "step": 449620 + }, + { + "epoch": 1.7381438357223487, + "grad_norm": 0.11886344105005264, + "learning_rate": 0.0005910287440831166, + "loss": 2.2273, + "step": 449630 + }, + { + "epoch": 1.738182492925732, + "grad_norm": 0.1266063153743744, + "learning_rate": 0.000590886803695317, + "loss": 2.213, + "step": 449640 + }, + { + "epoch": 1.7382211501291152, + "grad_norm": 0.130299374461174, + "learning_rate": 0.0005907448776037747, + "loss": 2.2096, + "step": 449650 + }, + { + "epoch": 1.7382598073324984, + "grad_norm": 0.1166926845908165, + "learning_rate": 0.0005906029658041705, + "loss": 2.2258, + "step": 449660 + }, + { + "epoch": 1.7382984645358817, + "grad_norm": 0.1253158152103424, + "learning_rate": 0.000590461068292188, + "loss": 2.2087, + "step": 449670 + }, + { + "epoch": 1.738337121739265, + "grad_norm": 0.11593757569789886, + "learning_rate": 0.0005903191850635123, + "loss": 2.2053, + "step": 449680 + }, + { + "epoch": 1.7383757789426482, + "grad_norm": 0.12687519192695618, + "learning_rate": 0.0005901773161138313, + "loss": 2.2177, + "step": 449690 + }, + { + "epoch": 1.7384144361460314, + "grad_norm": 0.12391290068626404, + "learning_rate": 0.0005900354614388346, + "loss": 2.2005, + "step": 449700 + }, + { + "epoch": 1.7384530933494147, + "grad_norm": 0.12147536873817444, + "learning_rate": 0.0005898936210342143, + "loss": 2.1999, + "step": 449710 + }, + { + "epoch": 1.738491750552798, + "grad_norm": 0.1351267397403717, + "learning_rate": 0.0005897517948956646, + "loss": 2.2156, + "step": 449720 + }, + { + "epoch": 1.7385304077561812, + "grad_norm": 0.14201191067695618, + "learning_rate": 0.0005896099830188815, + "loss": 2.2097, + "step": 449730 + }, + { + "epoch": 1.7385690649595644, + "grad_norm": 0.12384200096130371, + "learning_rate": 0.0005894681853995635, + "loss": 2.2043, + "step": 449740 + }, + { + "epoch": 1.7386077221629477, + "grad_norm": 0.11880270391702652, + "learning_rate": 0.0005893264020334115, + "loss": 2.2133, + "step": 449750 + }, + { + "epoch": 1.7386463793663312, + "grad_norm": 0.12355902045965195, + "learning_rate": 0.0005891846329161281, + "loss": 2.2097, + "step": 449760 + }, + { + "epoch": 1.7386850365697144, + "grad_norm": 0.12222316116094589, + "learning_rate": 0.0005890428780434185, + "loss": 2.2234, + "step": 449770 + }, + { + "epoch": 1.7387236937730977, + "grad_norm": 0.11164572834968567, + "learning_rate": 0.0005889011374109891, + "loss": 2.2085, + "step": 449780 + }, + { + "epoch": 1.738762350976481, + "grad_norm": 0.1121785044670105, + "learning_rate": 0.00058875941101455, + "loss": 2.2242, + "step": 449790 + }, + { + "epoch": 1.7388010081798644, + "grad_norm": 0.1273416131734848, + "learning_rate": 0.0005886176988498119, + "loss": 2.1992, + "step": 449800 + }, + { + "epoch": 1.7388396653832476, + "grad_norm": 0.14521710574626923, + "learning_rate": 0.0005884760009124889, + "loss": 2.2091, + "step": 449810 + }, + { + "epoch": 1.7388783225866309, + "grad_norm": 0.12968093156814575, + "learning_rate": 0.0005883343171982964, + "loss": 2.2175, + "step": 449820 + }, + { + "epoch": 1.7389169797900141, + "grad_norm": 0.1272241622209549, + "learning_rate": 0.0005881926477029524, + "loss": 2.2064, + "step": 449830 + }, + { + "epoch": 1.7389556369933974, + "grad_norm": 0.12387470155954361, + "learning_rate": 0.0005880509924221767, + "loss": 2.2229, + "step": 449840 + }, + { + "epoch": 1.7389942941967806, + "grad_norm": 0.13222984969615936, + "learning_rate": 0.0005879093513516917, + "loss": 2.2201, + "step": 449850 + }, + { + "epoch": 1.739032951400164, + "grad_norm": 0.12421976029872894, + "learning_rate": 0.0005877677244872217, + "loss": 2.2115, + "step": 449860 + }, + { + "epoch": 1.7390716086035471, + "grad_norm": 0.12273195385932922, + "learning_rate": 0.000587626111824493, + "loss": 2.2061, + "step": 449870 + }, + { + "epoch": 1.7391102658069304, + "grad_norm": 0.1302752047777176, + "learning_rate": 0.0005874845133592339, + "loss": 2.2158, + "step": 449880 + }, + { + "epoch": 1.7391489230103137, + "grad_norm": 0.1115131750702858, + "learning_rate": 0.0005873429290871759, + "loss": 2.201, + "step": 449890 + }, + { + "epoch": 1.739187580213697, + "grad_norm": 0.1338716298341751, + "learning_rate": 0.0005872013590040513, + "loss": 2.2057, + "step": 449900 + }, + { + "epoch": 1.7392262374170802, + "grad_norm": 0.12222873419523239, + "learning_rate": 0.000587059803105595, + "loss": 2.2239, + "step": 449910 + }, + { + "epoch": 1.7392648946204636, + "grad_norm": 0.12289751321077347, + "learning_rate": 0.0005869182613875446, + "loss": 2.1973, + "step": 449920 + }, + { + "epoch": 1.7393035518238469, + "grad_norm": 0.12132169306278229, + "learning_rate": 0.0005867767338456389, + "loss": 2.1977, + "step": 449930 + }, + { + "epoch": 1.7393422090272301, + "grad_norm": 0.12245948612689972, + "learning_rate": 0.0005866352204756198, + "loss": 2.2134, + "step": 449940 + }, + { + "epoch": 1.7393808662306134, + "grad_norm": 0.1194300726056099, + "learning_rate": 0.0005864937212732304, + "loss": 2.2151, + "step": 449950 + }, + { + "epoch": 1.7394195234339966, + "grad_norm": 0.129800483584404, + "learning_rate": 0.0005863522362342166, + "loss": 2.2056, + "step": 449960 + }, + { + "epoch": 1.7394581806373801, + "grad_norm": 0.11177778989076614, + "learning_rate": 0.0005862107653543262, + "loss": 2.2063, + "step": 449970 + }, + { + "epoch": 1.7394968378407634, + "grad_norm": 0.11829909682273865, + "learning_rate": 0.000586069308629309, + "loss": 2.2196, + "step": 449980 + }, + { + "epoch": 1.7395354950441466, + "grad_norm": 0.12138839066028595, + "learning_rate": 0.0005859278660549172, + "loss": 2.2104, + "step": 449990 + }, + { + "epoch": 1.7395741522475299, + "grad_norm": 0.12623794376850128, + "learning_rate": 0.0005857864376269049, + "loss": 2.2161, + "step": 450000 + }, + { + "epoch": 1.7396128094509131, + "grad_norm": 0.1256103515625, + "learning_rate": 0.0005856450233410284, + "loss": 2.2011, + "step": 450010 + }, + { + "epoch": 1.7396514666542964, + "grad_norm": 0.11964461952447891, + "learning_rate": 0.0005855036231930461, + "loss": 2.2061, + "step": 450020 + }, + { + "epoch": 1.7396901238576796, + "grad_norm": 0.11546836048364639, + "learning_rate": 0.0005853622371787186, + "loss": 2.2067, + "step": 450030 + }, + { + "epoch": 1.7397287810610629, + "grad_norm": 0.1272667646408081, + "learning_rate": 0.0005852208652938085, + "loss": 2.1995, + "step": 450040 + }, + { + "epoch": 1.7397674382644461, + "grad_norm": 0.13499830663204193, + "learning_rate": 0.0005850795075340807, + "loss": 2.2311, + "step": 450050 + }, + { + "epoch": 1.7398060954678294, + "grad_norm": 0.1112581118941307, + "learning_rate": 0.0005849381638953018, + "loss": 2.1999, + "step": 450060 + }, + { + "epoch": 1.7398447526712126, + "grad_norm": 0.12504318356513977, + "learning_rate": 0.0005847968343732408, + "loss": 2.2002, + "step": 450070 + }, + { + "epoch": 1.7398834098745959, + "grad_norm": 0.12915337085723877, + "learning_rate": 0.0005846555189636694, + "loss": 2.2103, + "step": 450080 + }, + { + "epoch": 1.7399220670779794, + "grad_norm": 0.12873782217502594, + "learning_rate": 0.0005845142176623603, + "loss": 2.2116, + "step": 450090 + }, + { + "epoch": 1.7399607242813626, + "grad_norm": 0.12179477512836456, + "learning_rate": 0.0005843729304650889, + "loss": 2.2131, + "step": 450100 + }, + { + "epoch": 1.7399993814847459, + "grad_norm": 0.11743723601102829, + "learning_rate": 0.0005842316573676327, + "loss": 2.208, + "step": 450110 + }, + { + "epoch": 1.7400380386881291, + "grad_norm": 0.12051267176866531, + "learning_rate": 0.0005840903983657715, + "loss": 2.216, + "step": 450120 + }, + { + "epoch": 1.7400766958915124, + "grad_norm": 0.13697391748428345, + "learning_rate": 0.0005839491534552865, + "loss": 2.2053, + "step": 450130 + }, + { + "epoch": 1.7401153530948958, + "grad_norm": 0.12576167285442352, + "learning_rate": 0.0005838079226319617, + "loss": 2.2214, + "step": 450140 + }, + { + "epoch": 1.740154010298279, + "grad_norm": 0.12356912344694138, + "learning_rate": 0.000583666705891583, + "loss": 2.2187, + "step": 450150 + }, + { + "epoch": 1.7401926675016623, + "grad_norm": 0.1240406483411789, + "learning_rate": 0.0005835255032299381, + "loss": 2.2062, + "step": 450160 + }, + { + "epoch": 1.7402313247050456, + "grad_norm": 0.12559963762760162, + "learning_rate": 0.0005833843146428175, + "loss": 2.2052, + "step": 450170 + }, + { + "epoch": 1.7402699819084289, + "grad_norm": 0.1147528812289238, + "learning_rate": 0.000583243140126013, + "loss": 2.2193, + "step": 450180 + }, + { + "epoch": 1.740308639111812, + "grad_norm": 0.12529990077018738, + "learning_rate": 0.000583101979675319, + "loss": 2.2089, + "step": 450190 + }, + { + "epoch": 1.7403472963151954, + "grad_norm": 0.1308630257844925, + "learning_rate": 0.0005829608332865319, + "loss": 2.2181, + "step": 450200 + }, + { + "epoch": 1.7403859535185786, + "grad_norm": 0.12569352984428406, + "learning_rate": 0.0005828197009554501, + "loss": 2.2027, + "step": 450210 + }, + { + "epoch": 1.7404246107219619, + "grad_norm": 0.1284223347902298, + "learning_rate": 0.000582678582677874, + "loss": 2.2122, + "step": 450220 + }, + { + "epoch": 1.7404632679253451, + "grad_norm": 0.12393103539943695, + "learning_rate": 0.0005825374784496065, + "loss": 2.2113, + "step": 450230 + }, + { + "epoch": 1.7405019251287284, + "grad_norm": 0.11839126795530319, + "learning_rate": 0.000582396388266452, + "loss": 2.2164, + "step": 450240 + }, + { + "epoch": 1.7405405823321116, + "grad_norm": 0.12693467736244202, + "learning_rate": 0.0005822553121242176, + "loss": 2.2165, + "step": 450250 + }, + { + "epoch": 1.740579239535495, + "grad_norm": 0.11685807257890701, + "learning_rate": 0.0005821142500187118, + "loss": 2.2087, + "step": 450260 + }, + { + "epoch": 1.7406178967388783, + "grad_norm": 0.12128729373216629, + "learning_rate": 0.000581973201945746, + "loss": 2.2145, + "step": 450270 + }, + { + "epoch": 1.7406565539422616, + "grad_norm": 0.12367042899131775, + "learning_rate": 0.0005818321679011331, + "loss": 2.2069, + "step": 450280 + }, + { + "epoch": 1.7406952111456448, + "grad_norm": 0.12804968655109406, + "learning_rate": 0.0005816911478806881, + "loss": 2.2099, + "step": 450290 + }, + { + "epoch": 1.7407338683490283, + "grad_norm": 0.12371846288442612, + "learning_rate": 0.0005815501418802285, + "loss": 2.2068, + "step": 450300 + }, + { + "epoch": 1.7407725255524116, + "grad_norm": 0.13237400352954865, + "learning_rate": 0.0005814091498955733, + "loss": 2.2264, + "step": 450310 + }, + { + "epoch": 1.7408111827557948, + "grad_norm": 0.12525179982185364, + "learning_rate": 0.0005812681719225441, + "loss": 2.2056, + "step": 450320 + }, + { + "epoch": 1.740849839959178, + "grad_norm": 0.1195155680179596, + "learning_rate": 0.0005811272079569642, + "loss": 2.2177, + "step": 450330 + }, + { + "epoch": 1.7408884971625613, + "grad_norm": 0.1262855976819992, + "learning_rate": 0.0005809862579946592, + "loss": 2.1999, + "step": 450340 + }, + { + "epoch": 1.7409271543659446, + "grad_norm": 0.13950762152671814, + "learning_rate": 0.0005808453220314567, + "loss": 2.2182, + "step": 450350 + }, + { + "epoch": 1.7409658115693278, + "grad_norm": 0.12949655950069427, + "learning_rate": 0.0005807044000631863, + "loss": 2.2177, + "step": 450360 + }, + { + "epoch": 1.741004468772711, + "grad_norm": 0.11451691389083862, + "learning_rate": 0.0005805634920856797, + "loss": 2.21, + "step": 450370 + }, + { + "epoch": 1.7410431259760943, + "grad_norm": 0.11944366991519928, + "learning_rate": 0.000580422598094771, + "loss": 2.2095, + "step": 450380 + }, + { + "epoch": 1.7410817831794776, + "grad_norm": 0.13329607248306274, + "learning_rate": 0.0005802817180862958, + "loss": 2.213, + "step": 450390 + }, + { + "epoch": 1.7411204403828608, + "grad_norm": 0.1350400447845459, + "learning_rate": 0.0005801408520560923, + "loss": 2.2152, + "step": 450400 + }, + { + "epoch": 1.741159097586244, + "grad_norm": 0.12450750172138214, + "learning_rate": 0.0005800000000000001, + "loss": 2.2142, + "step": 450410 + }, + { + "epoch": 1.7411977547896273, + "grad_norm": 0.11537111550569534, + "learning_rate": 0.0005798591619138616, + "loss": 2.209, + "step": 450420 + }, + { + "epoch": 1.7412364119930108, + "grad_norm": 0.11340397596359253, + "learning_rate": 0.0005797183377935207, + "loss": 2.2062, + "step": 450430 + }, + { + "epoch": 1.741275069196394, + "grad_norm": 0.12494982779026031, + "learning_rate": 0.0005795775276348238, + "loss": 2.2113, + "step": 450440 + }, + { + "epoch": 1.7413137263997773, + "grad_norm": 0.12394999712705612, + "learning_rate": 0.0005794367314336191, + "loss": 2.2185, + "step": 450450 + }, + { + "epoch": 1.7413523836031606, + "grad_norm": 0.13133755326271057, + "learning_rate": 0.0005792959491857565, + "loss": 2.2064, + "step": 450460 + }, + { + "epoch": 1.741391040806544, + "grad_norm": 0.13078813254833221, + "learning_rate": 0.0005791551808870892, + "loss": 2.2135, + "step": 450470 + }, + { + "epoch": 1.7414296980099273, + "grad_norm": 0.12275271862745285, + "learning_rate": 0.0005790144265334712, + "loss": 2.2116, + "step": 450480 + }, + { + "epoch": 1.7414683552133106, + "grad_norm": 0.12438642978668213, + "learning_rate": 0.0005788736861207587, + "loss": 2.2078, + "step": 450490 + }, + { + "epoch": 1.7415070124166938, + "grad_norm": 0.13157707452774048, + "learning_rate": 0.0005787329596448105, + "loss": 2.1979, + "step": 450500 + }, + { + "epoch": 1.741545669620077, + "grad_norm": 0.12612107396125793, + "learning_rate": 0.000578592247101487, + "loss": 2.2122, + "step": 450510 + }, + { + "epoch": 1.7415843268234603, + "grad_norm": 0.12490899860858917, + "learning_rate": 0.0005784515484866513, + "loss": 2.2124, + "step": 450520 + }, + { + "epoch": 1.7416229840268436, + "grad_norm": 0.1354866623878479, + "learning_rate": 0.0005783108637961674, + "loss": 2.2024, + "step": 450530 + }, + { + "epoch": 1.7416616412302268, + "grad_norm": 0.11852476745843887, + "learning_rate": 0.0005781701930259023, + "loss": 2.2092, + "step": 450540 + }, + { + "epoch": 1.74170029843361, + "grad_norm": 0.11969569325447083, + "learning_rate": 0.0005780295361717249, + "loss": 2.2063, + "step": 450550 + }, + { + "epoch": 1.7417389556369933, + "grad_norm": 0.12728244066238403, + "learning_rate": 0.0005778888932295057, + "loss": 2.2148, + "step": 450560 + }, + { + "epoch": 1.7417776128403766, + "grad_norm": 0.1304050236940384, + "learning_rate": 0.0005777482641951179, + "loss": 2.2052, + "step": 450570 + }, + { + "epoch": 1.7418162700437598, + "grad_norm": 0.12834100425243378, + "learning_rate": 0.0005776076490644362, + "loss": 2.206, + "step": 450580 + }, + { + "epoch": 1.741854927247143, + "grad_norm": 0.12126423418521881, + "learning_rate": 0.0005774670478333375, + "loss": 2.2091, + "step": 450590 + }, + { + "epoch": 1.7418935844505266, + "grad_norm": 0.1424793004989624, + "learning_rate": 0.0005773264604977009, + "loss": 2.2037, + "step": 450600 + }, + { + "epoch": 1.7419322416539098, + "grad_norm": 0.14608636498451233, + "learning_rate": 0.000577185887053407, + "loss": 2.2184, + "step": 450610 + }, + { + "epoch": 1.741970898857293, + "grad_norm": 0.120540089905262, + "learning_rate": 0.0005770453274963394, + "loss": 2.2063, + "step": 450620 + }, + { + "epoch": 1.7420095560606763, + "grad_norm": 0.12794199585914612, + "learning_rate": 0.0005769047818223827, + "loss": 2.221, + "step": 450630 + }, + { + "epoch": 1.7420482132640598, + "grad_norm": 0.11833221465349197, + "learning_rate": 0.0005767642500274243, + "loss": 2.2074, + "step": 450640 + }, + { + "epoch": 1.742086870467443, + "grad_norm": 0.12499483674764633, + "learning_rate": 0.0005766237321073533, + "loss": 2.2001, + "step": 450650 + }, + { + "epoch": 1.7421255276708263, + "grad_norm": 0.17976893484592438, + "learning_rate": 0.0005764832280580603, + "loss": 2.2064, + "step": 450660 + }, + { + "epoch": 1.7421641848742095, + "grad_norm": 0.13779857754707336, + "learning_rate": 0.0005763427378754393, + "loss": 2.1962, + "step": 450670 + }, + { + "epoch": 1.7422028420775928, + "grad_norm": 0.12299429625272751, + "learning_rate": 0.000576202261555385, + "loss": 2.2107, + "step": 450680 + }, + { + "epoch": 1.742241499280976, + "grad_norm": 0.12656106054782867, + "learning_rate": 0.0005760617990937948, + "loss": 2.2029, + "step": 450690 + }, + { + "epoch": 1.7422801564843593, + "grad_norm": 0.13690079748630524, + "learning_rate": 0.0005759213504865681, + "loss": 2.2119, + "step": 450700 + }, + { + "epoch": 1.7423188136877426, + "grad_norm": 0.12545296549797058, + "learning_rate": 0.0005757809157296059, + "loss": 2.2108, + "step": 450710 + }, + { + "epoch": 1.7423574708911258, + "grad_norm": 0.1948815882205963, + "learning_rate": 0.0005756404948188117, + "loss": 2.2058, + "step": 450720 + }, + { + "epoch": 1.742396128094509, + "grad_norm": 0.1338101625442505, + "learning_rate": 0.0005755000877500905, + "loss": 2.2384, + "step": 450730 + }, + { + "epoch": 1.7424347852978923, + "grad_norm": 0.11879181861877441, + "learning_rate": 0.00057535969451935, + "loss": 2.2143, + "step": 450740 + }, + { + "epoch": 1.7424734425012756, + "grad_norm": 0.11900530010461807, + "learning_rate": 0.0005752193151224994, + "loss": 2.2085, + "step": 450750 + }, + { + "epoch": 1.7425120997046588, + "grad_norm": 0.12955817580223083, + "learning_rate": 0.0005750789495554498, + "loss": 2.2031, + "step": 450760 + }, + { + "epoch": 1.7425507569080423, + "grad_norm": 0.11215945333242416, + "learning_rate": 0.0005749385978141153, + "loss": 2.21, + "step": 450770 + }, + { + "epoch": 1.7425894141114255, + "grad_norm": 0.11732306331396103, + "learning_rate": 0.0005747982598944106, + "loss": 2.226, + "step": 450780 + }, + { + "epoch": 1.7426280713148088, + "grad_norm": 0.12146113067865372, + "learning_rate": 0.0005746579357922535, + "loss": 2.2001, + "step": 450790 + }, + { + "epoch": 1.742666728518192, + "grad_norm": 0.13044078648090363, + "learning_rate": 0.0005745176255035631, + "loss": 2.2073, + "step": 450800 + }, + { + "epoch": 1.7427053857215755, + "grad_norm": 0.12934516370296478, + "learning_rate": 0.0005743773290242611, + "loss": 2.2063, + "step": 450810 + }, + { + "epoch": 1.7427440429249588, + "grad_norm": 0.1249857172369957, + "learning_rate": 0.0005742370463502709, + "loss": 2.2067, + "step": 450820 + }, + { + "epoch": 1.742782700128342, + "grad_norm": 0.1371041238307953, + "learning_rate": 0.0005740967774775177, + "loss": 2.2142, + "step": 450830 + }, + { + "epoch": 1.7428213573317253, + "grad_norm": 0.12729176878929138, + "learning_rate": 0.0005739565224019289, + "loss": 2.2039, + "step": 450840 + }, + { + "epoch": 1.7428600145351085, + "grad_norm": 0.12598155438899994, + "learning_rate": 0.0005738162811194345, + "loss": 2.2005, + "step": 450850 + }, + { + "epoch": 1.7428986717384918, + "grad_norm": 0.1471673548221588, + "learning_rate": 0.0005736760536259653, + "loss": 2.2131, + "step": 450860 + }, + { + "epoch": 1.742937328941875, + "grad_norm": 0.11984395235776901, + "learning_rate": 0.000573535839917455, + "loss": 2.2269, + "step": 450870 + }, + { + "epoch": 1.7429759861452583, + "grad_norm": 0.12927258014678955, + "learning_rate": 0.0005733956399898392, + "loss": 2.1974, + "step": 450880 + }, + { + "epoch": 1.7430146433486415, + "grad_norm": 0.13765715062618256, + "learning_rate": 0.0005732554538390553, + "loss": 2.2144, + "step": 450890 + }, + { + "epoch": 1.7430533005520248, + "grad_norm": 0.13073231279850006, + "learning_rate": 0.0005731152814610425, + "loss": 2.2167, + "step": 450900 + }, + { + "epoch": 1.743091957755408, + "grad_norm": 0.11769286543130875, + "learning_rate": 0.0005729751228517423, + "loss": 2.2048, + "step": 450910 + }, + { + "epoch": 1.7431306149587913, + "grad_norm": 0.13146555423736572, + "learning_rate": 0.0005728349780070983, + "loss": 2.211, + "step": 450920 + }, + { + "epoch": 1.7431692721621745, + "grad_norm": 0.120726577937603, + "learning_rate": 0.0005726948469230556, + "loss": 2.2024, + "step": 450930 + }, + { + "epoch": 1.743207929365558, + "grad_norm": 0.1122736856341362, + "learning_rate": 0.0005725547295955618, + "loss": 2.2047, + "step": 450940 + }, + { + "epoch": 1.7432465865689413, + "grad_norm": 0.13733603060245514, + "learning_rate": 0.0005724146260205663, + "loss": 2.2107, + "step": 450950 + }, + { + "epoch": 1.7432852437723245, + "grad_norm": 0.1244693174958229, + "learning_rate": 0.0005722745361940203, + "loss": 2.2065, + "step": 450960 + }, + { + "epoch": 1.7433239009757078, + "grad_norm": 0.12130724638700485, + "learning_rate": 0.0005721344601118772, + "loss": 2.2024, + "step": 450970 + }, + { + "epoch": 1.7433625581790912, + "grad_norm": 0.1321234107017517, + "learning_rate": 0.0005719943977700927, + "loss": 2.2016, + "step": 450980 + }, + { + "epoch": 1.7434012153824745, + "grad_norm": 0.12718746066093445, + "learning_rate": 0.000571854349164624, + "loss": 2.2084, + "step": 450990 + }, + { + "epoch": 1.7434398725858578, + "grad_norm": 0.3911479413509369, + "learning_rate": 0.0005717143142914301, + "loss": 2.2242, + "step": 451000 + }, + { + "epoch": 1.743478529789241, + "grad_norm": 0.13979850709438324, + "learning_rate": 0.0005715742931464724, + "loss": 2.2095, + "step": 451010 + }, + { + "epoch": 1.7435171869926243, + "grad_norm": 0.1296544075012207, + "learning_rate": 0.0005714342857257145, + "loss": 2.2119, + "step": 451020 + }, + { + "epoch": 1.7435558441960075, + "grad_norm": 0.12337645888328552, + "learning_rate": 0.0005712942920251212, + "loss": 2.212, + "step": 451030 + }, + { + "epoch": 1.7435945013993908, + "grad_norm": 0.12149360775947571, + "learning_rate": 0.00057115431204066, + "loss": 2.1986, + "step": 451040 + }, + { + "epoch": 1.743633158602774, + "grad_norm": 0.1235906183719635, + "learning_rate": 0.0005710143457683, + "loss": 2.2207, + "step": 451050 + }, + { + "epoch": 1.7436718158061573, + "grad_norm": 0.12550729513168335, + "learning_rate": 0.0005708743932040124, + "loss": 2.198, + "step": 451060 + }, + { + "epoch": 1.7437104730095405, + "grad_norm": 0.12192264944314957, + "learning_rate": 0.0005707344543437702, + "loss": 2.2223, + "step": 451070 + }, + { + "epoch": 1.7437491302129238, + "grad_norm": 0.11551124602556229, + "learning_rate": 0.0005705945291835491, + "loss": 2.2179, + "step": 451080 + }, + { + "epoch": 1.743787787416307, + "grad_norm": 0.12643487751483917, + "learning_rate": 0.0005704546177193255, + "loss": 2.199, + "step": 451090 + }, + { + "epoch": 1.7438264446196903, + "grad_norm": 0.1214127466082573, + "learning_rate": 0.0005703147199470787, + "loss": 2.2017, + "step": 451100 + }, + { + "epoch": 1.7438651018230737, + "grad_norm": 0.125314861536026, + "learning_rate": 0.0005701748358627898, + "loss": 2.2147, + "step": 451110 + }, + { + "epoch": 1.743903759026457, + "grad_norm": 0.1230613961815834, + "learning_rate": 0.000570034965462442, + "loss": 2.2113, + "step": 451120 + }, + { + "epoch": 1.7439424162298403, + "grad_norm": 0.12328998744487762, + "learning_rate": 0.0005698951087420197, + "loss": 2.2204, + "step": 451130 + }, + { + "epoch": 1.7439810734332235, + "grad_norm": 0.3015275001525879, + "learning_rate": 0.0005697552656975102, + "loss": 2.2021, + "step": 451140 + }, + { + "epoch": 1.744019730636607, + "grad_norm": 0.13240598142147064, + "learning_rate": 0.0005696154363249022, + "loss": 2.2112, + "step": 451150 + }, + { + "epoch": 1.7440583878399902, + "grad_norm": 0.12521402537822723, + "learning_rate": 0.0005694756206201866, + "loss": 2.2096, + "step": 451160 + }, + { + "epoch": 1.7440970450433735, + "grad_norm": 0.1339920163154602, + "learning_rate": 0.0005693358185793564, + "loss": 2.1992, + "step": 451170 + }, + { + "epoch": 1.7441357022467567, + "grad_norm": 0.1233472228050232, + "learning_rate": 0.0005691960301984063, + "loss": 2.2051, + "step": 451180 + }, + { + "epoch": 1.74417435945014, + "grad_norm": 0.12355933338403702, + "learning_rate": 0.0005690562554733328, + "loss": 2.1977, + "step": 451190 + }, + { + "epoch": 1.7442130166535232, + "grad_norm": 0.12268158048391342, + "learning_rate": 0.0005689164944001346, + "loss": 2.1982, + "step": 451200 + }, + { + "epoch": 1.7442516738569065, + "grad_norm": 0.1282660961151123, + "learning_rate": 0.0005687767469748124, + "loss": 2.2058, + "step": 451210 + }, + { + "epoch": 1.7442903310602897, + "grad_norm": 0.11994381994009018, + "learning_rate": 0.0005686370131933689, + "loss": 2.1929, + "step": 451220 + }, + { + "epoch": 1.744328988263673, + "grad_norm": 0.12090937048196793, + "learning_rate": 0.0005684972930518085, + "loss": 2.2087, + "step": 451230 + }, + { + "epoch": 1.7443676454670562, + "grad_norm": 0.1255774199962616, + "learning_rate": 0.0005683575865461376, + "loss": 2.2111, + "step": 451240 + }, + { + "epoch": 1.7444063026704395, + "grad_norm": 0.1352381408214569, + "learning_rate": 0.0005682178936723647, + "loss": 2.2018, + "step": 451250 + }, + { + "epoch": 1.7444449598738228, + "grad_norm": 0.12163727730512619, + "learning_rate": 0.0005680782144265002, + "loss": 2.2189, + "step": 451260 + }, + { + "epoch": 1.744483617077206, + "grad_norm": 0.1344509720802307, + "learning_rate": 0.0005679385488045563, + "loss": 2.2011, + "step": 451270 + }, + { + "epoch": 1.7445222742805895, + "grad_norm": 0.1113949567079544, + "learning_rate": 0.0005677988968025474, + "loss": 2.216, + "step": 451280 + }, + { + "epoch": 1.7445609314839727, + "grad_norm": 0.12429040670394897, + "learning_rate": 0.0005676592584164897, + "loss": 2.2099, + "step": 451290 + }, + { + "epoch": 1.744599588687356, + "grad_norm": 0.12263821810483932, + "learning_rate": 0.0005675196336424013, + "loss": 2.198, + "step": 451300 + }, + { + "epoch": 1.7446382458907392, + "grad_norm": 0.120047427713871, + "learning_rate": 0.0005673800224763024, + "loss": 2.2053, + "step": 451310 + }, + { + "epoch": 1.7446769030941227, + "grad_norm": 0.12776467204093933, + "learning_rate": 0.0005672404249142147, + "loss": 2.2238, + "step": 451320 + }, + { + "epoch": 1.744715560297506, + "grad_norm": 0.12136024981737137, + "learning_rate": 0.0005671008409521626, + "loss": 2.1972, + "step": 451330 + }, + { + "epoch": 1.7447542175008892, + "grad_norm": 0.12847566604614258, + "learning_rate": 0.0005669612705861716, + "loss": 2.2046, + "step": 451340 + }, + { + "epoch": 1.7447928747042725, + "grad_norm": 0.11636916548013687, + "learning_rate": 0.0005668217138122697, + "loss": 2.2228, + "step": 451350 + }, + { + "epoch": 1.7448315319076557, + "grad_norm": 0.13120609521865845, + "learning_rate": 0.0005666821706264867, + "loss": 2.2101, + "step": 451360 + }, + { + "epoch": 1.744870189111039, + "grad_norm": 0.12237390875816345, + "learning_rate": 0.0005665426410248542, + "loss": 2.2067, + "step": 451370 + }, + { + "epoch": 1.7449088463144222, + "grad_norm": 0.13978280127048492, + "learning_rate": 0.0005664031250034059, + "loss": 2.2129, + "step": 451380 + }, + { + "epoch": 1.7449475035178055, + "grad_norm": 0.13153356313705444, + "learning_rate": 0.0005662636225581774, + "loss": 2.2129, + "step": 451390 + }, + { + "epoch": 1.7449861607211887, + "grad_norm": 0.12097728997468948, + "learning_rate": 0.0005661241336852063, + "loss": 2.2106, + "step": 451400 + }, + { + "epoch": 1.745024817924572, + "grad_norm": 0.1258837878704071, + "learning_rate": 0.0005659846583805319, + "loss": 2.2322, + "step": 451410 + }, + { + "epoch": 1.7450634751279552, + "grad_norm": 0.13201965391635895, + "learning_rate": 0.0005658451966401954, + "loss": 2.2006, + "step": 451420 + }, + { + "epoch": 1.7451021323313385, + "grad_norm": 0.13621696829795837, + "learning_rate": 0.0005657057484602402, + "loss": 2.217, + "step": 451430 + }, + { + "epoch": 1.7451407895347217, + "grad_norm": 0.12259390950202942, + "learning_rate": 0.0005655663138367115, + "loss": 2.201, + "step": 451440 + }, + { + "epoch": 1.7451794467381052, + "grad_norm": 0.11789116263389587, + "learning_rate": 0.0005654268927656563, + "loss": 2.2053, + "step": 451450 + }, + { + "epoch": 1.7452181039414885, + "grad_norm": 0.13373173773288727, + "learning_rate": 0.0005652874852431238, + "loss": 2.2009, + "step": 451460 + }, + { + "epoch": 1.7452567611448717, + "grad_norm": 0.1411893516778946, + "learning_rate": 0.0005651480912651647, + "loss": 2.209, + "step": 451470 + }, + { + "epoch": 1.745295418348255, + "grad_norm": 0.12485019117593765, + "learning_rate": 0.0005650087108278321, + "loss": 2.2159, + "step": 451480 + }, + { + "epoch": 1.7453340755516384, + "grad_norm": 0.13727016746997833, + "learning_rate": 0.0005648693439271811, + "loss": 2.2193, + "step": 451490 + }, + { + "epoch": 1.7453727327550217, + "grad_norm": 0.11968348175287247, + "learning_rate": 0.0005647299905592676, + "loss": 2.2009, + "step": 451500 + }, + { + "epoch": 1.745411389958405, + "grad_norm": 0.11810345947742462, + "learning_rate": 0.0005645906507201508, + "loss": 2.2111, + "step": 451510 + }, + { + "epoch": 1.7454500471617882, + "grad_norm": 0.12262141704559326, + "learning_rate": 0.0005644513244058911, + "loss": 2.2229, + "step": 451520 + }, + { + "epoch": 1.7454887043651715, + "grad_norm": 0.12217133492231369, + "learning_rate": 0.0005643120116125511, + "loss": 2.2015, + "step": 451530 + }, + { + "epoch": 1.7455273615685547, + "grad_norm": 0.1324571669101715, + "learning_rate": 0.0005641727123361949, + "loss": 2.1986, + "step": 451540 + }, + { + "epoch": 1.745566018771938, + "grad_norm": 0.12952816486358643, + "learning_rate": 0.0005640334265728886, + "loss": 2.2094, + "step": 451550 + }, + { + "epoch": 1.7456046759753212, + "grad_norm": 0.1282138228416443, + "learning_rate": 0.0005638941543187008, + "loss": 2.2228, + "step": 451560 + }, + { + "epoch": 1.7456433331787045, + "grad_norm": 0.12335231155157089, + "learning_rate": 0.0005637548955697012, + "loss": 2.2075, + "step": 451570 + }, + { + "epoch": 1.7456819903820877, + "grad_norm": 0.13678689301013947, + "learning_rate": 0.000563615650321962, + "loss": 2.2099, + "step": 451580 + }, + { + "epoch": 1.745720647585471, + "grad_norm": 0.1274346113204956, + "learning_rate": 0.0005634764185715571, + "loss": 2.2113, + "step": 451590 + }, + { + "epoch": 1.7457593047888542, + "grad_norm": 0.1281769573688507, + "learning_rate": 0.0005633372003145623, + "loss": 2.2036, + "step": 451600 + }, + { + "epoch": 1.7457979619922375, + "grad_norm": 0.12230178713798523, + "learning_rate": 0.0005631979955470551, + "loss": 2.1982, + "step": 451610 + }, + { + "epoch": 1.745836619195621, + "grad_norm": 0.12558716535568237, + "learning_rate": 0.0005630588042651155, + "loss": 2.2187, + "step": 451620 + }, + { + "epoch": 1.7458752763990042, + "grad_norm": 0.14368413388729095, + "learning_rate": 0.0005629196264648244, + "loss": 2.22, + "step": 451630 + }, + { + "epoch": 1.7459139336023874, + "grad_norm": 0.13558371365070343, + "learning_rate": 0.0005627804621422657, + "loss": 2.198, + "step": 451640 + }, + { + "epoch": 1.7459525908057707, + "grad_norm": 0.11974722892045975, + "learning_rate": 0.0005626413112935241, + "loss": 2.2166, + "step": 451650 + }, + { + "epoch": 1.7459912480091542, + "grad_norm": 0.12421214580535889, + "learning_rate": 0.0005625021739146874, + "loss": 2.2027, + "step": 451660 + }, + { + "epoch": 1.7460299052125374, + "grad_norm": 0.13483013212680817, + "learning_rate": 0.0005623630500018442, + "loss": 2.207, + "step": 451670 + }, + { + "epoch": 1.7460685624159207, + "grad_norm": 0.12035472691059113, + "learning_rate": 0.0005622239395510857, + "loss": 2.1978, + "step": 451680 + }, + { + "epoch": 1.746107219619304, + "grad_norm": 0.12705770134925842, + "learning_rate": 0.0005620848425585048, + "loss": 2.2167, + "step": 451690 + }, + { + "epoch": 1.7461458768226872, + "grad_norm": 0.1203598827123642, + "learning_rate": 0.000561945759020196, + "loss": 2.2198, + "step": 451700 + }, + { + "epoch": 1.7461845340260704, + "grad_norm": 0.1211920827627182, + "learning_rate": 0.0005618066889322562, + "loss": 2.2095, + "step": 451710 + }, + { + "epoch": 1.7462231912294537, + "grad_norm": 0.1302730292081833, + "learning_rate": 0.0005616676322907839, + "loss": 2.1989, + "step": 451720 + }, + { + "epoch": 1.746261848432837, + "grad_norm": 0.1250298172235489, + "learning_rate": 0.0005615285890918791, + "loss": 2.2241, + "step": 451730 + }, + { + "epoch": 1.7463005056362202, + "grad_norm": 0.1284303367137909, + "learning_rate": 0.0005613895593316445, + "loss": 2.2192, + "step": 451740 + }, + { + "epoch": 1.7463391628396034, + "grad_norm": 0.12830610573291779, + "learning_rate": 0.0005612505430061843, + "loss": 2.2104, + "step": 451750 + }, + { + "epoch": 1.7463778200429867, + "grad_norm": 0.12058718502521515, + "learning_rate": 0.0005611115401116043, + "loss": 2.2115, + "step": 451760 + }, + { + "epoch": 1.74641647724637, + "grad_norm": 0.12984438240528107, + "learning_rate": 0.0005609725506440122, + "loss": 2.2087, + "step": 451770 + }, + { + "epoch": 1.7464551344497534, + "grad_norm": 0.12947475910186768, + "learning_rate": 0.0005608335745995183, + "loss": 2.2224, + "step": 451780 + }, + { + "epoch": 1.7464937916531367, + "grad_norm": 0.11166150122880936, + "learning_rate": 0.0005606946119742342, + "loss": 2.194, + "step": 451790 + }, + { + "epoch": 1.74653244885652, + "grad_norm": 0.13033993542194366, + "learning_rate": 0.0005605556627642734, + "loss": 2.2127, + "step": 451800 + }, + { + "epoch": 1.7465711060599032, + "grad_norm": 0.12836289405822754, + "learning_rate": 0.0005604167269657514, + "loss": 2.1921, + "step": 451810 + }, + { + "epoch": 1.7466097632632864, + "grad_norm": 0.13571980595588684, + "learning_rate": 0.0005602778045747854, + "loss": 2.2235, + "step": 451820 + }, + { + "epoch": 1.74664842046667, + "grad_norm": 0.13120993971824646, + "learning_rate": 0.0005601388955874946, + "loss": 2.2069, + "step": 451830 + }, + { + "epoch": 1.7466870776700532, + "grad_norm": 0.13109606504440308, + "learning_rate": 0.0005600000000000001, + "loss": 2.2094, + "step": 451840 + }, + { + "epoch": 1.7467257348734364, + "grad_norm": 0.12630265951156616, + "learning_rate": 0.0005598611178084247, + "loss": 2.2149, + "step": 451850 + }, + { + "epoch": 1.7467643920768197, + "grad_norm": 0.12479433417320251, + "learning_rate": 0.0005597222490088936, + "loss": 2.2069, + "step": 451860 + }, + { + "epoch": 1.746803049280203, + "grad_norm": 0.12622688710689545, + "learning_rate": 0.0005595833935975327, + "loss": 2.2114, + "step": 451870 + }, + { + "epoch": 1.7468417064835862, + "grad_norm": 0.1303582340478897, + "learning_rate": 0.0005594445515704713, + "loss": 2.2146, + "step": 451880 + }, + { + "epoch": 1.7468803636869694, + "grad_norm": 0.12129681557416916, + "learning_rate": 0.0005593057229238397, + "loss": 2.2068, + "step": 451890 + }, + { + "epoch": 1.7469190208903527, + "grad_norm": 0.12745247781276703, + "learning_rate": 0.00055916690765377, + "loss": 2.2152, + "step": 451900 + }, + { + "epoch": 1.746957678093736, + "grad_norm": 0.14266474545001984, + "learning_rate": 0.0005590281057563962, + "loss": 2.225, + "step": 451910 + }, + { + "epoch": 1.7469963352971192, + "grad_norm": 0.14710094034671783, + "learning_rate": 0.0005588893172278544, + "loss": 2.2181, + "step": 451920 + }, + { + "epoch": 1.7470349925005024, + "grad_norm": 0.12489454448223114, + "learning_rate": 0.0005587505420642825, + "loss": 2.2114, + "step": 451930 + }, + { + "epoch": 1.7470736497038857, + "grad_norm": 0.11751807481050491, + "learning_rate": 0.00055861178026182, + "loss": 2.2161, + "step": 451940 + }, + { + "epoch": 1.7471123069072692, + "grad_norm": 0.12177549302577972, + "learning_rate": 0.0005584730318166087, + "loss": 2.2063, + "step": 451950 + }, + { + "epoch": 1.7471509641106524, + "grad_norm": 0.12565843760967255, + "learning_rate": 0.0005583342967247922, + "loss": 2.2211, + "step": 451960 + }, + { + "epoch": 1.7471896213140357, + "grad_norm": 0.1387435495853424, + "learning_rate": 0.0005581955749825152, + "loss": 2.2068, + "step": 451970 + }, + { + "epoch": 1.747228278517419, + "grad_norm": 0.12651048600673676, + "learning_rate": 0.0005580568665859253, + "loss": 2.1967, + "step": 451980 + }, + { + "epoch": 1.7472669357208022, + "grad_norm": 0.12083800137042999, + "learning_rate": 0.0005579181715311714, + "loss": 2.2148, + "step": 451990 + }, + { + "epoch": 1.7473055929241856, + "grad_norm": 0.11948376148939133, + "learning_rate": 0.0005577794898144042, + "loss": 2.2078, + "step": 452000 + }, + { + "epoch": 1.7473442501275689, + "grad_norm": 0.13278955221176147, + "learning_rate": 0.0005576408214317767, + "loss": 2.1954, + "step": 452010 + }, + { + "epoch": 1.7473829073309521, + "grad_norm": 0.127226322889328, + "learning_rate": 0.0005575021663794431, + "loss": 2.2025, + "step": 452020 + }, + { + "epoch": 1.7474215645343354, + "grad_norm": 0.12776382267475128, + "learning_rate": 0.00055736352465356, + "loss": 2.2093, + "step": 452030 + }, + { + "epoch": 1.7474602217377186, + "grad_norm": 0.1194329634308815, + "learning_rate": 0.0005572248962502853, + "loss": 2.1928, + "step": 452040 + }, + { + "epoch": 1.747498878941102, + "grad_norm": 0.12772822380065918, + "learning_rate": 0.0005570862811657795, + "loss": 2.1951, + "step": 452050 + }, + { + "epoch": 1.7475375361444851, + "grad_norm": 0.1270766258239746, + "learning_rate": 0.0005569476793962042, + "loss": 2.2269, + "step": 452060 + }, + { + "epoch": 1.7475761933478684, + "grad_norm": 0.1367577165365219, + "learning_rate": 0.0005568090909377235, + "loss": 2.2082, + "step": 452070 + }, + { + "epoch": 1.7476148505512517, + "grad_norm": 0.12789234519004822, + "learning_rate": 0.0005566705157865026, + "loss": 2.2093, + "step": 452080 + }, + { + "epoch": 1.747653507754635, + "grad_norm": 0.12344441562891006, + "learning_rate": 0.0005565319539387095, + "loss": 2.2084, + "step": 452090 + }, + { + "epoch": 1.7476921649580182, + "grad_norm": 0.1296205371618271, + "learning_rate": 0.0005563934053905129, + "loss": 2.2148, + "step": 452100 + }, + { + "epoch": 1.7477308221614014, + "grad_norm": 0.1363183856010437, + "learning_rate": 0.0005562548701380843, + "loss": 2.2012, + "step": 452110 + }, + { + "epoch": 1.7477694793647849, + "grad_norm": 0.13961145281791687, + "learning_rate": 0.0005561163481775964, + "loss": 2.207, + "step": 452120 + }, + { + "epoch": 1.7478081365681681, + "grad_norm": 0.14444896578788757, + "learning_rate": 0.0005559778395052242, + "loss": 2.2128, + "step": 452130 + }, + { + "epoch": 1.7478467937715514, + "grad_norm": 0.12200967967510223, + "learning_rate": 0.0005558393441171443, + "loss": 2.2039, + "step": 452140 + }, + { + "epoch": 1.7478854509749346, + "grad_norm": 0.12712444365024567, + "learning_rate": 0.0005557008620095352, + "loss": 2.2076, + "step": 452150 + }, + { + "epoch": 1.747924108178318, + "grad_norm": 0.12497568875551224, + "learning_rate": 0.0005555623931785769, + "loss": 2.2101, + "step": 452160 + }, + { + "epoch": 1.7479627653817014, + "grad_norm": 0.1273370087146759, + "learning_rate": 0.000555423937620452, + "loss": 2.2146, + "step": 452170 + }, + { + "epoch": 1.7480014225850846, + "grad_norm": 0.1270856112241745, + "learning_rate": 0.000555285495331344, + "loss": 2.213, + "step": 452180 + }, + { + "epoch": 1.7480400797884679, + "grad_norm": 0.12017151713371277, + "learning_rate": 0.0005551470663074389, + "loss": 2.2232, + "step": 452190 + }, + { + "epoch": 1.7480787369918511, + "grad_norm": 0.12553970515727997, + "learning_rate": 0.0005550086505449245, + "loss": 2.2053, + "step": 452200 + }, + { + "epoch": 1.7481173941952344, + "grad_norm": 0.1256796270608902, + "learning_rate": 0.0005548702480399901, + "loss": 2.2238, + "step": 452210 + }, + { + "epoch": 1.7481560513986176, + "grad_norm": 0.13260310888290405, + "learning_rate": 0.0005547318587888265, + "loss": 2.1899, + "step": 452220 + }, + { + "epoch": 1.7481947086020009, + "grad_norm": 0.13431571424007416, + "learning_rate": 0.0005545934827876277, + "loss": 2.199, + "step": 452230 + }, + { + "epoch": 1.7482333658053841, + "grad_norm": 0.13198308646678925, + "learning_rate": 0.0005544551200325878, + "loss": 2.2265, + "step": 452240 + }, + { + "epoch": 1.7482720230087674, + "grad_norm": 0.11962399631738663, + "learning_rate": 0.000554316770519904, + "loss": 2.2126, + "step": 452250 + }, + { + "epoch": 1.7483106802121506, + "grad_norm": 0.1246422603726387, + "learning_rate": 0.0005541784342457748, + "loss": 2.2041, + "step": 452260 + }, + { + "epoch": 1.7483493374155339, + "grad_norm": 0.1360652893781662, + "learning_rate": 0.0005540401112064, + "loss": 2.2057, + "step": 452270 + }, + { + "epoch": 1.7483879946189171, + "grad_norm": 0.11979234218597412, + "learning_rate": 0.0005539018013979825, + "loss": 2.2141, + "step": 452280 + }, + { + "epoch": 1.7484266518223006, + "grad_norm": 0.1446179449558258, + "learning_rate": 0.0005537635048167259, + "loss": 2.2118, + "step": 452290 + }, + { + "epoch": 1.7484653090256839, + "grad_norm": 0.13093620538711548, + "learning_rate": 0.0005536252214588364, + "loss": 2.2136, + "step": 452300 + }, + { + "epoch": 1.7485039662290671, + "grad_norm": 0.11662276089191437, + "learning_rate": 0.0005534869513205212, + "loss": 2.1986, + "step": 452310 + }, + { + "epoch": 1.7485426234324504, + "grad_norm": 0.1224411353468895, + "learning_rate": 0.00055334869439799, + "loss": 2.2184, + "step": 452320 + }, + { + "epoch": 1.7485812806358338, + "grad_norm": 0.1330765187740326, + "learning_rate": 0.000553210450687454, + "loss": 2.2062, + "step": 452330 + }, + { + "epoch": 1.748619937839217, + "grad_norm": 0.12264736741781235, + "learning_rate": 0.0005530722201851261, + "loss": 2.2153, + "step": 452340 + }, + { + "epoch": 1.7486585950426003, + "grad_norm": 0.13567610085010529, + "learning_rate": 0.0005529340028872216, + "loss": 2.1991, + "step": 452350 + }, + { + "epoch": 1.7486972522459836, + "grad_norm": 0.13990464806556702, + "learning_rate": 0.0005527957987899565, + "loss": 2.2066, + "step": 452360 + }, + { + "epoch": 1.7487359094493669, + "grad_norm": 0.12277691811323166, + "learning_rate": 0.0005526576078895498, + "loss": 2.2136, + "step": 452370 + }, + { + "epoch": 1.74877456665275, + "grad_norm": 0.1335594654083252, + "learning_rate": 0.0005525194301822216, + "loss": 2.2146, + "step": 452380 + }, + { + "epoch": 1.7488132238561334, + "grad_norm": 0.11977743357419968, + "learning_rate": 0.0005523812656641942, + "loss": 2.2034, + "step": 452390 + }, + { + "epoch": 1.7488518810595166, + "grad_norm": 0.14657334983348846, + "learning_rate": 0.0005522431143316913, + "loss": 2.203, + "step": 452400 + }, + { + "epoch": 1.7488905382628999, + "grad_norm": 0.12392140924930573, + "learning_rate": 0.0005521049761809387, + "loss": 2.2205, + "step": 452410 + }, + { + "epoch": 1.7489291954662831, + "grad_norm": 0.12456782907247543, + "learning_rate": 0.0005519668512081639, + "loss": 2.2103, + "step": 452420 + }, + { + "epoch": 1.7489678526696664, + "grad_norm": 0.12019629776477814, + "learning_rate": 0.000551828739409596, + "loss": 2.2114, + "step": 452430 + }, + { + "epoch": 1.7490065098730496, + "grad_norm": 0.1146310493350029, + "learning_rate": 0.0005516906407814662, + "loss": 2.2189, + "step": 452440 + }, + { + "epoch": 1.7490451670764329, + "grad_norm": 0.12347836792469025, + "learning_rate": 0.0005515525553200076, + "loss": 2.2096, + "step": 452450 + }, + { + "epoch": 1.7490838242798163, + "grad_norm": 0.1218772679567337, + "learning_rate": 0.0005514144830214545, + "loss": 2.2048, + "step": 452460 + }, + { + "epoch": 1.7491224814831996, + "grad_norm": 0.1261073648929596, + "learning_rate": 0.0005512764238820436, + "loss": 2.195, + "step": 452470 + }, + { + "epoch": 1.7491611386865829, + "grad_norm": 0.1243288516998291, + "learning_rate": 0.0005511383778980133, + "loss": 2.2038, + "step": 452480 + }, + { + "epoch": 1.749199795889966, + "grad_norm": 0.1278071403503418, + "learning_rate": 0.0005510003450656036, + "loss": 2.2095, + "step": 452490 + }, + { + "epoch": 1.7492384530933496, + "grad_norm": 0.12030568718910217, + "learning_rate": 0.000550862325381056, + "loss": 2.2037, + "step": 452500 + }, + { + "epoch": 1.7492771102967328, + "grad_norm": 0.13277964293956757, + "learning_rate": 0.0005507243188406148, + "loss": 2.2189, + "step": 452510 + }, + { + "epoch": 1.749315767500116, + "grad_norm": 0.11886333674192429, + "learning_rate": 0.000550586325440525, + "loss": 2.1993, + "step": 452520 + }, + { + "epoch": 1.7493544247034993, + "grad_norm": 0.11295314133167267, + "learning_rate": 0.0005504483451770337, + "loss": 2.2119, + "step": 452530 + }, + { + "epoch": 1.7493930819068826, + "grad_norm": 0.12786784768104553, + "learning_rate": 0.0005503103780463902, + "loss": 2.2215, + "step": 452540 + }, + { + "epoch": 1.7494317391102658, + "grad_norm": 0.12659190595149994, + "learning_rate": 0.0005501724240448454, + "loss": 2.2226, + "step": 452550 + }, + { + "epoch": 1.749470396313649, + "grad_norm": 0.1250794380903244, + "learning_rate": 0.0005500344831686515, + "loss": 2.2043, + "step": 452560 + }, + { + "epoch": 1.7495090535170323, + "grad_norm": 0.12980180978775024, + "learning_rate": 0.000549896555414063, + "loss": 2.207, + "step": 452570 + }, + { + "epoch": 1.7495477107204156, + "grad_norm": 0.11602694541215897, + "learning_rate": 0.0005497586407773358, + "loss": 2.2181, + "step": 452580 + }, + { + "epoch": 1.7495863679237988, + "grad_norm": 0.12851059436798096, + "learning_rate": 0.0005496207392547285, + "loss": 2.2058, + "step": 452590 + }, + { + "epoch": 1.749625025127182, + "grad_norm": 0.1294124275445938, + "learning_rate": 0.0005494828508425003, + "loss": 2.2064, + "step": 452600 + }, + { + "epoch": 1.7496636823305654, + "grad_norm": 0.122859887778759, + "learning_rate": 0.0005493449755369129, + "loss": 2.2065, + "step": 452610 + }, + { + "epoch": 1.7497023395339486, + "grad_norm": 0.13860875368118286, + "learning_rate": 0.0005492071133342294, + "loss": 2.2032, + "step": 452620 + }, + { + "epoch": 1.749740996737332, + "grad_norm": 0.14647898077964783, + "learning_rate": 0.0005490692642307147, + "loss": 2.2083, + "step": 452630 + }, + { + "epoch": 1.7497796539407153, + "grad_norm": 0.13019074499607086, + "learning_rate": 0.0005489314282226356, + "loss": 2.1914, + "step": 452640 + }, + { + "epoch": 1.7498183111440986, + "grad_norm": 0.1232682466506958, + "learning_rate": 0.0005487936053062612, + "loss": 2.2194, + "step": 452650 + }, + { + "epoch": 1.7498569683474818, + "grad_norm": 0.1230897456407547, + "learning_rate": 0.0005486557954778613, + "loss": 2.1953, + "step": 452660 + }, + { + "epoch": 1.7498956255508653, + "grad_norm": 0.11922044306993484, + "learning_rate": 0.0005485179987337081, + "loss": 2.2055, + "step": 452670 + }, + { + "epoch": 1.7499342827542486, + "grad_norm": 0.12729021906852722, + "learning_rate": 0.0005483802150700756, + "loss": 2.2174, + "step": 452680 + }, + { + "epoch": 1.7499729399576318, + "grad_norm": 0.11684662103652954, + "learning_rate": 0.0005482424444832395, + "loss": 2.2029, + "step": 452690 + }, + { + "epoch": 1.750011597161015, + "grad_norm": 0.1304619461297989, + "learning_rate": 0.0005481046869694771, + "loss": 2.2144, + "step": 452700 + }, + { + "epoch": 1.7500502543643983, + "grad_norm": 0.12067972123622894, + "learning_rate": 0.0005479669425250677, + "loss": 2.1993, + "step": 452710 + }, + { + "epoch": 1.7500889115677816, + "grad_norm": 0.12739378213882446, + "learning_rate": 0.0005478292111462922, + "loss": 2.2076, + "step": 452720 + }, + { + "epoch": 1.7501275687711648, + "grad_norm": 0.1453160047531128, + "learning_rate": 0.0005476914928294334, + "loss": 2.1889, + "step": 452730 + }, + { + "epoch": 1.750166225974548, + "grad_norm": 0.1349991261959076, + "learning_rate": 0.0005475537875707754, + "loss": 2.1994, + "step": 452740 + }, + { + "epoch": 1.7502048831779313, + "grad_norm": 0.12166766822338104, + "learning_rate": 0.0005474160953666049, + "loss": 2.1917, + "step": 452750 + }, + { + "epoch": 1.7502435403813146, + "grad_norm": 0.12614476680755615, + "learning_rate": 0.0005472784162132098, + "loss": 2.2207, + "step": 452760 + }, + { + "epoch": 1.7502821975846978, + "grad_norm": 0.11877798289060593, + "learning_rate": 0.0005471407501068799, + "loss": 2.2097, + "step": 452770 + }, + { + "epoch": 1.750320854788081, + "grad_norm": 0.11452053487300873, + "learning_rate": 0.0005470030970439061, + "loss": 2.194, + "step": 452780 + }, + { + "epoch": 1.7503595119914643, + "grad_norm": 0.14666223526000977, + "learning_rate": 0.0005468654570205826, + "loss": 2.2067, + "step": 452790 + }, + { + "epoch": 1.7503981691948478, + "grad_norm": 0.12348919361829758, + "learning_rate": 0.000546727830033204, + "loss": 2.2057, + "step": 452800 + }, + { + "epoch": 1.750436826398231, + "grad_norm": 0.12077920883893967, + "learning_rate": 0.0005465902160780669, + "loss": 2.2069, + "step": 452810 + }, + { + "epoch": 1.7504754836016143, + "grad_norm": 0.1277742087841034, + "learning_rate": 0.0005464526151514701, + "loss": 2.1998, + "step": 452820 + }, + { + "epoch": 1.7505141408049976, + "grad_norm": 0.12332631647586823, + "learning_rate": 0.0005463150272497139, + "loss": 2.2114, + "step": 452830 + }, + { + "epoch": 1.750552798008381, + "grad_norm": 0.1287272572517395, + "learning_rate": 0.0005461774523691001, + "loss": 2.2085, + "step": 452840 + }, + { + "epoch": 1.7505914552117643, + "grad_norm": 0.131668359041214, + "learning_rate": 0.0005460398905059327, + "loss": 2.2095, + "step": 452850 + }, + { + "epoch": 1.7506301124151475, + "grad_norm": 0.13040921092033386, + "learning_rate": 0.0005459023416565172, + "loss": 2.199, + "step": 452860 + }, + { + "epoch": 1.7506687696185308, + "grad_norm": 0.12430719286203384, + "learning_rate": 0.000545764805817161, + "loss": 2.1903, + "step": 452870 + }, + { + "epoch": 1.750707426821914, + "grad_norm": 0.1209225133061409, + "learning_rate": 0.0005456272829841727, + "loss": 2.1949, + "step": 452880 + }, + { + "epoch": 1.7507460840252973, + "grad_norm": 0.12247639149427414, + "learning_rate": 0.0005454897731538633, + "loss": 2.2034, + "step": 452890 + }, + { + "epoch": 1.7507847412286806, + "grad_norm": 0.1321645826101303, + "learning_rate": 0.0005453522763225456, + "loss": 2.2039, + "step": 452900 + }, + { + "epoch": 1.7508233984320638, + "grad_norm": 0.12608574330806732, + "learning_rate": 0.0005452147924865333, + "loss": 2.208, + "step": 452910 + }, + { + "epoch": 1.750862055635447, + "grad_norm": 0.12137756496667862, + "learning_rate": 0.0005450773216421432, + "loss": 2.2106, + "step": 452920 + }, + { + "epoch": 1.7509007128388303, + "grad_norm": 0.12233705073595047, + "learning_rate": 0.0005449398637856922, + "loss": 2.2127, + "step": 452930 + }, + { + "epoch": 1.7509393700422136, + "grad_norm": 0.14403283596038818, + "learning_rate": 0.0005448024189135004, + "loss": 2.1966, + "step": 452940 + }, + { + "epoch": 1.7509780272455968, + "grad_norm": 0.1319127082824707, + "learning_rate": 0.0005446649870218885, + "loss": 2.1946, + "step": 452950 + }, + { + "epoch": 1.75101668444898, + "grad_norm": 0.13181868195533752, + "learning_rate": 0.00054452756810718, + "loss": 2.2135, + "step": 452960 + }, + { + "epoch": 1.7510553416523635, + "grad_norm": 0.118177130818367, + "learning_rate": 0.000544390162165699, + "loss": 2.2074, + "step": 452970 + }, + { + "epoch": 1.7510939988557468, + "grad_norm": 0.13542814552783966, + "learning_rate": 0.0005442527691937724, + "loss": 2.2117, + "step": 452980 + }, + { + "epoch": 1.75113265605913, + "grad_norm": 0.12567058205604553, + "learning_rate": 0.0005441153891877282, + "loss": 2.1741, + "step": 452990 + }, + { + "epoch": 1.7511713132625133, + "grad_norm": 0.12935258448123932, + "learning_rate": 0.0005439780221438964, + "loss": 2.1909, + "step": 453000 + }, + { + "epoch": 1.7512099704658968, + "grad_norm": 0.12639570236206055, + "learning_rate": 0.0005438406680586083, + "loss": 2.2082, + "step": 453010 + }, + { + "epoch": 1.75124862766928, + "grad_norm": 0.13248620927333832, + "learning_rate": 0.0005437033269281977, + "loss": 2.1987, + "step": 453020 + }, + { + "epoch": 1.7512872848726633, + "grad_norm": 0.16943858563899994, + "learning_rate": 0.0005435659987489994, + "loss": 2.2054, + "step": 453030 + }, + { + "epoch": 1.7513259420760465, + "grad_norm": 0.12565425038337708, + "learning_rate": 0.0005434286835173501, + "loss": 2.2227, + "step": 453040 + }, + { + "epoch": 1.7513645992794298, + "grad_norm": 0.12456222623586655, + "learning_rate": 0.0005432913812295886, + "loss": 2.2046, + "step": 453050 + }, + { + "epoch": 1.751403256482813, + "grad_norm": 0.1141451820731163, + "learning_rate": 0.000543154091882055, + "loss": 2.2103, + "step": 453060 + }, + { + "epoch": 1.7514419136861963, + "grad_norm": 0.11694903671741486, + "learning_rate": 0.0005430168154710915, + "loss": 2.1978, + "step": 453070 + }, + { + "epoch": 1.7514805708895795, + "grad_norm": 0.122064508497715, + "learning_rate": 0.0005428795519930411, + "loss": 2.2294, + "step": 453080 + }, + { + "epoch": 1.7515192280929628, + "grad_norm": 0.1250613033771515, + "learning_rate": 0.0005427423014442503, + "loss": 2.1978, + "step": 453090 + }, + { + "epoch": 1.751557885296346, + "grad_norm": 0.1284988522529602, + "learning_rate": 0.0005426050638210656, + "loss": 2.2149, + "step": 453100 + }, + { + "epoch": 1.7515965424997293, + "grad_norm": 0.11649057269096375, + "learning_rate": 0.0005424678391198362, + "loss": 2.2019, + "step": 453110 + }, + { + "epoch": 1.7516351997031125, + "grad_norm": 0.12649868428707123, + "learning_rate": 0.0005423306273369122, + "loss": 2.2269, + "step": 453120 + }, + { + "epoch": 1.7516738569064958, + "grad_norm": 0.13215883076190948, + "learning_rate": 0.0005421934284686463, + "loss": 2.1992, + "step": 453130 + }, + { + "epoch": 1.7517125141098793, + "grad_norm": 0.11886328458786011, + "learning_rate": 0.0005420562425113924, + "loss": 2.1983, + "step": 453140 + }, + { + "epoch": 1.7517511713132625, + "grad_norm": 0.1284274458885193, + "learning_rate": 0.0005419190694615062, + "loss": 2.1949, + "step": 453150 + }, + { + "epoch": 1.7517898285166458, + "grad_norm": 0.13134321570396423, + "learning_rate": 0.0005417819093153453, + "loss": 2.2101, + "step": 453160 + }, + { + "epoch": 1.751828485720029, + "grad_norm": 0.12084176391363144, + "learning_rate": 0.0005416447620692688, + "loss": 2.1954, + "step": 453170 + }, + { + "epoch": 1.7518671429234125, + "grad_norm": 0.12745612859725952, + "learning_rate": 0.0005415076277196373, + "loss": 2.2209, + "step": 453180 + }, + { + "epoch": 1.7519058001267958, + "grad_norm": 0.1288878470659256, + "learning_rate": 0.0005413705062628138, + "loss": 2.197, + "step": 453190 + }, + { + "epoch": 1.751944457330179, + "grad_norm": 0.12644262611865997, + "learning_rate": 0.0005412333976951624, + "loss": 2.1995, + "step": 453200 + }, + { + "epoch": 1.7519831145335623, + "grad_norm": 0.12212696671485901, + "learning_rate": 0.0005410963020130493, + "loss": 2.211, + "step": 453210 + }, + { + "epoch": 1.7520217717369455, + "grad_norm": 0.12275119870901108, + "learning_rate": 0.0005409592192128418, + "loss": 2.2088, + "step": 453220 + }, + { + "epoch": 1.7520604289403288, + "grad_norm": 0.12905056774616241, + "learning_rate": 0.0005408221492909098, + "loss": 2.2058, + "step": 453230 + }, + { + "epoch": 1.752099086143712, + "grad_norm": 0.1202598512172699, + "learning_rate": 0.0005406850922436241, + "loss": 2.2178, + "step": 453240 + }, + { + "epoch": 1.7521377433470953, + "grad_norm": 0.11684327572584152, + "learning_rate": 0.0005405480480673577, + "loss": 2.2047, + "step": 453250 + }, + { + "epoch": 1.7521764005504785, + "grad_norm": 0.1299309879541397, + "learning_rate": 0.0005404110167584848, + "loss": 2.2119, + "step": 453260 + }, + { + "epoch": 1.7522150577538618, + "grad_norm": 0.133416086435318, + "learning_rate": 0.0005402739983133822, + "loss": 2.2059, + "step": 453270 + }, + { + "epoch": 1.752253714957245, + "grad_norm": 0.13269919157028198, + "learning_rate": 0.0005401369927284272, + "loss": 2.2089, + "step": 453280 + }, + { + "epoch": 1.7522923721606283, + "grad_norm": 0.1308826357126236, + "learning_rate": 0.00054, + "loss": 2.1902, + "step": 453290 + }, + { + "epoch": 1.7523310293640115, + "grad_norm": 0.2655637264251709, + "learning_rate": 0.0005398630201244816, + "loss": 2.2136, + "step": 453300 + }, + { + "epoch": 1.752369686567395, + "grad_norm": 0.12664833664894104, + "learning_rate": 0.0005397260530982551, + "loss": 2.2168, + "step": 453310 + }, + { + "epoch": 1.7524083437707783, + "grad_norm": 0.11643911898136139, + "learning_rate": 0.0005395890989177052, + "loss": 2.2128, + "step": 453320 + }, + { + "epoch": 1.7524470009741615, + "grad_norm": 0.13293546438217163, + "learning_rate": 0.0005394521575792186, + "loss": 2.1997, + "step": 453330 + }, + { + "epoch": 1.7524856581775448, + "grad_norm": 0.1252000778913498, + "learning_rate": 0.000539315229079183, + "loss": 2.2026, + "step": 453340 + }, + { + "epoch": 1.7525243153809282, + "grad_norm": 0.11817177385091782, + "learning_rate": 0.0005391783134139883, + "loss": 2.208, + "step": 453350 + }, + { + "epoch": 1.7525629725843115, + "grad_norm": 0.12528595328330994, + "learning_rate": 0.0005390414105800261, + "loss": 2.2127, + "step": 453360 + }, + { + "epoch": 1.7526016297876947, + "grad_norm": 0.13388140499591827, + "learning_rate": 0.0005389045205736895, + "loss": 2.2181, + "step": 453370 + }, + { + "epoch": 1.752640286991078, + "grad_norm": 0.14031468331813812, + "learning_rate": 0.0005387676433913735, + "loss": 2.1913, + "step": 453380 + }, + { + "epoch": 1.7526789441944612, + "grad_norm": 0.12155263125896454, + "learning_rate": 0.0005386307790294746, + "loss": 2.2062, + "step": 453390 + }, + { + "epoch": 1.7527176013978445, + "grad_norm": 0.1317920684814453, + "learning_rate": 0.0005384939274843912, + "loss": 2.2134, + "step": 453400 + }, + { + "epoch": 1.7527562586012277, + "grad_norm": 0.1260206401348114, + "learning_rate": 0.0005383570887525229, + "loss": 2.2299, + "step": 453410 + }, + { + "epoch": 1.752794915804611, + "grad_norm": 0.12761461734771729, + "learning_rate": 0.0005382202628302717, + "loss": 2.2078, + "step": 453420 + }, + { + "epoch": 1.7528335730079943, + "grad_norm": 0.12996332347393036, + "learning_rate": 0.0005380834497140406, + "loss": 2.2013, + "step": 453430 + }, + { + "epoch": 1.7528722302113775, + "grad_norm": 0.15279492735862732, + "learning_rate": 0.0005379466494002347, + "loss": 2.2176, + "step": 453440 + }, + { + "epoch": 1.7529108874147608, + "grad_norm": 0.1288621574640274, + "learning_rate": 0.000537809861885261, + "loss": 2.1883, + "step": 453450 + }, + { + "epoch": 1.752949544618144, + "grad_norm": 0.13647779822349548, + "learning_rate": 0.0005376730871655271, + "loss": 2.2179, + "step": 453460 + }, + { + "epoch": 1.7529882018215273, + "grad_norm": 0.12741248309612274, + "learning_rate": 0.0005375363252374439, + "loss": 2.2043, + "step": 453470 + }, + { + "epoch": 1.7530268590249107, + "grad_norm": 0.12702925503253937, + "learning_rate": 0.0005373995760974222, + "loss": 2.1951, + "step": 453480 + }, + { + "epoch": 1.753065516228294, + "grad_norm": 0.13216041028499603, + "learning_rate": 0.0005372628397418761, + "loss": 2.2046, + "step": 453490 + }, + { + "epoch": 1.7531041734316772, + "grad_norm": 0.12956835329532623, + "learning_rate": 0.0005371261161672206, + "loss": 2.1989, + "step": 453500 + }, + { + "epoch": 1.7531428306350605, + "grad_norm": 0.11891122162342072, + "learning_rate": 0.0005369894053698723, + "loss": 2.204, + "step": 453510 + }, + { + "epoch": 1.753181487838444, + "grad_norm": 0.13319365680217743, + "learning_rate": 0.0005368527073462495, + "loss": 2.2232, + "step": 453520 + }, + { + "epoch": 1.7532201450418272, + "grad_norm": 0.12692497670650482, + "learning_rate": 0.0005367160220927723, + "loss": 2.2106, + "step": 453530 + }, + { + "epoch": 1.7532588022452105, + "grad_norm": 0.11499441415071487, + "learning_rate": 0.0005365793496058626, + "loss": 2.202, + "step": 453540 + }, + { + "epoch": 1.7532974594485937, + "grad_norm": 0.12278398871421814, + "learning_rate": 0.0005364426898819439, + "loss": 2.2035, + "step": 453550 + }, + { + "epoch": 1.753336116651977, + "grad_norm": 0.12266360968351364, + "learning_rate": 0.0005363060429174412, + "loss": 2.2051, + "step": 453560 + }, + { + "epoch": 1.7533747738553602, + "grad_norm": 0.13310161232948303, + "learning_rate": 0.000536169408708781, + "loss": 2.2049, + "step": 453570 + }, + { + "epoch": 1.7534134310587435, + "grad_norm": 0.12809118628501892, + "learning_rate": 0.000536032787252392, + "loss": 2.1987, + "step": 453580 + }, + { + "epoch": 1.7534520882621267, + "grad_norm": 0.12067703902721405, + "learning_rate": 0.0005358961785447043, + "loss": 2.2204, + "step": 453590 + }, + { + "epoch": 1.75349074546551, + "grad_norm": 0.12874580919742584, + "learning_rate": 0.0005357595825821497, + "loss": 2.2005, + "step": 453600 + }, + { + "epoch": 1.7535294026688932, + "grad_norm": 0.12450389564037323, + "learning_rate": 0.0005356229993611617, + "loss": 2.1904, + "step": 453610 + }, + { + "epoch": 1.7535680598722765, + "grad_norm": 0.12393300235271454, + "learning_rate": 0.0005354864288781753, + "loss": 2.2067, + "step": 453620 + }, + { + "epoch": 1.7536067170756597, + "grad_norm": 0.12177273631095886, + "learning_rate": 0.0005353498711296272, + "loss": 2.213, + "step": 453630 + }, + { + "epoch": 1.7536453742790432, + "grad_norm": 0.12027515470981598, + "learning_rate": 0.0005352133261119556, + "loss": 2.19, + "step": 453640 + }, + { + "epoch": 1.7536840314824265, + "grad_norm": 0.12833796441555023, + "learning_rate": 0.000535076793821601, + "loss": 2.1944, + "step": 453650 + }, + { + "epoch": 1.7537226886858097, + "grad_norm": 0.1365223377943039, + "learning_rate": 0.0005349402742550051, + "loss": 2.1946, + "step": 453660 + }, + { + "epoch": 1.753761345889193, + "grad_norm": 0.1281602382659912, + "learning_rate": 0.000534803767408611, + "loss": 2.2129, + "step": 453670 + }, + { + "epoch": 1.7538000030925762, + "grad_norm": 0.13165245950222015, + "learning_rate": 0.0005346672732788638, + "loss": 2.1915, + "step": 453680 + }, + { + "epoch": 1.7538386602959597, + "grad_norm": 0.12608040869235992, + "learning_rate": 0.0005345307918622104, + "loss": 2.1995, + "step": 453690 + }, + { + "epoch": 1.753877317499343, + "grad_norm": 0.13146719336509705, + "learning_rate": 0.0005343943231550991, + "loss": 2.2035, + "step": 453700 + }, + { + "epoch": 1.7539159747027262, + "grad_norm": 0.11518460512161255, + "learning_rate": 0.0005342578671539799, + "loss": 2.2079, + "step": 453710 + }, + { + "epoch": 1.7539546319061095, + "grad_norm": 0.14398086071014404, + "learning_rate": 0.0005341214238553045, + "loss": 2.2013, + "step": 453720 + }, + { + "epoch": 1.7539932891094927, + "grad_norm": 0.12497788667678833, + "learning_rate": 0.000533984993255526, + "loss": 2.2126, + "step": 453730 + }, + { + "epoch": 1.754031946312876, + "grad_norm": 0.13252729177474976, + "learning_rate": 0.0005338485753510997, + "loss": 2.2209, + "step": 453740 + }, + { + "epoch": 1.7540706035162592, + "grad_norm": 0.12597353756427765, + "learning_rate": 0.000533712170138482, + "loss": 2.1939, + "step": 453750 + }, + { + "epoch": 1.7541092607196425, + "grad_norm": 0.12324507534503937, + "learning_rate": 0.0005335757776141312, + "loss": 2.2004, + "step": 453760 + }, + { + "epoch": 1.7541479179230257, + "grad_norm": 0.13148833811283112, + "learning_rate": 0.0005334393977745074, + "loss": 2.1953, + "step": 453770 + }, + { + "epoch": 1.754186575126409, + "grad_norm": 0.12210629880428314, + "learning_rate": 0.0005333030306160718, + "loss": 2.1935, + "step": 453780 + }, + { + "epoch": 1.7542252323297922, + "grad_norm": 0.13100388646125793, + "learning_rate": 0.0005331666761352876, + "loss": 2.2096, + "step": 453790 + }, + { + "epoch": 1.7542638895331755, + "grad_norm": 0.12396988272666931, + "learning_rate": 0.00053303033432862, + "loss": 2.2063, + "step": 453800 + }, + { + "epoch": 1.754302546736559, + "grad_norm": 0.12156442552804947, + "learning_rate": 0.0005328940051925355, + "loss": 2.2074, + "step": 453810 + }, + { + "epoch": 1.7543412039399422, + "grad_norm": 0.12706667184829712, + "learning_rate": 0.000532757688723502, + "loss": 2.1943, + "step": 453820 + }, + { + "epoch": 1.7543798611433254, + "grad_norm": 0.14350438117980957, + "learning_rate": 0.0005326213849179892, + "loss": 2.2274, + "step": 453830 + }, + { + "epoch": 1.7544185183467087, + "grad_norm": 0.1357300728559494, + "learning_rate": 0.0005324850937724688, + "loss": 2.211, + "step": 453840 + }, + { + "epoch": 1.754457175550092, + "grad_norm": 0.12557262182235718, + "learning_rate": 0.0005323488152834135, + "loss": 2.2091, + "step": 453850 + }, + { + "epoch": 1.7544958327534754, + "grad_norm": 0.13544535636901855, + "learning_rate": 0.0005322125494472982, + "loss": 2.2037, + "step": 453860 + }, + { + "epoch": 1.7545344899568587, + "grad_norm": 0.13692021369934082, + "learning_rate": 0.0005320762962605994, + "loss": 2.2041, + "step": 453870 + }, + { + "epoch": 1.754573147160242, + "grad_norm": 0.13866128027439117, + "learning_rate": 0.0005319400557197946, + "loss": 2.2113, + "step": 453880 + }, + { + "epoch": 1.7546118043636252, + "grad_norm": 0.12811279296875, + "learning_rate": 0.0005318038278213635, + "loss": 2.2096, + "step": 453890 + }, + { + "epoch": 1.7546504615670084, + "grad_norm": 0.13107416033744812, + "learning_rate": 0.0005316676125617879, + "loss": 2.2015, + "step": 453900 + }, + { + "epoch": 1.7546891187703917, + "grad_norm": 0.13009685277938843, + "learning_rate": 0.0005315314099375499, + "loss": 2.1995, + "step": 453910 + }, + { + "epoch": 1.754727775973775, + "grad_norm": 0.14282618463039398, + "learning_rate": 0.0005313952199451344, + "loss": 2.201, + "step": 453920 + }, + { + "epoch": 1.7547664331771582, + "grad_norm": 0.11497281491756439, + "learning_rate": 0.0005312590425810275, + "loss": 2.2115, + "step": 453930 + }, + { + "epoch": 1.7548050903805414, + "grad_norm": 0.12178318947553635, + "learning_rate": 0.0005311228778417167, + "loss": 2.1942, + "step": 453940 + }, + { + "epoch": 1.7548437475839247, + "grad_norm": 0.13076919317245483, + "learning_rate": 0.0005309867257236918, + "loss": 2.2137, + "step": 453950 + }, + { + "epoch": 1.754882404787308, + "grad_norm": 0.12824806571006775, + "learning_rate": 0.0005308505862234434, + "loss": 2.2139, + "step": 453960 + }, + { + "epoch": 1.7549210619906912, + "grad_norm": 0.12367754429578781, + "learning_rate": 0.0005307144593374644, + "loss": 2.2248, + "step": 453970 + }, + { + "epoch": 1.7549597191940747, + "grad_norm": 0.13095413148403168, + "learning_rate": 0.0005305783450622488, + "loss": 2.2029, + "step": 453980 + }, + { + "epoch": 1.754998376397458, + "grad_norm": 0.12684781849384308, + "learning_rate": 0.0005304422433942924, + "loss": 2.1942, + "step": 453990 + }, + { + "epoch": 1.7550370336008412, + "grad_norm": 0.14733237028121948, + "learning_rate": 0.0005303061543300931, + "loss": 2.1993, + "step": 454000 + }, + { + "epoch": 1.7550756908042244, + "grad_norm": 0.1254054307937622, + "learning_rate": 0.0005301700778661498, + "loss": 2.2158, + "step": 454010 + }, + { + "epoch": 1.7551143480076077, + "grad_norm": 0.15444502234458923, + "learning_rate": 0.0005300340139989632, + "loss": 2.1988, + "step": 454020 + }, + { + "epoch": 1.7551530052109912, + "grad_norm": 0.13979749381542206, + "learning_rate": 0.000529897962725036, + "loss": 2.1916, + "step": 454030 + }, + { + "epoch": 1.7551916624143744, + "grad_norm": 0.11630989611148834, + "learning_rate": 0.0005297619240408715, + "loss": 2.207, + "step": 454040 + }, + { + "epoch": 1.7552303196177577, + "grad_norm": 0.13520605862140656, + "learning_rate": 0.000529625897942976, + "loss": 2.217, + "step": 454050 + }, + { + "epoch": 1.755268976821141, + "grad_norm": 0.126438707113266, + "learning_rate": 0.0005294898844278562, + "loss": 2.2166, + "step": 454060 + }, + { + "epoch": 1.7553076340245242, + "grad_norm": 0.12552592158317566, + "learning_rate": 0.0005293538834920211, + "loss": 2.2159, + "step": 454070 + }, + { + "epoch": 1.7553462912279074, + "grad_norm": 0.14067593216896057, + "learning_rate": 0.0005292178951319812, + "loss": 2.2056, + "step": 454080 + }, + { + "epoch": 1.7553849484312907, + "grad_norm": 0.13103902339935303, + "learning_rate": 0.0005290819193442484, + "loss": 2.1993, + "step": 454090 + }, + { + "epoch": 1.755423605634674, + "grad_norm": 0.11966156214475632, + "learning_rate": 0.0005289459561253367, + "loss": 2.1988, + "step": 454100 + }, + { + "epoch": 1.7554622628380572, + "grad_norm": 0.14210577309131622, + "learning_rate": 0.0005288100054717608, + "loss": 2.2031, + "step": 454110 + }, + { + "epoch": 1.7555009200414404, + "grad_norm": 0.134195476770401, + "learning_rate": 0.0005286740673800383, + "loss": 2.2114, + "step": 454120 + }, + { + "epoch": 1.7555395772448237, + "grad_norm": 0.13276775181293488, + "learning_rate": 0.0005285381418466872, + "loss": 2.191, + "step": 454130 + }, + { + "epoch": 1.755578234448207, + "grad_norm": 0.13156795501708984, + "learning_rate": 0.0005284022288682277, + "loss": 2.1898, + "step": 454140 + }, + { + "epoch": 1.7556168916515904, + "grad_norm": 0.1209663450717926, + "learning_rate": 0.0005282663284411817, + "loss": 2.2067, + "step": 454150 + }, + { + "epoch": 1.7556555488549737, + "grad_norm": 0.1239723488688469, + "learning_rate": 0.0005281304405620722, + "loss": 2.1923, + "step": 454160 + }, + { + "epoch": 1.755694206058357, + "grad_norm": 0.13002730906009674, + "learning_rate": 0.0005279945652274243, + "loss": 2.1867, + "step": 454170 + }, + { + "epoch": 1.7557328632617402, + "grad_norm": 0.13225910067558289, + "learning_rate": 0.0005278587024337644, + "loss": 2.21, + "step": 454180 + }, + { + "epoch": 1.7557715204651236, + "grad_norm": 0.13071177899837494, + "learning_rate": 0.0005277228521776207, + "loss": 2.2045, + "step": 454190 + }, + { + "epoch": 1.7558101776685069, + "grad_norm": 0.13750162720680237, + "learning_rate": 0.0005275870144555231, + "loss": 2.192, + "step": 454200 + }, + { + "epoch": 1.7558488348718901, + "grad_norm": 0.12981943786144257, + "learning_rate": 0.0005274511892640025, + "loss": 2.211, + "step": 454210 + }, + { + "epoch": 1.7558874920752734, + "grad_norm": 0.11927240341901779, + "learning_rate": 0.0005273153765995926, + "loss": 2.1964, + "step": 454220 + }, + { + "epoch": 1.7559261492786566, + "grad_norm": 0.1346549540758133, + "learning_rate": 0.000527179576458827, + "loss": 2.2015, + "step": 454230 + }, + { + "epoch": 1.75596480648204, + "grad_norm": 0.11863110214471817, + "learning_rate": 0.0005270437888382425, + "loss": 2.1989, + "step": 454240 + }, + { + "epoch": 1.7560034636854231, + "grad_norm": 0.11934219300746918, + "learning_rate": 0.0005269080137343765, + "loss": 2.2103, + "step": 454250 + }, + { + "epoch": 1.7560421208888064, + "grad_norm": 0.1372785121202469, + "learning_rate": 0.0005267722511437684, + "loss": 2.2004, + "step": 454260 + }, + { + "epoch": 1.7560807780921897, + "grad_norm": 0.12632456421852112, + "learning_rate": 0.0005266365010629591, + "loss": 2.1843, + "step": 454270 + }, + { + "epoch": 1.756119435295573, + "grad_norm": 0.12038110196590424, + "learning_rate": 0.0005265007634884911, + "loss": 2.2167, + "step": 454280 + }, + { + "epoch": 1.7561580924989562, + "grad_norm": 0.11933927237987518, + "learning_rate": 0.0005263650384169083, + "loss": 2.2137, + "step": 454290 + }, + { + "epoch": 1.7561967497023394, + "grad_norm": 0.11876744776964188, + "learning_rate": 0.0005262293258447568, + "loss": 2.1949, + "step": 454300 + }, + { + "epoch": 1.7562354069057227, + "grad_norm": 0.12975801527500153, + "learning_rate": 0.0005260936257685836, + "loss": 2.2105, + "step": 454310 + }, + { + "epoch": 1.7562740641091061, + "grad_norm": 0.12426824867725372, + "learning_rate": 0.0005259579381849378, + "loss": 2.1841, + "step": 454320 + }, + { + "epoch": 1.7563127213124894, + "grad_norm": 0.12224451452493668, + "learning_rate": 0.0005258222630903695, + "loss": 2.2126, + "step": 454330 + }, + { + "epoch": 1.7563513785158726, + "grad_norm": 0.12690091133117676, + "learning_rate": 0.0005256866004814309, + "loss": 2.199, + "step": 454340 + }, + { + "epoch": 1.756390035719256, + "grad_norm": 0.12292948365211487, + "learning_rate": 0.0005255509503546758, + "loss": 2.1995, + "step": 454350 + }, + { + "epoch": 1.7564286929226394, + "grad_norm": 0.12324800342321396, + "learning_rate": 0.0005254153127066592, + "loss": 2.2003, + "step": 454360 + }, + { + "epoch": 1.7564673501260226, + "grad_norm": 0.11988487094640732, + "learning_rate": 0.0005252796875339378, + "loss": 2.2043, + "step": 454370 + }, + { + "epoch": 1.7565060073294059, + "grad_norm": 0.12712682783603668, + "learning_rate": 0.0005251440748330705, + "loss": 2.1982, + "step": 454380 + }, + { + "epoch": 1.7565446645327891, + "grad_norm": 0.13616231083869934, + "learning_rate": 0.0005250084746006165, + "loss": 2.2174, + "step": 454390 + }, + { + "epoch": 1.7565833217361724, + "grad_norm": 0.1232861876487732, + "learning_rate": 0.0005248728868331381, + "loss": 2.1907, + "step": 454400 + }, + { + "epoch": 1.7566219789395556, + "grad_norm": 0.13185040652751923, + "learning_rate": 0.0005247373115271979, + "loss": 2.179, + "step": 454410 + }, + { + "epoch": 1.7566606361429389, + "grad_norm": 0.122291199862957, + "learning_rate": 0.0005246017486793609, + "loss": 2.2056, + "step": 454420 + }, + { + "epoch": 1.7566992933463221, + "grad_norm": 0.12516425549983978, + "learning_rate": 0.0005244661982861931, + "loss": 2.2214, + "step": 454430 + }, + { + "epoch": 1.7567379505497054, + "grad_norm": 0.12709055840969086, + "learning_rate": 0.0005243306603442626, + "loss": 2.2089, + "step": 454440 + }, + { + "epoch": 1.7567766077530886, + "grad_norm": 0.1386878341436386, + "learning_rate": 0.0005241951348501388, + "loss": 2.2162, + "step": 454450 + }, + { + "epoch": 1.756815264956472, + "grad_norm": 0.12939682602882385, + "learning_rate": 0.0005240596218003928, + "loss": 2.2067, + "step": 454460 + }, + { + "epoch": 1.7568539221598551, + "grad_norm": 0.1294785439968109, + "learning_rate": 0.0005239241211915968, + "loss": 2.2099, + "step": 454470 + }, + { + "epoch": 1.7568925793632384, + "grad_norm": 0.12862272560596466, + "learning_rate": 0.0005237886330203255, + "loss": 2.2114, + "step": 454480 + }, + { + "epoch": 1.7569312365666219, + "grad_norm": 0.11318708956241608, + "learning_rate": 0.0005236531572831537, + "loss": 2.1954, + "step": 454490 + }, + { + "epoch": 1.7569698937700051, + "grad_norm": 0.12838613986968994, + "learning_rate": 0.0005235176939766599, + "loss": 2.2058, + "step": 454500 + }, + { + "epoch": 1.7570085509733884, + "grad_norm": 0.12900970876216888, + "learning_rate": 0.0005233822430974223, + "loss": 2.209, + "step": 454510 + }, + { + "epoch": 1.7570472081767716, + "grad_norm": 0.13854677975177765, + "learning_rate": 0.0005232468046420213, + "loss": 2.1932, + "step": 454520 + }, + { + "epoch": 1.757085865380155, + "grad_norm": 0.13848355412483215, + "learning_rate": 0.0005231113786070394, + "loss": 2.2199, + "step": 454530 + }, + { + "epoch": 1.7571245225835384, + "grad_norm": 0.12348717451095581, + "learning_rate": 0.0005229759649890595, + "loss": 2.2052, + "step": 454540 + }, + { + "epoch": 1.7571631797869216, + "grad_norm": 0.1331428736448288, + "learning_rate": 0.0005228405637846672, + "loss": 2.1957, + "step": 454550 + }, + { + "epoch": 1.7572018369903049, + "grad_norm": 0.11971791088581085, + "learning_rate": 0.000522705174990449, + "loss": 2.1854, + "step": 454560 + }, + { + "epoch": 1.757240494193688, + "grad_norm": 0.1279265582561493, + "learning_rate": 0.0005225697986029934, + "loss": 2.2107, + "step": 454570 + }, + { + "epoch": 1.7572791513970714, + "grad_norm": 0.12536872923374176, + "learning_rate": 0.0005224344346188898, + "loss": 2.2119, + "step": 454580 + }, + { + "epoch": 1.7573178086004546, + "grad_norm": 0.12962239980697632, + "learning_rate": 0.0005222990830347299, + "loss": 2.2129, + "step": 454590 + }, + { + "epoch": 1.7573564658038379, + "grad_norm": 0.12700778245925903, + "learning_rate": 0.0005221637438471067, + "loss": 2.2072, + "step": 454600 + }, + { + "epoch": 1.7573951230072211, + "grad_norm": 0.13613589107990265, + "learning_rate": 0.0005220284170526146, + "loss": 2.2107, + "step": 454610 + }, + { + "epoch": 1.7574337802106044, + "grad_norm": 0.11709623038768768, + "learning_rate": 0.0005218931026478497, + "loss": 2.2034, + "step": 454620 + }, + { + "epoch": 1.7574724374139876, + "grad_norm": 0.24251140654087067, + "learning_rate": 0.0005217578006294097, + "loss": 2.2029, + "step": 454630 + }, + { + "epoch": 1.7575110946173709, + "grad_norm": 0.13349416851997375, + "learning_rate": 0.0005216225109938937, + "loss": 2.2222, + "step": 454640 + }, + { + "epoch": 1.7575497518207541, + "grad_norm": 0.1423693746328354, + "learning_rate": 0.0005214872337379025, + "loss": 2.2006, + "step": 454650 + }, + { + "epoch": 1.7575884090241376, + "grad_norm": 0.12413844466209412, + "learning_rate": 0.0005213519688580382, + "loss": 2.2021, + "step": 454660 + }, + { + "epoch": 1.7576270662275209, + "grad_norm": 0.1254565715789795, + "learning_rate": 0.0005212167163509049, + "loss": 2.2068, + "step": 454670 + }, + { + "epoch": 1.757665723430904, + "grad_norm": 0.12610286474227905, + "learning_rate": 0.000521081476213108, + "loss": 2.1984, + "step": 454680 + }, + { + "epoch": 1.7577043806342874, + "grad_norm": 0.12193763256072998, + "learning_rate": 0.0005209462484412541, + "loss": 2.2037, + "step": 454690 + }, + { + "epoch": 1.7577430378376708, + "grad_norm": 0.14640814065933228, + "learning_rate": 0.0005208110330319522, + "loss": 2.2038, + "step": 454700 + }, + { + "epoch": 1.757781695041054, + "grad_norm": 0.12274169921875, + "learning_rate": 0.0005206758299818122, + "loss": 2.1971, + "step": 454710 + }, + { + "epoch": 1.7578203522444373, + "grad_norm": 0.12056424468755722, + "learning_rate": 0.0005205406392874457, + "loss": 2.2161, + "step": 454720 + }, + { + "epoch": 1.7578590094478206, + "grad_norm": 0.13092705607414246, + "learning_rate": 0.0005204054609454655, + "loss": 2.2075, + "step": 454730 + }, + { + "epoch": 1.7578976666512038, + "grad_norm": 0.11801334470510483, + "learning_rate": 0.0005202702949524869, + "loss": 2.2114, + "step": 454740 + }, + { + "epoch": 1.757936323854587, + "grad_norm": 0.1536809206008911, + "learning_rate": 0.0005201351413051258, + "loss": 2.2239, + "step": 454750 + }, + { + "epoch": 1.7579749810579703, + "grad_norm": 0.12560512125492096, + "learning_rate": 0.0005200000000000001, + "loss": 2.2159, + "step": 454760 + }, + { + "epoch": 1.7580136382613536, + "grad_norm": 0.13057692348957062, + "learning_rate": 0.0005198648710337291, + "loss": 2.2035, + "step": 454770 + }, + { + "epoch": 1.7580522954647368, + "grad_norm": 0.15517479181289673, + "learning_rate": 0.0005197297544029334, + "loss": 2.2156, + "step": 454780 + }, + { + "epoch": 1.75809095266812, + "grad_norm": 0.12616227567195892, + "learning_rate": 0.0005195946501042357, + "loss": 2.1918, + "step": 454790 + }, + { + "epoch": 1.7581296098715034, + "grad_norm": 0.13498133420944214, + "learning_rate": 0.0005194595581342603, + "loss": 2.2042, + "step": 454800 + }, + { + "epoch": 1.7581682670748866, + "grad_norm": 0.12483587116003036, + "learning_rate": 0.0005193244784896321, + "loss": 2.1974, + "step": 454810 + }, + { + "epoch": 1.7582069242782699, + "grad_norm": 0.1263275444507599, + "learning_rate": 0.0005191894111669784, + "loss": 2.215, + "step": 454820 + }, + { + "epoch": 1.7582455814816533, + "grad_norm": 0.12117032706737518, + "learning_rate": 0.000519054356162928, + "loss": 2.2084, + "step": 454830 + }, + { + "epoch": 1.7582842386850366, + "grad_norm": 0.1304064393043518, + "learning_rate": 0.0005189193134741106, + "loss": 2.2004, + "step": 454840 + }, + { + "epoch": 1.7583228958884198, + "grad_norm": 0.12708914279937744, + "learning_rate": 0.0005187842830971581, + "loss": 2.1867, + "step": 454850 + }, + { + "epoch": 1.758361553091803, + "grad_norm": 0.1348961889743805, + "learning_rate": 0.0005186492650287036, + "loss": 2.2099, + "step": 454860 + }, + { + "epoch": 1.7584002102951866, + "grad_norm": 0.13336822390556335, + "learning_rate": 0.000518514259265382, + "loss": 2.2037, + "step": 454870 + }, + { + "epoch": 1.7584388674985698, + "grad_norm": 0.14478354156017303, + "learning_rate": 0.0005183792658038294, + "loss": 2.1886, + "step": 454880 + }, + { + "epoch": 1.758477524701953, + "grad_norm": 0.12498698383569717, + "learning_rate": 0.0005182442846406834, + "loss": 2.1837, + "step": 454890 + }, + { + "epoch": 1.7585161819053363, + "grad_norm": 0.12127989530563354, + "learning_rate": 0.0005181093157725835, + "loss": 2.1984, + "step": 454900 + }, + { + "epoch": 1.7585548391087196, + "grad_norm": 0.12991923093795776, + "learning_rate": 0.000517974359196171, + "loss": 2.1798, + "step": 454910 + }, + { + "epoch": 1.7585934963121028, + "grad_norm": 0.12539005279541016, + "learning_rate": 0.0005178394149080876, + "loss": 2.2072, + "step": 454920 + }, + { + "epoch": 1.758632153515486, + "grad_norm": 0.12650595605373383, + "learning_rate": 0.0005177044829049775, + "loss": 2.1956, + "step": 454930 + }, + { + "epoch": 1.7586708107188693, + "grad_norm": 0.12115669995546341, + "learning_rate": 0.0005175695631834862, + "loss": 2.1798, + "step": 454940 + }, + { + "epoch": 1.7587094679222526, + "grad_norm": 0.1294897347688675, + "learning_rate": 0.0005174346557402605, + "loss": 2.2077, + "step": 454950 + }, + { + "epoch": 1.7587481251256358, + "grad_norm": 0.1396237313747406, + "learning_rate": 0.000517299760571949, + "loss": 2.2059, + "step": 454960 + }, + { + "epoch": 1.758786782329019, + "grad_norm": 0.12254752218723297, + "learning_rate": 0.0005171648776752016, + "loss": 2.1985, + "step": 454970 + }, + { + "epoch": 1.7588254395324023, + "grad_norm": 0.1284838616847992, + "learning_rate": 0.00051703000704667, + "loss": 2.1999, + "step": 454980 + }, + { + "epoch": 1.7588640967357856, + "grad_norm": 0.12287527322769165, + "learning_rate": 0.000516895148683007, + "loss": 2.1966, + "step": 454990 + }, + { + "epoch": 1.758902753939169, + "grad_norm": 0.1231754943728447, + "learning_rate": 0.0005167603025808674, + "loss": 2.2017, + "step": 455000 + }, + { + "epoch": 1.7589414111425523, + "grad_norm": 0.14277738332748413, + "learning_rate": 0.0005166254687369072, + "loss": 2.1753, + "step": 455010 + }, + { + "epoch": 1.7589800683459356, + "grad_norm": 0.12003131955862045, + "learning_rate": 0.0005164906471477841, + "loss": 2.1935, + "step": 455020 + }, + { + "epoch": 1.7590187255493188, + "grad_norm": 0.12798646092414856, + "learning_rate": 0.0005163558378101574, + "loss": 2.2036, + "step": 455030 + }, + { + "epoch": 1.7590573827527023, + "grad_norm": 0.12923835217952728, + "learning_rate": 0.0005162210407206873, + "loss": 2.2028, + "step": 455040 + }, + { + "epoch": 1.7590960399560855, + "grad_norm": 0.1290251612663269, + "learning_rate": 0.0005160862558760364, + "loss": 2.1966, + "step": 455050 + }, + { + "epoch": 1.7591346971594688, + "grad_norm": 0.12686462700366974, + "learning_rate": 0.0005159514832728682, + "loss": 2.1959, + "step": 455060 + }, + { + "epoch": 1.759173354362852, + "grad_norm": 0.12811657786369324, + "learning_rate": 0.0005158167229078478, + "loss": 2.1898, + "step": 455070 + }, + { + "epoch": 1.7592120115662353, + "grad_norm": 0.11983395367860794, + "learning_rate": 0.0005156819747776422, + "loss": 2.2102, + "step": 455080 + }, + { + "epoch": 1.7592506687696186, + "grad_norm": 0.15635327994823456, + "learning_rate": 0.0005155472388789196, + "loss": 2.2058, + "step": 455090 + }, + { + "epoch": 1.7592893259730018, + "grad_norm": 0.12226098775863647, + "learning_rate": 0.0005154125152083491, + "loss": 2.1989, + "step": 455100 + }, + { + "epoch": 1.759327983176385, + "grad_norm": 0.11969849467277527, + "learning_rate": 0.0005152778037626029, + "loss": 2.2051, + "step": 455110 + }, + { + "epoch": 1.7593666403797683, + "grad_norm": 0.11924776434898376, + "learning_rate": 0.0005151431045383533, + "loss": 2.2042, + "step": 455120 + }, + { + "epoch": 1.7594052975831516, + "grad_norm": 0.11855210363864899, + "learning_rate": 0.0005150084175322744, + "loss": 2.1958, + "step": 455130 + }, + { + "epoch": 1.7594439547865348, + "grad_norm": 0.122954361140728, + "learning_rate": 0.0005148737427410422, + "loss": 2.2136, + "step": 455140 + }, + { + "epoch": 1.759482611989918, + "grad_norm": 0.11402136832475662, + "learning_rate": 0.0005147390801613341, + "loss": 2.1967, + "step": 455150 + }, + { + "epoch": 1.7595212691933013, + "grad_norm": 0.12220233678817749, + "learning_rate": 0.0005146044297898287, + "loss": 2.2009, + "step": 455160 + }, + { + "epoch": 1.7595599263966848, + "grad_norm": 0.1369936317205429, + "learning_rate": 0.0005144697916232064, + "loss": 2.2066, + "step": 455170 + }, + { + "epoch": 1.759598583600068, + "grad_norm": 0.13836544752120972, + "learning_rate": 0.000514335165658149, + "loss": 2.1852, + "step": 455180 + }, + { + "epoch": 1.7596372408034513, + "grad_norm": 0.13634607195854187, + "learning_rate": 0.0005142005518913395, + "loss": 2.2191, + "step": 455190 + }, + { + "epoch": 1.7596758980068345, + "grad_norm": 0.12321195751428604, + "learning_rate": 0.0005140659503194632, + "loss": 2.1931, + "step": 455200 + }, + { + "epoch": 1.759714555210218, + "grad_norm": 0.13414765894412994, + "learning_rate": 0.000513931360939206, + "loss": 2.2071, + "step": 455210 + }, + { + "epoch": 1.7597532124136013, + "grad_norm": 0.1208004504442215, + "learning_rate": 0.0005137967837472562, + "loss": 2.2096, + "step": 455220 + }, + { + "epoch": 1.7597918696169845, + "grad_norm": 0.13674625754356384, + "learning_rate": 0.0005136622187403026, + "loss": 2.1978, + "step": 455230 + }, + { + "epoch": 1.7598305268203678, + "grad_norm": 0.13066132366657257, + "learning_rate": 0.0005135276659150363, + "loss": 2.2067, + "step": 455240 + }, + { + "epoch": 1.759869184023751, + "grad_norm": 0.12996459007263184, + "learning_rate": 0.0005133931252681494, + "loss": 2.2065, + "step": 455250 + }, + { + "epoch": 1.7599078412271343, + "grad_norm": 0.14105352759361267, + "learning_rate": 0.0005132585967963359, + "loss": 2.194, + "step": 455260 + }, + { + "epoch": 1.7599464984305175, + "grad_norm": 0.12938223779201508, + "learning_rate": 0.0005131240804962911, + "loss": 2.1989, + "step": 455270 + }, + { + "epoch": 1.7599851556339008, + "grad_norm": 0.12563085556030273, + "learning_rate": 0.0005129895763647116, + "loss": 2.2171, + "step": 455280 + }, + { + "epoch": 1.760023812837284, + "grad_norm": 0.13781751692295074, + "learning_rate": 0.0005128550843982957, + "loss": 2.193, + "step": 455290 + }, + { + "epoch": 1.7600624700406673, + "grad_norm": 0.1279023140668869, + "learning_rate": 0.0005127206045937434, + "loss": 2.2161, + "step": 455300 + }, + { + "epoch": 1.7601011272440505, + "grad_norm": 0.1227601021528244, + "learning_rate": 0.0005125861369477557, + "loss": 2.203, + "step": 455310 + }, + { + "epoch": 1.7601397844474338, + "grad_norm": 0.14425310492515564, + "learning_rate": 0.0005124516814570359, + "loss": 2.1953, + "step": 455320 + }, + { + "epoch": 1.760178441650817, + "grad_norm": 0.1337219476699829, + "learning_rate": 0.0005123172381182874, + "loss": 2.1978, + "step": 455330 + }, + { + "epoch": 1.7602170988542005, + "grad_norm": 0.11427665501832962, + "learning_rate": 0.0005121828069282169, + "loss": 2.1928, + "step": 455340 + }, + { + "epoch": 1.7602557560575838, + "grad_norm": 0.12122835963964462, + "learning_rate": 0.0005120483878835307, + "loss": 2.2033, + "step": 455350 + }, + { + "epoch": 1.760294413260967, + "grad_norm": 0.12823830544948578, + "learning_rate": 0.0005119139809809381, + "loss": 2.205, + "step": 455360 + }, + { + "epoch": 1.7603330704643503, + "grad_norm": 0.12689967453479767, + "learning_rate": 0.000511779586217149, + "loss": 2.2061, + "step": 455370 + }, + { + "epoch": 1.7603717276677338, + "grad_norm": 0.1314159780740738, + "learning_rate": 0.0005116452035888755, + "loss": 2.2063, + "step": 455380 + }, + { + "epoch": 1.760410384871117, + "grad_norm": 0.12121304869651794, + "learning_rate": 0.0005115108330928305, + "loss": 2.2017, + "step": 455390 + }, + { + "epoch": 1.7604490420745003, + "grad_norm": 0.1249544620513916, + "learning_rate": 0.0005113764747257284, + "loss": 2.1912, + "step": 455400 + }, + { + "epoch": 1.7604876992778835, + "grad_norm": 0.13628491759300232, + "learning_rate": 0.0005112421284842857, + "loss": 2.2247, + "step": 455410 + }, + { + "epoch": 1.7605263564812668, + "grad_norm": 0.1296730488538742, + "learning_rate": 0.00051110779436522, + "loss": 2.2115, + "step": 455420 + }, + { + "epoch": 1.76056501368465, + "grad_norm": 0.12934458255767822, + "learning_rate": 0.0005109734723652504, + "loss": 2.1927, + "step": 455430 + }, + { + "epoch": 1.7606036708880333, + "grad_norm": 0.13591471314430237, + "learning_rate": 0.0005108391624810973, + "loss": 2.2021, + "step": 455440 + }, + { + "epoch": 1.7606423280914165, + "grad_norm": 0.132755845785141, + "learning_rate": 0.0005107048647094828, + "loss": 2.2093, + "step": 455450 + }, + { + "epoch": 1.7606809852947998, + "grad_norm": 0.13273707032203674, + "learning_rate": 0.0005105705790471307, + "loss": 2.2104, + "step": 455460 + }, + { + "epoch": 1.760719642498183, + "grad_norm": 0.12009256333112717, + "learning_rate": 0.0005104363054907655, + "loss": 2.2, + "step": 455470 + }, + { + "epoch": 1.7607582997015663, + "grad_norm": 0.12511485815048218, + "learning_rate": 0.0005103020440371143, + "loss": 2.1958, + "step": 455480 + }, + { + "epoch": 1.7607969569049495, + "grad_norm": 0.14407357573509216, + "learning_rate": 0.0005101677946829046, + "loss": 2.2029, + "step": 455490 + }, + { + "epoch": 1.7608356141083328, + "grad_norm": 0.12807932496070862, + "learning_rate": 0.000510033557424866, + "loss": 2.1967, + "step": 455500 + }, + { + "epoch": 1.7608742713117163, + "grad_norm": 0.1200571283698082, + "learning_rate": 0.0005098993322597294, + "loss": 2.2116, + "step": 455510 + }, + { + "epoch": 1.7609129285150995, + "grad_norm": 0.1264566034078598, + "learning_rate": 0.0005097651191842274, + "loss": 2.1993, + "step": 455520 + }, + { + "epoch": 1.7609515857184828, + "grad_norm": 0.17501239478588104, + "learning_rate": 0.0005096309181950936, + "loss": 2.2031, + "step": 455530 + }, + { + "epoch": 1.760990242921866, + "grad_norm": 0.12370411306619644, + "learning_rate": 0.0005094967292890633, + "loss": 2.2032, + "step": 455540 + }, + { + "epoch": 1.7610289001252495, + "grad_norm": 0.12493716180324554, + "learning_rate": 0.0005093625524628734, + "loss": 2.196, + "step": 455550 + }, + { + "epoch": 1.7610675573286327, + "grad_norm": 0.12528210878372192, + "learning_rate": 0.0005092283877132622, + "loss": 2.202, + "step": 455560 + }, + { + "epoch": 1.761106214532016, + "grad_norm": 0.12978526949882507, + "learning_rate": 0.0005090942350369691, + "loss": 2.1923, + "step": 455570 + }, + { + "epoch": 1.7611448717353992, + "grad_norm": 0.12521392107009888, + "learning_rate": 0.0005089600944307359, + "loss": 2.2155, + "step": 455580 + }, + { + "epoch": 1.7611835289387825, + "grad_norm": 0.12959663569927216, + "learning_rate": 0.0005088259658913048, + "loss": 2.1988, + "step": 455590 + }, + { + "epoch": 1.7612221861421657, + "grad_norm": 0.1173701286315918, + "learning_rate": 0.00050869184941542, + "loss": 2.2132, + "step": 455600 + }, + { + "epoch": 1.761260843345549, + "grad_norm": 0.13058489561080933, + "learning_rate": 0.0005085577449998273, + "loss": 2.2075, + "step": 455610 + }, + { + "epoch": 1.7612995005489323, + "grad_norm": 0.12429569661617279, + "learning_rate": 0.0005084236526412735, + "loss": 2.1938, + "step": 455620 + }, + { + "epoch": 1.7613381577523155, + "grad_norm": 0.1280798465013504, + "learning_rate": 0.0005082895723365073, + "loss": 2.1824, + "step": 455630 + }, + { + "epoch": 1.7613768149556988, + "grad_norm": 0.11434909701347351, + "learning_rate": 0.0005081555040822788, + "loss": 2.2017, + "step": 455640 + }, + { + "epoch": 1.761415472159082, + "grad_norm": 0.13100238144397736, + "learning_rate": 0.0005080214478753389, + "loss": 2.1949, + "step": 455650 + }, + { + "epoch": 1.7614541293624653, + "grad_norm": 0.12328257411718369, + "learning_rate": 0.000507887403712441, + "loss": 2.1991, + "step": 455660 + }, + { + "epoch": 1.7614927865658487, + "grad_norm": 0.1999645233154297, + "learning_rate": 0.0005077533715903393, + "loss": 2.205, + "step": 455670 + }, + { + "epoch": 1.761531443769232, + "grad_norm": 0.12685048580169678, + "learning_rate": 0.0005076193515057896, + "loss": 2.1929, + "step": 455680 + }, + { + "epoch": 1.7615701009726152, + "grad_norm": 0.11934152245521545, + "learning_rate": 0.0005074853434555493, + "loss": 2.22, + "step": 455690 + }, + { + "epoch": 1.7616087581759985, + "grad_norm": 0.11731681227684021, + "learning_rate": 0.0005073513474363767, + "loss": 2.2034, + "step": 455700 + }, + { + "epoch": 1.7616474153793817, + "grad_norm": 0.12497756630182266, + "learning_rate": 0.0005072173634450326, + "loss": 2.2034, + "step": 455710 + }, + { + "epoch": 1.7616860725827652, + "grad_norm": 0.12380430847406387, + "learning_rate": 0.0005070833914782781, + "loss": 2.1856, + "step": 455720 + }, + { + "epoch": 1.7617247297861485, + "grad_norm": 0.12082111090421677, + "learning_rate": 0.0005069494315328766, + "loss": 2.2045, + "step": 455730 + }, + { + "epoch": 1.7617633869895317, + "grad_norm": 0.12536385655403137, + "learning_rate": 0.0005068154836055927, + "loss": 2.1895, + "step": 455740 + }, + { + "epoch": 1.761802044192915, + "grad_norm": 0.12240228801965714, + "learning_rate": 0.000506681547693192, + "loss": 2.1965, + "step": 455750 + }, + { + "epoch": 1.7618407013962982, + "grad_norm": 0.1273859590291977, + "learning_rate": 0.0005065476237924425, + "loss": 2.2206, + "step": 455760 + }, + { + "epoch": 1.7618793585996815, + "grad_norm": 0.131776824593544, + "learning_rate": 0.0005064137119001127, + "loss": 2.2186, + "step": 455770 + }, + { + "epoch": 1.7619180158030647, + "grad_norm": 0.12719875574111938, + "learning_rate": 0.0005062798120129728, + "loss": 2.2156, + "step": 455780 + }, + { + "epoch": 1.761956673006448, + "grad_norm": 0.13013708591461182, + "learning_rate": 0.0005061459241277951, + "loss": 2.2029, + "step": 455790 + }, + { + "epoch": 1.7619953302098312, + "grad_norm": 0.13835778832435608, + "learning_rate": 0.0005060120482413521, + "loss": 2.1965, + "step": 455800 + }, + { + "epoch": 1.7620339874132145, + "grad_norm": 0.12558209896087646, + "learning_rate": 0.0005058781843504192, + "loss": 2.198, + "step": 455810 + }, + { + "epoch": 1.7620726446165977, + "grad_norm": 0.1262013167142868, + "learning_rate": 0.0005057443324517721, + "loss": 2.2012, + "step": 455820 + }, + { + "epoch": 1.762111301819981, + "grad_norm": 0.12975828349590302, + "learning_rate": 0.0005056104925421888, + "loss": 2.2104, + "step": 455830 + }, + { + "epoch": 1.7621499590233645, + "grad_norm": 0.13392338156700134, + "learning_rate": 0.0005054766646184477, + "loss": 2.1919, + "step": 455840 + }, + { + "epoch": 1.7621886162267477, + "grad_norm": 0.1283750832080841, + "learning_rate": 0.0005053428486773297, + "loss": 2.2028, + "step": 455850 + }, + { + "epoch": 1.762227273430131, + "grad_norm": 0.12651102244853973, + "learning_rate": 0.0005052090447156164, + "loss": 2.2032, + "step": 455860 + }, + { + "epoch": 1.7622659306335142, + "grad_norm": 0.12298119813203812, + "learning_rate": 0.0005050752527300914, + "loss": 2.2079, + "step": 455870 + }, + { + "epoch": 1.7623045878368975, + "grad_norm": 0.13039034605026245, + "learning_rate": 0.0005049414727175394, + "loss": 2.2161, + "step": 455880 + }, + { + "epoch": 1.762343245040281, + "grad_norm": 0.1351780742406845, + "learning_rate": 0.0005048077046747465, + "loss": 2.1956, + "step": 455890 + }, + { + "epoch": 1.7623819022436642, + "grad_norm": 0.13028337061405182, + "learning_rate": 0.0005046739485985004, + "loss": 2.1939, + "step": 455900 + }, + { + "epoch": 1.7624205594470475, + "grad_norm": 0.12082945555448532, + "learning_rate": 0.0005045402044855903, + "loss": 2.1899, + "step": 455910 + }, + { + "epoch": 1.7624592166504307, + "grad_norm": 0.1275576949119568, + "learning_rate": 0.0005044064723328065, + "loss": 2.2041, + "step": 455920 + }, + { + "epoch": 1.762497873853814, + "grad_norm": 0.12402354925870895, + "learning_rate": 0.0005042727521369412, + "loss": 2.1986, + "step": 455930 + }, + { + "epoch": 1.7625365310571972, + "grad_norm": 0.12115433067083359, + "learning_rate": 0.0005041390438947877, + "loss": 2.2154, + "step": 455940 + }, + { + "epoch": 1.7625751882605805, + "grad_norm": 0.11511493474245071, + "learning_rate": 0.0005040053476031408, + "loss": 2.1881, + "step": 455950 + }, + { + "epoch": 1.7626138454639637, + "grad_norm": 0.12377490848302841, + "learning_rate": 0.0005038716632587965, + "loss": 2.211, + "step": 455960 + }, + { + "epoch": 1.762652502667347, + "grad_norm": 0.1281753033399582, + "learning_rate": 0.0005037379908585528, + "loss": 2.2096, + "step": 455970 + }, + { + "epoch": 1.7626911598707302, + "grad_norm": 0.13222567737102509, + "learning_rate": 0.000503604330399209, + "loss": 2.2073, + "step": 455980 + }, + { + "epoch": 1.7627298170741135, + "grad_norm": 0.12830236554145813, + "learning_rate": 0.0005034706818775652, + "loss": 2.1993, + "step": 455990 + }, + { + "epoch": 1.7627684742774967, + "grad_norm": 0.12811852991580963, + "learning_rate": 0.0005033370452904234, + "loss": 2.2049, + "step": 456000 + }, + { + "epoch": 1.7628071314808802, + "grad_norm": 0.1232878714799881, + "learning_rate": 0.0005032034206345872, + "loss": 2.2021, + "step": 456010 + }, + { + "epoch": 1.7628457886842634, + "grad_norm": 0.14702750742435455, + "learning_rate": 0.0005030698079068616, + "loss": 2.1981, + "step": 456020 + }, + { + "epoch": 1.7628844458876467, + "grad_norm": 0.12906567752361298, + "learning_rate": 0.0005029362071040526, + "loss": 2.1987, + "step": 456030 + }, + { + "epoch": 1.76292310309103, + "grad_norm": 0.5849501490592957, + "learning_rate": 0.0005028026182229679, + "loss": 2.1847, + "step": 456040 + }, + { + "epoch": 1.7629617602944134, + "grad_norm": 0.13289207220077515, + "learning_rate": 0.0005026690412604167, + "loss": 2.2058, + "step": 456050 + }, + { + "epoch": 1.7630004174977967, + "grad_norm": 0.1222372055053711, + "learning_rate": 0.0005025354762132093, + "loss": 2.1919, + "step": 456060 + }, + { + "epoch": 1.76303907470118, + "grad_norm": 0.12752196192741394, + "learning_rate": 0.0005024019230781578, + "loss": 2.217, + "step": 456070 + }, + { + "epoch": 1.7630777319045632, + "grad_norm": 0.12730394303798676, + "learning_rate": 0.0005022683818520757, + "loss": 2.2028, + "step": 456080 + }, + { + "epoch": 1.7631163891079464, + "grad_norm": 0.12045823037624359, + "learning_rate": 0.0005021348525317775, + "loss": 2.2016, + "step": 456090 + }, + { + "epoch": 1.7631550463113297, + "grad_norm": 0.12794707715511322, + "learning_rate": 0.0005020013351140796, + "loss": 2.1824, + "step": 456100 + }, + { + "epoch": 1.763193703514713, + "grad_norm": 0.11931639164686203, + "learning_rate": 0.0005018678295957996, + "loss": 2.1882, + "step": 456110 + }, + { + "epoch": 1.7632323607180962, + "grad_norm": 0.12232300639152527, + "learning_rate": 0.0005017343359737565, + "loss": 2.216, + "step": 456120 + }, + { + "epoch": 1.7632710179214794, + "grad_norm": 0.12490812689065933, + "learning_rate": 0.000501600854244771, + "loss": 2.1968, + "step": 456130 + }, + { + "epoch": 1.7633096751248627, + "grad_norm": 0.12569020688533783, + "learning_rate": 0.0005014673844056648, + "loss": 2.1944, + "step": 456140 + }, + { + "epoch": 1.763348332328246, + "grad_norm": 0.13292300701141357, + "learning_rate": 0.0005013339264532609, + "loss": 2.2009, + "step": 456150 + }, + { + "epoch": 1.7633869895316292, + "grad_norm": 0.12760011851787567, + "learning_rate": 0.0005012004803843843, + "loss": 2.2126, + "step": 456160 + }, + { + "epoch": 1.7634256467350125, + "grad_norm": 0.12860462069511414, + "learning_rate": 0.0005010670461958613, + "loss": 2.2051, + "step": 456170 + }, + { + "epoch": 1.763464303938396, + "grad_norm": 0.11949852854013443, + "learning_rate": 0.0005009336238845192, + "loss": 2.1977, + "step": 456180 + }, + { + "epoch": 1.7635029611417792, + "grad_norm": 0.13162904977798462, + "learning_rate": 0.0005008002134471871, + "loss": 2.198, + "step": 456190 + }, + { + "epoch": 1.7635416183451624, + "grad_norm": 0.1294323205947876, + "learning_rate": 0.000500666814880695, + "loss": 2.203, + "step": 456200 + }, + { + "epoch": 1.7635802755485457, + "grad_norm": 0.13341623544692993, + "learning_rate": 0.000500533428181875, + "loss": 2.1982, + "step": 456210 + }, + { + "epoch": 1.7636189327519292, + "grad_norm": 0.1469571441411972, + "learning_rate": 0.0005004000533475603, + "loss": 2.2157, + "step": 456220 + }, + { + "epoch": 1.7636575899553124, + "grad_norm": 0.14060497283935547, + "learning_rate": 0.0005002666903745854, + "loss": 2.1957, + "step": 456230 + }, + { + "epoch": 1.7636962471586957, + "grad_norm": 0.13437320291996002, + "learning_rate": 0.000500133339259786, + "loss": 2.1918, + "step": 456240 + }, + { + "epoch": 1.763734904362079, + "grad_norm": 0.11874886602163315, + "learning_rate": 0.0005, + "loss": 2.201, + "step": 456250 + }, + { + "epoch": 1.7637735615654622, + "grad_norm": 0.13021759688854218, + "learning_rate": 0.000499866672592066, + "loss": 2.2052, + "step": 456260 + }, + { + "epoch": 1.7638122187688454, + "grad_norm": 0.13554084300994873, + "learning_rate": 0.0004997333570328239, + "loss": 2.1993, + "step": 456270 + }, + { + "epoch": 1.7638508759722287, + "grad_norm": 0.12686790525913239, + "learning_rate": 0.0004996000533191158, + "loss": 2.2008, + "step": 456280 + }, + { + "epoch": 1.763889533175612, + "grad_norm": 0.14258117973804474, + "learning_rate": 0.0004994667614477846, + "loss": 2.1982, + "step": 456290 + }, + { + "epoch": 1.7639281903789952, + "grad_norm": 0.1296672224998474, + "learning_rate": 0.0004993334814156745, + "loss": 2.1993, + "step": 456300 + }, + { + "epoch": 1.7639668475823784, + "grad_norm": 0.14639268815517426, + "learning_rate": 0.0004992002132196314, + "loss": 2.1921, + "step": 456310 + }, + { + "epoch": 1.7640055047857617, + "grad_norm": 0.15819869935512543, + "learning_rate": 0.0004990669568565025, + "loss": 2.1994, + "step": 456320 + }, + { + "epoch": 1.764044161989145, + "grad_norm": 0.1342620998620987, + "learning_rate": 0.0004989337123231366, + "loss": 2.188, + "step": 456330 + }, + { + "epoch": 1.7640828191925282, + "grad_norm": 0.11846864968538284, + "learning_rate": 0.0004988004796163836, + "loss": 2.184, + "step": 456340 + }, + { + "epoch": 1.7641214763959117, + "grad_norm": 0.12967292964458466, + "learning_rate": 0.0004986672587330949, + "loss": 2.2067, + "step": 456350 + }, + { + "epoch": 1.764160133599295, + "grad_norm": 0.12366357445716858, + "learning_rate": 0.0004985340496701232, + "loss": 2.1873, + "step": 456360 + }, + { + "epoch": 1.7641987908026782, + "grad_norm": 0.1255825161933899, + "learning_rate": 0.0004984008524243228, + "loss": 2.2074, + "step": 456370 + }, + { + "epoch": 1.7642374480060614, + "grad_norm": 0.1342882215976715, + "learning_rate": 0.0004982676669925497, + "loss": 2.1949, + "step": 456380 + }, + { + "epoch": 1.764276105209445, + "grad_norm": 0.14513757824897766, + "learning_rate": 0.0004981344933716603, + "loss": 2.1977, + "step": 456390 + }, + { + "epoch": 1.7643147624128281, + "grad_norm": 0.1229807436466217, + "learning_rate": 0.000498001331558513, + "loss": 2.1863, + "step": 456400 + }, + { + "epoch": 1.7643534196162114, + "grad_norm": 0.1304650902748108, + "learning_rate": 0.0004978681815499679, + "loss": 2.1815, + "step": 456410 + }, + { + "epoch": 1.7643920768195946, + "grad_norm": 0.12147001177072525, + "learning_rate": 0.0004977350433428863, + "loss": 2.2013, + "step": 456420 + }, + { + "epoch": 1.764430734022978, + "grad_norm": 0.12276685982942581, + "learning_rate": 0.0004976019169341302, + "loss": 2.2108, + "step": 456430 + }, + { + "epoch": 1.7644693912263612, + "grad_norm": 0.13635388016700745, + "learning_rate": 0.000497468802320564, + "loss": 2.1976, + "step": 456440 + }, + { + "epoch": 1.7645080484297444, + "grad_norm": 0.13049404323101044, + "learning_rate": 0.0004973356994990532, + "loss": 2.195, + "step": 456450 + }, + { + "epoch": 1.7645467056331277, + "grad_norm": 0.12726396322250366, + "learning_rate": 0.000497202608466464, + "loss": 2.2044, + "step": 456460 + }, + { + "epoch": 1.764585362836511, + "grad_norm": 0.1332763433456421, + "learning_rate": 0.0004970695292196648, + "loss": 2.1888, + "step": 456470 + }, + { + "epoch": 1.7646240200398942, + "grad_norm": 0.28297457098960876, + "learning_rate": 0.0004969364617555252, + "loss": 2.1922, + "step": 456480 + }, + { + "epoch": 1.7646626772432774, + "grad_norm": 0.1367010772228241, + "learning_rate": 0.0004968034060709159, + "loss": 2.1988, + "step": 456490 + }, + { + "epoch": 1.7647013344466607, + "grad_norm": 0.12271040678024292, + "learning_rate": 0.0004966703621627094, + "loss": 2.1955, + "step": 456500 + }, + { + "epoch": 1.764739991650044, + "grad_norm": 0.14147435128688812, + "learning_rate": 0.0004965373300277789, + "loss": 2.1992, + "step": 456510 + }, + { + "epoch": 1.7647786488534274, + "grad_norm": 0.1327211707830429, + "learning_rate": 0.0004964043096629998, + "loss": 2.2041, + "step": 456520 + }, + { + "epoch": 1.7648173060568106, + "grad_norm": 0.12535947561264038, + "learning_rate": 0.0004962713010652486, + "loss": 2.1888, + "step": 456530 + }, + { + "epoch": 1.764855963260194, + "grad_norm": 0.13628923892974854, + "learning_rate": 0.0004961383042314031, + "loss": 2.2029, + "step": 456540 + }, + { + "epoch": 1.7648946204635771, + "grad_norm": 0.12428577244281769, + "learning_rate": 0.0004960053191583422, + "loss": 2.1966, + "step": 456550 + }, + { + "epoch": 1.7649332776669606, + "grad_norm": 0.1212974563241005, + "learning_rate": 0.0004958723458429468, + "loss": 2.1948, + "step": 456560 + }, + { + "epoch": 1.7649719348703439, + "grad_norm": 0.12550272047519684, + "learning_rate": 0.0004957393842820986, + "loss": 2.2202, + "step": 456570 + }, + { + "epoch": 1.7650105920737271, + "grad_norm": 0.12495537847280502, + "learning_rate": 0.0004956064344726809, + "loss": 2.2067, + "step": 456580 + }, + { + "epoch": 1.7650492492771104, + "grad_norm": 0.12538939714431763, + "learning_rate": 0.0004954734964115788, + "loss": 2.2029, + "step": 456590 + }, + { + "epoch": 1.7650879064804936, + "grad_norm": 0.11974082887172699, + "learning_rate": 0.000495340570095678, + "loss": 2.1994, + "step": 456600 + }, + { + "epoch": 1.7651265636838769, + "grad_norm": 0.12425250560045242, + "learning_rate": 0.0004952076555218656, + "loss": 2.205, + "step": 456610 + }, + { + "epoch": 1.7651652208872601, + "grad_norm": 0.13043095171451569, + "learning_rate": 0.0004950747526870312, + "loss": 2.188, + "step": 456620 + }, + { + "epoch": 1.7652038780906434, + "grad_norm": 0.127410888671875, + "learning_rate": 0.0004949418615880647, + "loss": 2.1993, + "step": 456630 + }, + { + "epoch": 1.7652425352940266, + "grad_norm": 0.1242605522274971, + "learning_rate": 0.0004948089822218576, + "loss": 2.1888, + "step": 456640 + }, + { + "epoch": 1.76528119249741, + "grad_norm": 0.125991091132164, + "learning_rate": 0.0004946761145853029, + "loss": 2.2001, + "step": 456650 + }, + { + "epoch": 1.7653198497007931, + "grad_norm": 0.14134454727172852, + "learning_rate": 0.0004945432586752949, + "loss": 2.1943, + "step": 456660 + }, + { + "epoch": 1.7653585069041764, + "grad_norm": 0.13556590676307678, + "learning_rate": 0.0004944104144887294, + "loss": 2.1929, + "step": 456670 + }, + { + "epoch": 1.7653971641075596, + "grad_norm": 0.12267480045557022, + "learning_rate": 0.0004942775820225032, + "loss": 2.2012, + "step": 456680 + }, + { + "epoch": 1.7654358213109431, + "grad_norm": 0.12042094767093658, + "learning_rate": 0.0004941447612735148, + "loss": 2.1937, + "step": 456690 + }, + { + "epoch": 1.7654744785143264, + "grad_norm": 0.13384388387203217, + "learning_rate": 0.0004940119522386642, + "loss": 2.1987, + "step": 456700 + }, + { + "epoch": 1.7655131357177096, + "grad_norm": 0.12269002944231033, + "learning_rate": 0.000493879154914852, + "loss": 2.2075, + "step": 456710 + }, + { + "epoch": 1.7655517929210929, + "grad_norm": 0.12442340701818466, + "learning_rate": 0.0004937463692989815, + "loss": 2.2074, + "step": 456720 + }, + { + "epoch": 1.7655904501244764, + "grad_norm": 0.1305408775806427, + "learning_rate": 0.0004936135953879561, + "loss": 2.1942, + "step": 456730 + }, + { + "epoch": 1.7656291073278596, + "grad_norm": 0.12869402766227722, + "learning_rate": 0.0004934808331786813, + "loss": 2.1924, + "step": 456740 + }, + { + "epoch": 1.7656677645312429, + "grad_norm": 0.13615325093269348, + "learning_rate": 0.0004933480826680636, + "loss": 2.2168, + "step": 456750 + }, + { + "epoch": 1.765706421734626, + "grad_norm": 0.13996773958206177, + "learning_rate": 0.0004932153438530109, + "loss": 2.1946, + "step": 456760 + }, + { + "epoch": 1.7657450789380094, + "grad_norm": 0.124326691031456, + "learning_rate": 0.0004930826167304327, + "loss": 2.195, + "step": 456770 + }, + { + "epoch": 1.7657837361413926, + "grad_norm": 0.12879958748817444, + "learning_rate": 0.0004929499012972397, + "loss": 2.2016, + "step": 456780 + }, + { + "epoch": 1.7658223933447759, + "grad_norm": 0.1394444704055786, + "learning_rate": 0.0004928171975503437, + "loss": 2.2002, + "step": 456790 + }, + { + "epoch": 1.7658610505481591, + "grad_norm": 0.1241043284535408, + "learning_rate": 0.0004926845054866582, + "loss": 2.2017, + "step": 456800 + }, + { + "epoch": 1.7658997077515424, + "grad_norm": 0.12370163947343826, + "learning_rate": 0.0004925518251030983, + "loss": 2.1875, + "step": 456810 + }, + { + "epoch": 1.7659383649549256, + "grad_norm": 0.13047485053539276, + "learning_rate": 0.0004924191563965798, + "loss": 2.2002, + "step": 456820 + }, + { + "epoch": 1.7659770221583089, + "grad_norm": 0.13031162321567535, + "learning_rate": 0.0004922864993640204, + "loss": 2.19, + "step": 456830 + }, + { + "epoch": 1.7660156793616921, + "grad_norm": 0.14417153596878052, + "learning_rate": 0.000492153854002339, + "loss": 2.2101, + "step": 456840 + }, + { + "epoch": 1.7660543365650754, + "grad_norm": 0.12199871242046356, + "learning_rate": 0.0004920212203084553, + "loss": 2.2007, + "step": 456850 + }, + { + "epoch": 1.7660929937684589, + "grad_norm": 0.12932102382183075, + "learning_rate": 0.0004918885982792916, + "loss": 2.1939, + "step": 456860 + }, + { + "epoch": 1.766131650971842, + "grad_norm": 0.12995269894599915, + "learning_rate": 0.0004917559879117704, + "loss": 2.1926, + "step": 456870 + }, + { + "epoch": 1.7661703081752254, + "grad_norm": 0.2126019448041916, + "learning_rate": 0.0004916233892028159, + "loss": 2.1925, + "step": 456880 + }, + { + "epoch": 1.7662089653786086, + "grad_norm": 0.12991484999656677, + "learning_rate": 0.0004914908021493538, + "loss": 2.1984, + "step": 456890 + }, + { + "epoch": 1.766247622581992, + "grad_norm": 0.13209208846092224, + "learning_rate": 0.0004913582267483112, + "loss": 2.1932, + "step": 456900 + }, + { + "epoch": 1.7662862797853753, + "grad_norm": 0.12427953630685806, + "learning_rate": 0.0004912256629966163, + "loss": 2.1839, + "step": 456910 + }, + { + "epoch": 1.7663249369887586, + "grad_norm": 0.1290198415517807, + "learning_rate": 0.0004910931108911986, + "loss": 2.2005, + "step": 456920 + }, + { + "epoch": 1.7663635941921418, + "grad_norm": 0.12674565613269806, + "learning_rate": 0.0004909605704289897, + "loss": 2.1965, + "step": 456930 + }, + { + "epoch": 1.766402251395525, + "grad_norm": 0.13432876765727997, + "learning_rate": 0.0004908280416069215, + "loss": 2.2038, + "step": 456940 + }, + { + "epoch": 1.7664409085989083, + "grad_norm": 0.12446749210357666, + "learning_rate": 0.0004906955244219275, + "loss": 2.2015, + "step": 456950 + }, + { + "epoch": 1.7664795658022916, + "grad_norm": 0.13796383142471313, + "learning_rate": 0.0004905630188709434, + "loss": 2.1983, + "step": 456960 + }, + { + "epoch": 1.7665182230056748, + "grad_norm": 0.14042441546916962, + "learning_rate": 0.0004904305249509052, + "loss": 2.2075, + "step": 456970 + }, + { + "epoch": 1.766556880209058, + "grad_norm": 0.12553489208221436, + "learning_rate": 0.0004902980426587505, + "loss": 2.1971, + "step": 456980 + }, + { + "epoch": 1.7665955374124414, + "grad_norm": 0.15119601786136627, + "learning_rate": 0.0004901655719914186, + "loss": 2.1954, + "step": 456990 + }, + { + "epoch": 1.7666341946158246, + "grad_norm": 0.14849050343036652, + "learning_rate": 0.0004900331129458502, + "loss": 2.2048, + "step": 457000 + }, + { + "epoch": 1.7666728518192079, + "grad_norm": 0.13672266900539398, + "learning_rate": 0.0004899006655189864, + "loss": 2.2095, + "step": 457010 + }, + { + "epoch": 1.766711509022591, + "grad_norm": 0.12951894104480743, + "learning_rate": 0.000489768229707771, + "loss": 2.1923, + "step": 457020 + }, + { + "epoch": 1.7667501662259746, + "grad_norm": 0.13723427057266235, + "learning_rate": 0.0004896358055091481, + "loss": 2.1984, + "step": 457030 + }, + { + "epoch": 1.7667888234293578, + "grad_norm": 0.12522564828395844, + "learning_rate": 0.0004895033929200634, + "loss": 2.1975, + "step": 457040 + }, + { + "epoch": 1.766827480632741, + "grad_norm": 0.133930966258049, + "learning_rate": 0.0004893709919374645, + "loss": 2.1949, + "step": 457050 + }, + { + "epoch": 1.7668661378361243, + "grad_norm": 0.1456451416015625, + "learning_rate": 0.0004892386025582993, + "loss": 2.1979, + "step": 457060 + }, + { + "epoch": 1.7669047950395078, + "grad_norm": 0.13085532188415527, + "learning_rate": 0.0004891062247795181, + "loss": 2.2061, + "step": 457070 + }, + { + "epoch": 1.766943452242891, + "grad_norm": 0.13043569028377533, + "learning_rate": 0.0004889738585980718, + "loss": 2.1919, + "step": 457080 + }, + { + "epoch": 1.7669821094462743, + "grad_norm": 0.1371423453092575, + "learning_rate": 0.0004888415040109129, + "loss": 2.21, + "step": 457090 + }, + { + "epoch": 1.7670207666496576, + "grad_norm": 0.12515562772750854, + "learning_rate": 0.0004887091610149952, + "loss": 2.1925, + "step": 457100 + }, + { + "epoch": 1.7670594238530408, + "grad_norm": 0.16014552116394043, + "learning_rate": 0.0004885768296072737, + "loss": 2.2099, + "step": 457110 + }, + { + "epoch": 1.767098081056424, + "grad_norm": 0.13860388100147247, + "learning_rate": 0.0004884445097847052, + "loss": 2.1975, + "step": 457120 + }, + { + "epoch": 1.7671367382598073, + "grad_norm": 0.12750791013240814, + "learning_rate": 0.0004883122015442474, + "loss": 2.1925, + "step": 457130 + }, + { + "epoch": 1.7671753954631906, + "grad_norm": 0.12645824253559113, + "learning_rate": 0.00048817990488285944, + "loss": 2.1883, + "step": 457140 + }, + { + "epoch": 1.7672140526665738, + "grad_norm": 0.12171449512243271, + "learning_rate": 0.00048804761979750165, + "loss": 2.2046, + "step": 457150 + }, + { + "epoch": 1.767252709869957, + "grad_norm": 0.13183607161045074, + "learning_rate": 0.000487915346285136, + "loss": 2.2009, + "step": 457160 + }, + { + "epoch": 1.7672913670733403, + "grad_norm": 0.1332433670759201, + "learning_rate": 0.00048778308434272577, + "loss": 2.1939, + "step": 457170 + }, + { + "epoch": 1.7673300242767236, + "grad_norm": 0.12962956726551056, + "learning_rate": 0.00048765083396723496, + "loss": 2.2094, + "step": 457180 + }, + { + "epoch": 1.7673686814801068, + "grad_norm": 0.1359495371580124, + "learning_rate": 0.0004875185951556298, + "loss": 2.1924, + "step": 457190 + }, + { + "epoch": 1.7674073386834903, + "grad_norm": 0.13195596635341644, + "learning_rate": 0.0004873863679048771, + "loss": 2.1995, + "step": 457200 + }, + { + "epoch": 1.7674459958868736, + "grad_norm": 0.13214111328125, + "learning_rate": 0.00048725415221194537, + "loss": 2.1957, + "step": 457210 + }, + { + "epoch": 1.7674846530902568, + "grad_norm": 0.13750267028808594, + "learning_rate": 0.0004871219480738045, + "loss": 2.1915, + "step": 457220 + }, + { + "epoch": 1.76752331029364, + "grad_norm": 0.13220281898975372, + "learning_rate": 0.0004869897554874256, + "loss": 2.1841, + "step": 457230 + }, + { + "epoch": 1.7675619674970235, + "grad_norm": 0.12308496981859207, + "learning_rate": 0.000486857574449781, + "loss": 2.1852, + "step": 457240 + }, + { + "epoch": 1.7676006247004068, + "grad_norm": 0.11604126542806625, + "learning_rate": 0.00048672540495784447, + "loss": 2.1906, + "step": 457250 + }, + { + "epoch": 1.76763928190379, + "grad_norm": 0.12625819444656372, + "learning_rate": 0.000486593247008591, + "loss": 2.1857, + "step": 457260 + }, + { + "epoch": 1.7676779391071733, + "grad_norm": 0.13055488467216492, + "learning_rate": 0.00048646110059899695, + "loss": 2.1866, + "step": 457270 + }, + { + "epoch": 1.7677165963105566, + "grad_norm": 0.12539438903331757, + "learning_rate": 0.0004863289657260399, + "loss": 2.2114, + "step": 457280 + }, + { + "epoch": 1.7677552535139398, + "grad_norm": 0.12512290477752686, + "learning_rate": 0.00048619684238669934, + "loss": 2.1954, + "step": 457290 + }, + { + "epoch": 1.767793910717323, + "grad_norm": 0.14459888637065887, + "learning_rate": 0.0004860647305779551, + "loss": 2.2088, + "step": 457300 + }, + { + "epoch": 1.7678325679207063, + "grad_norm": 0.13872483372688293, + "learning_rate": 0.0004859326302967888, + "loss": 2.179, + "step": 457310 + }, + { + "epoch": 1.7678712251240896, + "grad_norm": 0.13036368787288666, + "learning_rate": 0.00048580054154018405, + "loss": 2.1955, + "step": 457320 + }, + { + "epoch": 1.7679098823274728, + "grad_norm": 0.128155916929245, + "learning_rate": 0.0004856684643051246, + "loss": 2.2028, + "step": 457330 + }, + { + "epoch": 1.767948539530856, + "grad_norm": 0.13497613370418549, + "learning_rate": 0.0004855363985885959, + "loss": 2.1886, + "step": 457340 + }, + { + "epoch": 1.7679871967342393, + "grad_norm": 0.12754927575588226, + "learning_rate": 0.0004854043443875853, + "loss": 2.1898, + "step": 457350 + }, + { + "epoch": 1.7680258539376226, + "grad_norm": 0.13196970522403717, + "learning_rate": 0.000485272301699081, + "loss": 2.2183, + "step": 457360 + }, + { + "epoch": 1.768064511141006, + "grad_norm": 0.1224970743060112, + "learning_rate": 0.0004851402705200723, + "loss": 2.199, + "step": 457370 + }, + { + "epoch": 1.7681031683443893, + "grad_norm": 0.12491404265165329, + "learning_rate": 0.00048500825084754997, + "loss": 2.203, + "step": 457380 + }, + { + "epoch": 1.7681418255477725, + "grad_norm": 0.13020554184913635, + "learning_rate": 0.0004848762426785065, + "loss": 2.1925, + "step": 457390 + }, + { + "epoch": 1.7681804827511558, + "grad_norm": 0.12115182727575302, + "learning_rate": 0.00048474424600993514, + "loss": 2.1961, + "step": 457400 + }, + { + "epoch": 1.7682191399545393, + "grad_norm": 0.12314828485250473, + "learning_rate": 0.0004846122608388306, + "loss": 2.1979, + "step": 457410 + }, + { + "epoch": 1.7682577971579225, + "grad_norm": 0.13972881436347961, + "learning_rate": 0.0004844802871621894, + "loss": 2.1956, + "step": 457420 + }, + { + "epoch": 1.7682964543613058, + "grad_norm": 0.14746752381324768, + "learning_rate": 0.0004843483249770084, + "loss": 2.1894, + "step": 457430 + }, + { + "epoch": 1.768335111564689, + "grad_norm": 0.1263071745634079, + "learning_rate": 0.00048421637428028676, + "loss": 2.2025, + "step": 457440 + }, + { + "epoch": 1.7683737687680723, + "grad_norm": 0.12901432812213898, + "learning_rate": 0.0004840844350690241, + "loss": 2.1954, + "step": 457450 + }, + { + "epoch": 1.7684124259714555, + "grad_norm": 0.1326717585325241, + "learning_rate": 0.00048395250734022176, + "loss": 2.194, + "step": 457460 + }, + { + "epoch": 1.7684510831748388, + "grad_norm": 0.13518276810646057, + "learning_rate": 0.0004838205910908828, + "loss": 2.196, + "step": 457470 + }, + { + "epoch": 1.768489740378222, + "grad_norm": 0.1248217448592186, + "learning_rate": 0.0004836886863180108, + "loss": 2.2143, + "step": 457480 + }, + { + "epoch": 1.7685283975816053, + "grad_norm": 0.13813568651676178, + "learning_rate": 0.0004835567930186109, + "loss": 2.1984, + "step": 457490 + }, + { + "epoch": 1.7685670547849885, + "grad_norm": 0.12963175773620605, + "learning_rate": 0.00048342491118969, + "loss": 2.203, + "step": 457500 + }, + { + "epoch": 1.7686057119883718, + "grad_norm": 0.13578617572784424, + "learning_rate": 0.0004832930408282554, + "loss": 2.2018, + "step": 457510 + }, + { + "epoch": 1.768644369191755, + "grad_norm": 0.1339896023273468, + "learning_rate": 0.00048316118193131666, + "loss": 2.2028, + "step": 457520 + }, + { + "epoch": 1.7686830263951385, + "grad_norm": 0.1255003660917282, + "learning_rate": 0.0004830293344958843, + "loss": 2.2142, + "step": 457530 + }, + { + "epoch": 1.7687216835985218, + "grad_norm": 0.1350245177745819, + "learning_rate": 0.0004828974985189696, + "loss": 2.2086, + "step": 457540 + }, + { + "epoch": 1.768760340801905, + "grad_norm": 0.13166548311710358, + "learning_rate": 0.00048276567399758584, + "loss": 2.187, + "step": 457550 + }, + { + "epoch": 1.7687989980052883, + "grad_norm": 0.12755592167377472, + "learning_rate": 0.0004826338609287473, + "loss": 2.1914, + "step": 457560 + }, + { + "epoch": 1.7688376552086715, + "grad_norm": 0.13070148229599, + "learning_rate": 0.00048250205930946977, + "loss": 2.2107, + "step": 457570 + }, + { + "epoch": 1.768876312412055, + "grad_norm": 0.12268534302711487, + "learning_rate": 0.00048237026913677, + "loss": 2.1851, + "step": 457580 + }, + { + "epoch": 1.7689149696154383, + "grad_norm": 0.12911690771579742, + "learning_rate": 0.0004822384904076662, + "loss": 2.195, + "step": 457590 + }, + { + "epoch": 1.7689536268188215, + "grad_norm": 0.127009779214859, + "learning_rate": 0.00048210672311917803, + "loss": 2.1929, + "step": 457600 + }, + { + "epoch": 1.7689922840222048, + "grad_norm": 0.12624113261699677, + "learning_rate": 0.000481974967268326, + "loss": 2.1953, + "step": 457610 + }, + { + "epoch": 1.769030941225588, + "grad_norm": 0.14546051621437073, + "learning_rate": 0.00048184322285213234, + "loss": 2.186, + "step": 457620 + }, + { + "epoch": 1.7690695984289713, + "grad_norm": 0.12668795883655548, + "learning_rate": 0.0004817114898676207, + "loss": 2.1864, + "step": 457630 + }, + { + "epoch": 1.7691082556323545, + "grad_norm": 0.13258390128612518, + "learning_rate": 0.0004815797683118155, + "loss": 2.208, + "step": 457640 + }, + { + "epoch": 1.7691469128357378, + "grad_norm": 0.13000395894050598, + "learning_rate": 0.00048144805818174267, + "loss": 2.1986, + "step": 457650 + }, + { + "epoch": 1.769185570039121, + "grad_norm": 0.136027991771698, + "learning_rate": 0.00048131635947442963, + "loss": 2.1954, + "step": 457660 + }, + { + "epoch": 1.7692242272425043, + "grad_norm": 0.13348853588104248, + "learning_rate": 0.0004811846721869049, + "loss": 2.2083, + "step": 457670 + }, + { + "epoch": 1.7692628844458875, + "grad_norm": 0.12772707641124725, + "learning_rate": 0.00048105299631619804, + "loss": 2.2012, + "step": 457680 + }, + { + "epoch": 1.7693015416492708, + "grad_norm": 0.1307777315378189, + "learning_rate": 0.0004809213318593406, + "loss": 2.178, + "step": 457690 + }, + { + "epoch": 1.7693401988526543, + "grad_norm": 0.13382311165332794, + "learning_rate": 0.00048078967881336453, + "loss": 2.2047, + "step": 457700 + }, + { + "epoch": 1.7693788560560375, + "grad_norm": 0.13161014020442963, + "learning_rate": 0.000480658037175304, + "loss": 2.1928, + "step": 457710 + }, + { + "epoch": 1.7694175132594208, + "grad_norm": 0.14786985516548157, + "learning_rate": 0.00048052640694219376, + "loss": 2.1747, + "step": 457720 + }, + { + "epoch": 1.769456170462804, + "grad_norm": 0.1263905167579651, + "learning_rate": 0.00048039478811107, + "loss": 2.2018, + "step": 457730 + }, + { + "epoch": 1.7694948276661873, + "grad_norm": 0.1310981661081314, + "learning_rate": 0.00048026318067897035, + "loss": 2.1885, + "step": 457740 + }, + { + "epoch": 1.7695334848695707, + "grad_norm": 0.1322363317012787, + "learning_rate": 0.0004801315846429337, + "loss": 2.1894, + "step": 457750 + }, + { + "epoch": 1.769572142072954, + "grad_norm": 0.12454366683959961, + "learning_rate": 0.00048, + "loss": 2.1919, + "step": 457760 + }, + { + "epoch": 1.7696107992763372, + "grad_norm": 0.12184811383485794, + "learning_rate": 0.00047986842674721086, + "loss": 2.1935, + "step": 457770 + }, + { + "epoch": 1.7696494564797205, + "grad_norm": 0.13372832536697388, + "learning_rate": 0.00047973686488160874, + "loss": 2.1851, + "step": 457780 + }, + { + "epoch": 1.7696881136831037, + "grad_norm": 0.12762166559696198, + "learning_rate": 0.0004796053144002377, + "loss": 2.1971, + "step": 457790 + }, + { + "epoch": 1.769726770886487, + "grad_norm": 0.1250433623790741, + "learning_rate": 0.000479473775300143, + "loss": 2.2086, + "step": 457800 + }, + { + "epoch": 1.7697654280898703, + "grad_norm": 0.12746527791023254, + "learning_rate": 0.00047934224757837106, + "loss": 2.1934, + "step": 457810 + }, + { + "epoch": 1.7698040852932535, + "grad_norm": 0.12750469148159027, + "learning_rate": 0.00047921073123196977, + "loss": 2.1892, + "step": 457820 + }, + { + "epoch": 1.7698427424966368, + "grad_norm": 0.13376954197883606, + "learning_rate": 0.0004790792262579882, + "loss": 2.1917, + "step": 457830 + }, + { + "epoch": 1.76988139970002, + "grad_norm": 0.12152202427387238, + "learning_rate": 0.0004789477326534766, + "loss": 2.191, + "step": 457840 + }, + { + "epoch": 1.7699200569034033, + "grad_norm": 0.1229873076081276, + "learning_rate": 0.0004788162504154865, + "loss": 2.2052, + "step": 457850 + }, + { + "epoch": 1.7699587141067865, + "grad_norm": 0.12747801840305328, + "learning_rate": 0.0004786847795410709, + "loss": 2.1836, + "step": 457860 + }, + { + "epoch": 1.76999737131017, + "grad_norm": 0.13957123458385468, + "learning_rate": 0.000478553320027284, + "loss": 2.1998, + "step": 457870 + }, + { + "epoch": 1.7700360285135532, + "grad_norm": 0.14057600498199463, + "learning_rate": 0.00047842187187118124, + "loss": 2.1965, + "step": 457880 + }, + { + "epoch": 1.7700746857169365, + "grad_norm": 0.12310191988945007, + "learning_rate": 0.0004782904350698194, + "loss": 2.1866, + "step": 457890 + }, + { + "epoch": 1.7701133429203197, + "grad_norm": 0.13113313913345337, + "learning_rate": 0.00047815900962025617, + "loss": 2.2088, + "step": 457900 + }, + { + "epoch": 1.770152000123703, + "grad_norm": 0.12382102012634277, + "learning_rate": 0.000478027595519551, + "loss": 2.1849, + "step": 457910 + }, + { + "epoch": 1.7701906573270865, + "grad_norm": 0.12879589200019836, + "learning_rate": 0.00047789619276476406, + "loss": 2.189, + "step": 457920 + }, + { + "epoch": 1.7702293145304697, + "grad_norm": 0.1366998553276062, + "learning_rate": 0.0004777648013529579, + "loss": 2.1999, + "step": 457930 + }, + { + "epoch": 1.770267971733853, + "grad_norm": 0.13403339684009552, + "learning_rate": 0.0004776334212811948, + "loss": 2.2097, + "step": 457940 + }, + { + "epoch": 1.7703066289372362, + "grad_norm": 0.13324300944805145, + "learning_rate": 0.0004775020525465394, + "loss": 2.193, + "step": 457950 + }, + { + "epoch": 1.7703452861406195, + "grad_norm": 0.13828282058238983, + "learning_rate": 0.0004773706951460575, + "loss": 2.1866, + "step": 457960 + }, + { + "epoch": 1.7703839433440027, + "grad_norm": 0.14581134915351868, + "learning_rate": 0.00047723934907681565, + "loss": 2.1945, + "step": 457970 + }, + { + "epoch": 1.770422600547386, + "grad_norm": 0.1256742626428604, + "learning_rate": 0.00047710801433588216, + "loss": 2.2081, + "step": 457980 + }, + { + "epoch": 1.7704612577507692, + "grad_norm": 0.1261265128850937, + "learning_rate": 0.00047697669092032616, + "loss": 2.1937, + "step": 457990 + }, + { + "epoch": 1.7704999149541525, + "grad_norm": 0.13095766305923462, + "learning_rate": 0.00047684537882721845, + "loss": 2.1984, + "step": 458000 + }, + { + "epoch": 1.7705385721575357, + "grad_norm": 0.12314862012863159, + "learning_rate": 0.000476714078053631, + "loss": 2.1993, + "step": 458010 + }, + { + "epoch": 1.770577229360919, + "grad_norm": 0.13177534937858582, + "learning_rate": 0.00047658278859663653, + "loss": 2.1924, + "step": 458020 + }, + { + "epoch": 1.7706158865643022, + "grad_norm": 0.12194767594337463, + "learning_rate": 0.00047645151045331023, + "loss": 2.2062, + "step": 458030 + }, + { + "epoch": 1.7706545437676857, + "grad_norm": 0.12695930898189545, + "learning_rate": 0.00047632024362072724, + "loss": 2.1973, + "step": 458040 + }, + { + "epoch": 1.770693200971069, + "grad_norm": 0.13179419934749603, + "learning_rate": 0.0004761889880959647, + "loss": 2.1966, + "step": 458050 + }, + { + "epoch": 1.7707318581744522, + "grad_norm": 0.14159278571605682, + "learning_rate": 0.0004760577438761009, + "loss": 2.1981, + "step": 458060 + }, + { + "epoch": 1.7707705153778355, + "grad_norm": 0.12899576127529144, + "learning_rate": 0.0004759265109582149, + "loss": 2.1843, + "step": 458070 + }, + { + "epoch": 1.770809172581219, + "grad_norm": 0.12268215417861938, + "learning_rate": 0.000475795289339388, + "loss": 2.195, + "step": 458080 + }, + { + "epoch": 1.7708478297846022, + "grad_norm": 0.11523349583148956, + "learning_rate": 0.0004756640790167017, + "loss": 2.1898, + "step": 458090 + }, + { + "epoch": 1.7708864869879855, + "grad_norm": 0.14718125760555267, + "learning_rate": 0.00047553287998723957, + "loss": 2.1935, + "step": 458100 + }, + { + "epoch": 1.7709251441913687, + "grad_norm": 0.14676856994628906, + "learning_rate": 0.0004754016922480859, + "loss": 2.2079, + "step": 458110 + }, + { + "epoch": 1.770963801394752, + "grad_norm": 0.1454949676990509, + "learning_rate": 0.0004752705157963266, + "loss": 2.2149, + "step": 458120 + }, + { + "epoch": 1.7710024585981352, + "grad_norm": 0.1337183564901352, + "learning_rate": 0.00047513935062904844, + "loss": 2.2025, + "step": 458130 + }, + { + "epoch": 1.7710411158015185, + "grad_norm": 0.13073621690273285, + "learning_rate": 0.00047500819674333995, + "loss": 2.2021, + "step": 458140 + }, + { + "epoch": 1.7710797730049017, + "grad_norm": 0.12698140740394592, + "learning_rate": 0.00047487705413629035, + "loss": 2.1931, + "step": 458150 + }, + { + "epoch": 1.771118430208285, + "grad_norm": 0.1372763216495514, + "learning_rate": 0.0004747459228049904, + "loss": 2.1923, + "step": 458160 + }, + { + "epoch": 1.7711570874116682, + "grad_norm": 0.1382165402173996, + "learning_rate": 0.00047461480274653243, + "loss": 2.1947, + "step": 458170 + }, + { + "epoch": 1.7711957446150515, + "grad_norm": 0.134132981300354, + "learning_rate": 0.0004744836939580095, + "loss": 2.1894, + "step": 458180 + }, + { + "epoch": 1.7712344018184347, + "grad_norm": 0.1326516568660736, + "learning_rate": 0.00047435259643651605, + "loss": 2.1872, + "step": 458190 + }, + { + "epoch": 1.771273059021818, + "grad_norm": 0.12394417822360992, + "learning_rate": 0.00047422151017914804, + "loss": 2.1994, + "step": 458200 + }, + { + "epoch": 1.7713117162252014, + "grad_norm": 0.13924540579319, + "learning_rate": 0.0004740904351830022, + "loss": 2.1929, + "step": 458210 + }, + { + "epoch": 1.7713503734285847, + "grad_norm": 0.1372320055961609, + "learning_rate": 0.0004739593714451766, + "loss": 2.1995, + "step": 458220 + }, + { + "epoch": 1.771389030631968, + "grad_norm": 0.12690575420856476, + "learning_rate": 0.00047382831896277143, + "loss": 2.1852, + "step": 458230 + }, + { + "epoch": 1.7714276878353512, + "grad_norm": 0.12741948664188385, + "learning_rate": 0.00047369727773288696, + "loss": 2.1826, + "step": 458240 + }, + { + "epoch": 1.7714663450387347, + "grad_norm": 0.13204288482666016, + "learning_rate": 0.00047356624775262524, + "loss": 2.1832, + "step": 458250 + }, + { + "epoch": 1.771505002242118, + "grad_norm": 0.13010390102863312, + "learning_rate": 0.0004734352290190893, + "loss": 2.1916, + "step": 458260 + }, + { + "epoch": 1.7715436594455012, + "grad_norm": 0.12276510894298553, + "learning_rate": 0.0004733042215293841, + "loss": 2.1894, + "step": 458270 + }, + { + "epoch": 1.7715823166488844, + "grad_norm": 0.12860916554927826, + "learning_rate": 0.00047317322528061493, + "loss": 2.2065, + "step": 458280 + }, + { + "epoch": 1.7716209738522677, + "grad_norm": 0.143928661942482, + "learning_rate": 0.00047304224026988885, + "loss": 2.1892, + "step": 458290 + }, + { + "epoch": 1.771659631055651, + "grad_norm": 0.1406896710395813, + "learning_rate": 0.00047291126649431404, + "loss": 2.2035, + "step": 458300 + }, + { + "epoch": 1.7716982882590342, + "grad_norm": 0.12735261023044586, + "learning_rate": 0.0004727803039510001, + "loss": 2.2184, + "step": 458310 + }, + { + "epoch": 1.7717369454624174, + "grad_norm": 0.13912999629974365, + "learning_rate": 0.0004726493526370572, + "loss": 2.1947, + "step": 458320 + }, + { + "epoch": 1.7717756026658007, + "grad_norm": 0.1308005154132843, + "learning_rate": 0.00047251841254959806, + "loss": 2.1873, + "step": 458330 + }, + { + "epoch": 1.771814259869184, + "grad_norm": 0.12966060638427734, + "learning_rate": 0.00047238748368573513, + "loss": 2.1975, + "step": 458340 + }, + { + "epoch": 1.7718529170725672, + "grad_norm": 0.13258548080921173, + "learning_rate": 0.00047225656604258325, + "loss": 2.1979, + "step": 458350 + }, + { + "epoch": 1.7718915742759505, + "grad_norm": 0.14349143207073212, + "learning_rate": 0.0004721256596172576, + "loss": 2.1986, + "step": 458360 + }, + { + "epoch": 1.7719302314793337, + "grad_norm": 0.13455979526042938, + "learning_rate": 0.0004719947644068754, + "loss": 2.189, + "step": 458370 + }, + { + "epoch": 1.7719688886827172, + "grad_norm": 0.12838658690452576, + "learning_rate": 0.00047186388040855465, + "loss": 2.1984, + "step": 458380 + }, + { + "epoch": 1.7720075458861004, + "grad_norm": 0.13850080966949463, + "learning_rate": 0.0004717330076194146, + "loss": 2.1958, + "step": 458390 + }, + { + "epoch": 1.7720462030894837, + "grad_norm": 0.12754419445991516, + "learning_rate": 0.000471602146036576, + "loss": 2.191, + "step": 458400 + }, + { + "epoch": 1.772084860292867, + "grad_norm": 0.132229745388031, + "learning_rate": 0.00047147129565716054, + "loss": 2.1912, + "step": 458410 + }, + { + "epoch": 1.7721235174962504, + "grad_norm": 0.12338346987962723, + "learning_rate": 0.0004713404564782908, + "loss": 2.1925, + "step": 458420 + }, + { + "epoch": 1.7721621746996337, + "grad_norm": 0.13899773359298706, + "learning_rate": 0.0004712096284970917, + "loss": 2.2055, + "step": 458430 + }, + { + "epoch": 1.772200831903017, + "grad_norm": 0.12873870134353638, + "learning_rate": 0.00047107881171068856, + "loss": 2.2032, + "step": 458440 + }, + { + "epoch": 1.7722394891064002, + "grad_norm": 0.12777495384216309, + "learning_rate": 0.0004709480061162079, + "loss": 2.2002, + "step": 458450 + }, + { + "epoch": 1.7722781463097834, + "grad_norm": 0.1315607875585556, + "learning_rate": 0.00047081721171077787, + "loss": 2.1856, + "step": 458460 + }, + { + "epoch": 1.7723168035131667, + "grad_norm": 0.12554654479026794, + "learning_rate": 0.0004706864284915275, + "loss": 2.1838, + "step": 458470 + }, + { + "epoch": 1.77235546071655, + "grad_norm": 0.11828701198101044, + "learning_rate": 0.00047055565645558704, + "loss": 2.1797, + "step": 458480 + }, + { + "epoch": 1.7723941179199332, + "grad_norm": 0.13972051441669464, + "learning_rate": 0.0004704248956000885, + "loss": 2.216, + "step": 458490 + }, + { + "epoch": 1.7724327751233164, + "grad_norm": 0.13295525312423706, + "learning_rate": 0.00047029414592216455, + "loss": 2.1843, + "step": 458500 + }, + { + "epoch": 1.7724714323266997, + "grad_norm": 0.1282622516155243, + "learning_rate": 0.0004701634074189494, + "loss": 2.1813, + "step": 458510 + }, + { + "epoch": 1.772510089530083, + "grad_norm": 0.1341368854045868, + "learning_rate": 0.00047003268008757784, + "loss": 2.1947, + "step": 458520 + }, + { + "epoch": 1.7725487467334662, + "grad_norm": 0.1381879597902298, + "learning_rate": 0.0004699019639251869, + "loss": 2.19, + "step": 458530 + }, + { + "epoch": 1.7725874039368494, + "grad_norm": 0.13804525136947632, + "learning_rate": 0.00046977125892891424, + "loss": 2.1871, + "step": 458540 + }, + { + "epoch": 1.772626061140233, + "grad_norm": 0.12451229244470596, + "learning_rate": 0.0004696405650958988, + "loss": 2.1922, + "step": 458550 + }, + { + "epoch": 1.7726647183436162, + "grad_norm": 0.13908781111240387, + "learning_rate": 0.0004695098824232806, + "loss": 2.2021, + "step": 458560 + }, + { + "epoch": 1.7727033755469994, + "grad_norm": 0.1446731686592102, + "learning_rate": 0.00046937921090820157, + "loss": 2.2011, + "step": 458570 + }, + { + "epoch": 1.7727420327503827, + "grad_norm": 0.13333015143871307, + "learning_rate": 0.0004692485505478037, + "loss": 2.1967, + "step": 458580 + }, + { + "epoch": 1.7727806899537661, + "grad_norm": 0.13534389436244965, + "learning_rate": 0.0004691179013392313, + "loss": 2.1932, + "step": 458590 + }, + { + "epoch": 1.7728193471571494, + "grad_norm": 0.1271008998155594, + "learning_rate": 0.00046898726327962903, + "loss": 2.1878, + "step": 458600 + }, + { + "epoch": 1.7728580043605326, + "grad_norm": 0.13144579529762268, + "learning_rate": 0.0004688566363661435, + "loss": 2.1871, + "step": 458610 + }, + { + "epoch": 1.772896661563916, + "grad_norm": 0.11865829676389694, + "learning_rate": 0.0004687260205959221, + "loss": 2.1993, + "step": 458620 + }, + { + "epoch": 1.7729353187672992, + "grad_norm": 0.12198822945356369, + "learning_rate": 0.00046859541596611364, + "loss": 2.191, + "step": 458630 + }, + { + "epoch": 1.7729739759706824, + "grad_norm": 0.14284397661685944, + "learning_rate": 0.0004684648224738681, + "loss": 2.1975, + "step": 458640 + }, + { + "epoch": 1.7730126331740657, + "grad_norm": 0.12675946950912476, + "learning_rate": 0.00046833424011633664, + "loss": 2.1877, + "step": 458650 + }, + { + "epoch": 1.773051290377449, + "grad_norm": 0.14137062430381775, + "learning_rate": 0.00046820366889067146, + "loss": 2.2019, + "step": 458660 + }, + { + "epoch": 1.7730899475808322, + "grad_norm": 0.12649844586849213, + "learning_rate": 0.000468073108794026, + "loss": 2.1905, + "step": 458670 + }, + { + "epoch": 1.7731286047842154, + "grad_norm": 0.1343609243631363, + "learning_rate": 0.0004679425598235554, + "loss": 2.1982, + "step": 458680 + }, + { + "epoch": 1.7731672619875987, + "grad_norm": 0.13129524886608124, + "learning_rate": 0.0004678120219764157, + "loss": 2.2044, + "step": 458690 + }, + { + "epoch": 1.773205919190982, + "grad_norm": 0.1215507909655571, + "learning_rate": 0.0004676814952497637, + "loss": 2.1676, + "step": 458700 + }, + { + "epoch": 1.7732445763943652, + "grad_norm": 0.13051870465278625, + "learning_rate": 0.0004675509796407582, + "loss": 2.193, + "step": 458710 + }, + { + "epoch": 1.7732832335977486, + "grad_norm": 0.12099869549274445, + "learning_rate": 0.0004674204751465585, + "loss": 2.2044, + "step": 458720 + }, + { + "epoch": 1.773321890801132, + "grad_norm": 0.13086313009262085, + "learning_rate": 0.0004672899817643259, + "loss": 2.2017, + "step": 458730 + }, + { + "epoch": 1.7733605480045151, + "grad_norm": 0.13035260140895844, + "learning_rate": 0.0004671594994912223, + "loss": 2.2122, + "step": 458740 + }, + { + "epoch": 1.7733992052078984, + "grad_norm": 0.13473574817180634, + "learning_rate": 0.00046702902832441073, + "loss": 2.1852, + "step": 458750 + }, + { + "epoch": 1.7734378624112819, + "grad_norm": 0.12415792793035507, + "learning_rate": 0.00046689856826105604, + "loss": 2.1954, + "step": 458760 + }, + { + "epoch": 1.7734765196146651, + "grad_norm": 0.13153572380542755, + "learning_rate": 0.0004667681192983235, + "loss": 2.1873, + "step": 458770 + }, + { + "epoch": 1.7735151768180484, + "grad_norm": 0.13177259266376495, + "learning_rate": 0.00046663768143338015, + "loss": 2.2101, + "step": 458780 + }, + { + "epoch": 1.7735538340214316, + "grad_norm": 0.13995657861232758, + "learning_rate": 0.0004665072546633944, + "loss": 2.1976, + "step": 458790 + }, + { + "epoch": 1.7735924912248149, + "grad_norm": 0.13670065999031067, + "learning_rate": 0.00046637683898553497, + "loss": 2.1987, + "step": 458800 + }, + { + "epoch": 1.7736311484281981, + "grad_norm": 0.13310128450393677, + "learning_rate": 0.0004662464343969728, + "loss": 2.2139, + "step": 458810 + }, + { + "epoch": 1.7736698056315814, + "grad_norm": 0.130756676197052, + "learning_rate": 0.00046611604089487926, + "loss": 2.1914, + "step": 458820 + }, + { + "epoch": 1.7737084628349646, + "grad_norm": 0.12607665359973907, + "learning_rate": 0.0004659856584764273, + "loss": 2.1856, + "step": 458830 + }, + { + "epoch": 1.773747120038348, + "grad_norm": 0.12528374791145325, + "learning_rate": 0.0004658552871387913, + "loss": 2.2033, + "step": 458840 + }, + { + "epoch": 1.7737857772417311, + "grad_norm": 0.12461727857589722, + "learning_rate": 0.00046572492687914656, + "loss": 2.1869, + "step": 458850 + }, + { + "epoch": 1.7738244344451144, + "grad_norm": 0.1290518194437027, + "learning_rate": 0.0004655945776946693, + "loss": 2.182, + "step": 458860 + }, + { + "epoch": 1.7738630916484976, + "grad_norm": 0.1273990273475647, + "learning_rate": 0.00046546423958253747, + "loss": 2.2012, + "step": 458870 + }, + { + "epoch": 1.773901748851881, + "grad_norm": 0.1488787829875946, + "learning_rate": 0.00046533391253992986, + "loss": 2.211, + "step": 458880 + }, + { + "epoch": 1.7739404060552644, + "grad_norm": 0.12724000215530396, + "learning_rate": 0.00046520359656402645, + "loss": 2.1899, + "step": 458890 + }, + { + "epoch": 1.7739790632586476, + "grad_norm": 0.12783779203891754, + "learning_rate": 0.0004650732916520086, + "loss": 2.1994, + "step": 458900 + }, + { + "epoch": 1.7740177204620309, + "grad_norm": 0.12095726281404495, + "learning_rate": 0.0004649429978010589, + "loss": 2.1844, + "step": 458910 + }, + { + "epoch": 1.7740563776654141, + "grad_norm": 0.12530633807182312, + "learning_rate": 0.000464812715008361, + "loss": 2.1985, + "step": 458920 + }, + { + "epoch": 1.7740950348687976, + "grad_norm": 0.12608347833156586, + "learning_rate": 0.00046468244327109963, + "loss": 2.1984, + "step": 458930 + }, + { + "epoch": 1.7741336920721809, + "grad_norm": 0.12942135334014893, + "learning_rate": 0.00046455218258646116, + "loss": 2.2085, + "step": 458940 + }, + { + "epoch": 1.774172349275564, + "grad_norm": 0.13217510282993317, + "learning_rate": 0.0004644219329516326, + "loss": 2.1949, + "step": 458950 + }, + { + "epoch": 1.7742110064789474, + "grad_norm": 0.12174969911575317, + "learning_rate": 0.0004642916943638027, + "loss": 2.2047, + "step": 458960 + }, + { + "epoch": 1.7742496636823306, + "grad_norm": 0.1340894103050232, + "learning_rate": 0.0004641614668201608, + "loss": 2.1964, + "step": 458970 + }, + { + "epoch": 1.7742883208857139, + "grad_norm": 0.1508149355649948, + "learning_rate": 0.0004640312503178978, + "loss": 2.1945, + "step": 458980 + }, + { + "epoch": 1.7743269780890971, + "grad_norm": 0.12747126817703247, + "learning_rate": 0.00046390104485420603, + "loss": 2.2081, + "step": 458990 + }, + { + "epoch": 1.7743656352924804, + "grad_norm": 0.13547492027282715, + "learning_rate": 0.0004637708504262783, + "loss": 2.1964, + "step": 459000 + }, + { + "epoch": 1.7744042924958636, + "grad_norm": 0.1237700879573822, + "learning_rate": 0.00046364066703130934, + "loss": 2.1918, + "step": 459010 + }, + { + "epoch": 1.7744429496992469, + "grad_norm": 0.12852251529693604, + "learning_rate": 0.00046351049466649476, + "loss": 2.1965, + "step": 459020 + }, + { + "epoch": 1.7744816069026301, + "grad_norm": 0.12711748480796814, + "learning_rate": 0.00046338033332903093, + "loss": 2.1877, + "step": 459030 + }, + { + "epoch": 1.7745202641060134, + "grad_norm": 0.13563001155853271, + "learning_rate": 0.0004632501830161162, + "loss": 2.1793, + "step": 459040 + }, + { + "epoch": 1.7745589213093966, + "grad_norm": 0.1352197676897049, + "learning_rate": 0.00046312004372494985, + "loss": 2.1924, + "step": 459050 + }, + { + "epoch": 1.77459757851278, + "grad_norm": 0.1381702423095703, + "learning_rate": 0.0004629899154527319, + "loss": 2.1913, + "step": 459060 + }, + { + "epoch": 1.7746362357161634, + "grad_norm": 0.13926814496517181, + "learning_rate": 0.00046285979819666424, + "loss": 2.1881, + "step": 459070 + }, + { + "epoch": 1.7746748929195466, + "grad_norm": 0.1228603944182396, + "learning_rate": 0.0004627296919539492, + "loss": 2.1881, + "step": 459080 + }, + { + "epoch": 1.7747135501229299, + "grad_norm": 0.13640454411506653, + "learning_rate": 0.00046259959672179086, + "loss": 2.2026, + "step": 459090 + }, + { + "epoch": 1.7747522073263133, + "grad_norm": 0.13040600717067719, + "learning_rate": 0.00046246951249739453, + "loss": 2.1888, + "step": 459100 + }, + { + "epoch": 1.7747908645296966, + "grad_norm": 0.1405375450849533, + "learning_rate": 0.0004623394392779661, + "loss": 2.1886, + "step": 459110 + }, + { + "epoch": 1.7748295217330798, + "grad_norm": 0.1282537281513214, + "learning_rate": 0.0004622093770607132, + "loss": 2.1804, + "step": 459120 + }, + { + "epoch": 1.774868178936463, + "grad_norm": 0.136846661567688, + "learning_rate": 0.0004620793258428444, + "loss": 2.1889, + "step": 459130 + }, + { + "epoch": 1.7749068361398463, + "grad_norm": 0.13499949872493744, + "learning_rate": 0.0004619492856215697, + "loss": 2.1955, + "step": 459140 + }, + { + "epoch": 1.7749454933432296, + "grad_norm": 0.13540257513523102, + "learning_rate": 0.0004618192563940997, + "loss": 2.1899, + "step": 459150 + }, + { + "epoch": 1.7749841505466128, + "grad_norm": 0.1366579830646515, + "learning_rate": 0.0004616892381576472, + "loss": 2.211, + "step": 459160 + }, + { + "epoch": 1.775022807749996, + "grad_norm": 0.13207294046878815, + "learning_rate": 0.000461559230909425, + "loss": 2.1789, + "step": 459170 + }, + { + "epoch": 1.7750614649533794, + "grad_norm": 0.13058124482631683, + "learning_rate": 0.0004614292346466478, + "loss": 2.1933, + "step": 459180 + }, + { + "epoch": 1.7751001221567626, + "grad_norm": 0.12599864602088928, + "learning_rate": 0.00046129924936653136, + "loss": 2.1844, + "step": 459190 + }, + { + "epoch": 1.7751387793601459, + "grad_norm": 0.27933165431022644, + "learning_rate": 0.0004611692750662924, + "loss": 2.1865, + "step": 459200 + }, + { + "epoch": 1.775177436563529, + "grad_norm": 0.13544274866580963, + "learning_rate": 0.00046103931174314925, + "loss": 2.19, + "step": 459210 + }, + { + "epoch": 1.7752160937669124, + "grad_norm": 0.13466085493564606, + "learning_rate": 0.0004609093593943208, + "loss": 2.2043, + "step": 459220 + }, + { + "epoch": 1.7752547509702958, + "grad_norm": 0.12241952866315842, + "learning_rate": 0.0004607794180170277, + "loss": 2.1852, + "step": 459230 + }, + { + "epoch": 1.775293408173679, + "grad_norm": 0.14127551019191742, + "learning_rate": 0.00046064948760849144, + "loss": 2.2023, + "step": 459240 + }, + { + "epoch": 1.7753320653770623, + "grad_norm": 0.13037101924419403, + "learning_rate": 0.00046051956816593466, + "loss": 2.1943, + "step": 459250 + }, + { + "epoch": 1.7753707225804456, + "grad_norm": 0.13260316848754883, + "learning_rate": 0.0004603896596865815, + "loss": 2.1981, + "step": 459260 + }, + { + "epoch": 1.775409379783829, + "grad_norm": 0.13140569627285004, + "learning_rate": 0.000460259762167657, + "loss": 2.2033, + "step": 459270 + }, + { + "epoch": 1.7754480369872123, + "grad_norm": 0.13945728540420532, + "learning_rate": 0.0004601298756063874, + "loss": 2.2103, + "step": 459280 + }, + { + "epoch": 1.7754866941905956, + "grad_norm": 0.1657353937625885, + "learning_rate": 0.00045999999999999996, + "loss": 2.1856, + "step": 459290 + }, + { + "epoch": 1.7755253513939788, + "grad_norm": 0.1261647492647171, + "learning_rate": 0.00045987013534572353, + "loss": 2.1814, + "step": 459300 + }, + { + "epoch": 1.775564008597362, + "grad_norm": 0.1261998414993286, + "learning_rate": 0.0004597402816407878, + "loss": 2.1964, + "step": 459310 + }, + { + "epoch": 1.7756026658007453, + "grad_norm": 0.12563207745552063, + "learning_rate": 0.0004596104388824236, + "loss": 2.1929, + "step": 459320 + }, + { + "epoch": 1.7756413230041286, + "grad_norm": 0.13220050930976868, + "learning_rate": 0.000459480607067863, + "loss": 2.1892, + "step": 459330 + }, + { + "epoch": 1.7756799802075118, + "grad_norm": 0.12477528303861618, + "learning_rate": 0.00045935078619433936, + "loss": 2.1909, + "step": 459340 + }, + { + "epoch": 1.775718637410895, + "grad_norm": 0.12958276271820068, + "learning_rate": 0.00045922097625908734, + "loss": 2.188, + "step": 459350 + }, + { + "epoch": 1.7757572946142783, + "grad_norm": 0.14196142554283142, + "learning_rate": 0.00045909117725934225, + "loss": 2.1954, + "step": 459360 + }, + { + "epoch": 1.7757959518176616, + "grad_norm": 0.12300693243741989, + "learning_rate": 0.00045896138919234076, + "loss": 2.1936, + "step": 459370 + }, + { + "epoch": 1.7758346090210448, + "grad_norm": 0.13028477132320404, + "learning_rate": 0.0004588316120553213, + "loss": 2.1722, + "step": 459380 + }, + { + "epoch": 1.7758732662244283, + "grad_norm": 0.134907066822052, + "learning_rate": 0.0004587018458455223, + "loss": 2.2024, + "step": 459390 + }, + { + "epoch": 1.7759119234278116, + "grad_norm": 0.13216833770275116, + "learning_rate": 0.00045857209056018465, + "loss": 2.1715, + "step": 459400 + }, + { + "epoch": 1.7759505806311948, + "grad_norm": 0.12976202368736267, + "learning_rate": 0.0004584423461965492, + "loss": 2.1829, + "step": 459410 + }, + { + "epoch": 1.775989237834578, + "grad_norm": 0.1588214933872223, + "learning_rate": 0.00045831261275185885, + "loss": 2.1936, + "step": 459420 + }, + { + "epoch": 1.7760278950379613, + "grad_norm": 0.13371814787387848, + "learning_rate": 0.0004581828902233571, + "loss": 2.1858, + "step": 459430 + }, + { + "epoch": 1.7760665522413448, + "grad_norm": 0.13082599639892578, + "learning_rate": 0.00045805317860828933, + "loss": 2.1968, + "step": 459440 + }, + { + "epoch": 1.776105209444728, + "grad_norm": 0.2999109625816345, + "learning_rate": 0.0004579234779039012, + "loss": 2.1796, + "step": 459450 + }, + { + "epoch": 1.7761438666481113, + "grad_norm": 0.13211677968502045, + "learning_rate": 0.0004577937881074399, + "loss": 2.1919, + "step": 459460 + }, + { + "epoch": 1.7761825238514946, + "grad_norm": 0.12072337418794632, + "learning_rate": 0.00045766410921615396, + "loss": 2.1943, + "step": 459470 + }, + { + "epoch": 1.7762211810548778, + "grad_norm": 0.13391290605068207, + "learning_rate": 0.0004575344412272928, + "loss": 2.1965, + "step": 459480 + }, + { + "epoch": 1.776259838258261, + "grad_norm": 0.12569762766361237, + "learning_rate": 0.00045740478413810705, + "loss": 2.1858, + "step": 459490 + }, + { + "epoch": 1.7762984954616443, + "grad_norm": 0.12477441877126694, + "learning_rate": 0.00045727513794584885, + "loss": 2.1926, + "step": 459500 + }, + { + "epoch": 1.7763371526650276, + "grad_norm": 0.1322563886642456, + "learning_rate": 0.0004571455026477709, + "loss": 2.1936, + "step": 459510 + }, + { + "epoch": 1.7763758098684108, + "grad_norm": 0.12976054847240448, + "learning_rate": 0.00045701587824112735, + "loss": 2.1956, + "step": 459520 + }, + { + "epoch": 1.776414467071794, + "grad_norm": 0.13775162398815155, + "learning_rate": 0.0004568862647231733, + "loss": 2.1815, + "step": 459530 + }, + { + "epoch": 1.7764531242751773, + "grad_norm": 0.13193078339099884, + "learning_rate": 0.00045675666209116586, + "loss": 2.2069, + "step": 459540 + }, + { + "epoch": 1.7764917814785606, + "grad_norm": 0.14443346858024597, + "learning_rate": 0.00045662707034236207, + "loss": 2.1823, + "step": 459550 + }, + { + "epoch": 1.776530438681944, + "grad_norm": 0.12272831797599792, + "learning_rate": 0.00045649748947402104, + "loss": 2.196, + "step": 459560 + }, + { + "epoch": 1.7765690958853273, + "grad_norm": 0.1343587189912796, + "learning_rate": 0.0004563679194834023, + "loss": 2.1984, + "step": 459570 + }, + { + "epoch": 1.7766077530887106, + "grad_norm": 0.14258071780204773, + "learning_rate": 0.0004562383603677671, + "loss": 2.1905, + "step": 459580 + }, + { + "epoch": 1.7766464102920938, + "grad_norm": 0.13907821476459503, + "learning_rate": 0.0004561088121243777, + "loss": 2.1901, + "step": 459590 + }, + { + "epoch": 1.776685067495477, + "grad_norm": 0.12912242114543915, + "learning_rate": 0.0004559792747504974, + "loss": 2.2063, + "step": 459600 + }, + { + "epoch": 1.7767237246988605, + "grad_norm": 0.13161329925060272, + "learning_rate": 0.0004558497482433908, + "loss": 2.1862, + "step": 459610 + }, + { + "epoch": 1.7767623819022438, + "grad_norm": 0.1229773536324501, + "learning_rate": 0.0004557202326003231, + "loss": 2.2066, + "step": 459620 + }, + { + "epoch": 1.776801039105627, + "grad_norm": 0.12915240228176117, + "learning_rate": 0.00045559072781856157, + "loss": 2.1936, + "step": 459630 + }, + { + "epoch": 1.7768396963090103, + "grad_norm": 0.12329524755477905, + "learning_rate": 0.0004554612338953741, + "loss": 2.1853, + "step": 459640 + }, + { + "epoch": 1.7768783535123935, + "grad_norm": 0.12746720016002655, + "learning_rate": 0.0004553317508280297, + "loss": 2.198, + "step": 459650 + }, + { + "epoch": 1.7769170107157768, + "grad_norm": 0.14305472373962402, + "learning_rate": 0.0004552022786137986, + "loss": 2.1832, + "step": 459660 + }, + { + "epoch": 1.77695566791916, + "grad_norm": 0.13622303307056427, + "learning_rate": 0.00045507281724995206, + "loss": 2.2125, + "step": 459670 + }, + { + "epoch": 1.7769943251225433, + "grad_norm": 0.12965580821037292, + "learning_rate": 0.0004549433667337628, + "loss": 2.1963, + "step": 459680 + }, + { + "epoch": 1.7770329823259265, + "grad_norm": 0.13008244335651398, + "learning_rate": 0.0004548139270625042, + "loss": 2.1986, + "step": 459690 + }, + { + "epoch": 1.7770716395293098, + "grad_norm": 0.13434603810310364, + "learning_rate": 0.00045468449823345147, + "loss": 2.1983, + "step": 459700 + }, + { + "epoch": 1.777110296732693, + "grad_norm": 0.13101531565189362, + "learning_rate": 0.00045455508024388005, + "loss": 2.1924, + "step": 459710 + }, + { + "epoch": 1.7771489539360763, + "grad_norm": 0.13777051866054535, + "learning_rate": 0.0004544256730910674, + "loss": 2.203, + "step": 459720 + }, + { + "epoch": 1.7771876111394598, + "grad_norm": 0.12869952619075775, + "learning_rate": 0.0004542962767722916, + "loss": 2.1834, + "step": 459730 + }, + { + "epoch": 1.777226268342843, + "grad_norm": 0.14540624618530273, + "learning_rate": 0.000454166891284832, + "loss": 2.1836, + "step": 459740 + }, + { + "epoch": 1.7772649255462263, + "grad_norm": 0.14128723740577698, + "learning_rate": 0.0004540375166259694, + "loss": 2.1843, + "step": 459750 + }, + { + "epoch": 1.7773035827496095, + "grad_norm": 0.12434931099414825, + "learning_rate": 0.0004539081527929849, + "loss": 2.1968, + "step": 459760 + }, + { + "epoch": 1.7773422399529928, + "grad_norm": 0.1366758644580841, + "learning_rate": 0.00045377879978316174, + "loss": 2.1951, + "step": 459770 + }, + { + "epoch": 1.7773808971563763, + "grad_norm": 0.13118357956409454, + "learning_rate": 0.00045364945759378354, + "loss": 2.18, + "step": 459780 + }, + { + "epoch": 1.7774195543597595, + "grad_norm": 0.13108068704605103, + "learning_rate": 0.00045352012622213535, + "loss": 2.1932, + "step": 459790 + }, + { + "epoch": 1.7774582115631428, + "grad_norm": 0.13078756630420685, + "learning_rate": 0.0004533908056655036, + "loss": 2.1999, + "step": 459800 + }, + { + "epoch": 1.777496868766526, + "grad_norm": 0.12737669050693512, + "learning_rate": 0.0004532614959211754, + "loss": 2.1839, + "step": 459810 + }, + { + "epoch": 1.7775355259699093, + "grad_norm": 0.13121169805526733, + "learning_rate": 0.0004531321969864395, + "loss": 2.1935, + "step": 459820 + }, + { + "epoch": 1.7775741831732925, + "grad_norm": 0.13091163337230682, + "learning_rate": 0.00045300290885858473, + "loss": 2.1976, + "step": 459830 + }, + { + "epoch": 1.7776128403766758, + "grad_norm": 0.1273505985736847, + "learning_rate": 0.0004528736315349027, + "loss": 2.2018, + "step": 459840 + }, + { + "epoch": 1.777651497580059, + "grad_norm": 0.13054147362709045, + "learning_rate": 0.0004527443650126848, + "loss": 2.2042, + "step": 459850 + }, + { + "epoch": 1.7776901547834423, + "grad_norm": 0.13942508399486542, + "learning_rate": 0.0004526151092892241, + "loss": 2.1913, + "step": 459860 + }, + { + "epoch": 1.7777288119868255, + "grad_norm": 0.12371347099542618, + "learning_rate": 0.00045248586436181463, + "loss": 2.1769, + "step": 459870 + }, + { + "epoch": 1.7777674691902088, + "grad_norm": 0.12571966648101807, + "learning_rate": 0.00045235663022775177, + "loss": 2.2091, + "step": 459880 + }, + { + "epoch": 1.777806126393592, + "grad_norm": 0.13976377248764038, + "learning_rate": 0.0004522274068843317, + "loss": 2.1819, + "step": 459890 + }, + { + "epoch": 1.7778447835969755, + "grad_norm": 0.13170628249645233, + "learning_rate": 0.0004520981943288522, + "loss": 2.2026, + "step": 459900 + }, + { + "epoch": 1.7778834408003588, + "grad_norm": 0.1292218416929245, + "learning_rate": 0.00045196899255861166, + "loss": 2.1987, + "step": 459910 + }, + { + "epoch": 1.777922098003742, + "grad_norm": 0.12546850740909576, + "learning_rate": 0.00045183980157090974, + "loss": 2.1963, + "step": 459920 + }, + { + "epoch": 1.7779607552071253, + "grad_norm": 0.1390175074338913, + "learning_rate": 0.0004517106213630475, + "loss": 2.2001, + "step": 459930 + }, + { + "epoch": 1.7779994124105087, + "grad_norm": 0.1215895414352417, + "learning_rate": 0.0004515814519323271, + "loss": 2.199, + "step": 459940 + }, + { + "epoch": 1.778038069613892, + "grad_norm": 0.12747113406658173, + "learning_rate": 0.0004514522932760514, + "loss": 2.1829, + "step": 459950 + }, + { + "epoch": 1.7780767268172752, + "grad_norm": 0.13074922561645508, + "learning_rate": 0.0004513231453915247, + "loss": 2.1869, + "step": 459960 + }, + { + "epoch": 1.7781153840206585, + "grad_norm": 0.1335902363061905, + "learning_rate": 0.0004511940082760526, + "loss": 2.1893, + "step": 459970 + }, + { + "epoch": 1.7781540412240417, + "grad_norm": 0.1362961083650589, + "learning_rate": 0.0004510648819269414, + "loss": 2.2001, + "step": 459980 + }, + { + "epoch": 1.778192698427425, + "grad_norm": 0.137433722615242, + "learning_rate": 0.0004509357663414988, + "loss": 2.1871, + "step": 459990 + }, + { + "epoch": 1.7782313556308083, + "grad_norm": 0.12900055944919586, + "learning_rate": 0.0004508066615170332, + "loss": 2.1741, + "step": 460000 + }, + { + "epoch": 1.7782700128341915, + "grad_norm": 0.12328225374221802, + "learning_rate": 0.000450677567450855, + "loss": 2.1998, + "step": 460010 + }, + { + "epoch": 1.7783086700375748, + "grad_norm": 0.13213379681110382, + "learning_rate": 0.00045054848414027496, + "loss": 2.1951, + "step": 460020 + }, + { + "epoch": 1.778347327240958, + "grad_norm": 0.13553999364376068, + "learning_rate": 0.00045041941158260505, + "loss": 2.1872, + "step": 460030 + }, + { + "epoch": 1.7783859844443413, + "grad_norm": 0.13455605506896973, + "learning_rate": 0.0004502903497751587, + "loss": 2.1847, + "step": 460040 + }, + { + "epoch": 1.7784246416477245, + "grad_norm": 0.1322959065437317, + "learning_rate": 0.00045016129871525016, + "loss": 2.1975, + "step": 460050 + }, + { + "epoch": 1.7784632988511078, + "grad_norm": 0.129024937748909, + "learning_rate": 0.000450032258400195, + "loss": 2.1846, + "step": 460060 + }, + { + "epoch": 1.7785019560544912, + "grad_norm": 0.14066551625728607, + "learning_rate": 0.0004499032288273097, + "loss": 2.1748, + "step": 460070 + }, + { + "epoch": 1.7785406132578745, + "grad_norm": 0.14260342717170715, + "learning_rate": 0.00044977420999391197, + "loss": 2.2117, + "step": 460080 + }, + { + "epoch": 1.7785792704612577, + "grad_norm": 0.12708856165409088, + "learning_rate": 0.0004496452018973205, + "loss": 2.195, + "step": 460090 + }, + { + "epoch": 1.778617927664641, + "grad_norm": 0.12720675766468048, + "learning_rate": 0.00044951620453485553, + "loss": 2.1898, + "step": 460100 + }, + { + "epoch": 1.7786565848680245, + "grad_norm": 0.1320163458585739, + "learning_rate": 0.00044938721790383786, + "loss": 2.215, + "step": 460110 + }, + { + "epoch": 1.7786952420714077, + "grad_norm": 0.12699101865291595, + "learning_rate": 0.00044925824200158984, + "loss": 2.1794, + "step": 460120 + }, + { + "epoch": 1.778733899274791, + "grad_norm": 0.1358613818883896, + "learning_rate": 0.00044912927682543447, + "loss": 2.202, + "step": 460130 + }, + { + "epoch": 1.7787725564781742, + "grad_norm": 0.12044976651668549, + "learning_rate": 0.00044900032237269616, + "loss": 2.1867, + "step": 460140 + }, + { + "epoch": 1.7788112136815575, + "grad_norm": 0.13260480761528015, + "learning_rate": 0.0004488713786407008, + "loss": 2.1866, + "step": 460150 + }, + { + "epoch": 1.7788498708849407, + "grad_norm": 0.13614635169506073, + "learning_rate": 0.0004487424456267748, + "loss": 2.2049, + "step": 460160 + }, + { + "epoch": 1.778888528088324, + "grad_norm": 0.17420963943004608, + "learning_rate": 0.0004486135233282456, + "loss": 2.1924, + "step": 460170 + }, + { + "epoch": 1.7789271852917072, + "grad_norm": 0.13680464029312134, + "learning_rate": 0.00044848461174244237, + "loss": 2.1879, + "step": 460180 + }, + { + "epoch": 1.7789658424950905, + "grad_norm": 0.1447361707687378, + "learning_rate": 0.00044835571086669493, + "loss": 2.2007, + "step": 460190 + }, + { + "epoch": 1.7790044996984737, + "grad_norm": 0.12781181931495667, + "learning_rate": 0.0004482268206983342, + "loss": 2.1904, + "step": 460200 + }, + { + "epoch": 1.779043156901857, + "grad_norm": 0.12790387868881226, + "learning_rate": 0.0004480979412346926, + "loss": 2.1889, + "step": 460210 + }, + { + "epoch": 1.7790818141052402, + "grad_norm": 0.13949060440063477, + "learning_rate": 0.00044796907247310314, + "loss": 2.2021, + "step": 460220 + }, + { + "epoch": 1.7791204713086235, + "grad_norm": 0.12806452810764313, + "learning_rate": 0.0004478402144109004, + "loss": 2.2124, + "step": 460230 + }, + { + "epoch": 1.779159128512007, + "grad_norm": 0.13862504065036774, + "learning_rate": 0.0004477113670454196, + "loss": 2.1974, + "step": 460240 + }, + { + "epoch": 1.7791977857153902, + "grad_norm": 0.13251705467700958, + "learning_rate": 0.00044758253037399757, + "loss": 2.1849, + "step": 460250 + }, + { + "epoch": 1.7792364429187735, + "grad_norm": 0.12595920264720917, + "learning_rate": 0.0004474537043939719, + "loss": 2.1959, + "step": 460260 + }, + { + "epoch": 1.7792751001221567, + "grad_norm": 0.12909527122974396, + "learning_rate": 0.00044732488910268154, + "loss": 2.1947, + "step": 460270 + }, + { + "epoch": 1.7793137573255402, + "grad_norm": 0.13403162360191345, + "learning_rate": 0.0004471960844974663, + "loss": 2.1987, + "step": 460280 + }, + { + "epoch": 1.7793524145289235, + "grad_norm": 0.12949995696544647, + "learning_rate": 0.000447067290575667, + "loss": 2.1963, + "step": 460290 + }, + { + "epoch": 1.7793910717323067, + "grad_norm": 0.13066048920154572, + "learning_rate": 0.00044693850733462594, + "loss": 2.1802, + "step": 460300 + }, + { + "epoch": 1.77942972893569, + "grad_norm": 0.12212751060724258, + "learning_rate": 0.0004468097347716862, + "loss": 2.1827, + "step": 460310 + }, + { + "epoch": 1.7794683861390732, + "grad_norm": 0.12501434981822968, + "learning_rate": 0.00044668097288419207, + "loss": 2.1835, + "step": 460320 + }, + { + "epoch": 1.7795070433424565, + "grad_norm": 0.1318371295928955, + "learning_rate": 0.00044655222166948926, + "loss": 2.1791, + "step": 460330 + }, + { + "epoch": 1.7795457005458397, + "grad_norm": 0.1317320317029953, + "learning_rate": 0.0004464234811249237, + "loss": 2.1771, + "step": 460340 + }, + { + "epoch": 1.779584357749223, + "grad_norm": 0.12879504263401031, + "learning_rate": 0.0004462947512478437, + "loss": 2.1777, + "step": 460350 + }, + { + "epoch": 1.7796230149526062, + "grad_norm": 0.13029487431049347, + "learning_rate": 0.0004461660320355974, + "loss": 2.1747, + "step": 460360 + }, + { + "epoch": 1.7796616721559895, + "grad_norm": 0.13985006511211395, + "learning_rate": 0.00044603732348553484, + "loss": 2.1885, + "step": 460370 + }, + { + "epoch": 1.7797003293593727, + "grad_norm": 0.14560475945472717, + "learning_rate": 0.0004459086255950071, + "loss": 2.1876, + "step": 460380 + }, + { + "epoch": 1.779738986562756, + "grad_norm": 0.1299740970134735, + "learning_rate": 0.00044577993836136587, + "loss": 2.1706, + "step": 460390 + }, + { + "epoch": 1.7797776437661392, + "grad_norm": 0.12563489377498627, + "learning_rate": 0.00044565126178196414, + "loss": 2.1969, + "step": 460400 + }, + { + "epoch": 1.7798163009695227, + "grad_norm": 0.12972305715084076, + "learning_rate": 0.00044552259585415666, + "loss": 2.1906, + "step": 460410 + }, + { + "epoch": 1.779854958172906, + "grad_norm": 0.1361573189496994, + "learning_rate": 0.000445393940575298, + "loss": 2.1895, + "step": 460420 + }, + { + "epoch": 1.7798936153762892, + "grad_norm": 0.13461749255657196, + "learning_rate": 0.0004452652959427452, + "loss": 2.1983, + "step": 460430 + }, + { + "epoch": 1.7799322725796725, + "grad_norm": 0.12253117561340332, + "learning_rate": 0.00044513666195385507, + "loss": 2.1851, + "step": 460440 + }, + { + "epoch": 1.779970929783056, + "grad_norm": 0.14229539036750793, + "learning_rate": 0.00044500803860598694, + "loss": 2.1843, + "step": 460450 + }, + { + "epoch": 1.7800095869864392, + "grad_norm": 0.13668429851531982, + "learning_rate": 0.0004448794258964999, + "loss": 2.1747, + "step": 460460 + }, + { + "epoch": 1.7800482441898224, + "grad_norm": 0.13260917365550995, + "learning_rate": 0.0004447508238227547, + "loss": 2.1825, + "step": 460470 + }, + { + "epoch": 1.7800869013932057, + "grad_norm": 0.12992557883262634, + "learning_rate": 0.00044462223238211365, + "loss": 2.1952, + "step": 460480 + }, + { + "epoch": 1.780125558596589, + "grad_norm": 0.13101063668727875, + "learning_rate": 0.0004444936515719391, + "loss": 2.2045, + "step": 460490 + }, + { + "epoch": 1.7801642157999722, + "grad_norm": 0.125391885638237, + "learning_rate": 0.0004443650813895954, + "loss": 2.1854, + "step": 460500 + }, + { + "epoch": 1.7802028730033554, + "grad_norm": 0.13517551124095917, + "learning_rate": 0.0004442365218324478, + "loss": 2.1908, + "step": 460510 + }, + { + "epoch": 1.7802415302067387, + "grad_norm": 0.1373213678598404, + "learning_rate": 0.0004441079728978621, + "loss": 2.1948, + "step": 460520 + }, + { + "epoch": 1.780280187410122, + "grad_norm": 0.13275998830795288, + "learning_rate": 0.0004439794345832058, + "loss": 2.1953, + "step": 460530 + }, + { + "epoch": 1.7803188446135052, + "grad_norm": 0.13309723138809204, + "learning_rate": 0.00044385090688584715, + "loss": 2.1857, + "step": 460540 + }, + { + "epoch": 1.7803575018168885, + "grad_norm": 0.12807965278625488, + "learning_rate": 0.0004437223898031559, + "loss": 2.1946, + "step": 460550 + }, + { + "epoch": 1.7803961590202717, + "grad_norm": 0.13153384625911713, + "learning_rate": 0.0004435938833325024, + "loss": 2.2084, + "step": 460560 + }, + { + "epoch": 1.780434816223655, + "grad_norm": 0.13783849775791168, + "learning_rate": 0.0004434653874712582, + "loss": 2.1858, + "step": 460570 + }, + { + "epoch": 1.7804734734270384, + "grad_norm": 0.13290105760097504, + "learning_rate": 0.00044333690221679635, + "loss": 2.2028, + "step": 460580 + }, + { + "epoch": 1.7805121306304217, + "grad_norm": 0.14658907055854797, + "learning_rate": 0.00044320842756649025, + "loss": 2.2051, + "step": 460590 + }, + { + "epoch": 1.780550787833805, + "grad_norm": 0.14052848517894745, + "learning_rate": 0.00044307996351771497, + "loss": 2.2021, + "step": 460600 + }, + { + "epoch": 1.7805894450371882, + "grad_norm": 0.12826259434223175, + "learning_rate": 0.00044295151006784627, + "loss": 2.2002, + "step": 460610 + }, + { + "epoch": 1.7806281022405717, + "grad_norm": 0.1517890989780426, + "learning_rate": 0.0004428230672142617, + "loss": 2.1957, + "step": 460620 + }, + { + "epoch": 1.780666759443955, + "grad_norm": 0.14213670790195465, + "learning_rate": 0.0004426946349543388, + "loss": 2.1969, + "step": 460630 + }, + { + "epoch": 1.7807054166473382, + "grad_norm": 0.12808117270469666, + "learning_rate": 0.0004425662132854571, + "loss": 2.1873, + "step": 460640 + }, + { + "epoch": 1.7807440738507214, + "grad_norm": 0.12215771526098251, + "learning_rate": 0.0004424378022049971, + "loss": 2.1998, + "step": 460650 + }, + { + "epoch": 1.7807827310541047, + "grad_norm": 0.12312013655900955, + "learning_rate": 0.00044230940171033974, + "loss": 2.1841, + "step": 460660 + }, + { + "epoch": 1.780821388257488, + "grad_norm": 0.12614500522613525, + "learning_rate": 0.0004421810117988676, + "loss": 2.1705, + "step": 460670 + }, + { + "epoch": 1.7808600454608712, + "grad_norm": 0.13631653785705566, + "learning_rate": 0.00044205263246796436, + "loss": 2.2122, + "step": 460680 + }, + { + "epoch": 1.7808987026642544, + "grad_norm": 0.1294577717781067, + "learning_rate": 0.00044192426371501446, + "loss": 2.1958, + "step": 460690 + }, + { + "epoch": 1.7809373598676377, + "grad_norm": 0.12741263210773468, + "learning_rate": 0.00044179590553740367, + "loss": 2.1946, + "step": 460700 + }, + { + "epoch": 1.780976017071021, + "grad_norm": 0.13759537041187286, + "learning_rate": 0.0004416675579325187, + "loss": 2.1925, + "step": 460710 + }, + { + "epoch": 1.7810146742744042, + "grad_norm": 0.14019958674907684, + "learning_rate": 0.00044153922089774754, + "loss": 2.2015, + "step": 460720 + }, + { + "epoch": 1.7810533314777874, + "grad_norm": 0.13502752780914307, + "learning_rate": 0.0004414108944304789, + "loss": 2.1901, + "step": 460730 + }, + { + "epoch": 1.7810919886811707, + "grad_norm": 0.1355697512626648, + "learning_rate": 0.00044128257852810274, + "loss": 2.193, + "step": 460740 + }, + { + "epoch": 1.7811306458845542, + "grad_norm": 0.12670107185840607, + "learning_rate": 0.0004411542731880105, + "loss": 2.197, + "step": 460750 + }, + { + "epoch": 1.7811693030879374, + "grad_norm": 0.1347462683916092, + "learning_rate": 0.0004410259784075938, + "loss": 2.1899, + "step": 460760 + }, + { + "epoch": 1.7812079602913207, + "grad_norm": 0.13940757513046265, + "learning_rate": 0.0004408976941842462, + "loss": 2.1961, + "step": 460770 + }, + { + "epoch": 1.781246617494704, + "grad_norm": 0.13067248463630676, + "learning_rate": 0.00044076942051536204, + "loss": 2.1989, + "step": 460780 + }, + { + "epoch": 1.7812852746980874, + "grad_norm": 0.21473339200019836, + "learning_rate": 0.0004406411573983364, + "loss": 2.1975, + "step": 460790 + }, + { + "epoch": 1.7813239319014706, + "grad_norm": 0.13648192584514618, + "learning_rate": 0.00044051290483056583, + "loss": 2.1845, + "step": 460800 + }, + { + "epoch": 1.781362589104854, + "grad_norm": 0.14370782673358917, + "learning_rate": 0.0004403846628094479, + "loss": 2.2032, + "step": 460810 + }, + { + "epoch": 1.7814012463082372, + "grad_norm": 0.13856147229671478, + "learning_rate": 0.0004402564313323809, + "loss": 2.1891, + "step": 460820 + }, + { + "epoch": 1.7814399035116204, + "grad_norm": 0.12482549995183945, + "learning_rate": 0.00044012821039676476, + "loss": 2.1885, + "step": 460830 + }, + { + "epoch": 1.7814785607150037, + "grad_norm": 0.13354481756687164, + "learning_rate": 0.00043999999999999996, + "loss": 2.1937, + "step": 460840 + }, + { + "epoch": 1.781517217918387, + "grad_norm": 0.12710081040859222, + "learning_rate": 0.00043987180013948856, + "loss": 2.1897, + "step": 460850 + }, + { + "epoch": 1.7815558751217702, + "grad_norm": 0.13189521431922913, + "learning_rate": 0.00043974361081263315, + "loss": 2.1795, + "step": 460860 + }, + { + "epoch": 1.7815945323251534, + "grad_norm": 0.12823714315891266, + "learning_rate": 0.00043961543201683775, + "loss": 2.1916, + "step": 460870 + }, + { + "epoch": 1.7816331895285367, + "grad_norm": 0.14169198274612427, + "learning_rate": 0.00043948726374950733, + "loss": 2.1928, + "step": 460880 + }, + { + "epoch": 1.78167184673192, + "grad_norm": 0.12744873762130737, + "learning_rate": 0.0004393591060080477, + "loss": 2.1961, + "step": 460890 + }, + { + "epoch": 1.7817105039353032, + "grad_norm": 0.12496272474527359, + "learning_rate": 0.0004392309587898664, + "loss": 2.1796, + "step": 460900 + }, + { + "epoch": 1.7817491611386864, + "grad_norm": 0.12871375679969788, + "learning_rate": 0.0004391028220923712, + "loss": 2.1775, + "step": 460910 + }, + { + "epoch": 1.78178781834207, + "grad_norm": 0.1288425326347351, + "learning_rate": 0.00043897469591297147, + "loss": 2.1783, + "step": 460920 + }, + { + "epoch": 1.7818264755454531, + "grad_norm": 0.13534577190876007, + "learning_rate": 0.0004388465802490775, + "loss": 2.1882, + "step": 460930 + }, + { + "epoch": 1.7818651327488364, + "grad_norm": 0.13280139863491058, + "learning_rate": 0.0004387184750981006, + "loss": 2.1937, + "step": 460940 + }, + { + "epoch": 1.7819037899522197, + "grad_norm": 0.1339811384677887, + "learning_rate": 0.00043859038045745334, + "loss": 2.1827, + "step": 460950 + }, + { + "epoch": 1.7819424471556031, + "grad_norm": 0.12586566805839539, + "learning_rate": 0.00043846229632454924, + "loss": 2.1885, + "step": 460960 + }, + { + "epoch": 1.7819811043589864, + "grad_norm": 0.130745530128479, + "learning_rate": 0.0004383342226968026, + "loss": 2.1948, + "step": 460970 + }, + { + "epoch": 1.7820197615623696, + "grad_norm": 0.12698744237422943, + "learning_rate": 0.0004382061595716289, + "loss": 2.1902, + "step": 460980 + }, + { + "epoch": 1.7820584187657529, + "grad_norm": 0.13514763116836548, + "learning_rate": 0.00043807810694644525, + "loss": 2.1903, + "step": 460990 + }, + { + "epoch": 1.7820970759691361, + "grad_norm": 0.12937365472316742, + "learning_rate": 0.00043795006481866914, + "loss": 2.1831, + "step": 461000 + }, + { + "epoch": 1.7821357331725194, + "grad_norm": 0.13314153254032135, + "learning_rate": 0.00043782203318571924, + "loss": 2.1806, + "step": 461010 + }, + { + "epoch": 1.7821743903759026, + "grad_norm": 0.13866551220417023, + "learning_rate": 0.0004376940120450157, + "loss": 2.1906, + "step": 461020 + }, + { + "epoch": 1.782213047579286, + "grad_norm": 0.14468303322792053, + "learning_rate": 0.00043756600139397886, + "loss": 2.1916, + "step": 461030 + }, + { + "epoch": 1.7822517047826691, + "grad_norm": 0.14432694017887115, + "learning_rate": 0.00043743800123003117, + "loss": 2.1851, + "step": 461040 + }, + { + "epoch": 1.7822903619860524, + "grad_norm": 0.12961898744106293, + "learning_rate": 0.00043731001155059544, + "loss": 2.1895, + "step": 461050 + }, + { + "epoch": 1.7823290191894356, + "grad_norm": 0.13149994611740112, + "learning_rate": 0.0004371820323530957, + "loss": 2.1945, + "step": 461060 + }, + { + "epoch": 1.782367676392819, + "grad_norm": 0.13549242913722992, + "learning_rate": 0.00043705406363495736, + "loss": 2.1876, + "step": 461070 + }, + { + "epoch": 1.7824063335962022, + "grad_norm": 0.12361189723014832, + "learning_rate": 0.00043692610539360625, + "loss": 2.1817, + "step": 461080 + }, + { + "epoch": 1.7824449907995856, + "grad_norm": 0.1255415827035904, + "learning_rate": 0.00043679815762646945, + "loss": 2.1865, + "step": 461090 + }, + { + "epoch": 1.7824836480029689, + "grad_norm": 0.13414831459522247, + "learning_rate": 0.0004366702203309758, + "loss": 2.1791, + "step": 461100 + }, + { + "epoch": 1.7825223052063521, + "grad_norm": 0.13580499589443207, + "learning_rate": 0.000436542293504554, + "loss": 2.1991, + "step": 461110 + }, + { + "epoch": 1.7825609624097354, + "grad_norm": 0.13255441188812256, + "learning_rate": 0.00043641437714463495, + "loss": 2.203, + "step": 461120 + }, + { + "epoch": 1.7825996196131189, + "grad_norm": 0.14729438722133636, + "learning_rate": 0.00043628647124864983, + "loss": 2.1803, + "step": 461130 + }, + { + "epoch": 1.782638276816502, + "grad_norm": 0.13216754794120789, + "learning_rate": 0.00043615857581403097, + "loss": 2.1881, + "step": 461140 + }, + { + "epoch": 1.7826769340198854, + "grad_norm": 0.13753969967365265, + "learning_rate": 0.0004360306908382121, + "loss": 2.1955, + "step": 461150 + }, + { + "epoch": 1.7827155912232686, + "grad_norm": 0.12598957121372223, + "learning_rate": 0.0004359028163186278, + "loss": 2.1878, + "step": 461160 + }, + { + "epoch": 1.7827542484266519, + "grad_norm": 0.1264917254447937, + "learning_rate": 0.0004357749522527137, + "loss": 2.1832, + "step": 461170 + }, + { + "epoch": 1.7827929056300351, + "grad_norm": 0.12438520789146423, + "learning_rate": 0.00043564709863790643, + "loss": 2.2021, + "step": 461180 + }, + { + "epoch": 1.7828315628334184, + "grad_norm": 0.11906236410140991, + "learning_rate": 0.0004355192554716436, + "loss": 2.2008, + "step": 461190 + }, + { + "epoch": 1.7828702200368016, + "grad_norm": 0.135292649269104, + "learning_rate": 0.0004353914227513642, + "loss": 2.1814, + "step": 461200 + }, + { + "epoch": 1.7829088772401849, + "grad_norm": 0.12936165928840637, + "learning_rate": 0.0004352636004745081, + "loss": 2.2004, + "step": 461210 + }, + { + "epoch": 1.7829475344435681, + "grad_norm": 0.1327020227909088, + "learning_rate": 0.0004351357886385159, + "loss": 2.1905, + "step": 461220 + }, + { + "epoch": 1.7829861916469514, + "grad_norm": 0.1360083669424057, + "learning_rate": 0.00043500798724082946, + "loss": 2.1856, + "step": 461230 + }, + { + "epoch": 1.7830248488503346, + "grad_norm": 0.13193684816360474, + "learning_rate": 0.00043488019627889175, + "loss": 2.1823, + "step": 461240 + }, + { + "epoch": 1.7830635060537179, + "grad_norm": 0.13851018249988556, + "learning_rate": 0.00043475241575014725, + "loss": 2.1703, + "step": 461250 + }, + { + "epoch": 1.7831021632571014, + "grad_norm": 0.1312793642282486, + "learning_rate": 0.0004346246456520404, + "loss": 2.1843, + "step": 461260 + }, + { + "epoch": 1.7831408204604846, + "grad_norm": 0.14227105677127838, + "learning_rate": 0.00043449688598201755, + "loss": 2.1856, + "step": 461270 + }, + { + "epoch": 1.7831794776638679, + "grad_norm": 0.13117855787277222, + "learning_rate": 0.00043436913673752575, + "loss": 2.18, + "step": 461280 + }, + { + "epoch": 1.7832181348672511, + "grad_norm": 0.13411663472652435, + "learning_rate": 0.0004342413979160134, + "loss": 2.1885, + "step": 461290 + }, + { + "epoch": 1.7832567920706346, + "grad_norm": 0.11938449740409851, + "learning_rate": 0.00043411366951492925, + "loss": 2.1939, + "step": 461300 + }, + { + "epoch": 1.7832954492740178, + "grad_norm": 0.12993189692497253, + "learning_rate": 0.0004339859515317239, + "loss": 2.1905, + "step": 461310 + }, + { + "epoch": 1.783334106477401, + "grad_norm": 0.12578348815441132, + "learning_rate": 0.00043385824396384855, + "loss": 2.1811, + "step": 461320 + }, + { + "epoch": 1.7833727636807843, + "grad_norm": 0.1200806200504303, + "learning_rate": 0.00043373054680875557, + "loss": 2.2001, + "step": 461330 + }, + { + "epoch": 1.7834114208841676, + "grad_norm": 0.28573447465896606, + "learning_rate": 0.00043360286006389814, + "loss": 2.209, + "step": 461340 + }, + { + "epoch": 1.7834500780875508, + "grad_norm": 0.5567190051078796, + "learning_rate": 0.0004334751837267308, + "loss": 2.2089, + "step": 461350 + }, + { + "epoch": 1.783488735290934, + "grad_norm": 0.12286386638879776, + "learning_rate": 0.0004333475177947088, + "loss": 2.1917, + "step": 461360 + }, + { + "epoch": 1.7835273924943174, + "grad_norm": 0.12691837549209595, + "learning_rate": 0.000433219862265289, + "loss": 2.1982, + "step": 461370 + }, + { + "epoch": 1.7835660496977006, + "grad_norm": 0.13498446345329285, + "learning_rate": 0.0004330922171359286, + "loss": 2.1868, + "step": 461380 + }, + { + "epoch": 1.7836047069010839, + "grad_norm": 0.13531361520290375, + "learning_rate": 0.0004329645824040862, + "loss": 2.1926, + "step": 461390 + }, + { + "epoch": 1.783643364104467, + "grad_norm": 0.13043451309204102, + "learning_rate": 0.00043283695806722134, + "loss": 2.1929, + "step": 461400 + }, + { + "epoch": 1.7836820213078504, + "grad_norm": 0.1304284781217575, + "learning_rate": 0.00043270934412279496, + "loss": 2.1833, + "step": 461410 + }, + { + "epoch": 1.7837206785112338, + "grad_norm": 0.12887679040431976, + "learning_rate": 0.00043258174056826814, + "loss": 2.2014, + "step": 461420 + }, + { + "epoch": 1.783759335714617, + "grad_norm": 0.13068842887878418, + "learning_rate": 0.0004324541474011039, + "loss": 2.1896, + "step": 461430 + }, + { + "epoch": 1.7837979929180003, + "grad_norm": 0.1315651834011078, + "learning_rate": 0.0004323265646187662, + "loss": 2.1961, + "step": 461440 + }, + { + "epoch": 1.7838366501213836, + "grad_norm": 0.14479106664657593, + "learning_rate": 0.00043219899221871904, + "loss": 2.1788, + "step": 461450 + }, + { + "epoch": 1.7838753073247668, + "grad_norm": 0.14688260853290558, + "learning_rate": 0.000432071430198429, + "loss": 2.1864, + "step": 461460 + }, + { + "epoch": 1.7839139645281503, + "grad_norm": 0.13669735193252563, + "learning_rate": 0.0004319438785553624, + "loss": 2.2074, + "step": 461470 + }, + { + "epoch": 1.7839526217315336, + "grad_norm": 0.12653201818466187, + "learning_rate": 0.0004318163372869872, + "loss": 2.19, + "step": 461480 + }, + { + "epoch": 1.7839912789349168, + "grad_norm": 0.14503104984760284, + "learning_rate": 0.00043168880639077247, + "loss": 2.1842, + "step": 461490 + }, + { + "epoch": 1.7840299361383, + "grad_norm": 0.12771466374397278, + "learning_rate": 0.00043156128586418775, + "loss": 2.1907, + "step": 461500 + }, + { + "epoch": 1.7840685933416833, + "grad_norm": 0.1463581770658493, + "learning_rate": 0.0004314337757047042, + "loss": 2.1919, + "step": 461510 + }, + { + "epoch": 1.7841072505450666, + "grad_norm": 0.1495421826839447, + "learning_rate": 0.0004313062759097939, + "loss": 2.176, + "step": 461520 + }, + { + "epoch": 1.7841459077484498, + "grad_norm": 0.11844052374362946, + "learning_rate": 0.0004311787864769294, + "loss": 2.1923, + "step": 461530 + }, + { + "epoch": 1.784184564951833, + "grad_norm": 0.13675814867019653, + "learning_rate": 0.0004310513074035851, + "loss": 2.1839, + "step": 461540 + }, + { + "epoch": 1.7842232221552163, + "grad_norm": 0.13174942135810852, + "learning_rate": 0.0004309238386872356, + "loss": 2.1829, + "step": 461550 + }, + { + "epoch": 1.7842618793585996, + "grad_norm": 0.12208150327205658, + "learning_rate": 0.0004307963803253574, + "loss": 2.1899, + "step": 461560 + }, + { + "epoch": 1.7843005365619828, + "grad_norm": 0.1265152543783188, + "learning_rate": 0.0004306689323154276, + "loss": 2.1816, + "step": 461570 + }, + { + "epoch": 1.784339193765366, + "grad_norm": 0.13260005414485931, + "learning_rate": 0.000430541494654924, + "loss": 2.181, + "step": 461580 + }, + { + "epoch": 1.7843778509687496, + "grad_norm": 0.13829654455184937, + "learning_rate": 0.00043041406734132593, + "loss": 2.1941, + "step": 461590 + }, + { + "epoch": 1.7844165081721328, + "grad_norm": 0.13675826787948608, + "learning_rate": 0.00043028665037211325, + "loss": 2.1791, + "step": 461600 + }, + { + "epoch": 1.784455165375516, + "grad_norm": 0.2085188925266266, + "learning_rate": 0.0004301592437447677, + "loss": 2.1903, + "step": 461610 + }, + { + "epoch": 1.7844938225788993, + "grad_norm": 0.13831783831119537, + "learning_rate": 0.0004300318474567708, + "loss": 2.1827, + "step": 461620 + }, + { + "epoch": 1.7845324797822826, + "grad_norm": 0.142887681722641, + "learning_rate": 0.00042990446150560646, + "loss": 2.1777, + "step": 461630 + }, + { + "epoch": 1.784571136985666, + "grad_norm": 0.13441495597362518, + "learning_rate": 0.0004297770858887584, + "loss": 2.1861, + "step": 461640 + }, + { + "epoch": 1.7846097941890493, + "grad_norm": 0.13365398347377777, + "learning_rate": 0.0004296497206037118, + "loss": 2.1861, + "step": 461650 + }, + { + "epoch": 1.7846484513924326, + "grad_norm": 0.13537412881851196, + "learning_rate": 0.00042952236564795343, + "loss": 2.1866, + "step": 461660 + }, + { + "epoch": 1.7846871085958158, + "grad_norm": 0.13806648552417755, + "learning_rate": 0.00042939502101897053, + "loss": 2.1839, + "step": 461670 + }, + { + "epoch": 1.784725765799199, + "grad_norm": 0.1359514445066452, + "learning_rate": 0.0004292676867142511, + "loss": 2.1896, + "step": 461680 + }, + { + "epoch": 1.7847644230025823, + "grad_norm": 0.1382725089788437, + "learning_rate": 0.0004291403627312846, + "loss": 2.1877, + "step": 461690 + }, + { + "epoch": 1.7848030802059656, + "grad_norm": 0.1247827336192131, + "learning_rate": 0.00042901304906756144, + "loss": 2.1927, + "step": 461700 + }, + { + "epoch": 1.7848417374093488, + "grad_norm": 0.13510502874851227, + "learning_rate": 0.000428885745720573, + "loss": 2.1909, + "step": 461710 + }, + { + "epoch": 1.784880394612732, + "grad_norm": 0.13972440361976624, + "learning_rate": 0.00042875845268781166, + "loss": 2.1741, + "step": 461720 + }, + { + "epoch": 1.7849190518161153, + "grad_norm": 0.1431395560503006, + "learning_rate": 0.00042863116996677065, + "loss": 2.1991, + "step": 461730 + }, + { + "epoch": 1.7849577090194986, + "grad_norm": 0.13320407271385193, + "learning_rate": 0.0004285038975549447, + "loss": 2.1836, + "step": 461740 + }, + { + "epoch": 1.7849963662228818, + "grad_norm": 0.13148362934589386, + "learning_rate": 0.00042837663544982883, + "loss": 2.1817, + "step": 461750 + }, + { + "epoch": 1.7850350234262653, + "grad_norm": 0.12751802802085876, + "learning_rate": 0.00042824938364891983, + "loss": 2.1946, + "step": 461760 + }, + { + "epoch": 1.7850736806296486, + "grad_norm": 0.17811429500579834, + "learning_rate": 0.00042812214214971525, + "loss": 2.1768, + "step": 461770 + }, + { + "epoch": 1.7851123378330318, + "grad_norm": 0.1396486610174179, + "learning_rate": 0.0004279949109497132, + "loss": 2.1838, + "step": 461780 + }, + { + "epoch": 1.785150995036415, + "grad_norm": 0.1367500275373459, + "learning_rate": 0.00042786769004641336, + "loss": 2.1765, + "step": 461790 + }, + { + "epoch": 1.7851896522397985, + "grad_norm": 0.1433398723602295, + "learning_rate": 0.0004277404794373163, + "loss": 2.201, + "step": 461800 + }, + { + "epoch": 1.7852283094431818, + "grad_norm": 0.13393816351890564, + "learning_rate": 0.00042761327911992344, + "loss": 2.1902, + "step": 461810 + }, + { + "epoch": 1.785266966646565, + "grad_norm": 0.13377831876277924, + "learning_rate": 0.0004274860890917371, + "loss": 2.1767, + "step": 461820 + }, + { + "epoch": 1.7853056238499483, + "grad_norm": 0.13527078926563263, + "learning_rate": 0.00042735890935026124, + "loss": 2.1768, + "step": 461830 + }, + { + "epoch": 1.7853442810533315, + "grad_norm": 0.12564711272716522, + "learning_rate": 0.00042723173989300014, + "loss": 2.1718, + "step": 461840 + }, + { + "epoch": 1.7853829382567148, + "grad_norm": 0.13481220602989197, + "learning_rate": 0.0004271045807174592, + "loss": 2.1888, + "step": 461850 + }, + { + "epoch": 1.785421595460098, + "grad_norm": 0.1479419767856598, + "learning_rate": 0.00042697743182114524, + "loss": 2.1969, + "step": 461860 + }, + { + "epoch": 1.7854602526634813, + "grad_norm": 0.1323142945766449, + "learning_rate": 0.00042685029320156567, + "loss": 2.1843, + "step": 461870 + }, + { + "epoch": 1.7854989098668645, + "grad_norm": 0.1364038586616516, + "learning_rate": 0.0004267231648562291, + "loss": 2.1845, + "step": 461880 + }, + { + "epoch": 1.7855375670702478, + "grad_norm": 0.1245933398604393, + "learning_rate": 0.0004265960467826453, + "loss": 2.197, + "step": 461890 + }, + { + "epoch": 1.785576224273631, + "grad_norm": 0.2402573972940445, + "learning_rate": 0.0004264689389783245, + "loss": 2.2002, + "step": 461900 + }, + { + "epoch": 1.7856148814770143, + "grad_norm": 0.14551228284835815, + "learning_rate": 0.0004263418414407785, + "loss": 2.1978, + "step": 461910 + }, + { + "epoch": 1.7856535386803976, + "grad_norm": 0.12514843046665192, + "learning_rate": 0.0004262147541675199, + "loss": 2.1786, + "step": 461920 + }, + { + "epoch": 1.785692195883781, + "grad_norm": 0.12956567108631134, + "learning_rate": 0.00042608767715606225, + "loss": 2.1979, + "step": 461930 + }, + { + "epoch": 1.7857308530871643, + "grad_norm": 0.1494515985250473, + "learning_rate": 0.0004259606104039202, + "loss": 2.1979, + "step": 461940 + }, + { + "epoch": 1.7857695102905475, + "grad_norm": 0.13267038762569427, + "learning_rate": 0.0004258335539086091, + "loss": 2.1715, + "step": 461950 + }, + { + "epoch": 1.7858081674939308, + "grad_norm": 0.12581340968608856, + "learning_rate": 0.00042570650766764586, + "loss": 2.1653, + "step": 461960 + }, + { + "epoch": 1.7858468246973143, + "grad_norm": 0.1374005228281021, + "learning_rate": 0.00042557947167854793, + "loss": 2.1893, + "step": 461970 + }, + { + "epoch": 1.7858854819006975, + "grad_norm": 0.13171625137329102, + "learning_rate": 0.00042545244593883403, + "loss": 2.1875, + "step": 461980 + }, + { + "epoch": 1.7859241391040808, + "grad_norm": 0.1323147863149643, + "learning_rate": 0.0004253254304460239, + "loss": 2.1985, + "step": 461990 + }, + { + "epoch": 1.785962796307464, + "grad_norm": 0.1326800435781479, + "learning_rate": 0.00042519842519763774, + "loss": 2.1843, + "step": 462000 + }, + { + "epoch": 1.7860014535108473, + "grad_norm": 0.13129490613937378, + "learning_rate": 0.00042507143019119756, + "loss": 2.1899, + "step": 462010 + }, + { + "epoch": 1.7860401107142305, + "grad_norm": 0.1310659795999527, + "learning_rate": 0.00042494444542422573, + "loss": 2.19, + "step": 462020 + }, + { + "epoch": 1.7860787679176138, + "grad_norm": 0.1303727775812149, + "learning_rate": 0.0004248174708942458, + "loss": 2.1927, + "step": 462030 + }, + { + "epoch": 1.786117425120997, + "grad_norm": 0.13262943923473358, + "learning_rate": 0.0004246905065987827, + "loss": 2.1929, + "step": 462040 + }, + { + "epoch": 1.7861560823243803, + "grad_norm": 0.11869736760854721, + "learning_rate": 0.00042456355253536174, + "loss": 2.1919, + "step": 462050 + }, + { + "epoch": 1.7861947395277635, + "grad_norm": 0.1460510939359665, + "learning_rate": 0.0004244366087015095, + "loss": 2.1887, + "step": 462060 + }, + { + "epoch": 1.7862333967311468, + "grad_norm": 0.12706270813941956, + "learning_rate": 0.0004243096750947539, + "loss": 2.1937, + "step": 462070 + }, + { + "epoch": 1.78627205393453, + "grad_norm": 0.1253349334001541, + "learning_rate": 0.00042418275171262335, + "loss": 2.1891, + "step": 462080 + }, + { + "epoch": 1.7863107111379133, + "grad_norm": 0.3475554883480072, + "learning_rate": 0.0004240558385526472, + "loss": 2.1786, + "step": 462090 + }, + { + "epoch": 1.7863493683412968, + "grad_norm": 0.131262868642807, + "learning_rate": 0.00042392893561235637, + "loss": 2.1959, + "step": 462100 + }, + { + "epoch": 1.78638802554468, + "grad_norm": 0.12694989144802094, + "learning_rate": 0.0004238020428892824, + "loss": 2.1898, + "step": 462110 + }, + { + "epoch": 1.7864266827480633, + "grad_norm": 0.13427241146564484, + "learning_rate": 0.00042367516038095763, + "loss": 2.1913, + "step": 462120 + }, + { + "epoch": 1.7864653399514465, + "grad_norm": 0.13424880802631378, + "learning_rate": 0.0004235482880849157, + "loss": 2.184, + "step": 462130 + }, + { + "epoch": 1.78650399715483, + "grad_norm": 0.13431230187416077, + "learning_rate": 0.0004234214259986913, + "loss": 2.179, + "step": 462140 + }, + { + "epoch": 1.7865426543582132, + "grad_norm": 0.12677377462387085, + "learning_rate": 0.00042329457411982, + "loss": 2.1846, + "step": 462150 + }, + { + "epoch": 1.7865813115615965, + "grad_norm": 0.14376206696033478, + "learning_rate": 0.00042316773244583804, + "loss": 2.1829, + "step": 462160 + }, + { + "epoch": 1.7866199687649797, + "grad_norm": 0.15024666488170624, + "learning_rate": 0.00042304090097428326, + "loss": 2.1885, + "step": 462170 + }, + { + "epoch": 1.786658625968363, + "grad_norm": 0.15297283232212067, + "learning_rate": 0.00042291407970269424, + "loss": 2.178, + "step": 462180 + }, + { + "epoch": 1.7866972831717463, + "grad_norm": 0.14141041040420532, + "learning_rate": 0.00042278726862861007, + "loss": 2.1869, + "step": 462190 + }, + { + "epoch": 1.7867359403751295, + "grad_norm": 0.12803156673908234, + "learning_rate": 0.00042266046774957176, + "loss": 2.2057, + "step": 462200 + }, + { + "epoch": 1.7867745975785128, + "grad_norm": 0.13150155544281006, + "learning_rate": 0.0004225336770631203, + "loss": 2.1765, + "step": 462210 + }, + { + "epoch": 1.786813254781896, + "grad_norm": 0.13283614814281464, + "learning_rate": 0.0004224068965667987, + "loss": 2.1888, + "step": 462220 + }, + { + "epoch": 1.7868519119852793, + "grad_norm": 0.13864447176456451, + "learning_rate": 0.0004222801262581497, + "loss": 2.1821, + "step": 462230 + }, + { + "epoch": 1.7868905691886625, + "grad_norm": 0.12497738748788834, + "learning_rate": 0.00042215336613471855, + "loss": 2.196, + "step": 462240 + }, + { + "epoch": 1.7869292263920458, + "grad_norm": 0.1317017376422882, + "learning_rate": 0.00042202661619405004, + "loss": 2.1909, + "step": 462250 + }, + { + "epoch": 1.786967883595429, + "grad_norm": 0.12532906234264374, + "learning_rate": 0.00042189987643369075, + "loss": 2.1832, + "step": 462260 + }, + { + "epoch": 1.7870065407988125, + "grad_norm": 0.13995838165283203, + "learning_rate": 0.0004217731468511885, + "loss": 2.1884, + "step": 462270 + }, + { + "epoch": 1.7870451980021957, + "grad_norm": 0.13947032392024994, + "learning_rate": 0.00042164642744409143, + "loss": 2.1886, + "step": 462280 + }, + { + "epoch": 1.787083855205579, + "grad_norm": 0.13160671293735504, + "learning_rate": 0.0004215197182099486, + "loss": 2.191, + "step": 462290 + }, + { + "epoch": 1.7871225124089622, + "grad_norm": 0.12940038740634918, + "learning_rate": 0.00042139301914631067, + "loss": 2.1905, + "step": 462300 + }, + { + "epoch": 1.7871611696123457, + "grad_norm": 0.14498552680015564, + "learning_rate": 0.0004212663302507291, + "loss": 2.1898, + "step": 462310 + }, + { + "epoch": 1.787199826815729, + "grad_norm": 0.12709790468215942, + "learning_rate": 0.000421139651520756, + "loss": 2.191, + "step": 462320 + }, + { + "epoch": 1.7872384840191122, + "grad_norm": 0.13836999237537384, + "learning_rate": 0.00042101298295394465, + "loss": 2.1813, + "step": 462330 + }, + { + "epoch": 1.7872771412224955, + "grad_norm": 0.15703056752681732, + "learning_rate": 0.00042088632454784935, + "loss": 2.1906, + "step": 462340 + }, + { + "epoch": 1.7873157984258787, + "grad_norm": 0.13247722387313843, + "learning_rate": 0.0004207596763000254, + "loss": 2.1802, + "step": 462350 + }, + { + "epoch": 1.787354455629262, + "grad_norm": 0.13744854927062988, + "learning_rate": 0.00042063303820802925, + "loss": 2.173, + "step": 462360 + }, + { + "epoch": 1.7873931128326452, + "grad_norm": 0.12450604140758514, + "learning_rate": 0.00042050641026941806, + "loss": 2.1786, + "step": 462370 + }, + { + "epoch": 1.7874317700360285, + "grad_norm": 0.13161182403564453, + "learning_rate": 0.0004203797924817496, + "loss": 2.1803, + "step": 462380 + }, + { + "epoch": 1.7874704272394117, + "grad_norm": 0.13816915452480316, + "learning_rate": 0.00042025318484258367, + "loss": 2.1695, + "step": 462390 + }, + { + "epoch": 1.787509084442795, + "grad_norm": 0.13009148836135864, + "learning_rate": 0.0004201265873494802, + "loss": 2.1891, + "step": 462400 + }, + { + "epoch": 1.7875477416461782, + "grad_norm": 0.13122321665287018, + "learning_rate": 0.00041999999999999996, + "loss": 2.2087, + "step": 462410 + }, + { + "epoch": 1.7875863988495615, + "grad_norm": 0.13132932782173157, + "learning_rate": 0.0004198734227917056, + "loss": 2.1995, + "step": 462420 + }, + { + "epoch": 1.7876250560529447, + "grad_norm": 0.13773812353610992, + "learning_rate": 0.00041974685572216, + "loss": 2.175, + "step": 462430 + }, + { + "epoch": 1.7876637132563282, + "grad_norm": 0.1318158358335495, + "learning_rate": 0.00041962029878892725, + "loss": 2.179, + "step": 462440 + }, + { + "epoch": 1.7877023704597115, + "grad_norm": 0.13777051866054535, + "learning_rate": 0.00041949375198957207, + "loss": 2.1998, + "step": 462450 + }, + { + "epoch": 1.7877410276630947, + "grad_norm": 0.13231171667575836, + "learning_rate": 0.000419367215321661, + "loss": 2.1742, + "step": 462460 + }, + { + "epoch": 1.787779684866478, + "grad_norm": 0.13529814779758453, + "learning_rate": 0.0004192406887827609, + "loss": 2.1972, + "step": 462470 + }, + { + "epoch": 1.7878183420698615, + "grad_norm": 0.1315818727016449, + "learning_rate": 0.0004191141723704397, + "loss": 2.1843, + "step": 462480 + }, + { + "epoch": 1.7878569992732447, + "grad_norm": 0.15160876512527466, + "learning_rate": 0.0004189876660822662, + "loss": 2.1826, + "step": 462490 + }, + { + "epoch": 1.787895656476628, + "grad_norm": 0.14363962411880493, + "learning_rate": 0.00041886116991581024, + "loss": 2.1811, + "step": 462500 + }, + { + "epoch": 1.7879343136800112, + "grad_norm": 0.13013380765914917, + "learning_rate": 0.0004187346838686432, + "loss": 2.1926, + "step": 462510 + }, + { + "epoch": 1.7879729708833945, + "grad_norm": 0.14183056354522705, + "learning_rate": 0.0004186082079383364, + "loss": 2.1842, + "step": 462520 + }, + { + "epoch": 1.7880116280867777, + "grad_norm": 0.1285829097032547, + "learning_rate": 0.00041848174212246294, + "loss": 2.1828, + "step": 462530 + }, + { + "epoch": 1.788050285290161, + "grad_norm": 0.2322116643190384, + "learning_rate": 0.00041835528641859644, + "loss": 2.1752, + "step": 462540 + }, + { + "epoch": 1.7880889424935442, + "grad_norm": 0.1346760243177414, + "learning_rate": 0.000418228840824312, + "loss": 2.1777, + "step": 462550 + }, + { + "epoch": 1.7881275996969275, + "grad_norm": 0.13815739750862122, + "learning_rate": 0.00041810240533718493, + "loss": 2.1846, + "step": 462560 + }, + { + "epoch": 1.7881662569003107, + "grad_norm": 0.13315899670124054, + "learning_rate": 0.0004179759799547922, + "loss": 2.1878, + "step": 462570 + }, + { + "epoch": 1.788204914103694, + "grad_norm": 0.13540950417518616, + "learning_rate": 0.00041784956467471137, + "loss": 2.201, + "step": 462580 + }, + { + "epoch": 1.7882435713070772, + "grad_norm": 0.1370074301958084, + "learning_rate": 0.0004177231594945212, + "loss": 2.1808, + "step": 462590 + }, + { + "epoch": 1.7882822285104605, + "grad_norm": 0.15163642168045044, + "learning_rate": 0.0004175967644118015, + "loss": 2.1865, + "step": 462600 + }, + { + "epoch": 1.788320885713844, + "grad_norm": 0.12375643104314804, + "learning_rate": 0.00041747037942413233, + "loss": 2.1816, + "step": 462610 + }, + { + "epoch": 1.7883595429172272, + "grad_norm": 0.132793590426445, + "learning_rate": 0.00041734400452909546, + "loss": 2.1728, + "step": 462620 + }, + { + "epoch": 1.7883982001206105, + "grad_norm": 0.1314646154642105, + "learning_rate": 0.0004172176397242735, + "loss": 2.1967, + "step": 462630 + }, + { + "epoch": 1.7884368573239937, + "grad_norm": 0.14199237525463104, + "learning_rate": 0.0004170912850072497, + "loss": 2.192, + "step": 462640 + }, + { + "epoch": 1.7884755145273772, + "grad_norm": 0.12800417840480804, + "learning_rate": 0.00041696494037560884, + "loss": 2.1822, + "step": 462650 + }, + { + "epoch": 1.7885141717307604, + "grad_norm": 0.13144774734973907, + "learning_rate": 0.00041683860582693576, + "loss": 2.1896, + "step": 462660 + }, + { + "epoch": 1.7885528289341437, + "grad_norm": 0.13051064312458038, + "learning_rate": 0.00041671228135881756, + "loss": 2.1785, + "step": 462670 + }, + { + "epoch": 1.788591486137527, + "grad_norm": 0.13029839098453522, + "learning_rate": 0.000416585966968841, + "loss": 2.2002, + "step": 462680 + }, + { + "epoch": 1.7886301433409102, + "grad_norm": 0.13918250799179077, + "learning_rate": 0.00041645966265459465, + "loss": 2.1887, + "step": 462690 + }, + { + "epoch": 1.7886688005442934, + "grad_norm": 0.13804082572460175, + "learning_rate": 0.00041633336841366764, + "loss": 2.1915, + "step": 462700 + }, + { + "epoch": 1.7887074577476767, + "grad_norm": 0.1327112466096878, + "learning_rate": 0.0004162070842436503, + "loss": 2.1787, + "step": 462710 + }, + { + "epoch": 1.78874611495106, + "grad_norm": 0.13810524344444275, + "learning_rate": 0.0004160808101421336, + "loss": 2.1799, + "step": 462720 + }, + { + "epoch": 1.7887847721544432, + "grad_norm": 0.13068757951259613, + "learning_rate": 0.0004159545461067098, + "loss": 2.1706, + "step": 462730 + }, + { + "epoch": 1.7888234293578265, + "grad_norm": 0.1321135014295578, + "learning_rate": 0.00041582829213497184, + "loss": 2.1871, + "step": 462740 + }, + { + "epoch": 1.7888620865612097, + "grad_norm": 0.13681761920452118, + "learning_rate": 0.00041570204822451417, + "loss": 2.1958, + "step": 462750 + }, + { + "epoch": 1.788900743764593, + "grad_norm": 0.13599014282226562, + "learning_rate": 0.00041557581437293113, + "loss": 2.1828, + "step": 462760 + }, + { + "epoch": 1.7889394009679762, + "grad_norm": 0.13342493772506714, + "learning_rate": 0.0004154495905778195, + "loss": 2.1702, + "step": 462770 + }, + { + "epoch": 1.7889780581713597, + "grad_norm": 0.13734008371829987, + "learning_rate": 0.0004153233768367755, + "loss": 2.1812, + "step": 462780 + }, + { + "epoch": 1.789016715374743, + "grad_norm": 0.1402251124382019, + "learning_rate": 0.0004151971731473976, + "loss": 2.2038, + "step": 462790 + }, + { + "epoch": 1.7890553725781262, + "grad_norm": 0.12966884672641754, + "learning_rate": 0.000415070979507284, + "loss": 2.1917, + "step": 462800 + }, + { + "epoch": 1.7890940297815094, + "grad_norm": 0.12560537457466125, + "learning_rate": 0.00041494479591403514, + "loss": 2.1611, + "step": 462810 + }, + { + "epoch": 1.789132686984893, + "grad_norm": 0.1583850383758545, + "learning_rate": 0.00041481862236525125, + "loss": 2.1862, + "step": 462820 + }, + { + "epoch": 1.7891713441882762, + "grad_norm": 0.1309800148010254, + "learning_rate": 0.0004146924588585344, + "loss": 2.2087, + "step": 462830 + }, + { + "epoch": 1.7892100013916594, + "grad_norm": 0.14885538816452026, + "learning_rate": 0.000414566305391487, + "loss": 2.1931, + "step": 462840 + }, + { + "epoch": 1.7892486585950427, + "grad_norm": 0.1438407003879547, + "learning_rate": 0.0004144401619617126, + "loss": 2.2001, + "step": 462850 + }, + { + "epoch": 1.789287315798426, + "grad_norm": 0.13319040834903717, + "learning_rate": 0.0004143140285668161, + "loss": 2.1934, + "step": 462860 + }, + { + "epoch": 1.7893259730018092, + "grad_norm": 0.14416548609733582, + "learning_rate": 0.00041418790520440286, + "loss": 2.1855, + "step": 462870 + }, + { + "epoch": 1.7893646302051924, + "grad_norm": 0.141445130109787, + "learning_rate": 0.00041406179187207927, + "loss": 2.1994, + "step": 462880 + }, + { + "epoch": 1.7894032874085757, + "grad_norm": 0.12892618775367737, + "learning_rate": 0.00041393568856745277, + "loss": 2.1844, + "step": 462890 + }, + { + "epoch": 1.789441944611959, + "grad_norm": 0.12439202517271042, + "learning_rate": 0.0004138095952881318, + "loss": 2.1728, + "step": 462900 + }, + { + "epoch": 1.7894806018153422, + "grad_norm": 0.12896130979061127, + "learning_rate": 0.0004136835120317257, + "loss": 2.1916, + "step": 462910 + }, + { + "epoch": 1.7895192590187254, + "grad_norm": 0.13055801391601562, + "learning_rate": 0.0004135574387958447, + "loss": 2.1768, + "step": 462920 + }, + { + "epoch": 1.7895579162221087, + "grad_norm": 0.13586017489433289, + "learning_rate": 0.00041343137557810005, + "loss": 2.1835, + "step": 462930 + }, + { + "epoch": 1.789596573425492, + "grad_norm": 0.12946003675460815, + "learning_rate": 0.0004133053223761039, + "loss": 2.1799, + "step": 462940 + }, + { + "epoch": 1.7896352306288754, + "grad_norm": 0.13411448895931244, + "learning_rate": 0.0004131792791874691, + "loss": 2.1871, + "step": 462950 + }, + { + "epoch": 1.7896738878322587, + "grad_norm": 0.1489628553390503, + "learning_rate": 0.0004130532460098102, + "loss": 2.1704, + "step": 462960 + }, + { + "epoch": 1.789712545035642, + "grad_norm": 0.13088008761405945, + "learning_rate": 0.0004129272228407419, + "loss": 2.1948, + "step": 462970 + }, + { + "epoch": 1.7897512022390252, + "grad_norm": 0.14513364434242249, + "learning_rate": 0.0004128012096778804, + "loss": 2.1811, + "step": 462980 + }, + { + "epoch": 1.7897898594424086, + "grad_norm": 0.14233975112438202, + "learning_rate": 0.00041267520651884216, + "loss": 2.1659, + "step": 462990 + }, + { + "epoch": 1.789828516645792, + "grad_norm": 0.14351864159107208, + "learning_rate": 0.0004125492133612456, + "loss": 2.2001, + "step": 463000 + }, + { + "epoch": 1.7898671738491752, + "grad_norm": 0.13553811609745026, + "learning_rate": 0.00041242323020270936, + "loss": 2.1786, + "step": 463010 + }, + { + "epoch": 1.7899058310525584, + "grad_norm": 0.1296354979276657, + "learning_rate": 0.00041229725704085274, + "loss": 2.1878, + "step": 463020 + }, + { + "epoch": 1.7899444882559417, + "grad_norm": 0.13192026317119598, + "learning_rate": 0.00041217129387329687, + "loss": 2.1856, + "step": 463030 + }, + { + "epoch": 1.789983145459325, + "grad_norm": 0.14609040319919586, + "learning_rate": 0.00041204534069766343, + "loss": 2.1879, + "step": 463040 + }, + { + "epoch": 1.7900218026627082, + "grad_norm": 0.12881425023078918, + "learning_rate": 0.0004119193975115747, + "loss": 2.1879, + "step": 463050 + }, + { + "epoch": 1.7900604598660914, + "grad_norm": 0.13772840797901154, + "learning_rate": 0.00041179346431265417, + "loss": 2.1975, + "step": 463060 + }, + { + "epoch": 1.7900991170694747, + "grad_norm": 0.12743765115737915, + "learning_rate": 0.0004116675410985269, + "loss": 2.1731, + "step": 463070 + }, + { + "epoch": 1.790137774272858, + "grad_norm": 0.1352182924747467, + "learning_rate": 0.0004115416278668176, + "loss": 2.1944, + "step": 463080 + }, + { + "epoch": 1.7901764314762412, + "grad_norm": 0.13374656438827515, + "learning_rate": 0.00041141572461515286, + "loss": 2.1887, + "step": 463090 + }, + { + "epoch": 1.7902150886796244, + "grad_norm": 0.13277378678321838, + "learning_rate": 0.00041128983134115993, + "loss": 2.1919, + "step": 463100 + }, + { + "epoch": 1.7902537458830077, + "grad_norm": 0.13288132846355438, + "learning_rate": 0.00041116394804246716, + "loss": 2.1795, + "step": 463110 + }, + { + "epoch": 1.7902924030863911, + "grad_norm": 0.12260843068361282, + "learning_rate": 0.00041103807471670376, + "loss": 2.1794, + "step": 463120 + }, + { + "epoch": 1.7903310602897744, + "grad_norm": 0.13363111019134521, + "learning_rate": 0.00041091221136149936, + "loss": 2.1825, + "step": 463130 + }, + { + "epoch": 1.7903697174931577, + "grad_norm": 0.13432249426841736, + "learning_rate": 0.00041078635797448573, + "loss": 2.1814, + "step": 463140 + }, + { + "epoch": 1.790408374696541, + "grad_norm": 0.1417021006345749, + "learning_rate": 0.0004106605145532942, + "loss": 2.1927, + "step": 463150 + }, + { + "epoch": 1.7904470318999244, + "grad_norm": 0.13484080135822296, + "learning_rate": 0.0004105346810955577, + "loss": 2.1915, + "step": 463160 + }, + { + "epoch": 1.7904856891033076, + "grad_norm": 0.13451482355594635, + "learning_rate": 0.0004104088575989107, + "loss": 2.1812, + "step": 463170 + }, + { + "epoch": 1.7905243463066909, + "grad_norm": 0.1374129354953766, + "learning_rate": 0.0004102830440609875, + "loss": 2.1901, + "step": 463180 + }, + { + "epoch": 1.7905630035100741, + "grad_norm": 0.14045169949531555, + "learning_rate": 0.00041015724047942405, + "loss": 2.1895, + "step": 463190 + }, + { + "epoch": 1.7906016607134574, + "grad_norm": 0.13701249659061432, + "learning_rate": 0.00041003144685185667, + "loss": 2.1921, + "step": 463200 + }, + { + "epoch": 1.7906403179168406, + "grad_norm": 0.13817450404167175, + "learning_rate": 0.0004099056631759235, + "loss": 2.1708, + "step": 463210 + }, + { + "epoch": 1.790678975120224, + "grad_norm": 0.13564909994602203, + "learning_rate": 0.0004097798894492624, + "loss": 2.195, + "step": 463220 + }, + { + "epoch": 1.7907176323236071, + "grad_norm": 0.13489286601543427, + "learning_rate": 0.00040965412566951344, + "loss": 2.1775, + "step": 463230 + }, + { + "epoch": 1.7907562895269904, + "grad_norm": 0.1390005499124527, + "learning_rate": 0.0004095283718343166, + "loss": 2.1749, + "step": 463240 + }, + { + "epoch": 1.7907949467303736, + "grad_norm": 0.1344020962715149, + "learning_rate": 0.00040940262794131343, + "loss": 2.1894, + "step": 463250 + }, + { + "epoch": 1.790833603933757, + "grad_norm": 0.13207466900348663, + "learning_rate": 0.0004092768939881459, + "loss": 2.1758, + "step": 463260 + }, + { + "epoch": 1.7908722611371402, + "grad_norm": 0.13027574121952057, + "learning_rate": 0.0004091511699724577, + "loss": 2.1879, + "step": 463270 + }, + { + "epoch": 1.7909109183405236, + "grad_norm": 0.1281670182943344, + "learning_rate": 0.0004090254558918927, + "loss": 2.1701, + "step": 463280 + }, + { + "epoch": 1.7909495755439069, + "grad_norm": 0.15068063139915466, + "learning_rate": 0.0004088997517440958, + "loss": 2.1783, + "step": 463290 + }, + { + "epoch": 1.7909882327472901, + "grad_norm": 0.13794660568237305, + "learning_rate": 0.00040877405752671314, + "loss": 2.1744, + "step": 463300 + }, + { + "epoch": 1.7910268899506734, + "grad_norm": 0.13627295196056366, + "learning_rate": 0.0004086483732373916, + "loss": 2.1758, + "step": 463310 + }, + { + "epoch": 1.7910655471540566, + "grad_norm": 0.13109062612056732, + "learning_rate": 0.00040852269887377914, + "loss": 2.1872, + "step": 463320 + }, + { + "epoch": 1.79110420435744, + "grad_norm": 0.14107844233512878, + "learning_rate": 0.0004083970344335244, + "loss": 2.1697, + "step": 463330 + }, + { + "epoch": 1.7911428615608234, + "grad_norm": 0.13397075235843658, + "learning_rate": 0.000408271379914277, + "loss": 2.1803, + "step": 463340 + }, + { + "epoch": 1.7911815187642066, + "grad_norm": 0.14661332964897156, + "learning_rate": 0.0004081457353136877, + "loss": 2.1796, + "step": 463350 + }, + { + "epoch": 1.7912201759675899, + "grad_norm": 0.1381164938211441, + "learning_rate": 0.00040802010062940795, + "loss": 2.1815, + "step": 463360 + }, + { + "epoch": 1.7912588331709731, + "grad_norm": 0.13051655888557434, + "learning_rate": 0.0004078944758590906, + "loss": 2.1773, + "step": 463370 + }, + { + "epoch": 1.7912974903743564, + "grad_norm": 0.12441668659448624, + "learning_rate": 0.0004077688610003885, + "loss": 2.1906, + "step": 463380 + }, + { + "epoch": 1.7913361475777396, + "grad_norm": 0.15715846419334412, + "learning_rate": 0.00040764325605095644, + "loss": 2.1769, + "step": 463390 + }, + { + "epoch": 1.7913748047811229, + "grad_norm": 0.12852106988430023, + "learning_rate": 0.00040751766100844945, + "loss": 2.1843, + "step": 463400 + }, + { + "epoch": 1.7914134619845061, + "grad_norm": 0.13250230252742767, + "learning_rate": 0.00040739207587052364, + "loss": 2.1805, + "step": 463410 + }, + { + "epoch": 1.7914521191878894, + "grad_norm": 0.14088231325149536, + "learning_rate": 0.0004072665006348364, + "loss": 2.1739, + "step": 463420 + }, + { + "epoch": 1.7914907763912726, + "grad_norm": 0.13542966544628143, + "learning_rate": 0.0004071409352990454, + "loss": 2.1783, + "step": 463430 + }, + { + "epoch": 1.7915294335946559, + "grad_norm": 0.14837566018104553, + "learning_rate": 0.00040701537986080985, + "loss": 2.1788, + "step": 463440 + }, + { + "epoch": 1.7915680907980394, + "grad_norm": 0.1346118003129959, + "learning_rate": 0.00040688983431778957, + "loss": 2.1652, + "step": 463450 + }, + { + "epoch": 1.7916067480014226, + "grad_norm": 0.13045181334018707, + "learning_rate": 0.0004067642986676452, + "loss": 2.1795, + "step": 463460 + }, + { + "epoch": 1.7916454052048059, + "grad_norm": 0.14252230525016785, + "learning_rate": 0.00040663877290803897, + "loss": 2.1939, + "step": 463470 + }, + { + "epoch": 1.7916840624081891, + "grad_norm": 0.1370854675769806, + "learning_rate": 0.00040651325703663276, + "loss": 2.1773, + "step": 463480 + }, + { + "epoch": 1.7917227196115724, + "grad_norm": 0.1398986577987671, + "learning_rate": 0.0004063877510510909, + "loss": 2.1854, + "step": 463490 + }, + { + "epoch": 1.7917613768149558, + "grad_norm": 0.13030345737934113, + "learning_rate": 0.0004062622549490773, + "loss": 2.1822, + "step": 463500 + }, + { + "epoch": 1.791800034018339, + "grad_norm": 0.1358768492937088, + "learning_rate": 0.00040613676872825757, + "loss": 2.1887, + "step": 463510 + }, + { + "epoch": 1.7918386912217223, + "grad_norm": 0.13510680198669434, + "learning_rate": 0.00040601129238629797, + "loss": 2.1733, + "step": 463520 + }, + { + "epoch": 1.7918773484251056, + "grad_norm": 0.1370605230331421, + "learning_rate": 0.00040588582592086597, + "loss": 2.1913, + "step": 463530 + }, + { + "epoch": 1.7919160056284889, + "grad_norm": 0.13525953888893127, + "learning_rate": 0.00040576036932962945, + "loss": 2.1771, + "step": 463540 + }, + { + "epoch": 1.791954662831872, + "grad_norm": 0.13494880497455597, + "learning_rate": 0.0004056349226102574, + "loss": 2.1689, + "step": 463550 + }, + { + "epoch": 1.7919933200352554, + "grad_norm": 0.13117605447769165, + "learning_rate": 0.00040550948576042003, + "loss": 2.1933, + "step": 463560 + }, + { + "epoch": 1.7920319772386386, + "grad_norm": 0.14315210282802582, + "learning_rate": 0.0004053840587777884, + "loss": 2.1824, + "step": 463570 + }, + { + "epoch": 1.7920706344420219, + "grad_norm": 0.13220641016960144, + "learning_rate": 0.0004052586416600339, + "loss": 2.1862, + "step": 463580 + }, + { + "epoch": 1.7921092916454051, + "grad_norm": 0.14106839895248413, + "learning_rate": 0.00040513323440482976, + "loss": 2.1879, + "step": 463590 + }, + { + "epoch": 1.7921479488487884, + "grad_norm": 0.13774384558200836, + "learning_rate": 0.0004050078370098491, + "loss": 2.1773, + "step": 463600 + }, + { + "epoch": 1.7921866060521716, + "grad_norm": 0.13437210023403168, + "learning_rate": 0.00040488244947276697, + "loss": 2.2018, + "step": 463610 + }, + { + "epoch": 1.792225263255555, + "grad_norm": 0.14692780375480652, + "learning_rate": 0.00040475707179125854, + "loss": 2.1843, + "step": 463620 + }, + { + "epoch": 1.7922639204589383, + "grad_norm": 0.1312679797410965, + "learning_rate": 0.00040463170396300033, + "loss": 2.1742, + "step": 463630 + }, + { + "epoch": 1.7923025776623216, + "grad_norm": 0.14086544513702393, + "learning_rate": 0.0004045063459856697, + "loss": 2.1907, + "step": 463640 + }, + { + "epoch": 1.7923412348657048, + "grad_norm": 0.13703036308288574, + "learning_rate": 0.0004043809978569446, + "loss": 2.1835, + "step": 463650 + }, + { + "epoch": 1.792379892069088, + "grad_norm": 0.1371353417634964, + "learning_rate": 0.0004042556595745044, + "loss": 2.1831, + "step": 463660 + }, + { + "epoch": 1.7924185492724716, + "grad_norm": 0.1327003687620163, + "learning_rate": 0.00040413033113602913, + "loss": 2.1728, + "step": 463670 + }, + { + "epoch": 1.7924572064758548, + "grad_norm": 0.12607765197753906, + "learning_rate": 0.00040400501253919964, + "loss": 2.1761, + "step": 463680 + }, + { + "epoch": 1.792495863679238, + "grad_norm": 0.1389392912387848, + "learning_rate": 0.0004038797037816979, + "loss": 2.1914, + "step": 463690 + }, + { + "epoch": 1.7925345208826213, + "grad_norm": 0.1530158966779709, + "learning_rate": 0.0004037544048612067, + "loss": 2.182, + "step": 463700 + }, + { + "epoch": 1.7925731780860046, + "grad_norm": 0.13505354523658752, + "learning_rate": 0.00040362911577540973, + "loss": 2.1846, + "step": 463710 + }, + { + "epoch": 1.7926118352893878, + "grad_norm": 0.14173811674118042, + "learning_rate": 0.00040350383652199164, + "loss": 2.1878, + "step": 463720 + }, + { + "epoch": 1.792650492492771, + "grad_norm": 0.14959704875946045, + "learning_rate": 0.0004033785670986376, + "loss": 2.1822, + "step": 463730 + }, + { + "epoch": 1.7926891496961543, + "grad_norm": 0.13770392537117004, + "learning_rate": 0.0004032533075030342, + "loss": 2.187, + "step": 463740 + }, + { + "epoch": 1.7927278068995376, + "grad_norm": 0.14019890129566193, + "learning_rate": 0.0004031280577328689, + "loss": 2.1767, + "step": 463750 + }, + { + "epoch": 1.7927664641029208, + "grad_norm": 0.12711156904697418, + "learning_rate": 0.0004030028177858298, + "loss": 2.18, + "step": 463760 + }, + { + "epoch": 1.792805121306304, + "grad_norm": 0.12975165247917175, + "learning_rate": 0.0004028775876596058, + "loss": 2.1658, + "step": 463770 + }, + { + "epoch": 1.7928437785096873, + "grad_norm": 0.19896434247493744, + "learning_rate": 0.00040275236735188734, + "loss": 2.1886, + "step": 463780 + }, + { + "epoch": 1.7928824357130708, + "grad_norm": 0.14500820636749268, + "learning_rate": 0.0004026271568603652, + "loss": 2.1842, + "step": 463790 + }, + { + "epoch": 1.792921092916454, + "grad_norm": 0.15005642175674438, + "learning_rate": 0.00040250195618273145, + "loss": 2.188, + "step": 463800 + }, + { + "epoch": 1.7929597501198373, + "grad_norm": 0.13256584107875824, + "learning_rate": 0.00040237676531667833, + "loss": 2.1978, + "step": 463810 + }, + { + "epoch": 1.7929984073232206, + "grad_norm": 0.14087162911891937, + "learning_rate": 0.00040225158425989996, + "loss": 2.166, + "step": 463820 + }, + { + "epoch": 1.793037064526604, + "grad_norm": 0.12890873849391937, + "learning_rate": 0.0004021264130100906, + "loss": 2.1969, + "step": 463830 + }, + { + "epoch": 1.7930757217299873, + "grad_norm": 0.13375651836395264, + "learning_rate": 0.0004020012515649458, + "loss": 2.1854, + "step": 463840 + }, + { + "epoch": 1.7931143789333706, + "grad_norm": 0.12807835638523102, + "learning_rate": 0.00040187609992216203, + "loss": 2.1808, + "step": 463850 + }, + { + "epoch": 1.7931530361367538, + "grad_norm": 0.14423508942127228, + "learning_rate": 0.0004017509580794363, + "loss": 2.1934, + "step": 463860 + }, + { + "epoch": 1.793191693340137, + "grad_norm": 0.13110533356666565, + "learning_rate": 0.000401625826034467, + "loss": 2.1802, + "step": 463870 + }, + { + "epoch": 1.7932303505435203, + "grad_norm": 0.1442173719406128, + "learning_rate": 0.00040150070378495317, + "loss": 2.1688, + "step": 463880 + }, + { + "epoch": 1.7932690077469036, + "grad_norm": 0.15013518929481506, + "learning_rate": 0.0004013755913285948, + "loss": 2.1949, + "step": 463890 + }, + { + "epoch": 1.7933076649502868, + "grad_norm": 0.13914164900779724, + "learning_rate": 0.00040125048866309254, + "loss": 2.1994, + "step": 463900 + }, + { + "epoch": 1.79334632215367, + "grad_norm": 0.12703953683376312, + "learning_rate": 0.0004011253957861485, + "loss": 2.1705, + "step": 463910 + }, + { + "epoch": 1.7933849793570533, + "grad_norm": 0.12627583742141724, + "learning_rate": 0.0004010003126954653, + "loss": 2.1905, + "step": 463920 + }, + { + "epoch": 1.7934236365604366, + "grad_norm": 0.1557985097169876, + "learning_rate": 0.00040087523938874627, + "loss": 2.1901, + "step": 463930 + }, + { + "epoch": 1.7934622937638198, + "grad_norm": 0.1482551395893097, + "learning_rate": 0.0004007501758636958, + "loss": 2.1745, + "step": 463940 + }, + { + "epoch": 1.793500950967203, + "grad_norm": 0.14754800498485565, + "learning_rate": 0.00040062512211801947, + "loss": 2.1624, + "step": 463950 + }, + { + "epoch": 1.7935396081705866, + "grad_norm": 0.1354418247938156, + "learning_rate": 0.0004005000781494237, + "loss": 2.1827, + "step": 463960 + }, + { + "epoch": 1.7935782653739698, + "grad_norm": 0.13364222645759583, + "learning_rate": 0.0004003750439556151, + "loss": 2.1831, + "step": 463970 + }, + { + "epoch": 1.793616922577353, + "grad_norm": 0.14250802993774414, + "learning_rate": 0.0004002500195343024, + "loss": 2.1793, + "step": 463980 + }, + { + "epoch": 1.7936555797807363, + "grad_norm": 0.1395851969718933, + "learning_rate": 0.00040012500488319393, + "loss": 2.1871, + "step": 463990 + }, + { + "epoch": 1.7936942369841198, + "grad_norm": 0.14435814321041107, + "learning_rate": 0.0003999999999999999, + "loss": 2.1865, + "step": 464000 + }, + { + "epoch": 1.793732894187503, + "grad_norm": 0.1335323601961136, + "learning_rate": 0.0003998750048824311, + "loss": 2.1904, + "step": 464010 + }, + { + "epoch": 1.7937715513908863, + "grad_norm": 0.13234373927116394, + "learning_rate": 0.00039975001952819887, + "loss": 2.1629, + "step": 464020 + }, + { + "epoch": 1.7938102085942695, + "grad_norm": 0.13462892174720764, + "learning_rate": 0.00039962504393501597, + "loss": 2.1843, + "step": 464030 + }, + { + "epoch": 1.7938488657976528, + "grad_norm": 0.12947219610214233, + "learning_rate": 0.0003995000781005955, + "loss": 2.1693, + "step": 464040 + }, + { + "epoch": 1.793887523001036, + "grad_norm": 0.144515722990036, + "learning_rate": 0.000399375122022652, + "loss": 2.1667, + "step": 464050 + }, + { + "epoch": 1.7939261802044193, + "grad_norm": 0.13721828162670135, + "learning_rate": 0.0003992501756989009, + "loss": 2.1779, + "step": 464060 + }, + { + "epoch": 1.7939648374078025, + "grad_norm": 0.1374235600233078, + "learning_rate": 0.0003991252391270577, + "loss": 2.1787, + "step": 464070 + }, + { + "epoch": 1.7940034946111858, + "grad_norm": 0.1416328251361847, + "learning_rate": 0.00039900031230484, + "loss": 2.189, + "step": 464080 + }, + { + "epoch": 1.794042151814569, + "grad_norm": 0.14912135899066925, + "learning_rate": 0.0003988753952299653, + "loss": 2.1813, + "step": 464090 + }, + { + "epoch": 1.7940808090179523, + "grad_norm": 0.13194890320301056, + "learning_rate": 0.0003987504879001524, + "loss": 2.187, + "step": 464100 + }, + { + "epoch": 1.7941194662213356, + "grad_norm": 0.13360001146793365, + "learning_rate": 0.0003986255903131211, + "loss": 2.1908, + "step": 464110 + }, + { + "epoch": 1.7941581234247188, + "grad_norm": 0.13524281978607178, + "learning_rate": 0.0003985007024665919, + "loss": 2.1818, + "step": 464120 + }, + { + "epoch": 1.7941967806281023, + "grad_norm": 0.12458459287881851, + "learning_rate": 0.000398375824358286, + "loss": 2.1824, + "step": 464130 + }, + { + "epoch": 1.7942354378314855, + "grad_norm": 0.14129717648029327, + "learning_rate": 0.000398250955985926, + "loss": 2.1798, + "step": 464140 + }, + { + "epoch": 1.7942740950348688, + "grad_norm": 0.13172756135463715, + "learning_rate": 0.00039812609734723515, + "loss": 2.1876, + "step": 464150 + }, + { + "epoch": 1.794312752238252, + "grad_norm": 0.14094604551792145, + "learning_rate": 0.0003980012484399371, + "loss": 2.1692, + "step": 464160 + }, + { + "epoch": 1.7943514094416355, + "grad_norm": 0.14977578818798065, + "learning_rate": 0.00039787640926175727, + "loss": 2.1665, + "step": 464170 + }, + { + "epoch": 1.7943900666450188, + "grad_norm": 0.1476397067308426, + "learning_rate": 0.0003977515798104214, + "loss": 2.1795, + "step": 464180 + }, + { + "epoch": 1.794428723848402, + "grad_norm": 0.13331814110279083, + "learning_rate": 0.00039762676008365627, + "loss": 2.1777, + "step": 464190 + }, + { + "epoch": 1.7944673810517853, + "grad_norm": 0.1343361735343933, + "learning_rate": 0.00039750195007918967, + "loss": 2.1882, + "step": 464200 + }, + { + "epoch": 1.7945060382551685, + "grad_norm": 0.14114736020565033, + "learning_rate": 0.0003973771497947498, + "loss": 2.1801, + "step": 464210 + }, + { + "epoch": 1.7945446954585518, + "grad_norm": 0.14076592028141022, + "learning_rate": 0.00039725235922806614, + "loss": 2.1783, + "step": 464220 + }, + { + "epoch": 1.794583352661935, + "grad_norm": 0.1326855570077896, + "learning_rate": 0.00039712757837686907, + "loss": 2.1827, + "step": 464230 + }, + { + "epoch": 1.7946220098653183, + "grad_norm": 0.14001700282096863, + "learning_rate": 0.0003970028072388898, + "loss": 2.1758, + "step": 464240 + }, + { + "epoch": 1.7946606670687015, + "grad_norm": 0.12555120885372162, + "learning_rate": 0.00039687804581186036, + "loss": 2.1796, + "step": 464250 + }, + { + "epoch": 1.7946993242720848, + "grad_norm": 0.13253574073314667, + "learning_rate": 0.00039675329409351366, + "loss": 2.2041, + "step": 464260 + }, + { + "epoch": 1.794737981475468, + "grad_norm": 0.14871980249881744, + "learning_rate": 0.0003966285520815833, + "loss": 2.1794, + "step": 464270 + }, + { + "epoch": 1.7947766386788513, + "grad_norm": 0.16246652603149414, + "learning_rate": 0.00039650381977380443, + "loss": 2.1792, + "step": 464280 + }, + { + "epoch": 1.7948152958822345, + "grad_norm": 0.137555330991745, + "learning_rate": 0.0003963790971679122, + "loss": 2.1873, + "step": 464290 + }, + { + "epoch": 1.794853953085618, + "grad_norm": 0.14714735746383667, + "learning_rate": 0.0003962543842616435, + "loss": 2.1782, + "step": 464300 + }, + { + "epoch": 1.7948926102890013, + "grad_norm": 0.13641484081745148, + "learning_rate": 0.0003961296810527355, + "loss": 2.1937, + "step": 464310 + }, + { + "epoch": 1.7949312674923845, + "grad_norm": 0.12888626754283905, + "learning_rate": 0.0003960049875389262, + "loss": 2.1855, + "step": 464320 + }, + { + "epoch": 1.7949699246957678, + "grad_norm": 0.13622303307056427, + "learning_rate": 0.000395880303717955, + "loss": 2.1954, + "step": 464330 + }, + { + "epoch": 1.7950085818991512, + "grad_norm": 0.13224974274635315, + "learning_rate": 0.00039575562958756196, + "loss": 2.1882, + "step": 464340 + }, + { + "epoch": 1.7950472391025345, + "grad_norm": 0.12995639443397522, + "learning_rate": 0.00039563096514548746, + "loss": 2.1894, + "step": 464350 + }, + { + "epoch": 1.7950858963059177, + "grad_norm": 0.13467252254486084, + "learning_rate": 0.00039550631038947384, + "loss": 2.1652, + "step": 464360 + }, + { + "epoch": 1.795124553509301, + "grad_norm": 0.14471621811389923, + "learning_rate": 0.0003953816653172631, + "loss": 2.1806, + "step": 464370 + }, + { + "epoch": 1.7951632107126843, + "grad_norm": 0.1446365863084793, + "learning_rate": 0.00039525702992659916, + "loss": 2.1708, + "step": 464380 + }, + { + "epoch": 1.7952018679160675, + "grad_norm": 0.14779379963874817, + "learning_rate": 0.0003951324042152262, + "loss": 2.1792, + "step": 464390 + }, + { + "epoch": 1.7952405251194508, + "grad_norm": 0.13273638486862183, + "learning_rate": 0.00039500778818088954, + "loss": 2.1942, + "step": 464400 + }, + { + "epoch": 1.795279182322834, + "grad_norm": 0.14872750639915466, + "learning_rate": 0.00039488318182133545, + "loss": 2.1748, + "step": 464410 + }, + { + "epoch": 1.7953178395262173, + "grad_norm": 0.1282881498336792, + "learning_rate": 0.00039475858513431073, + "loss": 2.1705, + "step": 464420 + }, + { + "epoch": 1.7953564967296005, + "grad_norm": 0.13489757478237152, + "learning_rate": 0.0003946339981175633, + "loss": 2.188, + "step": 464430 + }, + { + "epoch": 1.7953951539329838, + "grad_norm": 0.14069312810897827, + "learning_rate": 0.0003945094207688418, + "loss": 2.1712, + "step": 464440 + }, + { + "epoch": 1.795433811136367, + "grad_norm": 0.14453202486038208, + "learning_rate": 0.000394384853085896, + "loss": 2.1757, + "step": 464450 + }, + { + "epoch": 1.7954724683397503, + "grad_norm": 0.13123004138469696, + "learning_rate": 0.0003942602950664762, + "loss": 2.1844, + "step": 464460 + }, + { + "epoch": 1.7955111255431337, + "grad_norm": 0.12672416865825653, + "learning_rate": 0.00039413574670833394, + "loss": 2.1749, + "step": 464470 + }, + { + "epoch": 1.795549782746517, + "grad_norm": 0.13173723220825195, + "learning_rate": 0.00039401120800922155, + "loss": 2.1734, + "step": 464480 + }, + { + "epoch": 1.7955884399499003, + "grad_norm": 0.13183413445949554, + "learning_rate": 0.0003938866789668918, + "loss": 2.1802, + "step": 464490 + }, + { + "epoch": 1.7956270971532835, + "grad_norm": 0.1551782786846161, + "learning_rate": 0.000393762159579099, + "loss": 2.1851, + "step": 464500 + }, + { + "epoch": 1.795665754356667, + "grad_norm": 0.14472754299640656, + "learning_rate": 0.0003936376498435976, + "loss": 2.1806, + "step": 464510 + }, + { + "epoch": 1.7957044115600502, + "grad_norm": 0.12884870171546936, + "learning_rate": 0.0003935131497581439, + "loss": 2.1793, + "step": 464520 + }, + { + "epoch": 1.7957430687634335, + "grad_norm": 0.1283208727836609, + "learning_rate": 0.0003933886593204938, + "loss": 2.1979, + "step": 464530 + }, + { + "epoch": 1.7957817259668167, + "grad_norm": 0.12944021821022034, + "learning_rate": 0.00039326417852840543, + "loss": 2.1765, + "step": 464540 + }, + { + "epoch": 1.7958203831702, + "grad_norm": 0.13233332335948944, + "learning_rate": 0.00039313970737963656, + "loss": 2.1776, + "step": 464550 + }, + { + "epoch": 1.7958590403735832, + "grad_norm": 0.13329827785491943, + "learning_rate": 0.0003930152458719467, + "loss": 2.1701, + "step": 464560 + }, + { + "epoch": 1.7958976975769665, + "grad_norm": 0.14701420068740845, + "learning_rate": 0.0003928907940030957, + "loss": 2.1762, + "step": 464570 + }, + { + "epoch": 1.7959363547803497, + "grad_norm": 0.1379716545343399, + "learning_rate": 0.0003927663517708446, + "loss": 2.1741, + "step": 464580 + }, + { + "epoch": 1.795975011983733, + "grad_norm": 0.14706137776374817, + "learning_rate": 0.00039264191917295536, + "loss": 2.177, + "step": 464590 + }, + { + "epoch": 1.7960136691871162, + "grad_norm": 0.14323627948760986, + "learning_rate": 0.00039251749620719046, + "loss": 2.1757, + "step": 464600 + }, + { + "epoch": 1.7960523263904995, + "grad_norm": 0.14055584371089935, + "learning_rate": 0.0003923930828713134, + "loss": 2.1903, + "step": 464610 + }, + { + "epoch": 1.7960909835938828, + "grad_norm": 0.14579209685325623, + "learning_rate": 0.0003922686791630885, + "loss": 2.1685, + "step": 464620 + }, + { + "epoch": 1.796129640797266, + "grad_norm": 0.14397454261779785, + "learning_rate": 0.0003921442850802812, + "loss": 2.1694, + "step": 464630 + }, + { + "epoch": 1.7961682980006495, + "grad_norm": 0.13405250012874603, + "learning_rate": 0.0003920199006206575, + "loss": 2.1845, + "step": 464640 + }, + { + "epoch": 1.7962069552040327, + "grad_norm": 0.12927570939064026, + "learning_rate": 0.00039189552578198453, + "loss": 2.1854, + "step": 464650 + }, + { + "epoch": 1.796245612407416, + "grad_norm": 0.12672555446624756, + "learning_rate": 0.00039177116056203, + "loss": 2.1822, + "step": 464660 + }, + { + "epoch": 1.7962842696107992, + "grad_norm": 0.12893100082874298, + "learning_rate": 0.00039164680495856264, + "loss": 2.1945, + "step": 464670 + }, + { + "epoch": 1.7963229268141827, + "grad_norm": 0.15712900459766388, + "learning_rate": 0.000391522458969352, + "loss": 2.188, + "step": 464680 + }, + { + "epoch": 1.796361584017566, + "grad_norm": 0.13878193497657776, + "learning_rate": 0.00039139812259216856, + "loss": 2.189, + "step": 464690 + }, + { + "epoch": 1.7964002412209492, + "grad_norm": 0.13633514940738678, + "learning_rate": 0.0003912737958247836, + "loss": 2.169, + "step": 464700 + }, + { + "epoch": 1.7964388984243325, + "grad_norm": 0.14353691041469574, + "learning_rate": 0.0003911494786649694, + "loss": 2.1845, + "step": 464710 + }, + { + "epoch": 1.7964775556277157, + "grad_norm": 0.18341241776943207, + "learning_rate": 0.00039102517111049863, + "loss": 2.1812, + "step": 464720 + }, + { + "epoch": 1.796516212831099, + "grad_norm": 0.13173289597034454, + "learning_rate": 0.0003909008731591457, + "loss": 2.168, + "step": 464730 + }, + { + "epoch": 1.7965548700344822, + "grad_norm": 0.12816593050956726, + "learning_rate": 0.00039077658480868484, + "loss": 2.1878, + "step": 464740 + }, + { + "epoch": 1.7965935272378655, + "grad_norm": 0.1346024125814438, + "learning_rate": 0.0003906523060568918, + "loss": 2.1862, + "step": 464750 + }, + { + "epoch": 1.7966321844412487, + "grad_norm": 0.14077845215797424, + "learning_rate": 0.00039052803690154336, + "loss": 2.1944, + "step": 464760 + }, + { + "epoch": 1.796670841644632, + "grad_norm": 0.1459149569272995, + "learning_rate": 0.0003904037773404163, + "loss": 2.1837, + "step": 464770 + }, + { + "epoch": 1.7967094988480152, + "grad_norm": 0.13631995022296906, + "learning_rate": 0.00039027952737128916, + "loss": 2.1902, + "step": 464780 + }, + { + "epoch": 1.7967481560513985, + "grad_norm": 0.14161813259124756, + "learning_rate": 0.0003901552869919409, + "loss": 2.1842, + "step": 464790 + }, + { + "epoch": 1.7967868132547817, + "grad_norm": 0.13641470670700073, + "learning_rate": 0.00039003105620015146, + "loss": 2.1826, + "step": 464800 + }, + { + "epoch": 1.7968254704581652, + "grad_norm": 0.13865359127521515, + "learning_rate": 0.0003899068349937014, + "loss": 2.1938, + "step": 464810 + }, + { + "epoch": 1.7968641276615485, + "grad_norm": 0.1396956443786621, + "learning_rate": 0.0003897826233703725, + "loss": 2.1741, + "step": 464820 + }, + { + "epoch": 1.7969027848649317, + "grad_norm": 0.13643811643123627, + "learning_rate": 0.00038965842132794706, + "loss": 2.1956, + "step": 464830 + }, + { + "epoch": 1.796941442068315, + "grad_norm": 0.13304957747459412, + "learning_rate": 0.0003895342288642083, + "loss": 2.1757, + "step": 464840 + }, + { + "epoch": 1.7969800992716984, + "grad_norm": 0.1358797699213028, + "learning_rate": 0.00038941004597694076, + "loss": 2.1777, + "step": 464850 + }, + { + "epoch": 1.7970187564750817, + "grad_norm": 0.139873206615448, + "learning_rate": 0.00038928587266392925, + "loss": 2.1837, + "step": 464860 + }, + { + "epoch": 1.797057413678465, + "grad_norm": 0.14250683784484863, + "learning_rate": 0.00038916170892295957, + "loss": 2.1822, + "step": 464870 + }, + { + "epoch": 1.7970960708818482, + "grad_norm": 0.1290065497159958, + "learning_rate": 0.00038903755475181855, + "loss": 2.1697, + "step": 464880 + }, + { + "epoch": 1.7971347280852314, + "grad_norm": 0.1323602795600891, + "learning_rate": 0.00038891341014829365, + "loss": 2.1984, + "step": 464890 + }, + { + "epoch": 1.7971733852886147, + "grad_norm": 0.13214319944381714, + "learning_rate": 0.0003887892751101736, + "loss": 2.171, + "step": 464900 + }, + { + "epoch": 1.797212042491998, + "grad_norm": 0.1308826357126236, + "learning_rate": 0.00038866514963524734, + "loss": 2.17, + "step": 464910 + }, + { + "epoch": 1.7972506996953812, + "grad_norm": 0.13486738502979279, + "learning_rate": 0.0003885410337213051, + "loss": 2.1815, + "step": 464920 + }, + { + "epoch": 1.7972893568987645, + "grad_norm": 0.13937675952911377, + "learning_rate": 0.00038841692736613797, + "loss": 2.1841, + "step": 464930 + }, + { + "epoch": 1.7973280141021477, + "grad_norm": 0.1473415046930313, + "learning_rate": 0.00038829283056753775, + "loss": 2.1837, + "step": 464940 + }, + { + "epoch": 1.797366671305531, + "grad_norm": 0.1369243860244751, + "learning_rate": 0.000388168743323297, + "loss": 2.1827, + "step": 464950 + }, + { + "epoch": 1.7974053285089142, + "grad_norm": 0.13933010399341583, + "learning_rate": 0.0003880446656312093, + "loss": 2.1799, + "step": 464960 + }, + { + "epoch": 1.7974439857122975, + "grad_norm": 0.13083308935165405, + "learning_rate": 0.00038792059748906895, + "loss": 2.183, + "step": 464970 + }, + { + "epoch": 1.797482642915681, + "grad_norm": 0.13724645972251892, + "learning_rate": 0.0003877965388946716, + "loss": 2.1802, + "step": 464980 + }, + { + "epoch": 1.7975213001190642, + "grad_norm": 0.1350887417793274, + "learning_rate": 0.00038767248984581304, + "loss": 2.1946, + "step": 464990 + }, + { + "epoch": 1.7975599573224474, + "grad_norm": 0.1418774127960205, + "learning_rate": 0.00038754845034028994, + "loss": 2.1783, + "step": 465000 + }, + { + "epoch": 1.7975986145258307, + "grad_norm": 0.14340370893478394, + "learning_rate": 0.0003874244203759005, + "loss": 2.1918, + "step": 465010 + }, + { + "epoch": 1.7976372717292142, + "grad_norm": 0.1408160775899887, + "learning_rate": 0.0003873003999504434, + "loss": 2.1886, + "step": 465020 + }, + { + "epoch": 1.7976759289325974, + "grad_norm": 0.14962686598300934, + "learning_rate": 0.0003871763890617177, + "loss": 2.1787, + "step": 465030 + }, + { + "epoch": 1.7977145861359807, + "grad_norm": 0.12789714336395264, + "learning_rate": 0.00038705238770752383, + "loss": 2.1711, + "step": 465040 + }, + { + "epoch": 1.797753243339364, + "grad_norm": 0.13508401811122894, + "learning_rate": 0.0003869283958856631, + "loss": 2.1685, + "step": 465050 + }, + { + "epoch": 1.7977919005427472, + "grad_norm": 0.14755527675151825, + "learning_rate": 0.00038680441359393745, + "loss": 2.1775, + "step": 465060 + }, + { + "epoch": 1.7978305577461304, + "grad_norm": 0.1413993090391159, + "learning_rate": 0.00038668044083014987, + "loss": 2.1773, + "step": 465070 + }, + { + "epoch": 1.7978692149495137, + "grad_norm": 0.13822470605373383, + "learning_rate": 0.00038655647759210354, + "loss": 2.1674, + "step": 465080 + }, + { + "epoch": 1.797907872152897, + "grad_norm": 0.1406533569097519, + "learning_rate": 0.00038643252387760365, + "loss": 2.1813, + "step": 465090 + }, + { + "epoch": 1.7979465293562802, + "grad_norm": 0.13797791302204132, + "learning_rate": 0.0003863085796844552, + "loss": 2.1735, + "step": 465100 + }, + { + "epoch": 1.7979851865596634, + "grad_norm": 0.13203437626361847, + "learning_rate": 0.00038618464501046466, + "loss": 2.1778, + "step": 465110 + }, + { + "epoch": 1.7980238437630467, + "grad_norm": 0.13330654799938202, + "learning_rate": 0.00038606071985343893, + "loss": 2.1701, + "step": 465120 + }, + { + "epoch": 1.79806250096643, + "grad_norm": 0.12800458073616028, + "learning_rate": 0.00038593680421118574, + "loss": 2.1876, + "step": 465130 + }, + { + "epoch": 1.7981011581698134, + "grad_norm": 0.14644229412078857, + "learning_rate": 0.0003858128980815143, + "loss": 2.1572, + "step": 465140 + }, + { + "epoch": 1.7981398153731967, + "grad_norm": 0.13411371409893036, + "learning_rate": 0.00038568900146223386, + "loss": 2.1805, + "step": 465150 + }, + { + "epoch": 1.79817847257658, + "grad_norm": 0.13013438880443573, + "learning_rate": 0.00038556511435115494, + "loss": 2.184, + "step": 465160 + }, + { + "epoch": 1.7982171297799632, + "grad_norm": 0.1430242508649826, + "learning_rate": 0.00038544123674608865, + "loss": 2.1809, + "step": 465170 + }, + { + "epoch": 1.7982557869833464, + "grad_norm": 0.13126738369464874, + "learning_rate": 0.0003853173686448472, + "loss": 2.1838, + "step": 465180 + }, + { + "epoch": 1.79829444418673, + "grad_norm": 0.13454361259937286, + "learning_rate": 0.0003851935100452437, + "loss": 2.1774, + "step": 465190 + }, + { + "epoch": 1.7983331013901132, + "grad_norm": 0.1343718320131302, + "learning_rate": 0.0003850696609450921, + "loss": 2.1715, + "step": 465200 + }, + { + "epoch": 1.7983717585934964, + "grad_norm": 0.13827650249004364, + "learning_rate": 0.0003849458213422066, + "loss": 2.1833, + "step": 465210 + }, + { + "epoch": 1.7984104157968797, + "grad_norm": 0.15048812329769135, + "learning_rate": 0.00038482199123440264, + "loss": 2.1765, + "step": 465220 + }, + { + "epoch": 1.798449073000263, + "grad_norm": 0.13728807866573334, + "learning_rate": 0.000384698170619497, + "loss": 2.1933, + "step": 465230 + }, + { + "epoch": 1.7984877302036462, + "grad_norm": 0.13223780691623688, + "learning_rate": 0.0003845743594953064, + "loss": 2.1672, + "step": 465240 + }, + { + "epoch": 1.7985263874070294, + "grad_norm": 0.13138167560100555, + "learning_rate": 0.0003844505578596489, + "loss": 2.1835, + "step": 465250 + }, + { + "epoch": 1.7985650446104127, + "grad_norm": 0.13611911237239838, + "learning_rate": 0.0003843267657103433, + "loss": 2.179, + "step": 465260 + }, + { + "epoch": 1.798603701813796, + "grad_norm": 0.14531941711902618, + "learning_rate": 0.0003842029830452094, + "loss": 2.1804, + "step": 465270 + }, + { + "epoch": 1.7986423590171792, + "grad_norm": 0.1326284259557724, + "learning_rate": 0.00038407920986206756, + "loss": 2.1778, + "step": 465280 + }, + { + "epoch": 1.7986810162205624, + "grad_norm": 0.13530728220939636, + "learning_rate": 0.00038395544615873914, + "loss": 2.1922, + "step": 465290 + }, + { + "epoch": 1.7987196734239457, + "grad_norm": 0.13688497245311737, + "learning_rate": 0.00038383169193304624, + "loss": 2.1731, + "step": 465300 + }, + { + "epoch": 1.7987583306273291, + "grad_norm": 0.13359713554382324, + "learning_rate": 0.000383707947182812, + "loss": 2.1863, + "step": 465310 + }, + { + "epoch": 1.7987969878307124, + "grad_norm": 0.13442204892635345, + "learning_rate": 0.0003835842119058599, + "loss": 2.1773, + "step": 465320 + }, + { + "epoch": 1.7988356450340957, + "grad_norm": 0.13345547020435333, + "learning_rate": 0.000383460486100015, + "loss": 2.1906, + "step": 465330 + }, + { + "epoch": 1.798874302237479, + "grad_norm": 0.13196028769016266, + "learning_rate": 0.0003833367697631025, + "loss": 2.1886, + "step": 465340 + }, + { + "epoch": 1.7989129594408622, + "grad_norm": 0.1448703408241272, + "learning_rate": 0.00038321306289294886, + "loss": 2.185, + "step": 465350 + }, + { + "epoch": 1.7989516166442456, + "grad_norm": 0.1323598027229309, + "learning_rate": 0.00038308936548738106, + "loss": 2.1767, + "step": 465360 + }, + { + "epoch": 1.7989902738476289, + "grad_norm": 0.13242816925048828, + "learning_rate": 0.0003829656775442274, + "loss": 2.1704, + "step": 465370 + }, + { + "epoch": 1.7990289310510121, + "grad_norm": 0.13908936083316803, + "learning_rate": 0.00038284199906131613, + "loss": 2.178, + "step": 465380 + }, + { + "epoch": 1.7990675882543954, + "grad_norm": 0.14099274575710297, + "learning_rate": 0.0003827183300364776, + "loss": 2.1736, + "step": 465390 + }, + { + "epoch": 1.7991062454577786, + "grad_norm": 0.13117602467536926, + "learning_rate": 0.0003825946704675416, + "loss": 2.1667, + "step": 465400 + }, + { + "epoch": 1.799144902661162, + "grad_norm": 0.13193495571613312, + "learning_rate": 0.00038247102035234004, + "loss": 2.1999, + "step": 465410 + }, + { + "epoch": 1.7991835598645451, + "grad_norm": 0.13038615882396698, + "learning_rate": 0.0003823473796887047, + "loss": 2.1899, + "step": 465420 + }, + { + "epoch": 1.7992222170679284, + "grad_norm": 0.13985690474510193, + "learning_rate": 0.0003822237484744684, + "loss": 2.1819, + "step": 465430 + }, + { + "epoch": 1.7992608742713116, + "grad_norm": 0.13930624723434448, + "learning_rate": 0.0003821001267074653, + "loss": 2.1834, + "step": 465440 + }, + { + "epoch": 1.799299531474695, + "grad_norm": 0.1398736536502838, + "learning_rate": 0.00038197651438552985, + "loss": 2.1811, + "step": 465450 + }, + { + "epoch": 1.7993381886780782, + "grad_norm": 0.14476174116134644, + "learning_rate": 0.00038185291150649725, + "loss": 2.1798, + "step": 465460 + }, + { + "epoch": 1.7993768458814614, + "grad_norm": 0.13657523691654205, + "learning_rate": 0.00038172931806820397, + "loss": 2.1647, + "step": 465470 + }, + { + "epoch": 1.7994155030848449, + "grad_norm": 0.13949531316757202, + "learning_rate": 0.0003816057340684873, + "loss": 2.1851, + "step": 465480 + }, + { + "epoch": 1.7994541602882281, + "grad_norm": 0.1438172459602356, + "learning_rate": 0.0003814821595051847, + "loss": 2.1769, + "step": 465490 + }, + { + "epoch": 1.7994928174916114, + "grad_norm": 0.14232169091701508, + "learning_rate": 0.00038135859437613554, + "loss": 2.1664, + "step": 465500 + }, + { + "epoch": 1.7995314746949946, + "grad_norm": 0.14083006978034973, + "learning_rate": 0.00038123503867917873, + "loss": 2.1734, + "step": 465510 + }, + { + "epoch": 1.799570131898378, + "grad_norm": 0.2224435806274414, + "learning_rate": 0.00038111149241215505, + "loss": 2.1903, + "step": 465520 + }, + { + "epoch": 1.7996087891017614, + "grad_norm": 0.13325290381908417, + "learning_rate": 0.0003809879555729057, + "loss": 2.1779, + "step": 465530 + }, + { + "epoch": 1.7996474463051446, + "grad_norm": 0.13875515758991241, + "learning_rate": 0.0003808644281592724, + "loss": 2.1839, + "step": 465540 + }, + { + "epoch": 1.7996861035085279, + "grad_norm": 0.12978360056877136, + "learning_rate": 0.0003807409101690984, + "loss": 2.1805, + "step": 465550 + }, + { + "epoch": 1.7997247607119111, + "grad_norm": 0.1447114795446396, + "learning_rate": 0.0003806174016002273, + "loss": 2.1888, + "step": 465560 + }, + { + "epoch": 1.7997634179152944, + "grad_norm": 0.14308002591133118, + "learning_rate": 0.0003804939024505034, + "loss": 2.1888, + "step": 465570 + }, + { + "epoch": 1.7998020751186776, + "grad_norm": 0.131975919008255, + "learning_rate": 0.00038037041271777206, + "loss": 2.1964, + "step": 465580 + }, + { + "epoch": 1.7998407323220609, + "grad_norm": 0.1352776437997818, + "learning_rate": 0.0003802469323998796, + "loss": 2.1995, + "step": 465590 + }, + { + "epoch": 1.7998793895254441, + "grad_norm": 0.1364874243736267, + "learning_rate": 0.00038012346149467294, + "loss": 2.1741, + "step": 465600 + }, + { + "epoch": 1.7999180467288274, + "grad_norm": 0.13249623775482178, + "learning_rate": 0.0003799999999999999, + "loss": 2.1845, + "step": 465610 + }, + { + "epoch": 1.7999567039322106, + "grad_norm": 0.13581779599189758, + "learning_rate": 0.00037987654791370916, + "loss": 2.1756, + "step": 465620 + }, + { + "epoch": 1.7999953611355939, + "grad_norm": 0.13477849960327148, + "learning_rate": 0.0003797531052336498, + "loss": 2.1781, + "step": 465630 + }, + { + "epoch": 1.8000340183389771, + "grad_norm": 0.14062079787254333, + "learning_rate": 0.00037962967195767264, + "loss": 2.1815, + "step": 465640 + }, + { + "epoch": 1.8000726755423606, + "grad_norm": 0.13062138855457306, + "learning_rate": 0.00037950624808362823, + "loss": 2.1799, + "step": 465650 + }, + { + "epoch": 1.8001113327457439, + "grad_norm": 0.1472616344690323, + "learning_rate": 0.00037938283360936854, + "loss": 2.1665, + "step": 465660 + }, + { + "epoch": 1.8001499899491271, + "grad_norm": 0.1320866197347641, + "learning_rate": 0.0003792594285327464, + "loss": 2.1736, + "step": 465670 + }, + { + "epoch": 1.8001886471525104, + "grad_norm": 0.131913959980011, + "learning_rate": 0.0003791360328516151, + "loss": 2.1878, + "step": 465680 + }, + { + "epoch": 1.8002273043558938, + "grad_norm": 0.1491318941116333, + "learning_rate": 0.0003790126465638295, + "loss": 2.1772, + "step": 465690 + }, + { + "epoch": 1.800265961559277, + "grad_norm": 0.1818619966506958, + "learning_rate": 0.00037888926966724436, + "loss": 2.1752, + "step": 465700 + }, + { + "epoch": 1.8003046187626603, + "grad_norm": 0.14308425784111023, + "learning_rate": 0.00037876590215971584, + "loss": 2.1712, + "step": 465710 + }, + { + "epoch": 1.8003432759660436, + "grad_norm": 0.1418374925851822, + "learning_rate": 0.0003786425440391006, + "loss": 2.1756, + "step": 465720 + }, + { + "epoch": 1.8003819331694269, + "grad_norm": 0.12860672175884247, + "learning_rate": 0.00037851919530325606, + "loss": 2.1726, + "step": 465730 + }, + { + "epoch": 1.80042059037281, + "grad_norm": 0.13387353718280792, + "learning_rate": 0.0003783958559500411, + "loss": 2.189, + "step": 465740 + }, + { + "epoch": 1.8004592475761934, + "grad_norm": 0.14423732459545135, + "learning_rate": 0.0003782725259773145, + "loss": 2.1842, + "step": 465750 + }, + { + "epoch": 1.8004979047795766, + "grad_norm": 0.13406185805797577, + "learning_rate": 0.0003781492053829367, + "loss": 2.1937, + "step": 465760 + }, + { + "epoch": 1.8005365619829599, + "grad_norm": 0.15299859642982483, + "learning_rate": 0.0003780258941647683, + "loss": 2.1682, + "step": 465770 + }, + { + "epoch": 1.8005752191863431, + "grad_norm": 0.14690442383289337, + "learning_rate": 0.0003779025923206707, + "loss": 2.1801, + "step": 465780 + }, + { + "epoch": 1.8006138763897264, + "grad_norm": 0.1272706538438797, + "learning_rate": 0.000377779299848507, + "loss": 2.1661, + "step": 465790 + }, + { + "epoch": 1.8006525335931096, + "grad_norm": 0.13635335862636566, + "learning_rate": 0.00037765601674614026, + "loss": 2.1781, + "step": 465800 + }, + { + "epoch": 1.8006911907964929, + "grad_norm": 0.1357850581407547, + "learning_rate": 0.00037753274301143436, + "loss": 2.1726, + "step": 465810 + }, + { + "epoch": 1.8007298479998763, + "grad_norm": 0.14069999754428864, + "learning_rate": 0.0003774094786422546, + "loss": 2.1902, + "step": 465820 + }, + { + "epoch": 1.8007685052032596, + "grad_norm": 0.15152522921562195, + "learning_rate": 0.00037728622363646645, + "loss": 2.1753, + "step": 465830 + }, + { + "epoch": 1.8008071624066428, + "grad_norm": 0.13653448224067688, + "learning_rate": 0.00037716297799193657, + "loss": 2.1876, + "step": 465840 + }, + { + "epoch": 1.800845819610026, + "grad_norm": 0.13473238050937653, + "learning_rate": 0.0003770397417065321, + "loss": 2.1756, + "step": 465850 + }, + { + "epoch": 1.8008844768134096, + "grad_norm": 0.14700768887996674, + "learning_rate": 0.0003769165147781215, + "loss": 2.1749, + "step": 465860 + }, + { + "epoch": 1.8009231340167928, + "grad_norm": 0.13021835684776306, + "learning_rate": 0.00037679329720457356, + "loss": 2.1691, + "step": 465870 + }, + { + "epoch": 1.800961791220176, + "grad_norm": 0.1447163224220276, + "learning_rate": 0.000376670088983758, + "loss": 2.1935, + "step": 465880 + }, + { + "epoch": 1.8010004484235593, + "grad_norm": 0.13230176270008087, + "learning_rate": 0.0003765468901135456, + "loss": 2.1703, + "step": 465890 + }, + { + "epoch": 1.8010391056269426, + "grad_norm": 0.13884708285331726, + "learning_rate": 0.00037642370059180763, + "loss": 2.1827, + "step": 465900 + }, + { + "epoch": 1.8010777628303258, + "grad_norm": 0.1411489099264145, + "learning_rate": 0.0003763005204164165, + "loss": 2.1745, + "step": 465910 + }, + { + "epoch": 1.801116420033709, + "grad_norm": 0.1436185985803604, + "learning_rate": 0.000376177349585245, + "loss": 2.1955, + "step": 465920 + }, + { + "epoch": 1.8011550772370923, + "grad_norm": 0.1410207450389862, + "learning_rate": 0.00037605418809616675, + "loss": 2.1815, + "step": 465930 + }, + { + "epoch": 1.8011937344404756, + "grad_norm": 0.1412554234266281, + "learning_rate": 0.0003759310359470569, + "loss": 2.1914, + "step": 465940 + }, + { + "epoch": 1.8012323916438588, + "grad_norm": 0.14789709448814392, + "learning_rate": 0.00037580789313579045, + "loss": 2.1907, + "step": 465950 + }, + { + "epoch": 1.801271048847242, + "grad_norm": 0.14720627665519714, + "learning_rate": 0.0003756847596602437, + "loss": 2.1826, + "step": 465960 + }, + { + "epoch": 1.8013097060506253, + "grad_norm": 0.1361972987651825, + "learning_rate": 0.00037556163551829403, + "loss": 2.1794, + "step": 465970 + }, + { + "epoch": 1.8013483632540086, + "grad_norm": 0.14298291504383087, + "learning_rate": 0.0003754385207078188, + "loss": 2.1696, + "step": 465980 + }, + { + "epoch": 1.801387020457392, + "grad_norm": 0.13563038408756256, + "learning_rate": 0.00037531541522669686, + "loss": 2.1697, + "step": 465990 + }, + { + "epoch": 1.8014256776607753, + "grad_norm": 0.1419105976819992, + "learning_rate": 0.00037519231907280794, + "loss": 2.1756, + "step": 466000 + }, + { + "epoch": 1.8014643348641586, + "grad_norm": 0.13791169226169586, + "learning_rate": 0.0003750692322440319, + "loss": 2.1956, + "step": 466010 + }, + { + "epoch": 1.8015029920675418, + "grad_norm": 0.12917031347751617, + "learning_rate": 0.00037494615473825, + "loss": 2.175, + "step": 466020 + }, + { + "epoch": 1.8015416492709253, + "grad_norm": 0.13964608311653137, + "learning_rate": 0.00037482308655334396, + "loss": 2.1758, + "step": 466030 + }, + { + "epoch": 1.8015803064743086, + "grad_norm": 0.15289010107517242, + "learning_rate": 0.00037470002768719656, + "loss": 2.1872, + "step": 466040 + }, + { + "epoch": 1.8016189636776918, + "grad_norm": 0.14537134766578674, + "learning_rate": 0.00037457697813769107, + "loss": 2.181, + "step": 466050 + }, + { + "epoch": 1.801657620881075, + "grad_norm": 0.13884811103343964, + "learning_rate": 0.0003744539379027121, + "loss": 2.1666, + "step": 466060 + }, + { + "epoch": 1.8016962780844583, + "grad_norm": 0.14185583591461182, + "learning_rate": 0.0003743309069801444, + "loss": 2.1697, + "step": 466070 + }, + { + "epoch": 1.8017349352878416, + "grad_norm": 0.12399603426456451, + "learning_rate": 0.0003742078853678739, + "loss": 2.1698, + "step": 466080 + }, + { + "epoch": 1.8017735924912248, + "grad_norm": 0.13981971144676208, + "learning_rate": 0.00037408487306378734, + "loss": 2.1871, + "step": 466090 + }, + { + "epoch": 1.801812249694608, + "grad_norm": 0.13385088741779327, + "learning_rate": 0.0003739618700657723, + "loss": 2.1776, + "step": 466100 + }, + { + "epoch": 1.8018509068979913, + "grad_norm": 0.14517062902450562, + "learning_rate": 0.0003738388763717169, + "loss": 2.1799, + "step": 466110 + }, + { + "epoch": 1.8018895641013746, + "grad_norm": 0.1450577825307846, + "learning_rate": 0.00037371589197951027, + "loss": 2.1745, + "step": 466120 + }, + { + "epoch": 1.8019282213047578, + "grad_norm": 0.13384157419204712, + "learning_rate": 0.000373592916887042, + "loss": 2.1802, + "step": 466130 + }, + { + "epoch": 1.801966878508141, + "grad_norm": 0.14221005141735077, + "learning_rate": 0.0003734699510922033, + "loss": 2.1944, + "step": 466140 + }, + { + "epoch": 1.8020055357115243, + "grad_norm": 0.1455065906047821, + "learning_rate": 0.0003733469945928849, + "loss": 2.1854, + "step": 466150 + }, + { + "epoch": 1.8020441929149078, + "grad_norm": 0.138362854719162, + "learning_rate": 0.00037322404738697967, + "loss": 2.1867, + "step": 466160 + }, + { + "epoch": 1.802082850118291, + "grad_norm": 0.14707984030246735, + "learning_rate": 0.0003731011094723804, + "loss": 2.1681, + "step": 466170 + }, + { + "epoch": 1.8021215073216743, + "grad_norm": 0.13016606867313385, + "learning_rate": 0.0003729781808469808, + "loss": 2.1755, + "step": 466180 + }, + { + "epoch": 1.8021601645250576, + "grad_norm": 0.1483168751001358, + "learning_rate": 0.0003728552615086758, + "loss": 2.1808, + "step": 466190 + }, + { + "epoch": 1.802198821728441, + "grad_norm": 0.13777334988117218, + "learning_rate": 0.00037273235145536066, + "loss": 2.1752, + "step": 466200 + }, + { + "epoch": 1.8022374789318243, + "grad_norm": 0.13940338790416718, + "learning_rate": 0.0003726094506849316, + "loss": 2.1796, + "step": 466210 + }, + { + "epoch": 1.8022761361352075, + "grad_norm": 0.13678772747516632, + "learning_rate": 0.0003724865591952857, + "loss": 2.1863, + "step": 466220 + }, + { + "epoch": 1.8023147933385908, + "grad_norm": 0.13423606753349304, + "learning_rate": 0.00037236367698432083, + "loss": 2.1723, + "step": 466230 + }, + { + "epoch": 1.802353450541974, + "grad_norm": 0.14014889299869537, + "learning_rate": 0.00037224080404993565, + "loss": 2.1795, + "step": 466240 + }, + { + "epoch": 1.8023921077453573, + "grad_norm": 0.14486335217952728, + "learning_rate": 0.00037211794039002945, + "loss": 2.1824, + "step": 466250 + }, + { + "epoch": 1.8024307649487405, + "grad_norm": 0.14388778805732727, + "learning_rate": 0.00037199508600250234, + "loss": 2.1685, + "step": 466260 + }, + { + "epoch": 1.8024694221521238, + "grad_norm": 0.13235856592655182, + "learning_rate": 0.0003718722408852555, + "loss": 2.1741, + "step": 466270 + }, + { + "epoch": 1.802508079355507, + "grad_norm": 0.1384734809398651, + "learning_rate": 0.00037174940503619045, + "loss": 2.1746, + "step": 466280 + }, + { + "epoch": 1.8025467365588903, + "grad_norm": 0.13937100768089294, + "learning_rate": 0.0003716265784532098, + "loss": 2.1831, + "step": 466290 + }, + { + "epoch": 1.8025853937622736, + "grad_norm": 0.14748027920722961, + "learning_rate": 0.00037150376113421736, + "loss": 2.1725, + "step": 466300 + }, + { + "epoch": 1.8026240509656568, + "grad_norm": 0.1350601464509964, + "learning_rate": 0.00037138095307711683, + "loss": 2.193, + "step": 466310 + }, + { + "epoch": 1.80266270816904, + "grad_norm": 0.14014708995819092, + "learning_rate": 0.00037125815427981344, + "loss": 2.1666, + "step": 466320 + }, + { + "epoch": 1.8027013653724235, + "grad_norm": 0.13212034106254578, + "learning_rate": 0.0003711353647402127, + "loss": 2.1827, + "step": 466330 + }, + { + "epoch": 1.8027400225758068, + "grad_norm": 0.13790425658226013, + "learning_rate": 0.0003710125844562211, + "loss": 2.1748, + "step": 466340 + }, + { + "epoch": 1.80277867977919, + "grad_norm": 0.1323886662721634, + "learning_rate": 0.00037088981342574636, + "loss": 2.1774, + "step": 466350 + }, + { + "epoch": 1.8028173369825733, + "grad_norm": 0.1366249918937683, + "learning_rate": 0.0003707670516466959, + "loss": 2.1653, + "step": 466360 + }, + { + "epoch": 1.8028559941859568, + "grad_norm": 0.13770009577274323, + "learning_rate": 0.0003706442991169792, + "loss": 2.1673, + "step": 466370 + }, + { + "epoch": 1.80289465138934, + "grad_norm": 0.13911324739456177, + "learning_rate": 0.00037052155583450566, + "loss": 2.1823, + "step": 466380 + }, + { + "epoch": 1.8029333085927233, + "grad_norm": 0.13929787278175354, + "learning_rate": 0.0003703988217971859, + "loss": 2.1691, + "step": 466390 + }, + { + "epoch": 1.8029719657961065, + "grad_norm": 0.13730750977993011, + "learning_rate": 0.00037027609700293086, + "loss": 2.1581, + "step": 466400 + }, + { + "epoch": 1.8030106229994898, + "grad_norm": 0.13073213398456573, + "learning_rate": 0.00037015338144965313, + "loss": 2.1739, + "step": 466410 + }, + { + "epoch": 1.803049280202873, + "grad_norm": 0.16473360359668732, + "learning_rate": 0.0003700306751352649, + "loss": 2.1711, + "step": 466420 + }, + { + "epoch": 1.8030879374062563, + "grad_norm": 0.12526480853557587, + "learning_rate": 0.0003699079780576804, + "loss": 2.16, + "step": 466430 + }, + { + "epoch": 1.8031265946096395, + "grad_norm": 0.13086701929569244, + "learning_rate": 0.0003697852902148136, + "loss": 2.1803, + "step": 466440 + }, + { + "epoch": 1.8031652518130228, + "grad_norm": 0.14068730175495148, + "learning_rate": 0.0003696626116045796, + "loss": 2.1749, + "step": 466450 + }, + { + "epoch": 1.803203909016406, + "grad_norm": 0.12961159646511078, + "learning_rate": 0.0003695399422248946, + "loss": 2.186, + "step": 466460 + }, + { + "epoch": 1.8032425662197893, + "grad_norm": 0.13988232612609863, + "learning_rate": 0.00036941728207367543, + "loss": 2.1866, + "step": 466470 + }, + { + "epoch": 1.8032812234231725, + "grad_norm": 0.1488853394985199, + "learning_rate": 0.00036929463114883946, + "loss": 2.1771, + "step": 466480 + }, + { + "epoch": 1.8033198806265558, + "grad_norm": 0.13858400285243988, + "learning_rate": 0.00036917198944830476, + "loss": 2.1578, + "step": 466490 + }, + { + "epoch": 1.8033585378299393, + "grad_norm": 0.1406715363264084, + "learning_rate": 0.0003690493569699909, + "loss": 2.1727, + "step": 466500 + }, + { + "epoch": 1.8033971950333225, + "grad_norm": 0.14598716795444489, + "learning_rate": 0.00036892673371181765, + "loss": 2.1956, + "step": 466510 + }, + { + "epoch": 1.8034358522367058, + "grad_norm": 0.1434890627861023, + "learning_rate": 0.00036880411967170535, + "loss": 2.1911, + "step": 466520 + }, + { + "epoch": 1.803474509440089, + "grad_norm": 0.13857607543468475, + "learning_rate": 0.00036868151484757575, + "loss": 2.165, + "step": 466530 + }, + { + "epoch": 1.8035131666434725, + "grad_norm": 0.13821934163570404, + "learning_rate": 0.00036855891923735106, + "loss": 2.1851, + "step": 466540 + }, + { + "epoch": 1.8035518238468558, + "grad_norm": 0.15694352984428406, + "learning_rate": 0.0003684363328389542, + "loss": 2.1821, + "step": 466550 + }, + { + "epoch": 1.803590481050239, + "grad_norm": 0.24491091072559357, + "learning_rate": 0.00036831375565030887, + "loss": 2.1643, + "step": 466560 + }, + { + "epoch": 1.8036291382536223, + "grad_norm": 0.13402949273586273, + "learning_rate": 0.0003681911876693398, + "loss": 2.1775, + "step": 466570 + }, + { + "epoch": 1.8036677954570055, + "grad_norm": 0.1315721869468689, + "learning_rate": 0.0003680686288939723, + "loss": 2.1647, + "step": 466580 + }, + { + "epoch": 1.8037064526603888, + "grad_norm": 0.1430567055940628, + "learning_rate": 0.0003679460793221321, + "loss": 2.1823, + "step": 466590 + }, + { + "epoch": 1.803745109863772, + "grad_norm": 0.15081314742565155, + "learning_rate": 0.0003678235389517468, + "loss": 2.1719, + "step": 466600 + }, + { + "epoch": 1.8037837670671553, + "grad_norm": 0.36761045455932617, + "learning_rate": 0.00036770100778074367, + "loss": 2.1738, + "step": 466610 + }, + { + "epoch": 1.8038224242705385, + "grad_norm": 0.13689853250980377, + "learning_rate": 0.000367578485807051, + "loss": 2.1746, + "step": 466620 + }, + { + "epoch": 1.8038610814739218, + "grad_norm": 0.14220766723155975, + "learning_rate": 0.00036745597302859843, + "loss": 2.1867, + "step": 466630 + }, + { + "epoch": 1.803899738677305, + "grad_norm": 0.13954247534275055, + "learning_rate": 0.00036733346944331594, + "loss": 2.1849, + "step": 466640 + }, + { + "epoch": 1.8039383958806883, + "grad_norm": 0.14342482388019562, + "learning_rate": 0.0003672109750491339, + "loss": 2.1891, + "step": 466650 + }, + { + "epoch": 1.8039770530840715, + "grad_norm": 0.13386207818984985, + "learning_rate": 0.00036708848984398434, + "loss": 2.1792, + "step": 466660 + }, + { + "epoch": 1.804015710287455, + "grad_norm": 0.14620913565158844, + "learning_rate": 0.0003669660138257993, + "loss": 2.1828, + "step": 466670 + }, + { + "epoch": 1.8040543674908383, + "grad_norm": 0.13415135443210602, + "learning_rate": 0.0003668435469925118, + "loss": 2.1802, + "step": 466680 + }, + { + "epoch": 1.8040930246942215, + "grad_norm": 0.14544934034347534, + "learning_rate": 0.0003667210893420561, + "loss": 2.1764, + "step": 466690 + }, + { + "epoch": 1.8041316818976048, + "grad_norm": 0.1273856908082962, + "learning_rate": 0.0003665986408723665, + "loss": 2.179, + "step": 466700 + }, + { + "epoch": 1.8041703391009882, + "grad_norm": 0.13600336015224457, + "learning_rate": 0.0003664762015813787, + "loss": 2.1661, + "step": 466710 + }, + { + "epoch": 1.8042089963043715, + "grad_norm": 0.14356358349323273, + "learning_rate": 0.0003663537714670291, + "loss": 2.1596, + "step": 466720 + }, + { + "epoch": 1.8042476535077547, + "grad_norm": 0.1356818675994873, + "learning_rate": 0.0003662313505272541, + "loss": 2.1816, + "step": 466730 + }, + { + "epoch": 1.804286310711138, + "grad_norm": 0.13973554968833923, + "learning_rate": 0.00036610893875999186, + "loss": 2.1774, + "step": 466740 + }, + { + "epoch": 1.8043249679145212, + "grad_norm": 0.1400384157896042, + "learning_rate": 0.0003659865361631809, + "loss": 2.1816, + "step": 466750 + }, + { + "epoch": 1.8043636251179045, + "grad_norm": 0.13273154199123383, + "learning_rate": 0.0003658641427347602, + "loss": 2.1762, + "step": 466760 + }, + { + "epoch": 1.8044022823212877, + "grad_norm": 0.1496812403202057, + "learning_rate": 0.00036574175847267035, + "loss": 2.1769, + "step": 466770 + }, + { + "epoch": 1.804440939524671, + "grad_norm": 0.1461144983768463, + "learning_rate": 0.0003656193833748518, + "loss": 2.1795, + "step": 466780 + }, + { + "epoch": 1.8044795967280542, + "grad_norm": 0.13886010646820068, + "learning_rate": 0.000365497017439246, + "loss": 2.1781, + "step": 466790 + }, + { + "epoch": 1.8045182539314375, + "grad_norm": 0.1413482129573822, + "learning_rate": 0.000365374660663796, + "loss": 2.1923, + "step": 466800 + }, + { + "epoch": 1.8045569111348208, + "grad_norm": 0.1350710391998291, + "learning_rate": 0.00036525231304644444, + "loss": 2.1937, + "step": 466810 + }, + { + "epoch": 1.804595568338204, + "grad_norm": 0.14257410168647766, + "learning_rate": 0.0003651299745851353, + "loss": 2.1789, + "step": 466820 + }, + { + "epoch": 1.8046342255415873, + "grad_norm": 0.1451081931591034, + "learning_rate": 0.0003650076452778135, + "loss": 2.1837, + "step": 466830 + }, + { + "epoch": 1.8046728827449707, + "grad_norm": 0.13941356539726257, + "learning_rate": 0.00036488532512242444, + "loss": 2.1796, + "step": 466840 + }, + { + "epoch": 1.804711539948354, + "grad_norm": 0.14537939429283142, + "learning_rate": 0.00036476301411691406, + "loss": 2.1741, + "step": 466850 + }, + { + "epoch": 1.8047501971517372, + "grad_norm": 0.14455877244472504, + "learning_rate": 0.0003646407122592297, + "loss": 2.1726, + "step": 466860 + }, + { + "epoch": 1.8047888543551205, + "grad_norm": 0.1392398625612259, + "learning_rate": 0.0003645184195473188, + "loss": 2.1761, + "step": 466870 + }, + { + "epoch": 1.804827511558504, + "grad_norm": 0.13840122520923615, + "learning_rate": 0.00036439613597913037, + "loss": 2.1871, + "step": 466880 + }, + { + "epoch": 1.8048661687618872, + "grad_norm": 0.13393983244895935, + "learning_rate": 0.0003642738615526131, + "loss": 2.1587, + "step": 466890 + }, + { + "epoch": 1.8049048259652705, + "grad_norm": 0.12925367057323456, + "learning_rate": 0.0003641515962657176, + "loss": 2.1718, + "step": 466900 + }, + { + "epoch": 1.8049434831686537, + "grad_norm": 0.13216130435466766, + "learning_rate": 0.00036402934011639434, + "loss": 2.1776, + "step": 466910 + }, + { + "epoch": 1.804982140372037, + "grad_norm": 0.1310267150402069, + "learning_rate": 0.0003639070931025952, + "loss": 2.1727, + "step": 466920 + }, + { + "epoch": 1.8050207975754202, + "grad_norm": 0.14622503519058228, + "learning_rate": 0.0003637848552222725, + "loss": 2.1599, + "step": 466930 + }, + { + "epoch": 1.8050594547788035, + "grad_norm": 0.1343652307987213, + "learning_rate": 0.0003636626264733791, + "loss": 2.1721, + "step": 466940 + }, + { + "epoch": 1.8050981119821867, + "grad_norm": 0.1314699500799179, + "learning_rate": 0.00036354040685386926, + "loss": 2.1659, + "step": 466950 + }, + { + "epoch": 1.80513676918557, + "grad_norm": 0.13724282383918762, + "learning_rate": 0.00036341819636169736, + "loss": 2.177, + "step": 466960 + }, + { + "epoch": 1.8051754263889532, + "grad_norm": 0.13903264701366425, + "learning_rate": 0.00036329599499481893, + "loss": 2.1807, + "step": 466970 + }, + { + "epoch": 1.8052140835923365, + "grad_norm": 0.13293766975402832, + "learning_rate": 0.0003631738027511902, + "loss": 2.1769, + "step": 466980 + }, + { + "epoch": 1.8052527407957197, + "grad_norm": 0.13913320004940033, + "learning_rate": 0.00036305161962876787, + "loss": 2.1863, + "step": 466990 + }, + { + "epoch": 1.805291397999103, + "grad_norm": 0.14354361593723297, + "learning_rate": 0.0003629294456255099, + "loss": 2.1992, + "step": 467000 + }, + { + "epoch": 1.8053300552024865, + "grad_norm": 0.14252910017967224, + "learning_rate": 0.000362807280739375, + "loss": 2.1758, + "step": 467010 + }, + { + "epoch": 1.8053687124058697, + "grad_norm": 0.14904700219631195, + "learning_rate": 0.00036268512496832184, + "loss": 2.1866, + "step": 467020 + }, + { + "epoch": 1.805407369609253, + "grad_norm": 0.12930892407894135, + "learning_rate": 0.0003625629783103106, + "loss": 2.1848, + "step": 467030 + }, + { + "epoch": 1.8054460268126362, + "grad_norm": 0.13121457397937775, + "learning_rate": 0.0003624408407633024, + "loss": 2.1684, + "step": 467040 + }, + { + "epoch": 1.8054846840160197, + "grad_norm": 0.1382637470960617, + "learning_rate": 0.0003623187123252583, + "loss": 2.171, + "step": 467050 + }, + { + "epoch": 1.805523341219403, + "grad_norm": 0.12879937887191772, + "learning_rate": 0.0003621965929941409, + "loss": 2.1775, + "step": 467060 + }, + { + "epoch": 1.8055619984227862, + "grad_norm": 0.13088010251522064, + "learning_rate": 0.000362074482767913, + "loss": 2.1683, + "step": 467070 + }, + { + "epoch": 1.8056006556261694, + "grad_norm": 0.1338772475719452, + "learning_rate": 0.0003619523816445385, + "loss": 2.1611, + "step": 467080 + }, + { + "epoch": 1.8056393128295527, + "grad_norm": 0.13470415771007538, + "learning_rate": 0.00036183028962198183, + "loss": 2.1836, + "step": 467090 + }, + { + "epoch": 1.805677970032936, + "grad_norm": 0.13731078803539276, + "learning_rate": 0.0003617082066982085, + "loss": 2.1805, + "step": 467100 + }, + { + "epoch": 1.8057166272363192, + "grad_norm": 0.1420268565416336, + "learning_rate": 0.0003615861328711845, + "loss": 2.1831, + "step": 467110 + }, + { + "epoch": 1.8057552844397025, + "grad_norm": 0.1401713788509369, + "learning_rate": 0.0003614640681388766, + "loss": 2.1768, + "step": 467120 + }, + { + "epoch": 1.8057939416430857, + "grad_norm": 0.12639404833316803, + "learning_rate": 0.00036134201249925257, + "loss": 2.1889, + "step": 467130 + }, + { + "epoch": 1.805832598846469, + "grad_norm": 0.14168035984039307, + "learning_rate": 0.0003612199659502804, + "loss": 2.1738, + "step": 467140 + }, + { + "epoch": 1.8058712560498522, + "grad_norm": 0.12691108882427216, + "learning_rate": 0.00036109792848992964, + "loss": 2.1743, + "step": 467150 + }, + { + "epoch": 1.8059099132532355, + "grad_norm": 0.15322566032409668, + "learning_rate": 0.0003609759001161699, + "loss": 2.1749, + "step": 467160 + }, + { + "epoch": 1.805948570456619, + "grad_norm": 0.1456574946641922, + "learning_rate": 0.0003608538808269717, + "loss": 2.1733, + "step": 467170 + }, + { + "epoch": 1.8059872276600022, + "grad_norm": 0.14703112840652466, + "learning_rate": 0.0003607318706203064, + "loss": 2.1685, + "step": 467180 + }, + { + "epoch": 1.8060258848633854, + "grad_norm": 0.6242895126342773, + "learning_rate": 0.0003606098694941462, + "loss": 2.1638, + "step": 467190 + }, + { + "epoch": 1.8060645420667687, + "grad_norm": 0.1482149362564087, + "learning_rate": 0.00036048787744646417, + "loss": 2.17, + "step": 467200 + }, + { + "epoch": 1.806103199270152, + "grad_norm": 0.13098423182964325, + "learning_rate": 0.0003603658944752337, + "loss": 2.1666, + "step": 467210 + }, + { + "epoch": 1.8061418564735354, + "grad_norm": 0.13495689630508423, + "learning_rate": 0.00036024392057842935, + "loss": 2.1674, + "step": 467220 + }, + { + "epoch": 1.8061805136769187, + "grad_norm": 0.13893181085586548, + "learning_rate": 0.0003601219557540263, + "loss": 2.1786, + "step": 467230 + }, + { + "epoch": 1.806219170880302, + "grad_norm": 0.13123945891857147, + "learning_rate": 0.00036000000000000013, + "loss": 2.1702, + "step": 467240 + }, + { + "epoch": 1.8062578280836852, + "grad_norm": 0.13648271560668945, + "learning_rate": 0.00035987805331432756, + "loss": 2.1849, + "step": 467250 + }, + { + "epoch": 1.8062964852870684, + "grad_norm": 0.14485813677310944, + "learning_rate": 0.000359756115694986, + "loss": 2.1662, + "step": 467260 + }, + { + "epoch": 1.8063351424904517, + "grad_norm": 0.14141114056110382, + "learning_rate": 0.000359634187139954, + "loss": 2.1704, + "step": 467270 + }, + { + "epoch": 1.806373799693835, + "grad_norm": 0.14906097948551178, + "learning_rate": 0.00035951226764720977, + "loss": 2.1653, + "step": 467280 + }, + { + "epoch": 1.8064124568972182, + "grad_norm": 0.13113735616207123, + "learning_rate": 0.00035939035721473345, + "loss": 2.1679, + "step": 467290 + }, + { + "epoch": 1.8064511141006014, + "grad_norm": 0.13685385882854462, + "learning_rate": 0.00035926845584050525, + "loss": 2.1739, + "step": 467300 + }, + { + "epoch": 1.8064897713039847, + "grad_norm": 0.1268870234489441, + "learning_rate": 0.0003591465635225064, + "loss": 2.1784, + "step": 467310 + }, + { + "epoch": 1.806528428507368, + "grad_norm": 0.13800711929798126, + "learning_rate": 0.0003590246802587187, + "loss": 2.1711, + "step": 467320 + }, + { + "epoch": 1.8065670857107512, + "grad_norm": 0.1398317515850067, + "learning_rate": 0.0003589028060471251, + "loss": 2.1831, + "step": 467330 + }, + { + "epoch": 1.8066057429141347, + "grad_norm": 0.23167477548122406, + "learning_rate": 0.00035878094088570854, + "loss": 2.1715, + "step": 467340 + }, + { + "epoch": 1.806644400117518, + "grad_norm": 0.14315100014209747, + "learning_rate": 0.0003586590847724536, + "loss": 2.1893, + "step": 467350 + }, + { + "epoch": 1.8066830573209012, + "grad_norm": 0.14273662865161896, + "learning_rate": 0.00035853723770534485, + "loss": 2.1801, + "step": 467360 + }, + { + "epoch": 1.8067217145242844, + "grad_norm": 0.13973486423492432, + "learning_rate": 0.00035841539968236804, + "loss": 2.1668, + "step": 467370 + }, + { + "epoch": 1.8067603717276677, + "grad_norm": 0.12567126750946045, + "learning_rate": 0.00035829357070150936, + "loss": 2.162, + "step": 467380 + }, + { + "epoch": 1.8067990289310512, + "grad_norm": 0.14240488409996033, + "learning_rate": 0.00035817175076075625, + "loss": 2.1722, + "step": 467390 + }, + { + "epoch": 1.8068376861344344, + "grad_norm": 0.14131249487400055, + "learning_rate": 0.00035804993985809655, + "loss": 2.1719, + "step": 467400 + }, + { + "epoch": 1.8068763433378177, + "grad_norm": 0.13042914867401123, + "learning_rate": 0.0003579281379915189, + "loss": 2.1837, + "step": 467410 + }, + { + "epoch": 1.806915000541201, + "grad_norm": 0.13685810565948486, + "learning_rate": 0.00035780634515901255, + "loss": 2.1775, + "step": 467420 + }, + { + "epoch": 1.8069536577445842, + "grad_norm": 0.12795154750347137, + "learning_rate": 0.00035768456135856774, + "loss": 2.1866, + "step": 467430 + }, + { + "epoch": 1.8069923149479674, + "grad_norm": 0.14272700250148773, + "learning_rate": 0.0003575627865881752, + "loss": 2.1739, + "step": 467440 + }, + { + "epoch": 1.8070309721513507, + "grad_norm": 0.267858624458313, + "learning_rate": 0.00035744102084582676, + "loss": 2.1697, + "step": 467450 + }, + { + "epoch": 1.807069629354734, + "grad_norm": 0.13735882937908173, + "learning_rate": 0.0003573192641295144, + "loss": 2.1647, + "step": 467460 + }, + { + "epoch": 1.8071082865581172, + "grad_norm": 0.14670196175575256, + "learning_rate": 0.00035719751643723165, + "loss": 2.1725, + "step": 467470 + }, + { + "epoch": 1.8071469437615004, + "grad_norm": 0.1381799876689911, + "learning_rate": 0.00035707577776697197, + "loss": 2.1564, + "step": 467480 + }, + { + "epoch": 1.8071856009648837, + "grad_norm": 0.14322498440742493, + "learning_rate": 0.00035695404811673015, + "loss": 2.1885, + "step": 467490 + }, + { + "epoch": 1.807224258168267, + "grad_norm": 0.15391521155834198, + "learning_rate": 0.00035683232748450155, + "loss": 2.1776, + "step": 467500 + }, + { + "epoch": 1.8072629153716504, + "grad_norm": 0.13412201404571533, + "learning_rate": 0.00035671061586828225, + "loss": 2.1704, + "step": 467510 + }, + { + "epoch": 1.8073015725750337, + "grad_norm": 0.13290143013000488, + "learning_rate": 0.0003565889132660689, + "loss": 2.1778, + "step": 467520 + }, + { + "epoch": 1.807340229778417, + "grad_norm": 0.1417669951915741, + "learning_rate": 0.0003564672196758594, + "loss": 2.1694, + "step": 467530 + }, + { + "epoch": 1.8073788869818002, + "grad_norm": 0.13997195661067963, + "learning_rate": 0.0003563455350956515, + "loss": 2.1687, + "step": 467540 + }, + { + "epoch": 1.8074175441851836, + "grad_norm": 0.14239998161792755, + "learning_rate": 0.00035622385952344484, + "loss": 2.1887, + "step": 467550 + }, + { + "epoch": 1.8074562013885669, + "grad_norm": 0.13999323546886444, + "learning_rate": 0.0003561021929572386, + "loss": 2.1588, + "step": 467560 + }, + { + "epoch": 1.8074948585919501, + "grad_norm": 0.1441546231508255, + "learning_rate": 0.00035598053539503385, + "loss": 2.1766, + "step": 467570 + }, + { + "epoch": 1.8075335157953334, + "grad_norm": 0.13809379935264587, + "learning_rate": 0.00035585888683483134, + "loss": 2.1899, + "step": 467580 + }, + { + "epoch": 1.8075721729987166, + "grad_norm": 0.13348519802093506, + "learning_rate": 0.0003557372472746334, + "loss": 2.1814, + "step": 467590 + }, + { + "epoch": 1.8076108302021, + "grad_norm": 0.13278010487556458, + "learning_rate": 0.0003556156167124427, + "loss": 2.1669, + "step": 467600 + }, + { + "epoch": 1.8076494874054831, + "grad_norm": 0.1412791609764099, + "learning_rate": 0.0003554939951462628, + "loss": 2.1643, + "step": 467610 + }, + { + "epoch": 1.8076881446088664, + "grad_norm": 0.16468937695026398, + "learning_rate": 0.00035537238257409774, + "loss": 2.1733, + "step": 467620 + }, + { + "epoch": 1.8077268018122497, + "grad_norm": 0.14936642348766327, + "learning_rate": 0.0003552507789939525, + "loss": 2.1666, + "step": 467630 + }, + { + "epoch": 1.807765459015633, + "grad_norm": 0.1426035612821579, + "learning_rate": 0.0003551291844038329, + "loss": 2.1801, + "step": 467640 + }, + { + "epoch": 1.8078041162190162, + "grad_norm": 0.1369057148694992, + "learning_rate": 0.0003550075988017452, + "loss": 2.1801, + "step": 467650 + }, + { + "epoch": 1.8078427734223994, + "grad_norm": 0.16422870755195618, + "learning_rate": 0.00035488602218569664, + "loss": 2.1837, + "step": 467660 + }, + { + "epoch": 1.8078814306257827, + "grad_norm": 0.12718096375465393, + "learning_rate": 0.00035476445455369523, + "loss": 2.1663, + "step": 467670 + }, + { + "epoch": 1.8079200878291661, + "grad_norm": 0.13378946483135223, + "learning_rate": 0.0003546428959037495, + "loss": 2.1678, + "step": 467680 + }, + { + "epoch": 1.8079587450325494, + "grad_norm": 0.1549948751926422, + "learning_rate": 0.0003545213462338688, + "loss": 2.1786, + "step": 467690 + }, + { + "epoch": 1.8079974022359326, + "grad_norm": 0.13492682576179504, + "learning_rate": 0.000354399805542063, + "loss": 2.1713, + "step": 467700 + }, + { + "epoch": 1.808036059439316, + "grad_norm": 0.14016228914260864, + "learning_rate": 0.00035427827382634324, + "loss": 2.1588, + "step": 467710 + }, + { + "epoch": 1.8080747166426994, + "grad_norm": 0.1409558802843094, + "learning_rate": 0.0003541567510847212, + "loss": 2.1883, + "step": 467720 + }, + { + "epoch": 1.8081133738460826, + "grad_norm": 0.14013123512268066, + "learning_rate": 0.00035403523731520914, + "loss": 2.1679, + "step": 467730 + }, + { + "epoch": 1.8081520310494659, + "grad_norm": 0.13527102768421173, + "learning_rate": 0.00035391373251581973, + "loss": 2.1759, + "step": 467740 + }, + { + "epoch": 1.8081906882528491, + "grad_norm": 0.13502882421016693, + "learning_rate": 0.00035379223668456714, + "loss": 2.163, + "step": 467750 + }, + { + "epoch": 1.8082293454562324, + "grad_norm": 0.13633443415164948, + "learning_rate": 0.0003536707498194658, + "loss": 2.1783, + "step": 467760 + }, + { + "epoch": 1.8082680026596156, + "grad_norm": 0.13043592870235443, + "learning_rate": 0.0003535492719185309, + "loss": 2.1717, + "step": 467770 + }, + { + "epoch": 1.8083066598629989, + "grad_norm": 0.14720727503299713, + "learning_rate": 0.0003534278029797784, + "loss": 2.1837, + "step": 467780 + }, + { + "epoch": 1.8083453170663821, + "grad_norm": 0.1348484605550766, + "learning_rate": 0.00035330634300122487, + "loss": 2.1618, + "step": 467790 + }, + { + "epoch": 1.8083839742697654, + "grad_norm": 0.1376534253358841, + "learning_rate": 0.0003531848919808878, + "loss": 2.1715, + "step": 467800 + }, + { + "epoch": 1.8084226314731486, + "grad_norm": 0.15582869946956635, + "learning_rate": 0.0003530634499167855, + "loss": 2.1807, + "step": 467810 + }, + { + "epoch": 1.8084612886765319, + "grad_norm": 0.1453525424003601, + "learning_rate": 0.00035294201680693705, + "loss": 2.1715, + "step": 467820 + }, + { + "epoch": 1.8084999458799151, + "grad_norm": 0.13753698766231537, + "learning_rate": 0.0003528205926493617, + "loss": 2.1673, + "step": 467830 + }, + { + "epoch": 1.8085386030832984, + "grad_norm": 0.1529999077320099, + "learning_rate": 0.00035269917744208, + "loss": 2.1673, + "step": 467840 + }, + { + "epoch": 1.8085772602866819, + "grad_norm": 0.13831478357315063, + "learning_rate": 0.00035257777118311283, + "loss": 2.164, + "step": 467850 + }, + { + "epoch": 1.8086159174900651, + "grad_norm": 0.13578453660011292, + "learning_rate": 0.0003524563738704822, + "loss": 2.1694, + "step": 467860 + }, + { + "epoch": 1.8086545746934484, + "grad_norm": 0.1331843137741089, + "learning_rate": 0.00035233498550221086, + "loss": 2.1779, + "step": 467870 + }, + { + "epoch": 1.8086932318968316, + "grad_norm": 0.15052971243858337, + "learning_rate": 0.0003522136060763217, + "loss": 2.1831, + "step": 467880 + }, + { + "epoch": 1.808731889100215, + "grad_norm": 0.14466112852096558, + "learning_rate": 0.0003520922355908387, + "loss": 2.1874, + "step": 467890 + }, + { + "epoch": 1.8087705463035983, + "grad_norm": 0.12750951945781708, + "learning_rate": 0.0003519708740437868, + "loss": 2.1797, + "step": 467900 + }, + { + "epoch": 1.8088092035069816, + "grad_norm": 0.1318550854921341, + "learning_rate": 0.0003518495214331914, + "loss": 2.1606, + "step": 467910 + }, + { + "epoch": 1.8088478607103649, + "grad_norm": 0.13449975848197937, + "learning_rate": 0.0003517281777570789, + "loss": 2.1563, + "step": 467920 + }, + { + "epoch": 1.808886517913748, + "grad_norm": 0.15311302244663239, + "learning_rate": 0.0003516068430134758, + "loss": 2.1857, + "step": 467930 + }, + { + "epoch": 1.8089251751171314, + "grad_norm": 0.14139986038208008, + "learning_rate": 0.0003514855172004099, + "loss": 2.1793, + "step": 467940 + }, + { + "epoch": 1.8089638323205146, + "grad_norm": 0.14440183341503143, + "learning_rate": 0.00035136420031590964, + "loss": 2.1634, + "step": 467950 + }, + { + "epoch": 1.8090024895238979, + "grad_norm": 0.14607249200344086, + "learning_rate": 0.0003512428923580042, + "loss": 2.1742, + "step": 467960 + }, + { + "epoch": 1.8090411467272811, + "grad_norm": 0.14335627853870392, + "learning_rate": 0.00035112159332472315, + "loss": 2.1647, + "step": 467970 + }, + { + "epoch": 1.8090798039306644, + "grad_norm": 0.14517302811145782, + "learning_rate": 0.00035100030321409713, + "loss": 2.1622, + "step": 467980 + }, + { + "epoch": 1.8091184611340476, + "grad_norm": 0.14808966219425201, + "learning_rate": 0.00035087902202415736, + "loss": 2.1737, + "step": 467990 + }, + { + "epoch": 1.8091571183374309, + "grad_norm": 0.14284609258174896, + "learning_rate": 0.0003507577497529357, + "loss": 2.1733, + "step": 468000 + }, + { + "epoch": 1.8091957755408141, + "grad_norm": 0.13765506446361542, + "learning_rate": 0.0003506364863984652, + "loss": 2.1635, + "step": 468010 + }, + { + "epoch": 1.8092344327441976, + "grad_norm": 0.14198483526706696, + "learning_rate": 0.0003505152319587792, + "loss": 2.1669, + "step": 468020 + }, + { + "epoch": 1.8092730899475808, + "grad_norm": 0.1519058495759964, + "learning_rate": 0.0003503939864319117, + "loss": 2.1671, + "step": 468030 + }, + { + "epoch": 1.809311747150964, + "grad_norm": 0.14194536209106445, + "learning_rate": 0.0003502727498158975, + "loss": 2.1746, + "step": 468040 + }, + { + "epoch": 1.8093504043543474, + "grad_norm": 0.13198499381542206, + "learning_rate": 0.00035015152210877255, + "loss": 2.15, + "step": 468050 + }, + { + "epoch": 1.8093890615577308, + "grad_norm": 0.13800212740898132, + "learning_rate": 0.000350030303308573, + "loss": 2.1659, + "step": 468060 + }, + { + "epoch": 1.809427718761114, + "grad_norm": 0.14429043233394623, + "learning_rate": 0.0003499090934133355, + "loss": 2.1731, + "step": 468070 + }, + { + "epoch": 1.8094663759644973, + "grad_norm": 0.13230136036872864, + "learning_rate": 0.00034978789242109867, + "loss": 2.1691, + "step": 468080 + }, + { + "epoch": 1.8095050331678806, + "grad_norm": 0.14464101195335388, + "learning_rate": 0.00034966670032990013, + "loss": 2.1799, + "step": 468090 + }, + { + "epoch": 1.8095436903712638, + "grad_norm": 0.1464579999446869, + "learning_rate": 0.00034954551713777926, + "loss": 2.1786, + "step": 468100 + }, + { + "epoch": 1.809582347574647, + "grad_norm": 0.1446916162967682, + "learning_rate": 0.00034942434284277656, + "loss": 2.1968, + "step": 468110 + }, + { + "epoch": 1.8096210047780303, + "grad_norm": 0.1317659467458725, + "learning_rate": 0.00034930317744293207, + "loss": 2.1828, + "step": 468120 + }, + { + "epoch": 1.8096596619814136, + "grad_norm": 0.14055480062961578, + "learning_rate": 0.0003491820209362875, + "loss": 2.1885, + "step": 468130 + }, + { + "epoch": 1.8096983191847968, + "grad_norm": 0.13689568638801575, + "learning_rate": 0.0003490608733208847, + "loss": 2.1813, + "step": 468140 + }, + { + "epoch": 1.80973697638818, + "grad_norm": 0.1372600942850113, + "learning_rate": 0.00034893973459476646, + "loss": 2.1628, + "step": 468150 + }, + { + "epoch": 1.8097756335915633, + "grad_norm": 0.12364128232002258, + "learning_rate": 0.0003488186047559765, + "loss": 2.1619, + "step": 468160 + }, + { + "epoch": 1.8098142907949466, + "grad_norm": 0.13477951288223267, + "learning_rate": 0.00034869748380255897, + "loss": 2.1813, + "step": 468170 + }, + { + "epoch": 1.8098529479983299, + "grad_norm": 0.1300884187221527, + "learning_rate": 0.00034857637173255876, + "loss": 2.1549, + "step": 468180 + }, + { + "epoch": 1.8098916052017133, + "grad_norm": 0.12880156934261322, + "learning_rate": 0.00034845526854402167, + "loss": 2.171, + "step": 468190 + }, + { + "epoch": 1.8099302624050966, + "grad_norm": 0.15667806565761566, + "learning_rate": 0.0003483341742349937, + "loss": 2.178, + "step": 468200 + }, + { + "epoch": 1.8099689196084798, + "grad_norm": 0.1452578604221344, + "learning_rate": 0.0003482130888035224, + "loss": 2.1857, + "step": 468210 + }, + { + "epoch": 1.810007576811863, + "grad_norm": 0.14344045519828796, + "learning_rate": 0.0003480920122476554, + "loss": 2.1651, + "step": 468220 + }, + { + "epoch": 1.8100462340152466, + "grad_norm": 0.13554508984088898, + "learning_rate": 0.00034797094456544134, + "loss": 2.1758, + "step": 468230 + }, + { + "epoch": 1.8100848912186298, + "grad_norm": 0.1435360163450241, + "learning_rate": 0.00034784988575492927, + "loss": 2.1536, + "step": 468240 + }, + { + "epoch": 1.810123548422013, + "grad_norm": 0.12887118756771088, + "learning_rate": 0.00034772883581416945, + "loss": 2.164, + "step": 468250 + }, + { + "epoch": 1.8101622056253963, + "grad_norm": 0.14329533278942108, + "learning_rate": 0.0003476077947412122, + "loss": 2.1841, + "step": 468260 + }, + { + "epoch": 1.8102008628287796, + "grad_norm": 0.14588004350662231, + "learning_rate": 0.00034748676253410914, + "loss": 2.159, + "step": 468270 + }, + { + "epoch": 1.8102395200321628, + "grad_norm": 0.1433817744255066, + "learning_rate": 0.0003473657391909122, + "loss": 2.1649, + "step": 468280 + }, + { + "epoch": 1.810278177235546, + "grad_norm": 0.13485339283943176, + "learning_rate": 0.0003472447247096744, + "loss": 2.1753, + "step": 468290 + }, + { + "epoch": 1.8103168344389293, + "grad_norm": 0.1369163542985916, + "learning_rate": 0.00034712371908844907, + "loss": 2.1589, + "step": 468300 + }, + { + "epoch": 1.8103554916423126, + "grad_norm": 0.13304588198661804, + "learning_rate": 0.00034700272232529074, + "loss": 2.1755, + "step": 468310 + }, + { + "epoch": 1.8103941488456958, + "grad_norm": 0.12897934019565582, + "learning_rate": 0.00034688173441825397, + "loss": 2.1661, + "step": 468320 + }, + { + "epoch": 1.810432806049079, + "grad_norm": 0.1374320536851883, + "learning_rate": 0.00034676075536539485, + "loss": 2.1839, + "step": 468330 + }, + { + "epoch": 1.8104714632524623, + "grad_norm": 0.13342377543449402, + "learning_rate": 0.00034663978516476934, + "loss": 2.171, + "step": 468340 + }, + { + "epoch": 1.8105101204558456, + "grad_norm": 0.14364196360111237, + "learning_rate": 0.0003465188238144348, + "loss": 2.175, + "step": 468350 + }, + { + "epoch": 1.810548777659229, + "grad_norm": 0.13240283727645874, + "learning_rate": 0.0003463978713124489, + "loss": 2.1552, + "step": 468360 + }, + { + "epoch": 1.8105874348626123, + "grad_norm": 0.13555388152599335, + "learning_rate": 0.00034627692765687044, + "loss": 2.1646, + "step": 468370 + }, + { + "epoch": 1.8106260920659956, + "grad_norm": 0.1307956725358963, + "learning_rate": 0.00034615599284575825, + "loss": 2.1846, + "step": 468380 + }, + { + "epoch": 1.8106647492693788, + "grad_norm": 0.13531222939491272, + "learning_rate": 0.00034603506687717233, + "loss": 2.1697, + "step": 468390 + }, + { + "epoch": 1.8107034064727623, + "grad_norm": 0.14121176302433014, + "learning_rate": 0.00034591414974917336, + "loss": 2.1897, + "step": 468400 + }, + { + "epoch": 1.8107420636761455, + "grad_norm": 0.15340138971805573, + "learning_rate": 0.00034579324145982284, + "loss": 2.1687, + "step": 468410 + }, + { + "epoch": 1.8107807208795288, + "grad_norm": 0.13489243388175964, + "learning_rate": 0.00034567234200718255, + "loss": 2.1619, + "step": 468420 + }, + { + "epoch": 1.810819378082912, + "grad_norm": 0.14203903079032898, + "learning_rate": 0.00034555145138931567, + "loss": 2.1627, + "step": 468430 + }, + { + "epoch": 1.8108580352862953, + "grad_norm": 0.16522355377674103, + "learning_rate": 0.0003454305696042852, + "loss": 2.167, + "step": 468440 + }, + { + "epoch": 1.8108966924896785, + "grad_norm": 0.1337309032678604, + "learning_rate": 0.00034530969665015566, + "loss": 2.174, + "step": 468450 + }, + { + "epoch": 1.8109353496930618, + "grad_norm": 0.1513516902923584, + "learning_rate": 0.00034518883252499167, + "loss": 2.1809, + "step": 468460 + }, + { + "epoch": 1.810974006896445, + "grad_norm": 0.1391746699810028, + "learning_rate": 0.000345067977226859, + "loss": 2.1625, + "step": 468470 + }, + { + "epoch": 1.8110126640998283, + "grad_norm": 0.14154408872127533, + "learning_rate": 0.0003449471307538239, + "loss": 2.1849, + "step": 468480 + }, + { + "epoch": 1.8110513213032116, + "grad_norm": 0.13919097185134888, + "learning_rate": 0.0003448262931039534, + "loss": 2.1749, + "step": 468490 + }, + { + "epoch": 1.8110899785065948, + "grad_norm": 0.13251088559627533, + "learning_rate": 0.00034470546427531513, + "loss": 2.1609, + "step": 468500 + }, + { + "epoch": 1.811128635709978, + "grad_norm": 0.13987651467323303, + "learning_rate": 0.0003445846442659775, + "loss": 2.1784, + "step": 468510 + }, + { + "epoch": 1.8111672929133613, + "grad_norm": 0.13735197484493256, + "learning_rate": 0.00034446383307400977, + "loss": 2.1842, + "step": 468520 + }, + { + "epoch": 1.8112059501167448, + "grad_norm": 0.14323697984218597, + "learning_rate": 0.00034434303069748177, + "loss": 2.1885, + "step": 468530 + }, + { + "epoch": 1.811244607320128, + "grad_norm": 0.1448112279176712, + "learning_rate": 0.00034422223713446363, + "loss": 2.1646, + "step": 468540 + }, + { + "epoch": 1.8112832645235113, + "grad_norm": 0.1370207518339157, + "learning_rate": 0.0003441014523830273, + "loss": 2.1916, + "step": 468550 + }, + { + "epoch": 1.8113219217268945, + "grad_norm": 0.143899604678154, + "learning_rate": 0.00034398067644124386, + "loss": 2.1805, + "step": 468560 + }, + { + "epoch": 1.811360578930278, + "grad_norm": 0.12973351776599884, + "learning_rate": 0.0003438599093071868, + "loss": 2.1663, + "step": 468570 + }, + { + "epoch": 1.8113992361336613, + "grad_norm": 0.13267174363136292, + "learning_rate": 0.0003437391509789287, + "loss": 2.1478, + "step": 468580 + }, + { + "epoch": 1.8114378933370445, + "grad_norm": 0.13831119239330292, + "learning_rate": 0.00034361840145454405, + "loss": 2.1778, + "step": 468590 + }, + { + "epoch": 1.8114765505404278, + "grad_norm": 0.15274249017238617, + "learning_rate": 0.00034349766073210744, + "loss": 2.1776, + "step": 468600 + }, + { + "epoch": 1.811515207743811, + "grad_norm": 0.12548744678497314, + "learning_rate": 0.0003433769288096944, + "loss": 2.1673, + "step": 468610 + }, + { + "epoch": 1.8115538649471943, + "grad_norm": 0.14468833804130554, + "learning_rate": 0.00034325620568538117, + "loss": 2.1685, + "step": 468620 + }, + { + "epoch": 1.8115925221505775, + "grad_norm": 0.14792367815971375, + "learning_rate": 0.00034313549135724444, + "loss": 2.1722, + "step": 468630 + }, + { + "epoch": 1.8116311793539608, + "grad_norm": 0.14683176577091217, + "learning_rate": 0.00034301478582336167, + "loss": 2.1904, + "step": 468640 + }, + { + "epoch": 1.811669836557344, + "grad_norm": 0.14537638425827026, + "learning_rate": 0.0003428940890818113, + "loss": 2.1845, + "step": 468650 + }, + { + "epoch": 1.8117084937607273, + "grad_norm": 0.14245370030403137, + "learning_rate": 0.00034277340113067224, + "loss": 2.1565, + "step": 468660 + }, + { + "epoch": 1.8117471509641105, + "grad_norm": 0.14482244849205017, + "learning_rate": 0.000342652721968024, + "loss": 2.1833, + "step": 468670 + }, + { + "epoch": 1.8117858081674938, + "grad_norm": 0.13195568323135376, + "learning_rate": 0.00034253205159194723, + "loss": 2.1453, + "step": 468680 + }, + { + "epoch": 1.811824465370877, + "grad_norm": 0.15000492334365845, + "learning_rate": 0.0003424113900005226, + "loss": 2.1771, + "step": 468690 + }, + { + "epoch": 1.8118631225742605, + "grad_norm": 0.13357426226139069, + "learning_rate": 0.0003422907371918318, + "loss": 2.1556, + "step": 468700 + }, + { + "epoch": 1.8119017797776438, + "grad_norm": 0.13514158129692078, + "learning_rate": 0.00034217009316395776, + "loss": 2.1813, + "step": 468710 + }, + { + "epoch": 1.811940436981027, + "grad_norm": 0.13902096450328827, + "learning_rate": 0.0003420494579149835, + "loss": 2.1665, + "step": 468720 + }, + { + "epoch": 1.8119790941844103, + "grad_norm": 0.1330592930316925, + "learning_rate": 0.0003419288314429927, + "loss": 2.1671, + "step": 468730 + }, + { + "epoch": 1.8120177513877938, + "grad_norm": 0.133706197142601, + "learning_rate": 0.00034180821374606965, + "loss": 2.1595, + "step": 468740 + }, + { + "epoch": 1.812056408591177, + "grad_norm": 0.13246847689151764, + "learning_rate": 0.0003416876048223001, + "loss": 2.1694, + "step": 468750 + }, + { + "epoch": 1.8120950657945603, + "grad_norm": 0.13621580600738525, + "learning_rate": 0.00034156700466976965, + "loss": 2.1671, + "step": 468760 + }, + { + "epoch": 1.8121337229979435, + "grad_norm": 0.14098776876926422, + "learning_rate": 0.0003414464132865649, + "loss": 2.16, + "step": 468770 + }, + { + "epoch": 1.8121723802013268, + "grad_norm": 0.15192024409770966, + "learning_rate": 0.0003413258306707734, + "loss": 2.1746, + "step": 468780 + }, + { + "epoch": 1.81221103740471, + "grad_norm": 0.14788542687892914, + "learning_rate": 0.0003412052568204831, + "loss": 2.1594, + "step": 468790 + }, + { + "epoch": 1.8122496946080933, + "grad_norm": 0.13699501752853394, + "learning_rate": 0.0003410846917337824, + "loss": 2.1766, + "step": 468800 + }, + { + "epoch": 1.8122883518114765, + "grad_norm": 0.14366194605827332, + "learning_rate": 0.00034096413540876094, + "loss": 2.1682, + "step": 468810 + }, + { + "epoch": 1.8123270090148598, + "grad_norm": 0.9558488726615906, + "learning_rate": 0.000340843587843509, + "loss": 2.1689, + "step": 468820 + }, + { + "epoch": 1.812365666218243, + "grad_norm": 0.16100719571113586, + "learning_rate": 0.00034072304903611704, + "loss": 2.1775, + "step": 468830 + }, + { + "epoch": 1.8124043234216263, + "grad_norm": 0.14918309450149536, + "learning_rate": 0.00034060251898467686, + "loss": 2.1761, + "step": 468840 + }, + { + "epoch": 1.8124429806250095, + "grad_norm": 0.14374400675296783, + "learning_rate": 0.0003404819976872804, + "loss": 2.1675, + "step": 468850 + }, + { + "epoch": 1.8124816378283928, + "grad_norm": 0.13421660661697388, + "learning_rate": 0.0003403614851420205, + "loss": 2.1881, + "step": 468860 + }, + { + "epoch": 1.8125202950317763, + "grad_norm": 0.14474283158779144, + "learning_rate": 0.0003402409813469909, + "loss": 2.1628, + "step": 468870 + }, + { + "epoch": 1.8125589522351595, + "grad_norm": 0.1359967589378357, + "learning_rate": 0.0003401204863002858, + "loss": 2.1704, + "step": 468880 + }, + { + "epoch": 1.8125976094385428, + "grad_norm": 0.13450078666210175, + "learning_rate": 0.0003400000000000001, + "loss": 2.1702, + "step": 468890 + }, + { + "epoch": 1.812636266641926, + "grad_norm": 0.13271546363830566, + "learning_rate": 0.0003398795224442295, + "loss": 2.1689, + "step": 468900 + }, + { + "epoch": 1.8126749238453095, + "grad_norm": 0.1268029659986496, + "learning_rate": 0.00033975905363106993, + "loss": 2.1692, + "step": 468910 + }, + { + "epoch": 1.8127135810486927, + "grad_norm": 0.14645883440971375, + "learning_rate": 0.00033963859355861926, + "loss": 2.173, + "step": 468920 + }, + { + "epoch": 1.812752238252076, + "grad_norm": 0.13335394859313965, + "learning_rate": 0.00033951814222497444, + "loss": 2.1859, + "step": 468930 + }, + { + "epoch": 1.8127908954554592, + "grad_norm": 0.15273331105709076, + "learning_rate": 0.0003393976996282342, + "loss": 2.1771, + "step": 468940 + }, + { + "epoch": 1.8128295526588425, + "grad_norm": 0.145224928855896, + "learning_rate": 0.00033927726576649776, + "loss": 2.1615, + "step": 468950 + }, + { + "epoch": 1.8128682098622257, + "grad_norm": 0.14967118203639984, + "learning_rate": 0.00033915684063786446, + "loss": 2.1573, + "step": 468960 + }, + { + "epoch": 1.812906867065609, + "grad_norm": 0.14878715574741364, + "learning_rate": 0.0003390364242404351, + "loss": 2.1606, + "step": 468970 + }, + { + "epoch": 1.8129455242689922, + "grad_norm": 0.13415095210075378, + "learning_rate": 0.0003389160165723106, + "loss": 2.1638, + "step": 468980 + }, + { + "epoch": 1.8129841814723755, + "grad_norm": 0.13814513385295868, + "learning_rate": 0.0003387956176315932, + "loss": 2.1731, + "step": 468990 + }, + { + "epoch": 1.8130228386757588, + "grad_norm": 0.1519918441772461, + "learning_rate": 0.0003386752274163851, + "loss": 2.1596, + "step": 469000 + }, + { + "epoch": 1.813061495879142, + "grad_norm": 0.13185402750968933, + "learning_rate": 0.00033855484592478933, + "loss": 2.1661, + "step": 469010 + }, + { + "epoch": 1.8131001530825253, + "grad_norm": 0.13390794396400452, + "learning_rate": 0.00033843447315491047, + "loss": 2.1766, + "step": 469020 + }, + { + "epoch": 1.8131388102859087, + "grad_norm": 0.14997097849845886, + "learning_rate": 0.00033831410910485247, + "loss": 2.1679, + "step": 469030 + }, + { + "epoch": 1.813177467489292, + "grad_norm": 0.14121918380260468, + "learning_rate": 0.00033819375377272087, + "loss": 2.1631, + "step": 469040 + }, + { + "epoch": 1.8132161246926752, + "grad_norm": 0.1386699229478836, + "learning_rate": 0.0003380734071566218, + "loss": 2.1764, + "step": 469050 + }, + { + "epoch": 1.8132547818960585, + "grad_norm": 0.13607648015022278, + "learning_rate": 0.0003379530692546615, + "loss": 2.1766, + "step": 469060 + }, + { + "epoch": 1.8132934390994417, + "grad_norm": 0.1364724189043045, + "learning_rate": 0.0003378327400649477, + "loss": 2.1688, + "step": 469070 + }, + { + "epoch": 1.8133320963028252, + "grad_norm": 0.13681846857070923, + "learning_rate": 0.00033771241958558806, + "loss": 2.1722, + "step": 469080 + }, + { + "epoch": 1.8133707535062085, + "grad_norm": 0.14456836879253387, + "learning_rate": 0.00033759210781469175, + "loss": 2.1639, + "step": 469090 + }, + { + "epoch": 1.8134094107095917, + "grad_norm": 0.13703112304210663, + "learning_rate": 0.0003374718047503675, + "loss": 2.158, + "step": 469100 + }, + { + "epoch": 1.813448067912975, + "grad_norm": 0.1477336883544922, + "learning_rate": 0.0003373515103907261, + "loss": 2.1578, + "step": 469110 + }, + { + "epoch": 1.8134867251163582, + "grad_norm": 0.145752415060997, + "learning_rate": 0.0003372312247338778, + "loss": 2.1589, + "step": 469120 + }, + { + "epoch": 1.8135253823197415, + "grad_norm": 0.14201414585113525, + "learning_rate": 0.00033711094777793415, + "loss": 2.1756, + "step": 469130 + }, + { + "epoch": 1.8135640395231247, + "grad_norm": 0.1439470797777176, + "learning_rate": 0.0003369906795210076, + "loss": 2.1647, + "step": 469140 + }, + { + "epoch": 1.813602696726508, + "grad_norm": 0.13308900594711304, + "learning_rate": 0.0003368704199612107, + "loss": 2.1674, + "step": 469150 + }, + { + "epoch": 1.8136413539298912, + "grad_norm": 0.1444803923368454, + "learning_rate": 0.0003367501690966568, + "loss": 2.1459, + "step": 469160 + }, + { + "epoch": 1.8136800111332745, + "grad_norm": 0.13094887137413025, + "learning_rate": 0.00033662992692546023, + "loss": 2.1732, + "step": 469170 + }, + { + "epoch": 1.8137186683366577, + "grad_norm": 0.13859966397285461, + "learning_rate": 0.0003365096934457359, + "loss": 2.1773, + "step": 469180 + }, + { + "epoch": 1.813757325540041, + "grad_norm": 0.14027716219425201, + "learning_rate": 0.00033638946865559906, + "loss": 2.1661, + "step": 469190 + }, + { + "epoch": 1.8137959827434245, + "grad_norm": 0.13711123168468475, + "learning_rate": 0.0003362692525531663, + "loss": 2.1804, + "step": 469200 + }, + { + "epoch": 1.8138346399468077, + "grad_norm": 0.1386299878358841, + "learning_rate": 0.0003361490451365543, + "loss": 2.185, + "step": 469210 + }, + { + "epoch": 1.813873297150191, + "grad_norm": 0.1353200078010559, + "learning_rate": 0.0003360288464038808, + "loss": 2.1647, + "step": 469220 + }, + { + "epoch": 1.8139119543535742, + "grad_norm": 0.13703496754169464, + "learning_rate": 0.00033590865635326384, + "loss": 2.17, + "step": 469230 + }, + { + "epoch": 1.8139506115569575, + "grad_norm": 0.142286017537117, + "learning_rate": 0.0003357884749828224, + "loss": 2.1633, + "step": 469240 + }, + { + "epoch": 1.813989268760341, + "grad_norm": 0.13622930645942688, + "learning_rate": 0.0003356683022906761, + "loss": 2.1674, + "step": 469250 + }, + { + "epoch": 1.8140279259637242, + "grad_norm": 0.23010209202766418, + "learning_rate": 0.00033554813827494544, + "loss": 2.1682, + "step": 469260 + }, + { + "epoch": 1.8140665831671074, + "grad_norm": 0.14340844750404358, + "learning_rate": 0.0003354279829337512, + "loss": 2.1779, + "step": 469270 + }, + { + "epoch": 1.8141052403704907, + "grad_norm": 0.1332901418209076, + "learning_rate": 0.0003353078362652151, + "loss": 2.169, + "step": 469280 + }, + { + "epoch": 1.814143897573874, + "grad_norm": 0.12993653118610382, + "learning_rate": 0.00033518769826745934, + "loss": 2.1811, + "step": 469290 + }, + { + "epoch": 1.8141825547772572, + "grad_norm": 0.14110475778579712, + "learning_rate": 0.00033506756893860715, + "loss": 2.1651, + "step": 469300 + }, + { + "epoch": 1.8142212119806405, + "grad_norm": 0.14228901267051697, + "learning_rate": 0.0003349474482767818, + "loss": 2.1658, + "step": 469310 + }, + { + "epoch": 1.8142598691840237, + "grad_norm": 0.13981829583644867, + "learning_rate": 0.0003348273362801082, + "loss": 2.1691, + "step": 469320 + }, + { + "epoch": 1.814298526387407, + "grad_norm": 0.13701368868350983, + "learning_rate": 0.0003347072329467109, + "loss": 2.1591, + "step": 469330 + }, + { + "epoch": 1.8143371835907902, + "grad_norm": 0.14498670399188995, + "learning_rate": 0.00033458713827471586, + "loss": 2.1706, + "step": 469340 + }, + { + "epoch": 1.8143758407941735, + "grad_norm": 0.1400134116411209, + "learning_rate": 0.0003344670522622495, + "loss": 2.1715, + "step": 469350 + }, + { + "epoch": 1.8144144979975567, + "grad_norm": 0.15969963371753693, + "learning_rate": 0.0003343469749074388, + "loss": 2.1675, + "step": 469360 + }, + { + "epoch": 1.8144531552009402, + "grad_norm": 0.1523132175207138, + "learning_rate": 0.00033422690620841156, + "loss": 2.1751, + "step": 469370 + }, + { + "epoch": 1.8144918124043234, + "grad_norm": 0.1331949532032013, + "learning_rate": 0.00033410684616329614, + "loss": 2.1861, + "step": 469380 + }, + { + "epoch": 1.8145304696077067, + "grad_norm": 0.14200444519519806, + "learning_rate": 0.0003339867947702215, + "loss": 2.1604, + "step": 469390 + }, + { + "epoch": 1.81456912681109, + "grad_norm": 0.1326495110988617, + "learning_rate": 0.00033386675202731774, + "loss": 2.1813, + "step": 469400 + }, + { + "epoch": 1.8146077840144732, + "grad_norm": 0.14433759450912476, + "learning_rate": 0.00033374671793271474, + "loss": 2.1593, + "step": 469410 + }, + { + "epoch": 1.8146464412178567, + "grad_norm": 0.1497911512851715, + "learning_rate": 0.00033362669248454416, + "loss": 2.1541, + "step": 469420 + }, + { + "epoch": 1.81468509842124, + "grad_norm": 0.14449547231197357, + "learning_rate": 0.0003335066756809375, + "loss": 2.1869, + "step": 469430 + }, + { + "epoch": 1.8147237556246232, + "grad_norm": 0.14384634792804718, + "learning_rate": 0.00033338666752002723, + "loss": 2.1579, + "step": 469440 + }, + { + "epoch": 1.8147624128280064, + "grad_norm": 0.13797500729560852, + "learning_rate": 0.0003332666679999468, + "loss": 2.1512, + "step": 469450 + }, + { + "epoch": 1.8148010700313897, + "grad_norm": 0.1408056914806366, + "learning_rate": 0.0003331466771188294, + "loss": 2.1769, + "step": 469460 + }, + { + "epoch": 1.814839727234773, + "grad_norm": 0.1476975828409195, + "learning_rate": 0.00033302669487481, + "loss": 2.1623, + "step": 469470 + }, + { + "epoch": 1.8148783844381562, + "grad_norm": 0.14249102771282196, + "learning_rate": 0.0003329067212660235, + "loss": 2.1705, + "step": 469480 + }, + { + "epoch": 1.8149170416415394, + "grad_norm": 0.1411779820919037, + "learning_rate": 0.0003327867562906057, + "loss": 2.1735, + "step": 469490 + }, + { + "epoch": 1.8149556988449227, + "grad_norm": 0.14658279716968536, + "learning_rate": 0.0003326667999466935, + "loss": 2.1882, + "step": 469500 + }, + { + "epoch": 1.814994356048306, + "grad_norm": 0.1431387960910797, + "learning_rate": 0.00033254685223242333, + "loss": 2.17, + "step": 469510 + }, + { + "epoch": 1.8150330132516892, + "grad_norm": 0.13615825772285461, + "learning_rate": 0.0003324269131459334, + "loss": 2.183, + "step": 469520 + }, + { + "epoch": 1.8150716704550725, + "grad_norm": 0.139374777674675, + "learning_rate": 0.00033230698268536243, + "loss": 2.1777, + "step": 469530 + }, + { + "epoch": 1.815110327658456, + "grad_norm": 0.13844828307628632, + "learning_rate": 0.0003321870608488493, + "loss": 2.166, + "step": 469540 + }, + { + "epoch": 1.8151489848618392, + "grad_norm": 0.14447201788425446, + "learning_rate": 0.00033206714763453385, + "loss": 2.1627, + "step": 469550 + }, + { + "epoch": 1.8151876420652224, + "grad_norm": 0.14412400126457214, + "learning_rate": 0.00033194724304055656, + "loss": 2.1684, + "step": 469560 + }, + { + "epoch": 1.8152262992686057, + "grad_norm": 0.1350231021642685, + "learning_rate": 0.0003318273470650588, + "loss": 2.1779, + "step": 469570 + }, + { + "epoch": 1.8152649564719892, + "grad_norm": 0.14220920205116272, + "learning_rate": 0.00033170745970618223, + "loss": 2.1611, + "step": 469580 + }, + { + "epoch": 1.8153036136753724, + "grad_norm": 0.13567551970481873, + "learning_rate": 0.0003315875809620692, + "loss": 2.1804, + "step": 469590 + }, + { + "epoch": 1.8153422708787557, + "grad_norm": 0.14831498265266418, + "learning_rate": 0.00033146771083086325, + "loss": 2.165, + "step": 469600 + }, + { + "epoch": 1.815380928082139, + "grad_norm": 0.15242017805576324, + "learning_rate": 0.0003313478493107078, + "loss": 2.1671, + "step": 469610 + }, + { + "epoch": 1.8154195852855222, + "grad_norm": 0.1507246196269989, + "learning_rate": 0.0003312279963997478, + "loss": 2.1537, + "step": 469620 + }, + { + "epoch": 1.8154582424889054, + "grad_norm": 0.13301242887973785, + "learning_rate": 0.0003311081520961281, + "loss": 2.1598, + "step": 469630 + }, + { + "epoch": 1.8154968996922887, + "grad_norm": 0.14124217629432678, + "learning_rate": 0.00033098831639799477, + "loss": 2.1686, + "step": 469640 + }, + { + "epoch": 1.815535556895672, + "grad_norm": 0.138587087392807, + "learning_rate": 0.0003308684893034941, + "loss": 2.171, + "step": 469650 + }, + { + "epoch": 1.8155742140990552, + "grad_norm": 0.147024005651474, + "learning_rate": 0.0003307486708107734, + "loss": 2.1485, + "step": 469660 + }, + { + "epoch": 1.8156128713024384, + "grad_norm": 0.14949698746204376, + "learning_rate": 0.00033062886091798037, + "loss": 2.1686, + "step": 469670 + }, + { + "epoch": 1.8156515285058217, + "grad_norm": 0.13735251128673553, + "learning_rate": 0.0003305090596232636, + "loss": 2.1759, + "step": 469680 + }, + { + "epoch": 1.815690185709205, + "grad_norm": 0.13064594566822052, + "learning_rate": 0.0003303892669247721, + "loss": 2.1772, + "step": 469690 + }, + { + "epoch": 1.8157288429125882, + "grad_norm": 0.15359649062156677, + "learning_rate": 0.00033026948282065586, + "loss": 2.1486, + "step": 469700 + }, + { + "epoch": 1.8157675001159717, + "grad_norm": 0.14719156920909882, + "learning_rate": 0.00033014970730906533, + "loss": 2.1517, + "step": 469710 + }, + { + "epoch": 1.815806157319355, + "grad_norm": 0.14220820367336273, + "learning_rate": 0.0003300299403881517, + "loss": 2.1609, + "step": 469720 + }, + { + "epoch": 1.8158448145227382, + "grad_norm": 0.13633151352405548, + "learning_rate": 0.00032991018205606683, + "loss": 2.1561, + "step": 469730 + }, + { + "epoch": 1.8158834717261214, + "grad_norm": 0.14592762291431427, + "learning_rate": 0.00032979043231096285, + "loss": 2.1697, + "step": 469740 + }, + { + "epoch": 1.8159221289295049, + "grad_norm": 0.13614079356193542, + "learning_rate": 0.0003296706911509935, + "loss": 2.1678, + "step": 469750 + }, + { + "epoch": 1.8159607861328881, + "grad_norm": 0.1432075947523117, + "learning_rate": 0.00032955095857431195, + "loss": 2.1694, + "step": 469760 + }, + { + "epoch": 1.8159994433362714, + "grad_norm": 0.1378885954618454, + "learning_rate": 0.0003294312345790731, + "loss": 2.1932, + "step": 469770 + }, + { + "epoch": 1.8160381005396546, + "grad_norm": 0.14316721260547638, + "learning_rate": 0.00032931151916343196, + "loss": 2.1704, + "step": 469780 + }, + { + "epoch": 1.816076757743038, + "grad_norm": 0.1418437510728836, + "learning_rate": 0.0003291918123255442, + "loss": 2.1502, + "step": 469790 + }, + { + "epoch": 1.8161154149464211, + "grad_norm": 0.14487609267234802, + "learning_rate": 0.0003290721140635662, + "loss": 2.1786, + "step": 469800 + }, + { + "epoch": 1.8161540721498044, + "grad_norm": 0.13837496936321259, + "learning_rate": 0.0003289524243756552, + "loss": 2.1862, + "step": 469810 + }, + { + "epoch": 1.8161927293531877, + "grad_norm": 0.14798998832702637, + "learning_rate": 0.00032883274325996913, + "loss": 2.1732, + "step": 469820 + }, + { + "epoch": 1.816231386556571, + "grad_norm": 0.1464812457561493, + "learning_rate": 0.0003287130707146662, + "loss": 2.173, + "step": 469830 + }, + { + "epoch": 1.8162700437599542, + "grad_norm": 0.1402280181646347, + "learning_rate": 0.00032859340673790574, + "loss": 2.1594, + "step": 469840 + }, + { + "epoch": 1.8163087009633374, + "grad_norm": 0.13324207067489624, + "learning_rate": 0.0003284737513278471, + "loss": 2.1681, + "step": 469850 + }, + { + "epoch": 1.8163473581667207, + "grad_norm": 0.1384410262107849, + "learning_rate": 0.00032835410448265104, + "loss": 2.1794, + "step": 469860 + }, + { + "epoch": 1.816386015370104, + "grad_norm": 0.13632450997829437, + "learning_rate": 0.0003282344662004781, + "loss": 2.1626, + "step": 469870 + }, + { + "epoch": 1.8164246725734874, + "grad_norm": 0.14713577926158905, + "learning_rate": 0.0003281148364794908, + "loss": 2.1591, + "step": 469880 + }, + { + "epoch": 1.8164633297768706, + "grad_norm": 0.13500018417835236, + "learning_rate": 0.00032799521531785093, + "loss": 2.1738, + "step": 469890 + }, + { + "epoch": 1.816501986980254, + "grad_norm": 0.12875080108642578, + "learning_rate": 0.00032787560271372153, + "loss": 2.173, + "step": 469900 + }, + { + "epoch": 1.8165406441836371, + "grad_norm": 0.1497087925672531, + "learning_rate": 0.0003277559986652665, + "loss": 2.1654, + "step": 469910 + }, + { + "epoch": 1.8165793013870206, + "grad_norm": 0.14243261516094208, + "learning_rate": 0.00032763640317065026, + "loss": 2.1692, + "step": 469920 + }, + { + "epoch": 1.8166179585904039, + "grad_norm": 0.14686591923236847, + "learning_rate": 0.00032751681622803755, + "loss": 2.1697, + "step": 469930 + }, + { + "epoch": 1.8166566157937871, + "grad_norm": 0.1393844187259674, + "learning_rate": 0.00032739723783559404, + "loss": 2.1706, + "step": 469940 + }, + { + "epoch": 1.8166952729971704, + "grad_norm": 0.15019193291664124, + "learning_rate": 0.0003272776679914864, + "loss": 2.1591, + "step": 469950 + }, + { + "epoch": 1.8167339302005536, + "grad_norm": 0.15770284831523895, + "learning_rate": 0.00032715810669388135, + "loss": 2.1699, + "step": 469960 + }, + { + "epoch": 1.8167725874039369, + "grad_norm": 0.16133318841457367, + "learning_rate": 0.00032703855394094637, + "loss": 2.1562, + "step": 469970 + }, + { + "epoch": 1.8168112446073201, + "grad_norm": 0.14682245254516602, + "learning_rate": 0.00032691900973084987, + "loss": 2.1715, + "step": 469980 + }, + { + "epoch": 1.8168499018107034, + "grad_norm": 0.1428011804819107, + "learning_rate": 0.00032679947406176105, + "loss": 2.1799, + "step": 469990 + }, + { + "epoch": 1.8168885590140866, + "grad_norm": 0.14194734394550323, + "learning_rate": 0.00032667994693184887, + "loss": 2.1743, + "step": 470000 + }, + { + "epoch": 1.8169272162174699, + "grad_norm": 0.14284037053585052, + "learning_rate": 0.0003265604283392842, + "loss": 2.1708, + "step": 470010 + }, + { + "epoch": 1.8169658734208531, + "grad_norm": 0.16350673139095306, + "learning_rate": 0.00032644091828223763, + "loss": 2.1756, + "step": 470020 + }, + { + "epoch": 1.8170045306242364, + "grad_norm": 0.1494877189397812, + "learning_rate": 0.00032632141675888085, + "loss": 2.1886, + "step": 470030 + }, + { + "epoch": 1.8170431878276196, + "grad_norm": 0.15065732598304749, + "learning_rate": 0.0003262019237673859, + "loss": 2.1627, + "step": 470040 + }, + { + "epoch": 1.8170818450310031, + "grad_norm": 0.15042832493782043, + "learning_rate": 0.0003260824393059256, + "loss": 2.1633, + "step": 470050 + }, + { + "epoch": 1.8171205022343864, + "grad_norm": 0.14380717277526855, + "learning_rate": 0.00032596296337267374, + "loss": 2.1701, + "step": 470060 + }, + { + "epoch": 1.8171591594377696, + "grad_norm": 0.13390575349330902, + "learning_rate": 0.0003258434959658043, + "loss": 2.1673, + "step": 470070 + }, + { + "epoch": 1.8171978166411529, + "grad_norm": 0.14788520336151123, + "learning_rate": 0.000325724037083492, + "loss": 2.191, + "step": 470080 + }, + { + "epoch": 1.8172364738445363, + "grad_norm": 0.14955569803714752, + "learning_rate": 0.00032560458672391257, + "loss": 2.1561, + "step": 470090 + }, + { + "epoch": 1.8172751310479196, + "grad_norm": 0.1507478654384613, + "learning_rate": 0.00032548514488524186, + "loss": 2.1806, + "step": 470100 + }, + { + "epoch": 1.8173137882513029, + "grad_norm": 0.15433341264724731, + "learning_rate": 0.00032536571156565653, + "loss": 2.1841, + "step": 470110 + }, + { + "epoch": 1.817352445454686, + "grad_norm": 0.14390775561332703, + "learning_rate": 0.0003252462867633341, + "loss": 2.1729, + "step": 470120 + }, + { + "epoch": 1.8173911026580694, + "grad_norm": 0.15695568919181824, + "learning_rate": 0.0003251268704764529, + "loss": 2.1697, + "step": 470130 + }, + { + "epoch": 1.8174297598614526, + "grad_norm": 0.13125157356262207, + "learning_rate": 0.0003250074627031916, + "loss": 2.1614, + "step": 470140 + }, + { + "epoch": 1.8174684170648359, + "grad_norm": 0.13682635128498077, + "learning_rate": 0.00032488806344172926, + "loss": 2.1589, + "step": 470150 + }, + { + "epoch": 1.8175070742682191, + "grad_norm": 0.12663866579532623, + "learning_rate": 0.00032476867269024614, + "loss": 2.1648, + "step": 470160 + }, + { + "epoch": 1.8175457314716024, + "grad_norm": 0.13196326792240143, + "learning_rate": 0.00032464929044692247, + "loss": 2.1604, + "step": 470170 + }, + { + "epoch": 1.8175843886749856, + "grad_norm": 0.13578322529792786, + "learning_rate": 0.00032452991670994027, + "loss": 2.1697, + "step": 470180 + }, + { + "epoch": 1.8176230458783689, + "grad_norm": 0.13914400339126587, + "learning_rate": 0.0003244105514774809, + "loss": 2.1602, + "step": 470190 + }, + { + "epoch": 1.8176617030817521, + "grad_norm": 0.13743595778942108, + "learning_rate": 0.00032429119474772716, + "loss": 2.1734, + "step": 470200 + }, + { + "epoch": 1.8177003602851354, + "grad_norm": 0.129056915640831, + "learning_rate": 0.00032417184651886233, + "loss": 2.1721, + "step": 470210 + }, + { + "epoch": 1.8177390174885188, + "grad_norm": 0.14306758344173431, + "learning_rate": 0.00032405250678907004, + "loss": 2.1706, + "step": 470220 + }, + { + "epoch": 1.817777674691902, + "grad_norm": 0.1493530571460724, + "learning_rate": 0.00032393317555653534, + "loss": 2.1721, + "step": 470230 + }, + { + "epoch": 1.8178163318952854, + "grad_norm": 0.13991063833236694, + "learning_rate": 0.000323813852819443, + "loss": 2.1594, + "step": 470240 + }, + { + "epoch": 1.8178549890986686, + "grad_norm": 0.14696717262268066, + "learning_rate": 0.0003236945385759791, + "loss": 2.1738, + "step": 470250 + }, + { + "epoch": 1.817893646302052, + "grad_norm": 0.15113307535648346, + "learning_rate": 0.0003235752328243298, + "loss": 2.1661, + "step": 470260 + }, + { + "epoch": 1.8179323035054353, + "grad_norm": 0.13656733930110931, + "learning_rate": 0.0003234559355626825, + "loss": 2.1601, + "step": 470270 + }, + { + "epoch": 1.8179709607088186, + "grad_norm": 0.15864388644695282, + "learning_rate": 0.0003233366467892251, + "loss": 2.1721, + "step": 470280 + }, + { + "epoch": 1.8180096179122018, + "grad_norm": 0.13841305673122406, + "learning_rate": 0.00032321736650214563, + "loss": 2.1593, + "step": 470290 + }, + { + "epoch": 1.818048275115585, + "grad_norm": 0.15041330456733704, + "learning_rate": 0.00032309809469963335, + "loss": 2.1737, + "step": 470300 + }, + { + "epoch": 1.8180869323189683, + "grad_norm": 0.14067049324512482, + "learning_rate": 0.00032297883137987805, + "loss": 2.1657, + "step": 470310 + }, + { + "epoch": 1.8181255895223516, + "grad_norm": 0.13357412815093994, + "learning_rate": 0.00032285957654106954, + "loss": 2.1665, + "step": 470320 + }, + { + "epoch": 1.8181642467257348, + "grad_norm": 0.14067021012306213, + "learning_rate": 0.0003227403301813996, + "loss": 2.1581, + "step": 470330 + }, + { + "epoch": 1.818202903929118, + "grad_norm": 0.15399391949176788, + "learning_rate": 0.00032262109229905955, + "loss": 2.1756, + "step": 470340 + }, + { + "epoch": 1.8182415611325013, + "grad_norm": 0.13756081461906433, + "learning_rate": 0.0003225018628922416, + "loss": 2.186, + "step": 470350 + }, + { + "epoch": 1.8182802183358846, + "grad_norm": 0.14441081881523132, + "learning_rate": 0.0003223826419591385, + "loss": 2.1658, + "step": 470360 + }, + { + "epoch": 1.8183188755392679, + "grad_norm": 0.13834241032600403, + "learning_rate": 0.0003222634294979441, + "loss": 2.1676, + "step": 470370 + }, + { + "epoch": 1.818357532742651, + "grad_norm": 0.15014494955539703, + "learning_rate": 0.0003221442255068525, + "loss": 2.1694, + "step": 470380 + }, + { + "epoch": 1.8183961899460346, + "grad_norm": 0.14847639203071594, + "learning_rate": 0.00032202502998405857, + "loss": 2.1727, + "step": 470390 + }, + { + "epoch": 1.8184348471494178, + "grad_norm": 0.1385437697172165, + "learning_rate": 0.0003219058429277575, + "loss": 2.1695, + "step": 470400 + }, + { + "epoch": 1.818473504352801, + "grad_norm": 0.14720351994037628, + "learning_rate": 0.000321786664336146, + "loss": 2.1796, + "step": 470410 + }, + { + "epoch": 1.8185121615561843, + "grad_norm": 0.15029869973659515, + "learning_rate": 0.00032166749420742003, + "loss": 2.1467, + "step": 470420 + }, + { + "epoch": 1.8185508187595678, + "grad_norm": 0.14885807037353516, + "learning_rate": 0.0003215483325397779, + "loss": 2.1716, + "step": 470430 + }, + { + "epoch": 1.818589475962951, + "grad_norm": 0.14331059157848358, + "learning_rate": 0.00032142917933141704, + "loss": 2.1625, + "step": 470440 + }, + { + "epoch": 1.8186281331663343, + "grad_norm": 0.14600177109241486, + "learning_rate": 0.0003213100345805364, + "loss": 2.1658, + "step": 470450 + }, + { + "epoch": 1.8186667903697176, + "grad_norm": 0.14313966035842896, + "learning_rate": 0.0003211908982853351, + "loss": 2.1583, + "step": 470460 + }, + { + "epoch": 1.8187054475731008, + "grad_norm": 0.13352936506271362, + "learning_rate": 0.00032107177044401336, + "loss": 2.1777, + "step": 470470 + }, + { + "epoch": 1.818744104776484, + "grad_norm": 0.15081366896629333, + "learning_rate": 0.00032095265105477156, + "loss": 2.1828, + "step": 470480 + }, + { + "epoch": 1.8187827619798673, + "grad_norm": 0.15778934955596924, + "learning_rate": 0.0003208335401158109, + "loss": 2.175, + "step": 470490 + }, + { + "epoch": 1.8188214191832506, + "grad_norm": 0.15886171162128448, + "learning_rate": 0.0003207144376253335, + "loss": 2.1649, + "step": 470500 + }, + { + "epoch": 1.8188600763866338, + "grad_norm": 0.14672130346298218, + "learning_rate": 0.00032059534358154187, + "loss": 2.1552, + "step": 470510 + }, + { + "epoch": 1.818898733590017, + "grad_norm": 0.14366315305233002, + "learning_rate": 0.0003204762579826386, + "loss": 2.166, + "step": 470520 + }, + { + "epoch": 1.8189373907934003, + "grad_norm": 0.1349872350692749, + "learning_rate": 0.0003203571808268282, + "loss": 2.1711, + "step": 470530 + }, + { + "epoch": 1.8189760479967836, + "grad_norm": 0.14480504393577576, + "learning_rate": 0.0003202381121123148, + "loss": 2.1762, + "step": 470540 + }, + { + "epoch": 1.8190147052001668, + "grad_norm": 0.14662893116474152, + "learning_rate": 0.0003201190518373036, + "loss": 2.1742, + "step": 470550 + }, + { + "epoch": 1.8190533624035503, + "grad_norm": 0.1451336145401001, + "learning_rate": 0.0003200000000000001, + "loss": 2.1774, + "step": 470560 + }, + { + "epoch": 1.8190920196069336, + "grad_norm": 0.1518847644329071, + "learning_rate": 0.00031988095659861047, + "loss": 2.1701, + "step": 470570 + }, + { + "epoch": 1.8191306768103168, + "grad_norm": 0.16146059334278107, + "learning_rate": 0.0003197619216313421, + "loss": 2.1677, + "step": 470580 + }, + { + "epoch": 1.8191693340137, + "grad_norm": 0.15619412064552307, + "learning_rate": 0.0003196428950964025, + "loss": 2.1668, + "step": 470590 + }, + { + "epoch": 1.8192079912170835, + "grad_norm": 0.13538570702075958, + "learning_rate": 0.0003195238769919997, + "loss": 2.1662, + "step": 470600 + }, + { + "epoch": 1.8192466484204668, + "grad_norm": 0.15582145750522614, + "learning_rate": 0.0003194048673163428, + "loss": 2.1905, + "step": 470610 + }, + { + "epoch": 1.81928530562385, + "grad_norm": 0.14375025033950806, + "learning_rate": 0.00031928586606764077, + "loss": 2.1657, + "step": 470620 + }, + { + "epoch": 1.8193239628272333, + "grad_norm": 0.14095117151737213, + "learning_rate": 0.00031916687324410464, + "loss": 2.1675, + "step": 470630 + }, + { + "epoch": 1.8193626200306166, + "grad_norm": 0.1542339324951172, + "learning_rate": 0.00031904788884394456, + "loss": 2.1613, + "step": 470640 + }, + { + "epoch": 1.8194012772339998, + "grad_norm": 0.1529356837272644, + "learning_rate": 0.00031892891286537206, + "loss": 2.1627, + "step": 470650 + }, + { + "epoch": 1.819439934437383, + "grad_norm": 0.14034295082092285, + "learning_rate": 0.00031880994530659914, + "loss": 2.1585, + "step": 470660 + }, + { + "epoch": 1.8194785916407663, + "grad_norm": 0.14266914129257202, + "learning_rate": 0.0003186909861658387, + "loss": 2.1699, + "step": 470670 + }, + { + "epoch": 1.8195172488441496, + "grad_norm": 0.14826495945453644, + "learning_rate": 0.00031857203544130374, + "loss": 2.1701, + "step": 470680 + }, + { + "epoch": 1.8195559060475328, + "grad_norm": 0.13871616125106812, + "learning_rate": 0.00031845309313120865, + "loss": 2.1656, + "step": 470690 + }, + { + "epoch": 1.819594563250916, + "grad_norm": 0.14421504735946655, + "learning_rate": 0.0003183341592337674, + "loss": 2.1667, + "step": 470700 + }, + { + "epoch": 1.8196332204542993, + "grad_norm": 0.13885298371315002, + "learning_rate": 0.00031821523374719575, + "loss": 2.143, + "step": 470710 + }, + { + "epoch": 1.8196718776576826, + "grad_norm": 0.16003422439098358, + "learning_rate": 0.0003180963166697088, + "loss": 2.163, + "step": 470720 + }, + { + "epoch": 1.819710534861066, + "grad_norm": 0.15020252764225006, + "learning_rate": 0.0003179774079995239, + "loss": 2.1784, + "step": 470730 + }, + { + "epoch": 1.8197491920644493, + "grad_norm": 0.14752747118473053, + "learning_rate": 0.0003178585077348577, + "loss": 2.16, + "step": 470740 + }, + { + "epoch": 1.8197878492678325, + "grad_norm": 0.14104293286800385, + "learning_rate": 0.00031773961587392785, + "loss": 2.1707, + "step": 470750 + }, + { + "epoch": 1.8198265064712158, + "grad_norm": 0.141326904296875, + "learning_rate": 0.0003176207324149527, + "loss": 2.1736, + "step": 470760 + }, + { + "epoch": 1.8198651636745993, + "grad_norm": 0.14769048988819122, + "learning_rate": 0.00031750185735615144, + "loss": 2.1854, + "step": 470770 + }, + { + "epoch": 1.8199038208779825, + "grad_norm": 0.14957557618618011, + "learning_rate": 0.0003173829906957437, + "loss": 2.182, + "step": 470780 + }, + { + "epoch": 1.8199424780813658, + "grad_norm": 0.1409343183040619, + "learning_rate": 0.0003172641324319494, + "loss": 2.1707, + "step": 470790 + }, + { + "epoch": 1.819981135284749, + "grad_norm": 0.14448200166225433, + "learning_rate": 0.0003171452825629897, + "loss": 2.1599, + "step": 470800 + }, + { + "epoch": 1.8200197924881323, + "grad_norm": 0.13009198009967804, + "learning_rate": 0.000317026441087086, + "loss": 2.1677, + "step": 470810 + }, + { + "epoch": 1.8200584496915155, + "grad_norm": 0.1416071653366089, + "learning_rate": 0.00031690760800246023, + "loss": 2.1593, + "step": 470820 + }, + { + "epoch": 1.8200971068948988, + "grad_norm": 0.1369924396276474, + "learning_rate": 0.00031678878330733553, + "loss": 2.167, + "step": 470830 + }, + { + "epoch": 1.820135764098282, + "grad_norm": 0.13908712565898895, + "learning_rate": 0.0003166699669999349, + "loss": 2.1813, + "step": 470840 + }, + { + "epoch": 1.8201744213016653, + "grad_norm": 0.13555458188056946, + "learning_rate": 0.00031655115907848263, + "loss": 2.1665, + "step": 470850 + }, + { + "epoch": 1.8202130785050485, + "grad_norm": 0.15544474124908447, + "learning_rate": 0.0003164323595412033, + "loss": 2.1582, + "step": 470860 + }, + { + "epoch": 1.8202517357084318, + "grad_norm": 0.13911612331867218, + "learning_rate": 0.0003163135683863221, + "loss": 2.1752, + "step": 470870 + }, + { + "epoch": 1.820290392911815, + "grad_norm": 0.14102821052074432, + "learning_rate": 0.00031619478561206504, + "loss": 2.1728, + "step": 470880 + }, + { + "epoch": 1.8203290501151985, + "grad_norm": 0.1417911797761917, + "learning_rate": 0.0003160760112166583, + "loss": 2.1681, + "step": 470890 + }, + { + "epoch": 1.8203677073185818, + "grad_norm": 0.1564634144306183, + "learning_rate": 0.00031595724519832926, + "loss": 2.1569, + "step": 470900 + }, + { + "epoch": 1.820406364521965, + "grad_norm": 0.15098914504051208, + "learning_rate": 0.00031583848755530573, + "loss": 2.1558, + "step": 470910 + }, + { + "epoch": 1.8204450217253483, + "grad_norm": 0.1503758728504181, + "learning_rate": 0.0003157197382858159, + "loss": 2.1759, + "step": 470920 + }, + { + "epoch": 1.8204836789287315, + "grad_norm": 0.1641359031200409, + "learning_rate": 0.00031560099738808914, + "loss": 2.1705, + "step": 470930 + }, + { + "epoch": 1.820522336132115, + "grad_norm": 0.13702145218849182, + "learning_rate": 0.0003154822648603548, + "loss": 2.176, + "step": 470940 + }, + { + "epoch": 1.8205609933354983, + "grad_norm": 0.1344425529241562, + "learning_rate": 0.0003153635407008431, + "loss": 2.1762, + "step": 470950 + }, + { + "epoch": 1.8205996505388815, + "grad_norm": 0.14092884957790375, + "learning_rate": 0.00031524482490778504, + "loss": 2.1645, + "step": 470960 + }, + { + "epoch": 1.8206383077422648, + "grad_norm": 0.14377425611019135, + "learning_rate": 0.000315126117479412, + "loss": 2.1598, + "step": 470970 + }, + { + "epoch": 1.820676964945648, + "grad_norm": 0.16156238317489624, + "learning_rate": 0.00031500741841395644, + "loss": 2.1744, + "step": 470980 + }, + { + "epoch": 1.8207156221490313, + "grad_norm": 0.14735539257526398, + "learning_rate": 0.0003148887277096506, + "loss": 2.1714, + "step": 470990 + }, + { + "epoch": 1.8207542793524145, + "grad_norm": 0.13479958474636078, + "learning_rate": 0.00031477004536472844, + "loss": 2.1592, + "step": 471000 + }, + { + "epoch": 1.8207929365557978, + "grad_norm": 0.14318402111530304, + "learning_rate": 0.0003146513713774233, + "loss": 2.1753, + "step": 471010 + }, + { + "epoch": 1.820831593759181, + "grad_norm": 0.1418886035680771, + "learning_rate": 0.00031453270574597014, + "loss": 2.1701, + "step": 471020 + }, + { + "epoch": 1.8208702509625643, + "grad_norm": 0.155859112739563, + "learning_rate": 0.00031441404846860446, + "loss": 2.1639, + "step": 471030 + }, + { + "epoch": 1.8209089081659475, + "grad_norm": 0.15436893701553345, + "learning_rate": 0.0003142953995435618, + "loss": 2.1652, + "step": 471040 + }, + { + "epoch": 1.8209475653693308, + "grad_norm": 0.13911384344100952, + "learning_rate": 0.0003141767589690787, + "loss": 2.158, + "step": 471050 + }, + { + "epoch": 1.8209862225727143, + "grad_norm": 0.14979267120361328, + "learning_rate": 0.0003140581267433922, + "loss": 2.1578, + "step": 471060 + }, + { + "epoch": 1.8210248797760975, + "grad_norm": 0.14796827733516693, + "learning_rate": 0.00031393950286474003, + "loss": 2.1664, + "step": 471070 + }, + { + "epoch": 1.8210635369794808, + "grad_norm": 0.1441972851753235, + "learning_rate": 0.0003138208873313606, + "loss": 2.1759, + "step": 471080 + }, + { + "epoch": 1.821102194182864, + "grad_norm": 0.1467493325471878, + "learning_rate": 0.000313702280141493, + "loss": 2.1713, + "step": 471090 + }, + { + "epoch": 1.8211408513862473, + "grad_norm": 0.14196404814720154, + "learning_rate": 0.00031358368129337657, + "loss": 2.1555, + "step": 471100 + }, + { + "epoch": 1.8211795085896307, + "grad_norm": 0.14079461991786957, + "learning_rate": 0.0003134650907852516, + "loss": 2.1676, + "step": 471110 + }, + { + "epoch": 1.821218165793014, + "grad_norm": 0.14870962500572205, + "learning_rate": 0.00031334650861535864, + "loss": 2.1714, + "step": 471120 + }, + { + "epoch": 1.8212568229963972, + "grad_norm": 0.14794059097766876, + "learning_rate": 0.0003132279347819398, + "loss": 2.1574, + "step": 471130 + }, + { + "epoch": 1.8212954801997805, + "grad_norm": 0.15221253037452698, + "learning_rate": 0.00031310936928323653, + "loss": 2.1821, + "step": 471140 + }, + { + "epoch": 1.8213341374031637, + "grad_norm": 0.14122171700000763, + "learning_rate": 0.0003129908121174918, + "loss": 2.1542, + "step": 471150 + }, + { + "epoch": 1.821372794606547, + "grad_norm": 0.14007976651191711, + "learning_rate": 0.0003128722632829486, + "loss": 2.1689, + "step": 471160 + }, + { + "epoch": 1.8214114518099302, + "grad_norm": 0.13979344069957733, + "learning_rate": 0.00031275372277785117, + "loss": 2.1767, + "step": 471170 + }, + { + "epoch": 1.8214501090133135, + "grad_norm": 0.14779804646968842, + "learning_rate": 0.0003126351906004441, + "loss": 2.1651, + "step": 471180 + }, + { + "epoch": 1.8214887662166968, + "grad_norm": 0.14292143285274506, + "learning_rate": 0.00031251666674897205, + "loss": 2.1753, + "step": 471190 + }, + { + "epoch": 1.82152742342008, + "grad_norm": 0.14197584986686707, + "learning_rate": 0.00031239815122168115, + "loss": 2.1574, + "step": 471200 + }, + { + "epoch": 1.8215660806234633, + "grad_norm": 0.13902144134044647, + "learning_rate": 0.0003122796440168176, + "loss": 2.1556, + "step": 471210 + }, + { + "epoch": 1.8216047378268465, + "grad_norm": 0.15372341871261597, + "learning_rate": 0.0003121611451326285, + "loss": 2.1737, + "step": 471220 + }, + { + "epoch": 1.82164339503023, + "grad_norm": 0.14466264843940735, + "learning_rate": 0.00031204265456736136, + "loss": 2.1771, + "step": 471230 + }, + { + "epoch": 1.8216820522336132, + "grad_norm": 0.13506931066513062, + "learning_rate": 0.00031192417231926453, + "loss": 2.1691, + "step": 471240 + }, + { + "epoch": 1.8217207094369965, + "grad_norm": 0.14005401730537415, + "learning_rate": 0.0003118056983865867, + "loss": 2.1636, + "step": 471250 + }, + { + "epoch": 1.8217593666403797, + "grad_norm": 0.16184751689434052, + "learning_rate": 0.0003116872327675775, + "loss": 2.1734, + "step": 471260 + }, + { + "epoch": 1.821798023843763, + "grad_norm": 0.13978254795074463, + "learning_rate": 0.0003115687754604868, + "loss": 2.1618, + "step": 471270 + }, + { + "epoch": 1.8218366810471465, + "grad_norm": 0.13510258495807648, + "learning_rate": 0.0003114503264635653, + "loss": 2.173, + "step": 471280 + }, + { + "epoch": 1.8218753382505297, + "grad_norm": 0.14572641253471375, + "learning_rate": 0.00031133188577506444, + "loss": 2.1782, + "step": 471290 + }, + { + "epoch": 1.821913995453913, + "grad_norm": 0.15020966529846191, + "learning_rate": 0.00031121345339323606, + "loss": 2.164, + "step": 471300 + }, + { + "epoch": 1.8219526526572962, + "grad_norm": 0.15157094597816467, + "learning_rate": 0.0003110950293163326, + "loss": 2.1667, + "step": 471310 + }, + { + "epoch": 1.8219913098606795, + "grad_norm": 0.15115141868591309, + "learning_rate": 0.0003109766135426071, + "loss": 2.1664, + "step": 471320 + }, + { + "epoch": 1.8220299670640627, + "grad_norm": 0.14831262826919556, + "learning_rate": 0.00031085820607031333, + "loss": 2.1798, + "step": 471330 + }, + { + "epoch": 1.822068624267446, + "grad_norm": 0.14008532464504242, + "learning_rate": 0.00031073980689770586, + "loss": 2.1711, + "step": 471340 + }, + { + "epoch": 1.8221072814708292, + "grad_norm": 0.14013555645942688, + "learning_rate": 0.0003106214160230396, + "loss": 2.1672, + "step": 471350 + }, + { + "epoch": 1.8221459386742125, + "grad_norm": 0.14341752231121063, + "learning_rate": 0.00031050303344457, + "loss": 2.1691, + "step": 471360 + }, + { + "epoch": 1.8221845958775957, + "grad_norm": 0.14467793703079224, + "learning_rate": 0.00031038465916055326, + "loss": 2.1627, + "step": 471370 + }, + { + "epoch": 1.822223253080979, + "grad_norm": 0.13100607693195343, + "learning_rate": 0.00031026629316924616, + "loss": 2.1567, + "step": 471380 + }, + { + "epoch": 1.8222619102843622, + "grad_norm": 0.13663366436958313, + "learning_rate": 0.0003101479354689063, + "loss": 2.1653, + "step": 471390 + }, + { + "epoch": 1.8223005674877457, + "grad_norm": 0.13873040676116943, + "learning_rate": 0.0003100295860577913, + "loss": 2.1536, + "step": 471400 + }, + { + "epoch": 1.822339224691129, + "grad_norm": 0.1375930905342102, + "learning_rate": 0.00030991124493416036, + "loss": 2.1583, + "step": 471410 + }, + { + "epoch": 1.8223778818945122, + "grad_norm": 0.14047600328922272, + "learning_rate": 0.00030979291209627213, + "loss": 2.1596, + "step": 471420 + }, + { + "epoch": 1.8224165390978955, + "grad_norm": 0.14657500386238098, + "learning_rate": 0.0003096745875423868, + "loss": 2.182, + "step": 471430 + }, + { + "epoch": 1.822455196301279, + "grad_norm": 0.1562751680612564, + "learning_rate": 0.0003095562712707649, + "loss": 2.1657, + "step": 471440 + }, + { + "epoch": 1.8224938535046622, + "grad_norm": 0.1424303650856018, + "learning_rate": 0.0003094379632796671, + "loss": 2.1518, + "step": 471450 + }, + { + "epoch": 1.8225325107080455, + "grad_norm": 0.15109124779701233, + "learning_rate": 0.0003093196635673554, + "loss": 2.1767, + "step": 471460 + }, + { + "epoch": 1.8225711679114287, + "grad_norm": 0.15089304745197296, + "learning_rate": 0.0003092013721320921, + "loss": 2.1718, + "step": 471470 + }, + { + "epoch": 1.822609825114812, + "grad_norm": 0.13845579326152802, + "learning_rate": 0.00030908308897213986, + "loss": 2.1584, + "step": 471480 + }, + { + "epoch": 1.8226484823181952, + "grad_norm": 0.15574099123477936, + "learning_rate": 0.0003089648140857624, + "loss": 2.163, + "step": 471490 + }, + { + "epoch": 1.8226871395215785, + "grad_norm": 0.1501399129629135, + "learning_rate": 0.0003088465474712237, + "loss": 2.1597, + "step": 471500 + }, + { + "epoch": 1.8227257967249617, + "grad_norm": 0.19525304436683655, + "learning_rate": 0.0003087282891267886, + "loss": 2.1577, + "step": 471510 + }, + { + "epoch": 1.822764453928345, + "grad_norm": 0.14497607946395874, + "learning_rate": 0.00030861003905072204, + "loss": 2.161, + "step": 471520 + }, + { + "epoch": 1.8228031111317282, + "grad_norm": 0.13903570175170898, + "learning_rate": 0.0003084917972412904, + "loss": 2.1695, + "step": 471530 + }, + { + "epoch": 1.8228417683351115, + "grad_norm": 0.1286279857158661, + "learning_rate": 0.00030837356369676, + "loss": 2.176, + "step": 471540 + }, + { + "epoch": 1.8228804255384947, + "grad_norm": 0.7849547266960144, + "learning_rate": 0.0003082553384153979, + "loss": 2.1549, + "step": 471550 + }, + { + "epoch": 1.822919082741878, + "grad_norm": 0.1463264673948288, + "learning_rate": 0.000308137121395472, + "loss": 2.154, + "step": 471560 + }, + { + "epoch": 1.8229577399452614, + "grad_norm": 0.14043350517749786, + "learning_rate": 0.0003080189126352506, + "loss": 2.1559, + "step": 471570 + }, + { + "epoch": 1.8229963971486447, + "grad_norm": 0.13505004346370697, + "learning_rate": 0.00030790071213300264, + "loss": 2.1785, + "step": 471580 + }, + { + "epoch": 1.823035054352028, + "grad_norm": 0.1397033929824829, + "learning_rate": 0.0003077825198869975, + "loss": 2.151, + "step": 471590 + }, + { + "epoch": 1.8230737115554112, + "grad_norm": 0.16114629805088043, + "learning_rate": 0.00030766433589550556, + "loss": 2.1602, + "step": 471600 + }, + { + "epoch": 1.8231123687587947, + "grad_norm": 0.1364261955022812, + "learning_rate": 0.0003075461601567977, + "loss": 2.16, + "step": 471610 + }, + { + "epoch": 1.823151025962178, + "grad_norm": 0.1511559635400772, + "learning_rate": 0.000307427992669145, + "loss": 2.1632, + "step": 471620 + }, + { + "epoch": 1.8231896831655612, + "grad_norm": 0.1392008364200592, + "learning_rate": 0.00030730983343081955, + "loss": 2.1617, + "step": 471630 + }, + { + "epoch": 1.8232283403689444, + "grad_norm": 0.14022274315357208, + "learning_rate": 0.00030719168244009375, + "loss": 2.1701, + "step": 471640 + }, + { + "epoch": 1.8232669975723277, + "grad_norm": 0.14056052267551422, + "learning_rate": 0.0003070735396952411, + "loss": 2.1341, + "step": 471650 + }, + { + "epoch": 1.823305654775711, + "grad_norm": 0.15311531722545624, + "learning_rate": 0.00030695540519453535, + "loss": 2.1791, + "step": 471660 + }, + { + "epoch": 1.8233443119790942, + "grad_norm": 0.14211353659629822, + "learning_rate": 0.0003068372789362506, + "loss": 2.1611, + "step": 471670 + }, + { + "epoch": 1.8233829691824774, + "grad_norm": 0.14266042411327362, + "learning_rate": 0.00030671916091866213, + "loss": 2.1623, + "step": 471680 + }, + { + "epoch": 1.8234216263858607, + "grad_norm": 0.1391574889421463, + "learning_rate": 0.0003066010511400452, + "loss": 2.1728, + "step": 471690 + }, + { + "epoch": 1.823460283589244, + "grad_norm": 0.13404053449630737, + "learning_rate": 0.0003064829495986763, + "loss": 2.1765, + "step": 471700 + }, + { + "epoch": 1.8234989407926272, + "grad_norm": 0.148732990026474, + "learning_rate": 0.00030636485629283183, + "loss": 2.1709, + "step": 471710 + }, + { + "epoch": 1.8235375979960105, + "grad_norm": 0.15154549479484558, + "learning_rate": 0.0003062467712207897, + "loss": 2.1499, + "step": 471720 + }, + { + "epoch": 1.8235762551993937, + "grad_norm": 0.14113524556159973, + "learning_rate": 0.0003061286943808275, + "loss": 2.1826, + "step": 471730 + }, + { + "epoch": 1.8236149124027772, + "grad_norm": 0.15453845262527466, + "learning_rate": 0.00030601062577122407, + "loss": 2.165, + "step": 471740 + }, + { + "epoch": 1.8236535696061604, + "grad_norm": 0.1487610638141632, + "learning_rate": 0.0003058925653902585, + "loss": 2.1573, + "step": 471750 + }, + { + "epoch": 1.8236922268095437, + "grad_norm": 0.14542990922927856, + "learning_rate": 0.0003057745132362104, + "loss": 2.167, + "step": 471760 + }, + { + "epoch": 1.823730884012927, + "grad_norm": 0.14575399458408356, + "learning_rate": 0.00030565646930736024, + "loss": 2.1603, + "step": 471770 + }, + { + "epoch": 1.8237695412163104, + "grad_norm": 0.1531772017478943, + "learning_rate": 0.0003055384336019893, + "loss": 2.1539, + "step": 471780 + }, + { + "epoch": 1.8238081984196937, + "grad_norm": 0.1494482457637787, + "learning_rate": 0.000305420406118379, + "loss": 2.1679, + "step": 471790 + }, + { + "epoch": 1.823846855623077, + "grad_norm": 0.15587127208709717, + "learning_rate": 0.0003053023868548113, + "loss": 2.1581, + "step": 471800 + }, + { + "epoch": 1.8238855128264602, + "grad_norm": 0.1492954045534134, + "learning_rate": 0.00030518437580956913, + "loss": 2.1635, + "step": 471810 + }, + { + "epoch": 1.8239241700298434, + "grad_norm": 0.1406371295452118, + "learning_rate": 0.00030506637298093595, + "loss": 2.1721, + "step": 471820 + }, + { + "epoch": 1.8239628272332267, + "grad_norm": 0.1496431976556778, + "learning_rate": 0.0003049483783671956, + "loss": 2.1673, + "step": 471830 + }, + { + "epoch": 1.82400148443661, + "grad_norm": 0.1430736482143402, + "learning_rate": 0.0003048303919666327, + "loss": 2.1649, + "step": 471840 + }, + { + "epoch": 1.8240401416399932, + "grad_norm": 0.1506548672914505, + "learning_rate": 0.00030471241377753253, + "loss": 2.1603, + "step": 471850 + }, + { + "epoch": 1.8240787988433764, + "grad_norm": 0.14081627130508423, + "learning_rate": 0.0003045944437981807, + "loss": 2.1693, + "step": 471860 + }, + { + "epoch": 1.8241174560467597, + "grad_norm": 0.13509422540664673, + "learning_rate": 0.0003044764820268637, + "loss": 2.1568, + "step": 471870 + }, + { + "epoch": 1.824156113250143, + "grad_norm": 0.14616185426712036, + "learning_rate": 0.00030435852846186863, + "loss": 2.1586, + "step": 471880 + }, + { + "epoch": 1.8241947704535262, + "grad_norm": 0.1500072479248047, + "learning_rate": 0.0003042405831014825, + "loss": 2.1719, + "step": 471890 + }, + { + "epoch": 1.8242334276569094, + "grad_norm": 0.14021477103233337, + "learning_rate": 0.0003041226459439941, + "loss": 2.1569, + "step": 471900 + }, + { + "epoch": 1.824272084860293, + "grad_norm": 0.14883461594581604, + "learning_rate": 0.00030400471698769163, + "loss": 2.1632, + "step": 471910 + }, + { + "epoch": 1.8243107420636762, + "grad_norm": 0.1428908258676529, + "learning_rate": 0.0003038867962308649, + "loss": 2.1644, + "step": 471920 + }, + { + "epoch": 1.8243493992670594, + "grad_norm": 0.1492062509059906, + "learning_rate": 0.0003037688836718033, + "loss": 2.1625, + "step": 471930 + }, + { + "epoch": 1.8243880564704427, + "grad_norm": 0.1407235711812973, + "learning_rate": 0.00030365097930879804, + "loss": 2.1627, + "step": 471940 + }, + { + "epoch": 1.8244267136738261, + "grad_norm": 0.14518225193023682, + "learning_rate": 0.0003035330831401397, + "loss": 2.182, + "step": 471950 + }, + { + "epoch": 1.8244653708772094, + "grad_norm": 0.14669685065746307, + "learning_rate": 0.0003034151951641202, + "loss": 2.167, + "step": 471960 + }, + { + "epoch": 1.8245040280805926, + "grad_norm": 0.14908838272094727, + "learning_rate": 0.0003032973153790319, + "loss": 2.1547, + "step": 471970 + }, + { + "epoch": 1.824542685283976, + "grad_norm": 0.15010426938533783, + "learning_rate": 0.00030317944378316787, + "loss": 2.1593, + "step": 471980 + }, + { + "epoch": 1.8245813424873591, + "grad_norm": 0.15162095427513123, + "learning_rate": 0.0003030615803748211, + "loss": 2.1476, + "step": 471990 + }, + { + "epoch": 1.8246199996907424, + "grad_norm": 0.17372465133666992, + "learning_rate": 0.00030294372515228595, + "loss": 2.1694, + "step": 472000 + }, + { + "epoch": 1.8246586568941257, + "grad_norm": 0.1417848914861679, + "learning_rate": 0.0003028258781138573, + "loss": 2.1725, + "step": 472010 + }, + { + "epoch": 1.824697314097509, + "grad_norm": 0.138796865940094, + "learning_rate": 0.00030270803925783006, + "loss": 2.1666, + "step": 472020 + }, + { + "epoch": 1.8247359713008922, + "grad_norm": 0.13671287894248962, + "learning_rate": 0.0003025902085825003, + "loss": 2.1578, + "step": 472030 + }, + { + "epoch": 1.8247746285042754, + "grad_norm": 0.14566947519779205, + "learning_rate": 0.0003024723860861645, + "loss": 2.1563, + "step": 472040 + }, + { + "epoch": 1.8248132857076587, + "grad_norm": 0.14366553723812103, + "learning_rate": 0.00030235457176711945, + "loss": 2.1647, + "step": 472050 + }, + { + "epoch": 1.824851942911042, + "grad_norm": 0.14583460986614227, + "learning_rate": 0.0003022367656236633, + "loss": 2.1619, + "step": 472060 + }, + { + "epoch": 1.8248906001144252, + "grad_norm": 0.144693523645401, + "learning_rate": 0.0003021189676540939, + "loss": 2.1452, + "step": 472070 + }, + { + "epoch": 1.8249292573178086, + "grad_norm": 0.15990732610225677, + "learning_rate": 0.00030200117785671, + "loss": 2.1751, + "step": 472080 + }, + { + "epoch": 1.824967914521192, + "grad_norm": 0.16300183534622192, + "learning_rate": 0.0003018833962298113, + "loss": 2.1598, + "step": 472090 + }, + { + "epoch": 1.8250065717245751, + "grad_norm": 0.14283479750156403, + "learning_rate": 0.0003017656227716976, + "loss": 2.1696, + "step": 472100 + }, + { + "epoch": 1.8250452289279584, + "grad_norm": 0.13620631396770477, + "learning_rate": 0.0003016478574806696, + "loss": 2.173, + "step": 472110 + }, + { + "epoch": 1.8250838861313419, + "grad_norm": 0.1480875462293625, + "learning_rate": 0.00030153010035502837, + "loss": 2.1708, + "step": 472120 + }, + { + "epoch": 1.8251225433347251, + "grad_norm": 0.13537877798080444, + "learning_rate": 0.00030141235139307575, + "loss": 2.1646, + "step": 472130 + }, + { + "epoch": 1.8251612005381084, + "grad_norm": 0.14668239653110504, + "learning_rate": 0.000301294610593114, + "loss": 2.1873, + "step": 472140 + }, + { + "epoch": 1.8251998577414916, + "grad_norm": 0.14213785529136658, + "learning_rate": 0.0003011768779534463, + "loss": 2.1644, + "step": 472150 + }, + { + "epoch": 1.8252385149448749, + "grad_norm": 0.13207575678825378, + "learning_rate": 0.00030105915347237587, + "loss": 2.1637, + "step": 472160 + }, + { + "epoch": 1.8252771721482581, + "grad_norm": 0.12869346141815186, + "learning_rate": 0.0003009414371482071, + "loss": 2.161, + "step": 472170 + }, + { + "epoch": 1.8253158293516414, + "grad_norm": 0.12761442363262177, + "learning_rate": 0.00030082372897924457, + "loss": 2.1647, + "step": 472180 + }, + { + "epoch": 1.8253544865550246, + "grad_norm": 0.14196434617042542, + "learning_rate": 0.0003007060289637935, + "loss": 2.1608, + "step": 472190 + }, + { + "epoch": 1.8253931437584079, + "grad_norm": 0.15592649579048157, + "learning_rate": 0.00030058833710015987, + "loss": 2.1646, + "step": 472200 + }, + { + "epoch": 1.8254318009617911, + "grad_norm": 0.1407962143421173, + "learning_rate": 0.0003004706533866501, + "loss": 2.1762, + "step": 472210 + }, + { + "epoch": 1.8254704581651744, + "grad_norm": 0.14168062806129456, + "learning_rate": 0.0003003529778215715, + "loss": 2.1481, + "step": 472220 + }, + { + "epoch": 1.8255091153685576, + "grad_norm": 0.13927967846393585, + "learning_rate": 0.00030023531040323116, + "loss": 2.1729, + "step": 472230 + }, + { + "epoch": 1.825547772571941, + "grad_norm": 0.14393939077854156, + "learning_rate": 0.0003001176511299377, + "loss": 2.1701, + "step": 472240 + }, + { + "epoch": 1.8255864297753244, + "grad_norm": 0.14461536705493927, + "learning_rate": 0.00030000000000000003, + "loss": 2.1705, + "step": 472250 + }, + { + "epoch": 1.8256250869787076, + "grad_norm": 0.1421518325805664, + "learning_rate": 0.00029988235701172727, + "loss": 2.1574, + "step": 472260 + }, + { + "epoch": 1.8256637441820909, + "grad_norm": 0.14677967131137848, + "learning_rate": 0.0002997647221634294, + "loss": 2.1507, + "step": 472270 + }, + { + "epoch": 1.8257024013854741, + "grad_norm": 0.14067254960536957, + "learning_rate": 0.0002996470954534174, + "loss": 2.1773, + "step": 472280 + }, + { + "epoch": 1.8257410585888576, + "grad_norm": 0.13654285669326782, + "learning_rate": 0.0002995294768800019, + "loss": 2.1803, + "step": 472290 + }, + { + "epoch": 1.8257797157922409, + "grad_norm": 0.13923801481723785, + "learning_rate": 0.00029941186644149486, + "loss": 2.1612, + "step": 472300 + }, + { + "epoch": 1.825818372995624, + "grad_norm": 0.14260360598564148, + "learning_rate": 0.00029929426413620864, + "loss": 2.1781, + "step": 472310 + }, + { + "epoch": 1.8258570301990074, + "grad_norm": 0.14026546478271484, + "learning_rate": 0.0002991766699624561, + "loss": 2.1452, + "step": 472320 + }, + { + "epoch": 1.8258956874023906, + "grad_norm": 0.14311176538467407, + "learning_rate": 0.0002990590839185505, + "loss": 2.1521, + "step": 472330 + }, + { + "epoch": 1.8259343446057739, + "grad_norm": 0.13906440138816833, + "learning_rate": 0.00029894150600280644, + "loss": 2.1628, + "step": 472340 + }, + { + "epoch": 1.8259730018091571, + "grad_norm": 0.13940848410129547, + "learning_rate": 0.0002988239362135381, + "loss": 2.1673, + "step": 472350 + }, + { + "epoch": 1.8260116590125404, + "grad_norm": 0.15167121589183807, + "learning_rate": 0.000298706374549061, + "loss": 2.166, + "step": 472360 + }, + { + "epoch": 1.8260503162159236, + "grad_norm": 0.14164578914642334, + "learning_rate": 0.00029858882100769057, + "loss": 2.1489, + "step": 472370 + }, + { + "epoch": 1.8260889734193069, + "grad_norm": 0.15085890889167786, + "learning_rate": 0.0002984712755877437, + "loss": 2.1669, + "step": 472380 + }, + { + "epoch": 1.8261276306226901, + "grad_norm": 0.329809308052063, + "learning_rate": 0.00029835373828753696, + "loss": 2.1522, + "step": 472390 + }, + { + "epoch": 1.8261662878260734, + "grad_norm": 0.14183588325977325, + "learning_rate": 0.0002982362091053883, + "loss": 2.1665, + "step": 472400 + }, + { + "epoch": 1.8262049450294566, + "grad_norm": 0.15209904313087463, + "learning_rate": 0.0002981186880396154, + "loss": 2.1621, + "step": 472410 + }, + { + "epoch": 1.82624360223284, + "grad_norm": 0.1430646777153015, + "learning_rate": 0.0002980011750885374, + "loss": 2.1497, + "step": 472420 + }, + { + "epoch": 1.8262822594362234, + "grad_norm": 0.14137636125087738, + "learning_rate": 0.00029788367025047305, + "loss": 2.1704, + "step": 472430 + }, + { + "epoch": 1.8263209166396066, + "grad_norm": 0.14333899319171906, + "learning_rate": 0.00029776617352374294, + "loss": 2.1547, + "step": 472440 + }, + { + "epoch": 1.8263595738429899, + "grad_norm": 0.13744285702705383, + "learning_rate": 0.0002976486849066671, + "loss": 2.1571, + "step": 472450 + }, + { + "epoch": 1.8263982310463733, + "grad_norm": 0.15237878262996674, + "learning_rate": 0.00029753120439756666, + "loss": 2.1533, + "step": 472460 + }, + { + "epoch": 1.8264368882497566, + "grad_norm": 0.14781443774700165, + "learning_rate": 0.0002974137319947632, + "loss": 2.1643, + "step": 472470 + }, + { + "epoch": 1.8264755454531398, + "grad_norm": 0.15223272144794464, + "learning_rate": 0.00029729626769657894, + "loss": 2.1584, + "step": 472480 + }, + { + "epoch": 1.826514202656523, + "grad_norm": 0.1459142416715622, + "learning_rate": 0.00029717881150133677, + "loss": 2.1711, + "step": 472490 + }, + { + "epoch": 1.8265528598599063, + "grad_norm": 0.1432747095823288, + "learning_rate": 0.00029706136340736, + "loss": 2.1697, + "step": 472500 + }, + { + "epoch": 1.8265915170632896, + "grad_norm": 0.14006705582141876, + "learning_rate": 0.00029694392341297226, + "loss": 2.15, + "step": 472510 + }, + { + "epoch": 1.8266301742666728, + "grad_norm": 0.3814711570739746, + "learning_rate": 0.00029682649151649865, + "loss": 2.1609, + "step": 472520 + }, + { + "epoch": 1.826668831470056, + "grad_norm": 0.14432662725448608, + "learning_rate": 0.0002967090677162636, + "loss": 2.1637, + "step": 472530 + }, + { + "epoch": 1.8267074886734394, + "grad_norm": 0.13059882819652557, + "learning_rate": 0.0002965916520105931, + "loss": 2.1574, + "step": 472540 + }, + { + "epoch": 1.8267461458768226, + "grad_norm": 0.1524466872215271, + "learning_rate": 0.0002964742443978137, + "loss": 2.1566, + "step": 472550 + }, + { + "epoch": 1.8267848030802059, + "grad_norm": 0.1440562754869461, + "learning_rate": 0.0002963568448762517, + "loss": 2.1619, + "step": 472560 + }, + { + "epoch": 1.826823460283589, + "grad_norm": 0.14504368603229523, + "learning_rate": 0.00029623945344423475, + "loss": 2.1673, + "step": 472570 + }, + { + "epoch": 1.8268621174869724, + "grad_norm": 0.13357733190059662, + "learning_rate": 0.00029612207010009083, + "loss": 2.1599, + "step": 472580 + }, + { + "epoch": 1.8269007746903558, + "grad_norm": 0.13890889286994934, + "learning_rate": 0.00029600469484214844, + "loss": 2.1698, + "step": 472590 + }, + { + "epoch": 1.826939431893739, + "grad_norm": 0.15237276256084442, + "learning_rate": 0.0002958873276687366, + "loss": 2.1601, + "step": 472600 + }, + { + "epoch": 1.8269780890971223, + "grad_norm": 0.14198938012123108, + "learning_rate": 0.0002957699685781852, + "loss": 2.1589, + "step": 472610 + }, + { + "epoch": 1.8270167463005056, + "grad_norm": 0.13609854876995087, + "learning_rate": 0.00029565261756882435, + "loss": 2.1449, + "step": 472620 + }, + { + "epoch": 1.827055403503889, + "grad_norm": 0.14549599587917328, + "learning_rate": 0.0002955352746389852, + "loss": 2.1497, + "step": 472630 + }, + { + "epoch": 1.8270940607072723, + "grad_norm": 0.14228083193302155, + "learning_rate": 0.00029541793978699873, + "loss": 2.1675, + "step": 472640 + }, + { + "epoch": 1.8271327179106556, + "grad_norm": 0.14949370920658112, + "learning_rate": 0.00029530061301119727, + "loss": 2.1674, + "step": 472650 + }, + { + "epoch": 1.8271713751140388, + "grad_norm": 0.1407162845134735, + "learning_rate": 0.0002951832943099131, + "loss": 2.1596, + "step": 472660 + }, + { + "epoch": 1.827210032317422, + "grad_norm": 0.14520780742168427, + "learning_rate": 0.00029506598368147976, + "loss": 2.1736, + "step": 472670 + }, + { + "epoch": 1.8272486895208053, + "grad_norm": 0.14568758010864258, + "learning_rate": 0.0002949486811242308, + "loss": 2.1858, + "step": 472680 + }, + { + "epoch": 1.8272873467241886, + "grad_norm": 0.14513273537158966, + "learning_rate": 0.00029483138663650044, + "loss": 2.1635, + "step": 472690 + }, + { + "epoch": 1.8273260039275718, + "grad_norm": 0.13524672389030457, + "learning_rate": 0.0002947141002166236, + "loss": 2.1727, + "step": 472700 + }, + { + "epoch": 1.827364661130955, + "grad_norm": 0.14150987565517426, + "learning_rate": 0.00029459682186293556, + "loss": 2.1465, + "step": 472710 + }, + { + "epoch": 1.8274033183343383, + "grad_norm": 0.14861683547496796, + "learning_rate": 0.0002944795515737726, + "loss": 2.1769, + "step": 472720 + }, + { + "epoch": 1.8274419755377216, + "grad_norm": 0.14497342705726624, + "learning_rate": 0.0002943622893474711, + "loss": 2.1703, + "step": 472730 + }, + { + "epoch": 1.8274806327411048, + "grad_norm": 0.14805848896503448, + "learning_rate": 0.00029424503518236825, + "loss": 2.1594, + "step": 472740 + }, + { + "epoch": 1.827519289944488, + "grad_norm": 0.14520274102687836, + "learning_rate": 0.00029412778907680193, + "loss": 2.173, + "step": 472750 + }, + { + "epoch": 1.8275579471478716, + "grad_norm": 0.14054147899150848, + "learning_rate": 0.00029401055102911024, + "loss": 2.1616, + "step": 472760 + }, + { + "epoch": 1.8275966043512548, + "grad_norm": 0.14863263070583344, + "learning_rate": 0.00029389332103763223, + "loss": 2.1564, + "step": 472770 + }, + { + "epoch": 1.827635261554638, + "grad_norm": 0.14384257793426514, + "learning_rate": 0.000293776099100707, + "loss": 2.1452, + "step": 472780 + }, + { + "epoch": 1.8276739187580213, + "grad_norm": 0.13806447386741638, + "learning_rate": 0.00029365888521667505, + "loss": 2.1665, + "step": 472790 + }, + { + "epoch": 1.8277125759614048, + "grad_norm": 0.13954965770244598, + "learning_rate": 0.00029354167938387677, + "loss": 2.1549, + "step": 472800 + }, + { + "epoch": 1.827751233164788, + "grad_norm": 0.13952240347862244, + "learning_rate": 0.00029342448160065305, + "loss": 2.1762, + "step": 472810 + }, + { + "epoch": 1.8277898903681713, + "grad_norm": 0.36426278948783875, + "learning_rate": 0.0002933072918653459, + "loss": 2.1747, + "step": 472820 + }, + { + "epoch": 1.8278285475715546, + "grad_norm": 0.14163519442081451, + "learning_rate": 0.0002931901101762975, + "loss": 2.1594, + "step": 472830 + }, + { + "epoch": 1.8278672047749378, + "grad_norm": 0.1430559605360031, + "learning_rate": 0.0002930729365318505, + "loss": 2.155, + "step": 472840 + }, + { + "epoch": 1.827905861978321, + "grad_norm": 0.14050045609474182, + "learning_rate": 0.00029295577093034876, + "loss": 2.1582, + "step": 472850 + }, + { + "epoch": 1.8279445191817043, + "grad_norm": 0.1458028256893158, + "learning_rate": 0.00029283861337013595, + "loss": 2.1622, + "step": 472860 + }, + { + "epoch": 1.8279831763850876, + "grad_norm": 0.14921119809150696, + "learning_rate": 0.00029272146384955676, + "loss": 2.1689, + "step": 472870 + }, + { + "epoch": 1.8280218335884708, + "grad_norm": 0.15343713760375977, + "learning_rate": 0.00029260432236695634, + "loss": 2.1886, + "step": 472880 + }, + { + "epoch": 1.828060490791854, + "grad_norm": 0.1427895873785019, + "learning_rate": 0.0002924871889206804, + "loss": 2.1567, + "step": 472890 + }, + { + "epoch": 1.8280991479952373, + "grad_norm": 0.14036491513252258, + "learning_rate": 0.00029237006350907514, + "loss": 2.1473, + "step": 472900 + }, + { + "epoch": 1.8281378051986206, + "grad_norm": 0.1467023640871048, + "learning_rate": 0.0002922529461304875, + "loss": 2.1488, + "step": 472910 + }, + { + "epoch": 1.828176462402004, + "grad_norm": 0.1514258086681366, + "learning_rate": 0.0002921358367832645, + "loss": 2.1603, + "step": 472920 + }, + { + "epoch": 1.8282151196053873, + "grad_norm": 0.1391729712486267, + "learning_rate": 0.00029201873546575466, + "loss": 2.1634, + "step": 472930 + }, + { + "epoch": 1.8282537768087705, + "grad_norm": 0.14794260263442993, + "learning_rate": 0.0002919016421763061, + "loss": 2.159, + "step": 472940 + }, + { + "epoch": 1.8282924340121538, + "grad_norm": 0.14104720950126648, + "learning_rate": 0.000291784556913268, + "loss": 2.1398, + "step": 472950 + }, + { + "epoch": 1.828331091215537, + "grad_norm": 0.141220822930336, + "learning_rate": 0.00029166747967499006, + "loss": 2.1565, + "step": 472960 + }, + { + "epoch": 1.8283697484189205, + "grad_norm": 0.1357167363166809, + "learning_rate": 0.00029155041045982277, + "loss": 2.1559, + "step": 472970 + }, + { + "epoch": 1.8284084056223038, + "grad_norm": 0.1552206575870514, + "learning_rate": 0.00029143334926611653, + "loss": 2.1579, + "step": 472980 + }, + { + "epoch": 1.828447062825687, + "grad_norm": 0.1342216283082962, + "learning_rate": 0.00029131629609222286, + "loss": 2.1659, + "step": 472990 + }, + { + "epoch": 1.8284857200290703, + "grad_norm": 0.13896004855632782, + "learning_rate": 0.0002911992509364938, + "loss": 2.1699, + "step": 473000 + }, + { + "epoch": 1.8285243772324535, + "grad_norm": 0.145687997341156, + "learning_rate": 0.0002910822137972817, + "loss": 2.145, + "step": 473010 + }, + { + "epoch": 1.8285630344358368, + "grad_norm": 0.13861437141895294, + "learning_rate": 0.00029096518467293956, + "loss": 2.1561, + "step": 473020 + }, + { + "epoch": 1.82860169163922, + "grad_norm": 0.13986791670322418, + "learning_rate": 0.00029084816356182095, + "loss": 2.1615, + "step": 473030 + }, + { + "epoch": 1.8286403488426033, + "grad_norm": 0.14401181042194366, + "learning_rate": 0.0002907311504622803, + "loss": 2.1583, + "step": 473040 + }, + { + "epoch": 1.8286790060459865, + "grad_norm": 0.13906337320804596, + "learning_rate": 0.00029061414537267204, + "loss": 2.1475, + "step": 473050 + }, + { + "epoch": 1.8287176632493698, + "grad_norm": 0.14993421733379364, + "learning_rate": 0.0002904971482913519, + "loss": 2.1652, + "step": 473060 + }, + { + "epoch": 1.828756320452753, + "grad_norm": 0.13433660566806793, + "learning_rate": 0.0002903801592166755, + "loss": 2.1577, + "step": 473070 + }, + { + "epoch": 1.8287949776561363, + "grad_norm": 0.1434684544801712, + "learning_rate": 0.0002902631781469991, + "loss": 2.15, + "step": 473080 + }, + { + "epoch": 1.8288336348595198, + "grad_norm": 0.13965515792369843, + "learning_rate": 0.00029014620508068, + "loss": 2.1687, + "step": 473090 + }, + { + "epoch": 1.828872292062903, + "grad_norm": 0.15121091902256012, + "learning_rate": 0.00029002924001607556, + "loss": 2.1612, + "step": 473100 + }, + { + "epoch": 1.8289109492662863, + "grad_norm": 0.13976910710334778, + "learning_rate": 0.00028991228295154416, + "loss": 2.1516, + "step": 473110 + }, + { + "epoch": 1.8289496064696695, + "grad_norm": 0.13840851187705994, + "learning_rate": 0.000289795333885444, + "loss": 2.1733, + "step": 473120 + }, + { + "epoch": 1.8289882636730528, + "grad_norm": 0.1366550624370575, + "learning_rate": 0.00028967839281613484, + "loss": 2.1458, + "step": 473130 + }, + { + "epoch": 1.8290269208764363, + "grad_norm": 0.147894486784935, + "learning_rate": 0.000289561459741976, + "loss": 2.1685, + "step": 473140 + }, + { + "epoch": 1.8290655780798195, + "grad_norm": 0.16173036396503448, + "learning_rate": 0.00028944453466132815, + "loss": 2.1608, + "step": 473150 + }, + { + "epoch": 1.8291042352832028, + "grad_norm": 0.1482090801000595, + "learning_rate": 0.0002893276175725523, + "loss": 2.1696, + "step": 473160 + }, + { + "epoch": 1.829142892486586, + "grad_norm": 0.13548068702220917, + "learning_rate": 0.00028921070847400963, + "loss": 2.1496, + "step": 473170 + }, + { + "epoch": 1.8291815496899693, + "grad_norm": 0.18090428411960602, + "learning_rate": 0.0002890938073640623, + "loss": 2.1535, + "step": 473180 + }, + { + "epoch": 1.8292202068933525, + "grad_norm": 0.14325076341629028, + "learning_rate": 0.0002889769142410732, + "loss": 2.1672, + "step": 473190 + }, + { + "epoch": 1.8292588640967358, + "grad_norm": 0.15077604353427887, + "learning_rate": 0.0002888600291034049, + "loss": 2.168, + "step": 473200 + }, + { + "epoch": 1.829297521300119, + "grad_norm": 0.15352006256580353, + "learning_rate": 0.0002887431519494217, + "loss": 2.1481, + "step": 473210 + }, + { + "epoch": 1.8293361785035023, + "grad_norm": 0.13646052777767181, + "learning_rate": 0.00028862628277748773, + "loss": 2.156, + "step": 473220 + }, + { + "epoch": 1.8293748357068855, + "grad_norm": 0.14072224497795105, + "learning_rate": 0.00028850942158596736, + "loss": 2.1543, + "step": 473230 + }, + { + "epoch": 1.8294134929102688, + "grad_norm": 0.13621889054775238, + "learning_rate": 0.00028839256837322647, + "loss": 2.1632, + "step": 473240 + }, + { + "epoch": 1.829452150113652, + "grad_norm": 0.1435336321592331, + "learning_rate": 0.00028827572313763094, + "loss": 2.1498, + "step": 473250 + }, + { + "epoch": 1.8294908073170355, + "grad_norm": 0.14366935193538666, + "learning_rate": 0.00028815888587754726, + "loss": 2.1547, + "step": 473260 + }, + { + "epoch": 1.8295294645204188, + "grad_norm": 0.13832125067710876, + "learning_rate": 0.00028804205659134263, + "loss": 2.1409, + "step": 473270 + }, + { + "epoch": 1.829568121723802, + "grad_norm": 0.1476607769727707, + "learning_rate": 0.00028792523527738424, + "loss": 2.1582, + "step": 473280 + }, + { + "epoch": 1.8296067789271853, + "grad_norm": 0.14731153845787048, + "learning_rate": 0.0002878084219340407, + "loss": 2.1598, + "step": 473290 + }, + { + "epoch": 1.8296454361305687, + "grad_norm": 0.1447271704673767, + "learning_rate": 0.00028769161655968056, + "loss": 2.1614, + "step": 473300 + }, + { + "epoch": 1.829684093333952, + "grad_norm": 0.14205624163150787, + "learning_rate": 0.00028757481915267323, + "loss": 2.1652, + "step": 473310 + }, + { + "epoch": 1.8297227505373352, + "grad_norm": 0.14254549145698547, + "learning_rate": 0.00028745802971138844, + "loss": 2.1605, + "step": 473320 + }, + { + "epoch": 1.8297614077407185, + "grad_norm": 0.15251408517360687, + "learning_rate": 0.0002873412482341966, + "loss": 2.1681, + "step": 473330 + }, + { + "epoch": 1.8298000649441017, + "grad_norm": 0.14379853010177612, + "learning_rate": 0.00028722447471946853, + "loss": 2.1687, + "step": 473340 + }, + { + "epoch": 1.829838722147485, + "grad_norm": 0.1437206268310547, + "learning_rate": 0.0002871077091655763, + "loss": 2.1617, + "step": 473350 + }, + { + "epoch": 1.8298773793508682, + "grad_norm": 0.14462733268737793, + "learning_rate": 0.0002869909515708915, + "loss": 2.1615, + "step": 473360 + }, + { + "epoch": 1.8299160365542515, + "grad_norm": 0.1429223120212555, + "learning_rate": 0.00028687420193378665, + "loss": 2.1718, + "step": 473370 + }, + { + "epoch": 1.8299546937576348, + "grad_norm": 0.4351711571216583, + "learning_rate": 0.0002867574602526355, + "loss": 2.1731, + "step": 473380 + }, + { + "epoch": 1.829993350961018, + "grad_norm": 0.1433011293411255, + "learning_rate": 0.0002866407265258113, + "loss": 2.1601, + "step": 473390 + }, + { + "epoch": 1.8300320081644013, + "grad_norm": 0.1451789289712906, + "learning_rate": 0.0002865240007516885, + "loss": 2.1515, + "step": 473400 + }, + { + "epoch": 1.8300706653677845, + "grad_norm": 0.14553236961364746, + "learning_rate": 0.0002864072829286417, + "loss": 2.1798, + "step": 473410 + }, + { + "epoch": 1.8301093225711678, + "grad_norm": 0.14178434014320374, + "learning_rate": 0.0002862905730550469, + "loss": 2.1588, + "step": 473420 + }, + { + "epoch": 1.8301479797745512, + "grad_norm": 0.1531856209039688, + "learning_rate": 0.0002861738711292794, + "loss": 2.156, + "step": 473430 + }, + { + "epoch": 1.8301866369779345, + "grad_norm": 0.13813577592372894, + "learning_rate": 0.000286057177149716, + "loss": 2.1485, + "step": 473440 + }, + { + "epoch": 1.8302252941813177, + "grad_norm": 0.1400258094072342, + "learning_rate": 0.0002859404911147336, + "loss": 2.1445, + "step": 473450 + }, + { + "epoch": 1.830263951384701, + "grad_norm": 0.17947034537792206, + "learning_rate": 0.00028582381302271024, + "loss": 2.1659, + "step": 473460 + }, + { + "epoch": 1.8303026085880845, + "grad_norm": 0.14751413464546204, + "learning_rate": 0.0002857071428720237, + "loss": 2.1496, + "step": 473470 + }, + { + "epoch": 1.8303412657914677, + "grad_norm": 0.14646713435649872, + "learning_rate": 0.00028559048066105274, + "loss": 2.1789, + "step": 473480 + }, + { + "epoch": 1.830379922994851, + "grad_norm": 0.14492836594581604, + "learning_rate": 0.00028547382638817665, + "loss": 2.1544, + "step": 473490 + }, + { + "epoch": 1.8304185801982342, + "grad_norm": 0.1351613849401474, + "learning_rate": 0.0002853571800517754, + "loss": 2.1534, + "step": 473500 + }, + { + "epoch": 1.8304572374016175, + "grad_norm": 0.13662733137607574, + "learning_rate": 0.00028524054165022904, + "loss": 2.1562, + "step": 473510 + }, + { + "epoch": 1.8304958946050007, + "grad_norm": 0.14270558953285217, + "learning_rate": 0.00028512391118191863, + "loss": 2.1563, + "step": 473520 + }, + { + "epoch": 1.830534551808384, + "grad_norm": 0.14456135034561157, + "learning_rate": 0.0002850072886452257, + "loss": 2.1608, + "step": 473530 + }, + { + "epoch": 1.8305732090117672, + "grad_norm": 0.15161548554897308, + "learning_rate": 0.0002848906740385322, + "loss": 2.1488, + "step": 473540 + }, + { + "epoch": 1.8306118662151505, + "grad_norm": 0.15137378871440887, + "learning_rate": 0.0002847740673602208, + "loss": 2.1451, + "step": 473550 + }, + { + "epoch": 1.8306505234185337, + "grad_norm": 0.1438361555337906, + "learning_rate": 0.0002846574686086747, + "loss": 2.1628, + "step": 473560 + }, + { + "epoch": 1.830689180621917, + "grad_norm": 0.13977812230587006, + "learning_rate": 0.0002845408777822771, + "loss": 2.1502, + "step": 473570 + }, + { + "epoch": 1.8307278378253002, + "grad_norm": 0.14614924788475037, + "learning_rate": 0.0002844242948794127, + "loss": 2.1742, + "step": 473580 + }, + { + "epoch": 1.8307664950286835, + "grad_norm": 0.15732614696025848, + "learning_rate": 0.0002843077198984663, + "loss": 2.183, + "step": 473590 + }, + { + "epoch": 1.830805152232067, + "grad_norm": 0.14953894913196564, + "learning_rate": 0.0002841911528378227, + "loss": 2.16, + "step": 473600 + }, + { + "epoch": 1.8308438094354502, + "grad_norm": 0.1374642252922058, + "learning_rate": 0.0002840745936958682, + "loss": 2.1552, + "step": 473610 + }, + { + "epoch": 1.8308824666388335, + "grad_norm": 0.14025695621967316, + "learning_rate": 0.00028395804247098914, + "loss": 2.1515, + "step": 473620 + }, + { + "epoch": 1.8309211238422167, + "grad_norm": 0.14120370149612427, + "learning_rate": 0.00028384149916157233, + "loss": 2.1632, + "step": 473630 + }, + { + "epoch": 1.8309597810456002, + "grad_norm": 0.14266963303089142, + "learning_rate": 0.0002837249637660051, + "loss": 2.1542, + "step": 473640 + }, + { + "epoch": 1.8309984382489835, + "grad_norm": 0.14860790967941284, + "learning_rate": 0.0002836084362826761, + "loss": 2.1692, + "step": 473650 + }, + { + "epoch": 1.8310370954523667, + "grad_norm": 0.1417301744222641, + "learning_rate": 0.00028349191670997364, + "loss": 2.1607, + "step": 473660 + }, + { + "epoch": 1.83107575265575, + "grad_norm": 0.14226599037647247, + "learning_rate": 0.00028337540504628665, + "loss": 2.1626, + "step": 473670 + }, + { + "epoch": 1.8311144098591332, + "grad_norm": 0.15236929059028625, + "learning_rate": 0.0002832589012900053, + "loss": 2.1684, + "step": 473680 + }, + { + "epoch": 1.8311530670625165, + "grad_norm": 0.13800197839736938, + "learning_rate": 0.0002831424054395193, + "loss": 2.1689, + "step": 473690 + }, + { + "epoch": 1.8311917242658997, + "grad_norm": 0.14995115995407104, + "learning_rate": 0.0002830259174932197, + "loss": 2.1602, + "step": 473700 + }, + { + "epoch": 1.831230381469283, + "grad_norm": 0.13879883289337158, + "learning_rate": 0.0002829094374494978, + "loss": 2.1448, + "step": 473710 + }, + { + "epoch": 1.8312690386726662, + "grad_norm": 0.14781266450881958, + "learning_rate": 0.0002827929653067456, + "loss": 2.1695, + "step": 473720 + }, + { + "epoch": 1.8313076958760495, + "grad_norm": 0.1535464972257614, + "learning_rate": 0.0002826765010633554, + "loss": 2.1707, + "step": 473730 + }, + { + "epoch": 1.8313463530794327, + "grad_norm": 0.14041906595230103, + "learning_rate": 0.0002825600447177199, + "loss": 2.1556, + "step": 473740 + }, + { + "epoch": 1.831385010282816, + "grad_norm": 0.1497502326965332, + "learning_rate": 0.00028244359626823325, + "loss": 2.1657, + "step": 473750 + }, + { + "epoch": 1.8314236674861992, + "grad_norm": 0.13972197473049164, + "learning_rate": 0.0002823271557132889, + "loss": 2.1658, + "step": 473760 + }, + { + "epoch": 1.8314623246895827, + "grad_norm": 0.13600200414657593, + "learning_rate": 0.0002822107230512818, + "loss": 2.1512, + "step": 473770 + }, + { + "epoch": 1.831500981892966, + "grad_norm": 0.14934469759464264, + "learning_rate": 0.00028209429828060696, + "loss": 2.1513, + "step": 473780 + }, + { + "epoch": 1.8315396390963492, + "grad_norm": 0.15474063158035278, + "learning_rate": 0.0002819778813996603, + "loss": 2.1744, + "step": 473790 + }, + { + "epoch": 1.8315782962997325, + "grad_norm": 0.14638902246952057, + "learning_rate": 0.00028186147240683756, + "loss": 2.1552, + "step": 473800 + }, + { + "epoch": 1.831616953503116, + "grad_norm": 0.14121192693710327, + "learning_rate": 0.0002817450713005361, + "loss": 2.1506, + "step": 473810 + }, + { + "epoch": 1.8316556107064992, + "grad_norm": 0.14948327839374542, + "learning_rate": 0.0002816286780791528, + "loss": 2.1494, + "step": 473820 + }, + { + "epoch": 1.8316942679098824, + "grad_norm": 0.14478008449077606, + "learning_rate": 0.0002815122927410858, + "loss": 2.1566, + "step": 473830 + }, + { + "epoch": 1.8317329251132657, + "grad_norm": 0.1355316936969757, + "learning_rate": 0.0002813959152847332, + "loss": 2.1698, + "step": 473840 + }, + { + "epoch": 1.831771582316649, + "grad_norm": 0.14989060163497925, + "learning_rate": 0.00028127954570849424, + "loss": 2.1495, + "step": 473850 + }, + { + "epoch": 1.8318102395200322, + "grad_norm": 0.14782382547855377, + "learning_rate": 0.00028116318401076823, + "loss": 2.161, + "step": 473860 + }, + { + "epoch": 1.8318488967234154, + "grad_norm": 0.14131376147270203, + "learning_rate": 0.0002810468301899554, + "loss": 2.1479, + "step": 473870 + }, + { + "epoch": 1.8318875539267987, + "grad_norm": 0.1483166664838791, + "learning_rate": 0.0002809304842444562, + "loss": 2.1772, + "step": 473880 + }, + { + "epoch": 1.831926211130182, + "grad_norm": 0.14277474582195282, + "learning_rate": 0.0002808141461726716, + "loss": 2.1587, + "step": 473890 + }, + { + "epoch": 1.8319648683335652, + "grad_norm": 0.16562721133232117, + "learning_rate": 0.0002806978159730036, + "loss": 2.1527, + "step": 473900 + }, + { + "epoch": 1.8320035255369485, + "grad_norm": 0.15126599371433258, + "learning_rate": 0.000280581493643854, + "loss": 2.1806, + "step": 473910 + }, + { + "epoch": 1.8320421827403317, + "grad_norm": 0.14657704532146454, + "learning_rate": 0.0002804651791836259, + "loss": 2.158, + "step": 473920 + }, + { + "epoch": 1.832080839943715, + "grad_norm": 0.1424272060394287, + "learning_rate": 0.00028034887259072237, + "loss": 2.1626, + "step": 473930 + }, + { + "epoch": 1.8321194971470984, + "grad_norm": 0.13862474262714386, + "learning_rate": 0.0002802325738635472, + "loss": 2.1462, + "step": 473940 + }, + { + "epoch": 1.8321581543504817, + "grad_norm": 0.1364509016275406, + "learning_rate": 0.0002801162830005046, + "loss": 2.1579, + "step": 473950 + }, + { + "epoch": 1.832196811553865, + "grad_norm": 0.15476275980472565, + "learning_rate": 0.00028000000000000003, + "loss": 2.1475, + "step": 473960 + }, + { + "epoch": 1.8322354687572482, + "grad_norm": 0.15473146736621857, + "learning_rate": 0.00027988372486043847, + "loss": 2.159, + "step": 473970 + }, + { + "epoch": 1.8322741259606317, + "grad_norm": 0.14409764111042023, + "learning_rate": 0.0002797674575802258, + "loss": 2.1618, + "step": 473980 + }, + { + "epoch": 1.832312783164015, + "grad_norm": 0.15211471915245056, + "learning_rate": 0.00027965119815776896, + "loss": 2.1623, + "step": 473990 + }, + { + "epoch": 1.8323514403673982, + "grad_norm": 0.14995014667510986, + "learning_rate": 0.0002795349465914747, + "loss": 2.1575, + "step": 474000 + }, + { + "epoch": 1.8323900975707814, + "grad_norm": 0.1347171515226364, + "learning_rate": 0.0002794187028797506, + "loss": 2.1663, + "step": 474010 + }, + { + "epoch": 1.8324287547741647, + "grad_norm": 0.15486112236976624, + "learning_rate": 0.0002793024670210049, + "loss": 2.1725, + "step": 474020 + }, + { + "epoch": 1.832467411977548, + "grad_norm": 0.14291568100452423, + "learning_rate": 0.0002791862390136461, + "loss": 2.1361, + "step": 474030 + }, + { + "epoch": 1.8325060691809312, + "grad_norm": 0.32501938939094543, + "learning_rate": 0.00027907001885608376, + "loss": 2.161, + "step": 474040 + }, + { + "epoch": 1.8325447263843144, + "grad_norm": 0.15452173352241516, + "learning_rate": 0.0002789538065467272, + "loss": 2.1501, + "step": 474050 + }, + { + "epoch": 1.8325833835876977, + "grad_norm": 0.15415510535240173, + "learning_rate": 0.0002788376020839869, + "loss": 2.1625, + "step": 474060 + }, + { + "epoch": 1.832622040791081, + "grad_norm": 0.1350860446691513, + "learning_rate": 0.0002787214054662737, + "loss": 2.1674, + "step": 474070 + }, + { + "epoch": 1.8326606979944642, + "grad_norm": 0.14305320382118225, + "learning_rate": 0.000278605216691999, + "loss": 2.1698, + "step": 474080 + }, + { + "epoch": 1.8326993551978474, + "grad_norm": 0.14777852594852448, + "learning_rate": 0.00027848903575957443, + "loss": 2.1546, + "step": 474090 + }, + { + "epoch": 1.8327380124012307, + "grad_norm": 0.1491856426000595, + "learning_rate": 0.0002783728626674125, + "loss": 2.1722, + "step": 474100 + }, + { + "epoch": 1.8327766696046142, + "grad_norm": 0.14523689448833466, + "learning_rate": 0.00027825669741392647, + "loss": 2.1536, + "step": 474110 + }, + { + "epoch": 1.8328153268079974, + "grad_norm": 0.1457177847623825, + "learning_rate": 0.0002781405399975294, + "loss": 2.1333, + "step": 474120 + }, + { + "epoch": 1.8328539840113807, + "grad_norm": 0.14650383591651917, + "learning_rate": 0.0002780243904166355, + "loss": 2.1628, + "step": 474130 + }, + { + "epoch": 1.832892641214764, + "grad_norm": 0.15087856352329254, + "learning_rate": 0.00027790824866965935, + "loss": 2.1577, + "step": 474140 + }, + { + "epoch": 1.8329312984181474, + "grad_norm": 0.16552071273326874, + "learning_rate": 0.000277792114755016, + "loss": 2.1607, + "step": 474150 + }, + { + "epoch": 1.8329699556215306, + "grad_norm": 0.15123656392097473, + "learning_rate": 0.0002776759886711211, + "loss": 2.1612, + "step": 474160 + }, + { + "epoch": 1.833008612824914, + "grad_norm": 0.14551308751106262, + "learning_rate": 0.00027755987041639085, + "loss": 2.1521, + "step": 474170 + }, + { + "epoch": 1.8330472700282971, + "grad_norm": 0.15437181293964386, + "learning_rate": 0.0002774437599892421, + "loss": 2.1716, + "step": 474180 + }, + { + "epoch": 1.8330859272316804, + "grad_norm": 0.14891056716442108, + "learning_rate": 0.0002773276573880916, + "loss": 2.1543, + "step": 474190 + }, + { + "epoch": 1.8331245844350637, + "grad_norm": 0.1399906873703003, + "learning_rate": 0.0002772115626113576, + "loss": 2.1685, + "step": 474200 + }, + { + "epoch": 1.833163241638447, + "grad_norm": 0.14022347331047058, + "learning_rate": 0.0002770954756574582, + "loss": 2.16, + "step": 474210 + }, + { + "epoch": 1.8332018988418302, + "grad_norm": 0.1491684913635254, + "learning_rate": 0.0002769793965248124, + "loss": 2.1424, + "step": 474220 + }, + { + "epoch": 1.8332405560452134, + "grad_norm": 0.14810223877429962, + "learning_rate": 0.0002768633252118391, + "loss": 2.1571, + "step": 474230 + }, + { + "epoch": 1.8332792132485967, + "grad_norm": 0.14386752247810364, + "learning_rate": 0.00027674726171695885, + "loss": 2.168, + "step": 474240 + }, + { + "epoch": 1.83331787045198, + "grad_norm": 0.13856728374958038, + "learning_rate": 0.00027663120603859135, + "loss": 2.1629, + "step": 474250 + }, + { + "epoch": 1.8333565276553632, + "grad_norm": 0.14329712092876434, + "learning_rate": 0.00027651515817515816, + "loss": 2.1705, + "step": 474260 + }, + { + "epoch": 1.8333951848587464, + "grad_norm": 0.15207403898239136, + "learning_rate": 0.00027639911812508044, + "loss": 2.1651, + "step": 474270 + }, + { + "epoch": 1.83343384206213, + "grad_norm": 0.142039954662323, + "learning_rate": 0.00027628308588678043, + "loss": 2.1834, + "step": 474280 + }, + { + "epoch": 1.8334724992655131, + "grad_norm": 0.155516117811203, + "learning_rate": 0.00027616706145868066, + "loss": 2.1513, + "step": 474290 + }, + { + "epoch": 1.8335111564688964, + "grad_norm": 0.15602277219295502, + "learning_rate": 0.00027605104483920416, + "loss": 2.1713, + "step": 474300 + }, + { + "epoch": 1.8335498136722796, + "grad_norm": 0.15286743640899658, + "learning_rate": 0.0002759350360267743, + "loss": 2.1688, + "step": 474310 + }, + { + "epoch": 1.8335884708756631, + "grad_norm": 0.157220259308815, + "learning_rate": 0.0002758190350198155, + "loss": 2.1447, + "step": 474320 + }, + { + "epoch": 1.8336271280790464, + "grad_norm": 0.14431683719158173, + "learning_rate": 0.00027570304181675256, + "loss": 2.1547, + "step": 474330 + }, + { + "epoch": 1.8336657852824296, + "grad_norm": 0.1535961776971817, + "learning_rate": 0.00027558705641601036, + "loss": 2.1531, + "step": 474340 + }, + { + "epoch": 1.8337044424858129, + "grad_norm": 0.14194528758525848, + "learning_rate": 0.00027547107881601465, + "loss": 2.1503, + "step": 474350 + }, + { + "epoch": 1.8337430996891961, + "grad_norm": 0.14567318558692932, + "learning_rate": 0.000275355109015192, + "loss": 2.1482, + "step": 474360 + }, + { + "epoch": 1.8337817568925794, + "grad_norm": 0.19744686782360077, + "learning_rate": 0.00027523914701196904, + "loss": 2.1688, + "step": 474370 + }, + { + "epoch": 1.8338204140959626, + "grad_norm": 0.14790453016757965, + "learning_rate": 0.00027512319280477307, + "loss": 2.1721, + "step": 474380 + }, + { + "epoch": 1.8338590712993459, + "grad_norm": 0.14539088308811188, + "learning_rate": 0.0002750072463920319, + "loss": 2.1573, + "step": 474390 + }, + { + "epoch": 1.8338977285027291, + "grad_norm": 0.14718958735466003, + "learning_rate": 0.000274891307772174, + "loss": 2.1648, + "step": 474400 + }, + { + "epoch": 1.8339363857061124, + "grad_norm": 0.13314035534858704, + "learning_rate": 0.0002747753769436283, + "loss": 2.1405, + "step": 474410 + }, + { + "epoch": 1.8339750429094956, + "grad_norm": 0.14325682818889618, + "learning_rate": 0.00027465945390482417, + "loss": 2.1694, + "step": 474420 + }, + { + "epoch": 1.834013700112879, + "grad_norm": 0.15003050863742828, + "learning_rate": 0.0002745435386541915, + "loss": 2.178, + "step": 474430 + }, + { + "epoch": 1.8340523573162621, + "grad_norm": 0.15796884894371033, + "learning_rate": 0.0002744276311901608, + "loss": 2.1636, + "step": 474440 + }, + { + "epoch": 1.8340910145196456, + "grad_norm": 0.15004146099090576, + "learning_rate": 0.00027431173151116296, + "loss": 2.1625, + "step": 474450 + }, + { + "epoch": 1.8341296717230289, + "grad_norm": 0.14282776415348053, + "learning_rate": 0.0002741958396156301, + "loss": 2.1452, + "step": 474460 + }, + { + "epoch": 1.8341683289264121, + "grad_norm": 0.5124521255493164, + "learning_rate": 0.0002740799555019937, + "loss": 2.1463, + "step": 474470 + }, + { + "epoch": 1.8342069861297954, + "grad_norm": 0.1407298743724823, + "learning_rate": 0.0002739640791686866, + "loss": 2.1404, + "step": 474480 + }, + { + "epoch": 1.8342456433331789, + "grad_norm": 0.14269711077213287, + "learning_rate": 0.0002738482106141418, + "loss": 2.1486, + "step": 474490 + }, + { + "epoch": 1.834284300536562, + "grad_norm": 0.13416120409965515, + "learning_rate": 0.00027373234983679316, + "loss": 2.1663, + "step": 474500 + }, + { + "epoch": 1.8343229577399454, + "grad_norm": 0.16566786170005798, + "learning_rate": 0.0002736164968350747, + "loss": 2.1441, + "step": 474510 + }, + { + "epoch": 1.8343616149433286, + "grad_norm": 0.14914767444133759, + "learning_rate": 0.000273500651607421, + "loss": 2.1547, + "step": 474520 + }, + { + "epoch": 1.8344002721467119, + "grad_norm": 0.14149288833141327, + "learning_rate": 0.00027338481415226746, + "loss": 2.1503, + "step": 474530 + }, + { + "epoch": 1.8344389293500951, + "grad_norm": 0.15202617645263672, + "learning_rate": 0.00027326898446804985, + "loss": 2.1671, + "step": 474540 + }, + { + "epoch": 1.8344775865534784, + "grad_norm": 0.14977170526981354, + "learning_rate": 0.00027315316255320423, + "loss": 2.1531, + "step": 474550 + }, + { + "epoch": 1.8345162437568616, + "grad_norm": 0.14531210064888, + "learning_rate": 0.0002730373484061677, + "loss": 2.158, + "step": 474560 + }, + { + "epoch": 1.8345549009602449, + "grad_norm": 0.14642809331417084, + "learning_rate": 0.00027292154202537746, + "loss": 2.1638, + "step": 474570 + }, + { + "epoch": 1.8345935581636281, + "grad_norm": 0.1511840671300888, + "learning_rate": 0.00027280574340927125, + "loss": 2.1561, + "step": 474580 + }, + { + "epoch": 1.8346322153670114, + "grad_norm": 0.1489248126745224, + "learning_rate": 0.0002726899525562876, + "loss": 2.1671, + "step": 474590 + }, + { + "epoch": 1.8346708725703946, + "grad_norm": 0.1497693657875061, + "learning_rate": 0.00027257416946486535, + "loss": 2.1526, + "step": 474600 + }, + { + "epoch": 1.8347095297737779, + "grad_norm": 0.15171314775943756, + "learning_rate": 0.00027245839413344376, + "loss": 2.155, + "step": 474610 + }, + { + "epoch": 1.8347481869771614, + "grad_norm": 0.14315788447856903, + "learning_rate": 0.0002723426265604629, + "loss": 2.1467, + "step": 474620 + }, + { + "epoch": 1.8347868441805446, + "grad_norm": 0.1636415421962738, + "learning_rate": 0.0002722268667443635, + "loss": 2.1516, + "step": 474630 + }, + { + "epoch": 1.8348255013839279, + "grad_norm": 0.147812619805336, + "learning_rate": 0.00027211111468358594, + "loss": 2.1596, + "step": 474640 + }, + { + "epoch": 1.8348641585873111, + "grad_norm": 0.1467977911233902, + "learning_rate": 0.00027199537037657205, + "loss": 2.1551, + "step": 474650 + }, + { + "epoch": 1.8349028157906946, + "grad_norm": 0.1470346301794052, + "learning_rate": 0.00027187963382176416, + "loss": 2.159, + "step": 474660 + }, + { + "epoch": 1.8349414729940778, + "grad_norm": 0.14632827043533325, + "learning_rate": 0.0002717639050176044, + "loss": 2.1762, + "step": 474670 + }, + { + "epoch": 1.834980130197461, + "grad_norm": 0.13886310160160065, + "learning_rate": 0.000271648183962536, + "loss": 2.1751, + "step": 474680 + }, + { + "epoch": 1.8350187874008443, + "grad_norm": 0.14693546295166016, + "learning_rate": 0.00027153247065500244, + "loss": 2.1516, + "step": 474690 + }, + { + "epoch": 1.8350574446042276, + "grad_norm": 0.14294764399528503, + "learning_rate": 0.00027141676509344803, + "loss": 2.1513, + "step": 474700 + }, + { + "epoch": 1.8350961018076108, + "grad_norm": 0.15899953246116638, + "learning_rate": 0.0002713010672763172, + "loss": 2.1759, + "step": 474710 + }, + { + "epoch": 1.835134759010994, + "grad_norm": 0.15766823291778564, + "learning_rate": 0.00027118537720205536, + "loss": 2.1411, + "step": 474720 + }, + { + "epoch": 1.8351734162143774, + "grad_norm": 0.14663904905319214, + "learning_rate": 0.0002710696948691078, + "loss": 2.1547, + "step": 474730 + }, + { + "epoch": 1.8352120734177606, + "grad_norm": 0.14257924258708954, + "learning_rate": 0.0002709540202759211, + "loss": 2.1337, + "step": 474740 + }, + { + "epoch": 1.8352507306211439, + "grad_norm": 0.1540602296590805, + "learning_rate": 0.0002708383534209418, + "loss": 2.1465, + "step": 474750 + }, + { + "epoch": 1.835289387824527, + "grad_norm": 0.14710083603858948, + "learning_rate": 0.00027072269430261707, + "loss": 2.1615, + "step": 474760 + }, + { + "epoch": 1.8353280450279104, + "grad_norm": 0.1486217975616455, + "learning_rate": 0.00027060704291939473, + "loss": 2.1634, + "step": 474770 + }, + { + "epoch": 1.8353667022312938, + "grad_norm": 0.14736615121364594, + "learning_rate": 0.0002704913992697231, + "loss": 2.1638, + "step": 474780 + }, + { + "epoch": 1.835405359434677, + "grad_norm": 0.2241804301738739, + "learning_rate": 0.00027037576335205116, + "loss": 2.1621, + "step": 474790 + }, + { + "epoch": 1.8354440166380603, + "grad_norm": 0.14994165301322937, + "learning_rate": 0.0002702601351648277, + "loss": 2.1686, + "step": 474800 + }, + { + "epoch": 1.8354826738414436, + "grad_norm": 0.15455268323421478, + "learning_rate": 0.00027014451470650294, + "loss": 2.1466, + "step": 474810 + }, + { + "epoch": 1.8355213310448268, + "grad_norm": 0.13080264627933502, + "learning_rate": 0.0002700289019755273, + "loss": 2.158, + "step": 474820 + }, + { + "epoch": 1.8355599882482103, + "grad_norm": 0.1570408046245575, + "learning_rate": 0.0002699132969703513, + "loss": 2.1711, + "step": 474830 + }, + { + "epoch": 1.8355986454515936, + "grad_norm": 0.14790542423725128, + "learning_rate": 0.00026979769968942647, + "loss": 2.166, + "step": 474840 + }, + { + "epoch": 1.8356373026549768, + "grad_norm": 0.1418340802192688, + "learning_rate": 0.0002696821101312048, + "loss": 2.1674, + "step": 474850 + }, + { + "epoch": 1.83567595985836, + "grad_norm": 0.14686687290668488, + "learning_rate": 0.00026956652829413863, + "loss": 2.1502, + "step": 474860 + }, + { + "epoch": 1.8357146170617433, + "grad_norm": 0.14338235557079315, + "learning_rate": 0.000269450954176681, + "loss": 2.1498, + "step": 474870 + }, + { + "epoch": 1.8357532742651266, + "grad_norm": 0.14893607795238495, + "learning_rate": 0.00026933538777728505, + "loss": 2.1666, + "step": 474880 + }, + { + "epoch": 1.8357919314685098, + "grad_norm": 0.13746413588523865, + "learning_rate": 0.00026921982909440523, + "loss": 2.1737, + "step": 474890 + }, + { + "epoch": 1.835830588671893, + "grad_norm": 0.16587093472480774, + "learning_rate": 0.0002691042781264956, + "loss": 2.1414, + "step": 474900 + }, + { + "epoch": 1.8358692458752763, + "grad_norm": 0.16657157242298126, + "learning_rate": 0.00026898873487201123, + "loss": 2.149, + "step": 474910 + }, + { + "epoch": 1.8359079030786596, + "grad_norm": 0.18725548684597015, + "learning_rate": 0.000268873199329408, + "loss": 2.1519, + "step": 474920 + }, + { + "epoch": 1.8359465602820428, + "grad_norm": 0.14422450959682465, + "learning_rate": 0.00026875767149714137, + "loss": 2.1467, + "step": 474930 + }, + { + "epoch": 1.835985217485426, + "grad_norm": 0.14225982129573822, + "learning_rate": 0.0002686421513736683, + "loss": 2.1649, + "step": 474940 + }, + { + "epoch": 1.8360238746888096, + "grad_norm": 0.14490465819835663, + "learning_rate": 0.00026852663895744544, + "loss": 2.1558, + "step": 474950 + }, + { + "epoch": 1.8360625318921928, + "grad_norm": 0.14130234718322754, + "learning_rate": 0.0002684111342469309, + "loss": 2.1651, + "step": 474960 + }, + { + "epoch": 1.836101189095576, + "grad_norm": 0.14048299193382263, + "learning_rate": 0.0002682956372405825, + "loss": 2.1677, + "step": 474970 + }, + { + "epoch": 1.8361398462989593, + "grad_norm": 0.14509908854961395, + "learning_rate": 0.00026818014793685886, + "loss": 2.1716, + "step": 474980 + }, + { + "epoch": 1.8361785035023426, + "grad_norm": 0.14281538128852844, + "learning_rate": 0.000268064666334219, + "loss": 2.1586, + "step": 474990 + }, + { + "epoch": 1.836217160705726, + "grad_norm": 0.14713968336582184, + "learning_rate": 0.0002679491924311228, + "loss": 2.1496, + "step": 475000 + }, + { + "epoch": 1.8362558179091093, + "grad_norm": 0.14357225596904755, + "learning_rate": 0.00026783372622603, + "loss": 2.1569, + "step": 475010 + }, + { + "epoch": 1.8362944751124926, + "grad_norm": 0.1509816199541092, + "learning_rate": 0.0002677182677174017, + "loss": 2.1676, + "step": 475020 + }, + { + "epoch": 1.8363331323158758, + "grad_norm": 0.16111452877521515, + "learning_rate": 0.0002676028169036986, + "loss": 2.155, + "step": 475030 + }, + { + "epoch": 1.836371789519259, + "grad_norm": 0.14163239300251007, + "learning_rate": 0.00026748737378338294, + "loss": 2.1634, + "step": 475040 + }, + { + "epoch": 1.8364104467226423, + "grad_norm": 0.14450347423553467, + "learning_rate": 0.00026737193835491647, + "loss": 2.1466, + "step": 475050 + }, + { + "epoch": 1.8364491039260256, + "grad_norm": 0.15021710097789764, + "learning_rate": 0.0002672565106167617, + "loss": 2.1522, + "step": 475060 + }, + { + "epoch": 1.8364877611294088, + "grad_norm": 0.14771442115306854, + "learning_rate": 0.00026714109056738257, + "loss": 2.1503, + "step": 475070 + }, + { + "epoch": 1.836526418332792, + "grad_norm": 0.1410464346408844, + "learning_rate": 0.0002670256782052425, + "loss": 2.1549, + "step": 475080 + }, + { + "epoch": 1.8365650755361753, + "grad_norm": 0.17191557586193085, + "learning_rate": 0.0002669102735288054, + "loss": 2.1592, + "step": 475090 + }, + { + "epoch": 1.8366037327395586, + "grad_norm": 0.15708066523075104, + "learning_rate": 0.00026679487653653644, + "loss": 2.1602, + "step": 475100 + }, + { + "epoch": 1.8366423899429418, + "grad_norm": 0.15195932984352112, + "learning_rate": 0.0002666794872269007, + "loss": 2.1613, + "step": 475110 + }, + { + "epoch": 1.8366810471463253, + "grad_norm": 0.1657049059867859, + "learning_rate": 0.0002665641055983639, + "loss": 2.147, + "step": 475120 + }, + { + "epoch": 1.8367197043497085, + "grad_norm": 0.15128254890441895, + "learning_rate": 0.0002664487316493924, + "loss": 2.1517, + "step": 475130 + }, + { + "epoch": 1.8367583615530918, + "grad_norm": 0.17023751139640808, + "learning_rate": 0.0002663333653784532, + "loss": 2.1521, + "step": 475140 + }, + { + "epoch": 1.836797018756475, + "grad_norm": 0.14957919716835022, + "learning_rate": 0.0002662180067840132, + "loss": 2.1559, + "step": 475150 + }, + { + "epoch": 1.8368356759598583, + "grad_norm": 0.1486700475215912, + "learning_rate": 0.0002661026558645405, + "loss": 2.1676, + "step": 475160 + }, + { + "epoch": 1.8368743331632418, + "grad_norm": 0.1451827436685562, + "learning_rate": 0.00026598731261850327, + "loss": 2.1485, + "step": 475170 + }, + { + "epoch": 1.836912990366625, + "grad_norm": 0.1410922110080719, + "learning_rate": 0.0002658719770443705, + "loss": 2.1614, + "step": 475180 + }, + { + "epoch": 1.8369516475700083, + "grad_norm": 0.14330267906188965, + "learning_rate": 0.0002657566491406116, + "loss": 2.151, + "step": 475190 + }, + { + "epoch": 1.8369903047733915, + "grad_norm": 0.15906070172786713, + "learning_rate": 0.00026564132890569603, + "loss": 2.1485, + "step": 475200 + }, + { + "epoch": 1.8370289619767748, + "grad_norm": 0.2338830679655075, + "learning_rate": 0.0002655260163380946, + "loss": 2.153, + "step": 475210 + }, + { + "epoch": 1.837067619180158, + "grad_norm": 0.1482182890176773, + "learning_rate": 0.00026541071143627783, + "loss": 2.1493, + "step": 475220 + }, + { + "epoch": 1.8371062763835413, + "grad_norm": 0.14705368876457214, + "learning_rate": 0.0002652954141987172, + "loss": 2.1634, + "step": 475230 + }, + { + "epoch": 1.8371449335869245, + "grad_norm": 0.15626183152198792, + "learning_rate": 0.00026518012462388473, + "loss": 2.1447, + "step": 475240 + }, + { + "epoch": 1.8371835907903078, + "grad_norm": 0.15654116868972778, + "learning_rate": 0.0002650648427102529, + "loss": 2.1463, + "step": 475250 + }, + { + "epoch": 1.837222247993691, + "grad_norm": 0.1517390012741089, + "learning_rate": 0.00026494956845629414, + "loss": 2.1582, + "step": 475260 + }, + { + "epoch": 1.8372609051970743, + "grad_norm": 0.15306983888149261, + "learning_rate": 0.00026483430186048217, + "loss": 2.1513, + "step": 475270 + }, + { + "epoch": 1.8372995624004576, + "grad_norm": 0.14637787640094757, + "learning_rate": 0.0002647190429212911, + "loss": 2.1519, + "step": 475280 + }, + { + "epoch": 1.837338219603841, + "grad_norm": 0.15025612711906433, + "learning_rate": 0.00026460379163719506, + "loss": 2.1491, + "step": 475290 + }, + { + "epoch": 1.8373768768072243, + "grad_norm": 0.14811120927333832, + "learning_rate": 0.000264488548006669, + "loss": 2.1278, + "step": 475300 + }, + { + "epoch": 1.8374155340106075, + "grad_norm": 0.14240843057632446, + "learning_rate": 0.0002643733120281886, + "loss": 2.1479, + "step": 475310 + }, + { + "epoch": 1.8374541912139908, + "grad_norm": 0.14492589235305786, + "learning_rate": 0.00026425808370022933, + "loss": 2.1516, + "step": 475320 + }, + { + "epoch": 1.8374928484173743, + "grad_norm": 0.14916452765464783, + "learning_rate": 0.0002641428630212681, + "loss": 2.1487, + "step": 475330 + }, + { + "epoch": 1.8375315056207575, + "grad_norm": 0.15178216993808746, + "learning_rate": 0.0002640276499897818, + "loss": 2.1499, + "step": 475340 + }, + { + "epoch": 1.8375701628241408, + "grad_norm": 0.13903997838497162, + "learning_rate": 0.00026391244460424756, + "loss": 2.1368, + "step": 475350 + }, + { + "epoch": 1.837608820027524, + "grad_norm": 0.1431676298379898, + "learning_rate": 0.00026379724686314356, + "loss": 2.1494, + "step": 475360 + }, + { + "epoch": 1.8376474772309073, + "grad_norm": 0.1526685506105423, + "learning_rate": 0.00026368205676494827, + "loss": 2.1425, + "step": 475370 + }, + { + "epoch": 1.8376861344342905, + "grad_norm": 0.146949902176857, + "learning_rate": 0.00026356687430814077, + "loss": 2.1652, + "step": 475380 + }, + { + "epoch": 1.8377247916376738, + "grad_norm": 0.14898286759853363, + "learning_rate": 0.0002634516994912004, + "loss": 2.1543, + "step": 475390 + }, + { + "epoch": 1.837763448841057, + "grad_norm": 0.1496376097202301, + "learning_rate": 0.0002633365323126071, + "loss": 2.1489, + "step": 475400 + }, + { + "epoch": 1.8378021060444403, + "grad_norm": 0.15171100199222565, + "learning_rate": 0.0002632213727708417, + "loss": 2.1625, + "step": 475410 + }, + { + "epoch": 1.8378407632478235, + "grad_norm": 0.14547541737556458, + "learning_rate": 0.0002631062208643846, + "loss": 2.1543, + "step": 475420 + }, + { + "epoch": 1.8378794204512068, + "grad_norm": 0.14050796627998352, + "learning_rate": 0.00026299107659171763, + "loss": 2.1584, + "step": 475430 + }, + { + "epoch": 1.83791807765459, + "grad_norm": 0.14410996437072754, + "learning_rate": 0.00026287593995132297, + "loss": 2.1416, + "step": 475440 + }, + { + "epoch": 1.8379567348579733, + "grad_norm": 0.14802516996860504, + "learning_rate": 0.0002627608109416828, + "loss": 2.1649, + "step": 475450 + }, + { + "epoch": 1.8379953920613568, + "grad_norm": 0.14655569195747375, + "learning_rate": 0.0002626456895612801, + "loss": 2.1495, + "step": 475460 + }, + { + "epoch": 1.83803404926474, + "grad_norm": 0.14457468688488007, + "learning_rate": 0.0002625305758085985, + "loss": 2.1561, + "step": 475470 + }, + { + "epoch": 1.8380727064681233, + "grad_norm": 0.15370580554008484, + "learning_rate": 0.00026241546968212213, + "loss": 2.1535, + "step": 475480 + }, + { + "epoch": 1.8381113636715065, + "grad_norm": 0.15351314842700958, + "learning_rate": 0.00026230037118033536, + "loss": 2.146, + "step": 475490 + }, + { + "epoch": 1.83815002087489, + "grad_norm": 0.14985795319080353, + "learning_rate": 0.0002621852803017233, + "loss": 2.1583, + "step": 475500 + }, + { + "epoch": 1.8381886780782732, + "grad_norm": 0.1384783238172531, + "learning_rate": 0.00026207019704477144, + "loss": 2.145, + "step": 475510 + }, + { + "epoch": 1.8382273352816565, + "grad_norm": 0.14273719489574432, + "learning_rate": 0.0002619551214079654, + "loss": 2.1598, + "step": 475520 + }, + { + "epoch": 1.8382659924850397, + "grad_norm": 0.1650056093931198, + "learning_rate": 0.0002618400533897924, + "loss": 2.1493, + "step": 475530 + }, + { + "epoch": 1.838304649688423, + "grad_norm": 0.14638406038284302, + "learning_rate": 0.0002617249929887389, + "loss": 2.1714, + "step": 475540 + }, + { + "epoch": 1.8383433068918063, + "grad_norm": 0.16371501982212067, + "learning_rate": 0.00026160994020329255, + "loss": 2.1756, + "step": 475550 + }, + { + "epoch": 1.8383819640951895, + "grad_norm": 0.1557331085205078, + "learning_rate": 0.00026149489503194155, + "loss": 2.1417, + "step": 475560 + }, + { + "epoch": 1.8384206212985728, + "grad_norm": 0.14200273156166077, + "learning_rate": 0.0002613798574731743, + "loss": 2.1578, + "step": 475570 + }, + { + "epoch": 1.838459278501956, + "grad_norm": 0.1394173502922058, + "learning_rate": 0.00026126482752547966, + "loss": 2.1599, + "step": 475580 + }, + { + "epoch": 1.8384979357053393, + "grad_norm": 0.14956103265285492, + "learning_rate": 0.0002611498051873473, + "loss": 2.1487, + "step": 475590 + }, + { + "epoch": 1.8385365929087225, + "grad_norm": 0.14384682476520538, + "learning_rate": 0.0002610347904572674, + "loss": 2.155, + "step": 475600 + }, + { + "epoch": 1.8385752501121058, + "grad_norm": 0.14592944085597992, + "learning_rate": 0.0002609197833337302, + "loss": 2.1555, + "step": 475610 + }, + { + "epoch": 1.838613907315489, + "grad_norm": 0.14544503390789032, + "learning_rate": 0.00026080478381522676, + "loss": 2.1542, + "step": 475620 + }, + { + "epoch": 1.8386525645188725, + "grad_norm": 0.15256692469120026, + "learning_rate": 0.00026068979190024886, + "loss": 2.1349, + "step": 475630 + }, + { + "epoch": 1.8386912217222557, + "grad_norm": 0.1543371081352234, + "learning_rate": 0.000260574807587288, + "loss": 2.1641, + "step": 475640 + }, + { + "epoch": 1.838729878925639, + "grad_norm": 0.1660279631614685, + "learning_rate": 0.00026045983087483713, + "loss": 2.1526, + "step": 475650 + }, + { + "epoch": 1.8387685361290222, + "grad_norm": 0.15747512876987457, + "learning_rate": 0.00026034486176138927, + "loss": 2.1638, + "step": 475660 + }, + { + "epoch": 1.8388071933324057, + "grad_norm": 0.14996637403964996, + "learning_rate": 0.00026022990024543755, + "loss": 2.1445, + "step": 475670 + }, + { + "epoch": 1.838845850535789, + "grad_norm": 0.16035395860671997, + "learning_rate": 0.0002601149463254764, + "loss": 2.1599, + "step": 475680 + }, + { + "epoch": 1.8388845077391722, + "grad_norm": 0.1412738412618637, + "learning_rate": 0.00026000000000000003, + "loss": 2.1543, + "step": 475690 + }, + { + "epoch": 1.8389231649425555, + "grad_norm": 0.14722250401973724, + "learning_rate": 0.00025988506126750345, + "loss": 2.1565, + "step": 475700 + }, + { + "epoch": 1.8389618221459387, + "grad_norm": 0.14353296160697937, + "learning_rate": 0.00025977013012648233, + "loss": 2.1579, + "step": 475710 + }, + { + "epoch": 1.839000479349322, + "grad_norm": 0.15804100036621094, + "learning_rate": 0.0002596552065754327, + "loss": 2.1548, + "step": 475720 + }, + { + "epoch": 1.8390391365527052, + "grad_norm": 0.14293187856674194, + "learning_rate": 0.0002595402906128508, + "loss": 2.1488, + "step": 475730 + }, + { + "epoch": 1.8390777937560885, + "grad_norm": 0.1571645587682724, + "learning_rate": 0.00025942538223723365, + "loss": 2.1635, + "step": 475740 + }, + { + "epoch": 1.8391164509594717, + "grad_norm": 0.15808333456516266, + "learning_rate": 0.00025931048144707904, + "loss": 2.1539, + "step": 475750 + }, + { + "epoch": 1.839155108162855, + "grad_norm": 0.14674799144268036, + "learning_rate": 0.00025919558824088454, + "loss": 2.1502, + "step": 475760 + }, + { + "epoch": 1.8391937653662382, + "grad_norm": 0.14407147467136383, + "learning_rate": 0.0002590807026171489, + "loss": 2.1639, + "step": 475770 + }, + { + "epoch": 1.8392324225696215, + "grad_norm": 0.14988110959529877, + "learning_rate": 0.0002589658245743709, + "loss": 2.1673, + "step": 475780 + }, + { + "epoch": 1.8392710797730047, + "grad_norm": 0.15999698638916016, + "learning_rate": 0.0002588509541110502, + "loss": 2.1353, + "step": 475790 + }, + { + "epoch": 1.8393097369763882, + "grad_norm": 0.15566125512123108, + "learning_rate": 0.00025873609122568664, + "loss": 2.1478, + "step": 475800 + }, + { + "epoch": 1.8393483941797715, + "grad_norm": 0.14536809921264648, + "learning_rate": 0.0002586212359167808, + "loss": 2.1529, + "step": 475810 + }, + { + "epoch": 1.8393870513831547, + "grad_norm": 0.15134699642658234, + "learning_rate": 0.00025850638818283357, + "loss": 2.1546, + "step": 475820 + }, + { + "epoch": 1.839425708586538, + "grad_norm": 0.1482597142457962, + "learning_rate": 0.0002583915480223462, + "loss": 2.1701, + "step": 475830 + }, + { + "epoch": 1.8394643657899215, + "grad_norm": 0.15543188154697418, + "learning_rate": 0.0002582767154338208, + "loss": 2.1482, + "step": 475840 + }, + { + "epoch": 1.8395030229933047, + "grad_norm": 0.14815393090248108, + "learning_rate": 0.00025816189041575965, + "loss": 2.1483, + "step": 475850 + }, + { + "epoch": 1.839541680196688, + "grad_norm": 0.1632230579853058, + "learning_rate": 0.000258047072966666, + "loss": 2.1672, + "step": 475860 + }, + { + "epoch": 1.8395803374000712, + "grad_norm": 0.1543683409690857, + "learning_rate": 0.0002579322630850429, + "loss": 2.1607, + "step": 475870 + }, + { + "epoch": 1.8396189946034545, + "grad_norm": 0.14933699369430542, + "learning_rate": 0.0002578174607693946, + "loss": 2.1641, + "step": 475880 + }, + { + "epoch": 1.8396576518068377, + "grad_norm": 0.1431177854537964, + "learning_rate": 0.0002577026660182251, + "loss": 2.1481, + "step": 475890 + }, + { + "epoch": 1.839696309010221, + "grad_norm": 0.152797669172287, + "learning_rate": 0.00025758787883003987, + "loss": 2.1494, + "step": 475900 + }, + { + "epoch": 1.8397349662136042, + "grad_norm": 0.15314161777496338, + "learning_rate": 0.0002574730992033438, + "loss": 2.1549, + "step": 475910 + }, + { + "epoch": 1.8397736234169875, + "grad_norm": 0.1449500471353531, + "learning_rate": 0.0002573583271366429, + "loss": 2.1419, + "step": 475920 + }, + { + "epoch": 1.8398122806203707, + "grad_norm": 0.14397546648979187, + "learning_rate": 0.0002572435626284435, + "loss": 2.154, + "step": 475930 + }, + { + "epoch": 1.839850937823754, + "grad_norm": 0.15783585608005524, + "learning_rate": 0.00025712880567725275, + "loss": 2.1552, + "step": 475940 + }, + { + "epoch": 1.8398895950271372, + "grad_norm": 0.15068823099136353, + "learning_rate": 0.0002570140562815775, + "loss": 2.1508, + "step": 475950 + }, + { + "epoch": 1.8399282522305205, + "grad_norm": 0.14194492995738983, + "learning_rate": 0.00025689931443992586, + "loss": 2.1682, + "step": 475960 + }, + { + "epoch": 1.839966909433904, + "grad_norm": 0.14788579940795898, + "learning_rate": 0.00025678458015080644, + "loss": 2.14, + "step": 475970 + }, + { + "epoch": 1.8400055666372872, + "grad_norm": 0.14433981478214264, + "learning_rate": 0.00025666985341272786, + "loss": 2.1504, + "step": 475980 + }, + { + "epoch": 1.8400442238406705, + "grad_norm": 0.14616207778453827, + "learning_rate": 0.0002565551342241992, + "loss": 2.1583, + "step": 475990 + }, + { + "epoch": 1.8400828810440537, + "grad_norm": 0.1440388709306717, + "learning_rate": 0.0002564404225837307, + "loss": 2.133, + "step": 476000 + }, + { + "epoch": 1.8401215382474372, + "grad_norm": 0.14986035227775574, + "learning_rate": 0.0002563257184898322, + "loss": 2.1477, + "step": 476010 + }, + { + "epoch": 1.8401601954508204, + "grad_norm": 0.14334015548229218, + "learning_rate": 0.00025621102194101476, + "loss": 2.1548, + "step": 476020 + }, + { + "epoch": 1.8401988526542037, + "grad_norm": 0.18292531371116638, + "learning_rate": 0.0002560963329357897, + "loss": 2.1629, + "step": 476030 + }, + { + "epoch": 1.840237509857587, + "grad_norm": 0.15646730363368988, + "learning_rate": 0.00025598165147266874, + "loss": 2.1462, + "step": 476040 + }, + { + "epoch": 1.8402761670609702, + "grad_norm": 0.1612592488527298, + "learning_rate": 0.0002558669775501641, + "loss": 2.1633, + "step": 476050 + }, + { + "epoch": 1.8403148242643534, + "grad_norm": 0.16252127289772034, + "learning_rate": 0.00025575231116678834, + "loss": 2.1448, + "step": 476060 + }, + { + "epoch": 1.8403534814677367, + "grad_norm": 0.15415285527706146, + "learning_rate": 0.00025563765232105505, + "loss": 2.1516, + "step": 476070 + }, + { + "epoch": 1.84039213867112, + "grad_norm": 0.1514085829257965, + "learning_rate": 0.00025552300101147797, + "loss": 2.1596, + "step": 476080 + }, + { + "epoch": 1.8404307958745032, + "grad_norm": 0.14109759032726288, + "learning_rate": 0.00025540835723657083, + "loss": 2.1461, + "step": 476090 + }, + { + "epoch": 1.8404694530778865, + "grad_norm": 0.13598160445690155, + "learning_rate": 0.0002552937209948489, + "loss": 2.153, + "step": 476100 + }, + { + "epoch": 1.8405081102812697, + "grad_norm": 0.14688196778297424, + "learning_rate": 0.00025517909228482715, + "loss": 2.1528, + "step": 476110 + }, + { + "epoch": 1.840546767484653, + "grad_norm": 0.14169596135616302, + "learning_rate": 0.00025506447110502093, + "loss": 2.1507, + "step": 476120 + }, + { + "epoch": 1.8405854246880362, + "grad_norm": 0.16858428716659546, + "learning_rate": 0.00025494985745394707, + "loss": 2.1603, + "step": 476130 + }, + { + "epoch": 1.8406240818914197, + "grad_norm": 0.146564781665802, + "learning_rate": 0.0002548352513301215, + "loss": 2.1703, + "step": 476140 + }, + { + "epoch": 1.840662739094803, + "grad_norm": 0.15180619060993195, + "learning_rate": 0.00025472065273206203, + "loss": 2.1593, + "step": 476150 + }, + { + "epoch": 1.8407013962981862, + "grad_norm": 0.15303488075733185, + "learning_rate": 0.0002546060616582857, + "loss": 2.1517, + "step": 476160 + }, + { + "epoch": 1.8407400535015694, + "grad_norm": 0.15305596590042114, + "learning_rate": 0.0002544914781073109, + "loss": 2.1556, + "step": 476170 + }, + { + "epoch": 1.840778710704953, + "grad_norm": 0.14316235482692719, + "learning_rate": 0.0002543769020776563, + "loss": 2.1535, + "step": 476180 + }, + { + "epoch": 1.8408173679083362, + "grad_norm": 0.14286579191684723, + "learning_rate": 0.0002542623335678409, + "loss": 2.1468, + "step": 476190 + }, + { + "epoch": 1.8408560251117194, + "grad_norm": 0.1483672559261322, + "learning_rate": 0.00025414777257638435, + "loss": 2.1534, + "step": 476200 + }, + { + "epoch": 1.8408946823151027, + "grad_norm": 0.153128981590271, + "learning_rate": 0.00025403321910180666, + "loss": 2.1568, + "step": 476210 + }, + { + "epoch": 1.840933339518486, + "grad_norm": 0.16412784159183502, + "learning_rate": 0.0002539186731426282, + "loss": 2.1495, + "step": 476220 + }, + { + "epoch": 1.8409719967218692, + "grad_norm": 0.14767330884933472, + "learning_rate": 0.0002538041346973703, + "loss": 2.1457, + "step": 476230 + }, + { + "epoch": 1.8410106539252524, + "grad_norm": 0.15422873198986053, + "learning_rate": 0.00025368960376455417, + "loss": 2.1722, + "step": 476240 + }, + { + "epoch": 1.8410493111286357, + "grad_norm": 0.14106683433055878, + "learning_rate": 0.0002535750803427019, + "loss": 2.1421, + "step": 476250 + }, + { + "epoch": 1.841087968332019, + "grad_norm": 0.1555287390947342, + "learning_rate": 0.0002534605644303363, + "loss": 2.1435, + "step": 476260 + }, + { + "epoch": 1.8411266255354022, + "grad_norm": 0.1445557177066803, + "learning_rate": 0.0002533460560259797, + "loss": 2.1498, + "step": 476270 + }, + { + "epoch": 1.8411652827387854, + "grad_norm": 0.1519489884376526, + "learning_rate": 0.0002532315551281561, + "loss": 2.1547, + "step": 476280 + }, + { + "epoch": 1.8412039399421687, + "grad_norm": 0.1514035314321518, + "learning_rate": 0.0002531170617353893, + "loss": 2.1446, + "step": 476290 + }, + { + "epoch": 1.841242597145552, + "grad_norm": 0.14062903821468353, + "learning_rate": 0.00025300257584620357, + "loss": 2.1401, + "step": 476300 + }, + { + "epoch": 1.8412812543489354, + "grad_norm": 0.13704952597618103, + "learning_rate": 0.0002528880974591239, + "loss": 2.1568, + "step": 476310 + }, + { + "epoch": 1.8413199115523187, + "grad_norm": 0.17359179258346558, + "learning_rate": 0.00025277362657267566, + "loss": 2.1533, + "step": 476320 + }, + { + "epoch": 1.841358568755702, + "grad_norm": 0.20773059129714966, + "learning_rate": 0.0002526591631853847, + "loss": 2.1696, + "step": 476330 + }, + { + "epoch": 1.8413972259590852, + "grad_norm": 0.15593963861465454, + "learning_rate": 0.0002525447072957776, + "loss": 2.1463, + "step": 476340 + }, + { + "epoch": 1.8414358831624686, + "grad_norm": 0.13709397614002228, + "learning_rate": 0.0002524302589023808, + "loss": 2.1573, + "step": 476350 + }, + { + "epoch": 1.841474540365852, + "grad_norm": 0.14867442846298218, + "learning_rate": 0.00025231581800372197, + "loss": 2.143, + "step": 476360 + }, + { + "epoch": 1.8415131975692351, + "grad_norm": 0.15370690822601318, + "learning_rate": 0.0002522013845983284, + "loss": 2.1557, + "step": 476370 + }, + { + "epoch": 1.8415518547726184, + "grad_norm": 0.15814514458179474, + "learning_rate": 0.000252086958684729, + "loss": 2.163, + "step": 476380 + }, + { + "epoch": 1.8415905119760017, + "grad_norm": 0.15417690575122833, + "learning_rate": 0.0002519725402614523, + "loss": 2.1483, + "step": 476390 + }, + { + "epoch": 1.841629169179385, + "grad_norm": 0.16330553591251373, + "learning_rate": 0.0002518581293270274, + "loss": 2.1412, + "step": 476400 + }, + { + "epoch": 1.8416678263827682, + "grad_norm": 0.15288779139518738, + "learning_rate": 0.0002517437258799842, + "loss": 2.1471, + "step": 476410 + }, + { + "epoch": 1.8417064835861514, + "grad_norm": 0.1717262715101242, + "learning_rate": 0.0002516293299188528, + "loss": 2.1599, + "step": 476420 + }, + { + "epoch": 1.8417451407895347, + "grad_norm": 0.15698036551475525, + "learning_rate": 0.0002515149414421638, + "loss": 2.1335, + "step": 476430 + }, + { + "epoch": 1.841783797992918, + "grad_norm": 0.16390347480773926, + "learning_rate": 0.0002514005604484486, + "loss": 2.1474, + "step": 476440 + }, + { + "epoch": 1.8418224551963012, + "grad_norm": 0.1516304314136505, + "learning_rate": 0.0002512861869362386, + "loss": 2.146, + "step": 476450 + }, + { + "epoch": 1.8418611123996844, + "grad_norm": 0.19226273894309998, + "learning_rate": 0.0002511718209040661, + "loss": 2.149, + "step": 476460 + }, + { + "epoch": 1.8418997696030677, + "grad_norm": 0.1427430957555771, + "learning_rate": 0.0002510574623504636, + "loss": 2.1615, + "step": 476470 + }, + { + "epoch": 1.8419384268064511, + "grad_norm": 0.159335657954216, + "learning_rate": 0.00025094311127396416, + "loss": 2.1554, + "step": 476480 + }, + { + "epoch": 1.8419770840098344, + "grad_norm": 0.14797185361385345, + "learning_rate": 0.0002508287676731016, + "loss": 2.1598, + "step": 476490 + }, + { + "epoch": 1.8420157412132176, + "grad_norm": 0.16486115753650665, + "learning_rate": 0.00025071443154640985, + "loss": 2.1591, + "step": 476500 + }, + { + "epoch": 1.842054398416601, + "grad_norm": 0.14702408015727997, + "learning_rate": 0.00025060010289242317, + "loss": 2.1509, + "step": 476510 + }, + { + "epoch": 1.8420930556199844, + "grad_norm": 0.16612829267978668, + "learning_rate": 0.0002504857817096771, + "loss": 2.1511, + "step": 476520 + }, + { + "epoch": 1.8421317128233676, + "grad_norm": 0.15480519831180573, + "learning_rate": 0.00025037146799670643, + "loss": 2.1573, + "step": 476530 + }, + { + "epoch": 1.8421703700267509, + "grad_norm": 0.15102191269397736, + "learning_rate": 0.0002502571617520477, + "loss": 2.1603, + "step": 476540 + }, + { + "epoch": 1.8422090272301341, + "grad_norm": 0.15874643623828888, + "learning_rate": 0.00025014286297423706, + "loss": 2.1558, + "step": 476550 + }, + { + "epoch": 1.8422476844335174, + "grad_norm": 0.13553157448768616, + "learning_rate": 0.0002500285716618114, + "loss": 2.1539, + "step": 476560 + }, + { + "epoch": 1.8422863416369006, + "grad_norm": 0.822083592414856, + "learning_rate": 0.0002499142878133083, + "loss": 2.1482, + "step": 476570 + }, + { + "epoch": 1.842324998840284, + "grad_norm": 0.16487015783786774, + "learning_rate": 0.0002498000114272654, + "loss": 2.1404, + "step": 476580 + }, + { + "epoch": 1.8423636560436671, + "grad_norm": 0.15109044313430786, + "learning_rate": 0.0002496857425022214, + "loss": 2.1623, + "step": 476590 + }, + { + "epoch": 1.8424023132470504, + "grad_norm": 0.15568329393863678, + "learning_rate": 0.0002495714810367149, + "loss": 2.1414, + "step": 476600 + }, + { + "epoch": 1.8424409704504336, + "grad_norm": 0.14634571969509125, + "learning_rate": 0.000249457227029285, + "loss": 2.1592, + "step": 476610 + }, + { + "epoch": 1.842479627653817, + "grad_norm": 0.15658384561538696, + "learning_rate": 0.0002493429804784719, + "loss": 2.1577, + "step": 476620 + }, + { + "epoch": 1.8425182848572002, + "grad_norm": 0.14044654369354248, + "learning_rate": 0.0002492287413828156, + "loss": 2.147, + "step": 476630 + }, + { + "epoch": 1.8425569420605836, + "grad_norm": 0.13307639956474304, + "learning_rate": 0.00024911450974085693, + "loss": 2.1446, + "step": 476640 + }, + { + "epoch": 1.8425955992639669, + "grad_norm": 0.14166012406349182, + "learning_rate": 0.0002490002855511371, + "loss": 2.1486, + "step": 476650 + }, + { + "epoch": 1.8426342564673501, + "grad_norm": 0.15263959765434265, + "learning_rate": 0.00024888606881219745, + "loss": 2.1615, + "step": 476660 + }, + { + "epoch": 1.8426729136707334, + "grad_norm": 0.15352514386177063, + "learning_rate": 0.0002487718595225805, + "loss": 2.1607, + "step": 476670 + }, + { + "epoch": 1.8427115708741166, + "grad_norm": 0.15273882448673248, + "learning_rate": 0.000248657657680829, + "loss": 2.1578, + "step": 476680 + }, + { + "epoch": 1.8427502280775, + "grad_norm": 0.15000106394290924, + "learning_rate": 0.0002485434632854859, + "loss": 2.1488, + "step": 476690 + }, + { + "epoch": 1.8427888852808834, + "grad_norm": 0.14709743857383728, + "learning_rate": 0.0002484292763350946, + "loss": 2.1539, + "step": 476700 + }, + { + "epoch": 1.8428275424842666, + "grad_norm": 0.14307551085948944, + "learning_rate": 0.0002483150968281995, + "loss": 2.1502, + "step": 476710 + }, + { + "epoch": 1.8428661996876499, + "grad_norm": 0.1431547999382019, + "learning_rate": 0.000248200924763345, + "loss": 2.1427, + "step": 476720 + }, + { + "epoch": 1.8429048568910331, + "grad_norm": 0.1426902562379837, + "learning_rate": 0.00024808676013907593, + "loss": 2.1508, + "step": 476730 + }, + { + "epoch": 1.8429435140944164, + "grad_norm": 0.1503293514251709, + "learning_rate": 0.00024797260295393776, + "loss": 2.1488, + "step": 476740 + }, + { + "epoch": 1.8429821712977996, + "grad_norm": 0.15397608280181885, + "learning_rate": 0.00024785845320647696, + "loss": 2.1451, + "step": 476750 + }, + { + "epoch": 1.8430208285011829, + "grad_norm": 0.14943160116672516, + "learning_rate": 0.0002477443108952393, + "loss": 2.1624, + "step": 476760 + }, + { + "epoch": 1.8430594857045661, + "grad_norm": 0.14495742321014404, + "learning_rate": 0.000247630176018772, + "loss": 2.1507, + "step": 476770 + }, + { + "epoch": 1.8430981429079494, + "grad_norm": 0.15321022272109985, + "learning_rate": 0.0002475160485756225, + "loss": 2.1644, + "step": 476780 + }, + { + "epoch": 1.8431368001113326, + "grad_norm": 0.1492360681295395, + "learning_rate": 0.0002474019285643385, + "loss": 2.1533, + "step": 476790 + }, + { + "epoch": 1.8431754573147159, + "grad_norm": 0.1437566727399826, + "learning_rate": 0.0002472878159834684, + "loss": 2.1566, + "step": 476800 + }, + { + "epoch": 1.8432141145180994, + "grad_norm": 0.14607855677604675, + "learning_rate": 0.00024717371083156104, + "loss": 2.1383, + "step": 476810 + }, + { + "epoch": 1.8432527717214826, + "grad_norm": 0.14503221213817596, + "learning_rate": 0.0002470596131071656, + "loss": 2.1485, + "step": 476820 + }, + { + "epoch": 1.8432914289248659, + "grad_norm": 0.15728530287742615, + "learning_rate": 0.00024694552280883175, + "loss": 2.1448, + "step": 476830 + }, + { + "epoch": 1.8433300861282491, + "grad_norm": 0.1535523235797882, + "learning_rate": 0.0002468314399351099, + "loss": 2.1411, + "step": 476840 + }, + { + "epoch": 1.8433687433316324, + "grad_norm": 0.15958794951438904, + "learning_rate": 0.00024671736448455064, + "loss": 2.1587, + "step": 476850 + }, + { + "epoch": 1.8434074005350158, + "grad_norm": 0.14848648011684418, + "learning_rate": 0.0002466032964557052, + "loss": 2.1426, + "step": 476860 + }, + { + "epoch": 1.843446057738399, + "grad_norm": 0.15634022653102875, + "learning_rate": 0.0002464892358471249, + "loss": 2.1768, + "step": 476870 + }, + { + "epoch": 1.8434847149417823, + "grad_norm": 0.14585278928279877, + "learning_rate": 0.0002463751826573621, + "loss": 2.1488, + "step": 476880 + }, + { + "epoch": 1.8435233721451656, + "grad_norm": 0.1499786376953125, + "learning_rate": 0.0002462611368849694, + "loss": 2.156, + "step": 476890 + }, + { + "epoch": 1.8435620293485488, + "grad_norm": 0.15097768604755402, + "learning_rate": 0.0002461470985284997, + "loss": 2.1508, + "step": 476900 + }, + { + "epoch": 1.843600686551932, + "grad_norm": 0.1578410118818283, + "learning_rate": 0.0002460330675865066, + "loss": 2.1508, + "step": 476910 + }, + { + "epoch": 1.8436393437553154, + "grad_norm": 0.14859865605831146, + "learning_rate": 0.00024591904405754406, + "loss": 2.1512, + "step": 476920 + }, + { + "epoch": 1.8436780009586986, + "grad_norm": 0.1477453112602234, + "learning_rate": 0.00024580502794016644, + "loss": 2.1369, + "step": 476930 + }, + { + "epoch": 1.8437166581620819, + "grad_norm": 0.15519174933433533, + "learning_rate": 0.0002456910192329289, + "loss": 2.1551, + "step": 476940 + }, + { + "epoch": 1.843755315365465, + "grad_norm": 0.1488887071609497, + "learning_rate": 0.0002455770179343866, + "loss": 2.1478, + "step": 476950 + }, + { + "epoch": 1.8437939725688484, + "grad_norm": 0.14880669116973877, + "learning_rate": 0.0002454630240430953, + "loss": 2.1417, + "step": 476960 + }, + { + "epoch": 1.8438326297722316, + "grad_norm": 0.14693772792816162, + "learning_rate": 0.0002453490375576115, + "loss": 2.161, + "step": 476970 + }, + { + "epoch": 1.843871286975615, + "grad_norm": 0.16454418003559113, + "learning_rate": 0.0002452350584764922, + "loss": 2.152, + "step": 476980 + }, + { + "epoch": 1.8439099441789983, + "grad_norm": 0.17832502722740173, + "learning_rate": 0.00024512108679829425, + "loss": 2.1435, + "step": 476990 + }, + { + "epoch": 1.8439486013823816, + "grad_norm": 0.14790251851081848, + "learning_rate": 0.0002450071225215755, + "loss": 2.1578, + "step": 477000 + }, + { + "epoch": 1.8439872585857648, + "grad_norm": 0.142452210187912, + "learning_rate": 0.0002448931656448945, + "loss": 2.1419, + "step": 477010 + }, + { + "epoch": 1.844025915789148, + "grad_norm": 0.14764165878295898, + "learning_rate": 0.0002447792161668094, + "loss": 2.1409, + "step": 477020 + }, + { + "epoch": 1.8440645729925316, + "grad_norm": 0.14111246168613434, + "learning_rate": 0.0002446652740858797, + "loss": 2.1565, + "step": 477030 + }, + { + "epoch": 1.8441032301959148, + "grad_norm": 0.16016855835914612, + "learning_rate": 0.0002445513394006649, + "loss": 2.1595, + "step": 477040 + }, + { + "epoch": 1.844141887399298, + "grad_norm": 0.15253598988056183, + "learning_rate": 0.00024443741210972484, + "loss": 2.1467, + "step": 477050 + }, + { + "epoch": 1.8441805446026813, + "grad_norm": 0.14610204100608826, + "learning_rate": 0.00024432349221162043, + "loss": 2.1543, + "step": 477060 + }, + { + "epoch": 1.8442192018060646, + "grad_norm": 0.14825892448425293, + "learning_rate": 0.00024420957970491244, + "loss": 2.1495, + "step": 477070 + }, + { + "epoch": 1.8442578590094478, + "grad_norm": 0.142495796084404, + "learning_rate": 0.0002440956745881624, + "loss": 2.1487, + "step": 477080 + }, + { + "epoch": 1.844296516212831, + "grad_norm": 0.14968812465667725, + "learning_rate": 0.0002439817768599324, + "loss": 2.1572, + "step": 477090 + }, + { + "epoch": 1.8443351734162143, + "grad_norm": 0.16449612379074097, + "learning_rate": 0.00024386788651878467, + "loss": 2.1353, + "step": 477100 + }, + { + "epoch": 1.8443738306195976, + "grad_norm": 0.14865903556346893, + "learning_rate": 0.00024375400356328215, + "loss": 2.1476, + "step": 477110 + }, + { + "epoch": 1.8444124878229808, + "grad_norm": 0.14438387751579285, + "learning_rate": 0.0002436401279919882, + "loss": 2.1591, + "step": 477120 + }, + { + "epoch": 1.844451145026364, + "grad_norm": 0.13989832997322083, + "learning_rate": 0.00024352625980346643, + "loss": 2.1306, + "step": 477130 + }, + { + "epoch": 1.8444898022297473, + "grad_norm": 0.1448381245136261, + "learning_rate": 0.0002434123989962813, + "loss": 2.1626, + "step": 477140 + }, + { + "epoch": 1.8445284594331308, + "grad_norm": 0.14926588535308838, + "learning_rate": 0.00024329854556899734, + "loss": 2.1455, + "step": 477150 + }, + { + "epoch": 1.844567116636514, + "grad_norm": 0.14922012388706207, + "learning_rate": 0.00024318469952018008, + "loss": 2.16, + "step": 477160 + }, + { + "epoch": 1.8446057738398973, + "grad_norm": 0.1519775688648224, + "learning_rate": 0.00024307086084839492, + "loss": 2.1492, + "step": 477170 + }, + { + "epoch": 1.8446444310432806, + "grad_norm": 0.1431160271167755, + "learning_rate": 0.00024295702955220812, + "loss": 2.1519, + "step": 477180 + }, + { + "epoch": 1.844683088246664, + "grad_norm": 0.15909992158412933, + "learning_rate": 0.00024284320563018613, + "loss": 2.1432, + "step": 477190 + }, + { + "epoch": 1.8447217454500473, + "grad_norm": 0.15186427533626556, + "learning_rate": 0.00024272938908089637, + "loss": 2.1466, + "step": 477200 + }, + { + "epoch": 1.8447604026534306, + "grad_norm": 0.14806345105171204, + "learning_rate": 0.00024261557990290573, + "loss": 2.146, + "step": 477210 + }, + { + "epoch": 1.8447990598568138, + "grad_norm": 0.15865075588226318, + "learning_rate": 0.00024250177809478292, + "loss": 2.1494, + "step": 477220 + }, + { + "epoch": 1.844837717060197, + "grad_norm": 0.1485503762960434, + "learning_rate": 0.00024238798365509574, + "loss": 2.1387, + "step": 477230 + }, + { + "epoch": 1.8448763742635803, + "grad_norm": 0.13728691637516022, + "learning_rate": 0.00024227419658241357, + "loss": 2.1608, + "step": 477240 + }, + { + "epoch": 1.8449150314669636, + "grad_norm": 0.1566370725631714, + "learning_rate": 0.00024216041687530554, + "loss": 2.1623, + "step": 477250 + }, + { + "epoch": 1.8449536886703468, + "grad_norm": 0.13936889171600342, + "learning_rate": 0.0002420466445323415, + "loss": 2.1483, + "step": 477260 + }, + { + "epoch": 1.84499234587373, + "grad_norm": 0.15174809098243713, + "learning_rate": 0.00024193287955209166, + "loss": 2.1379, + "step": 477270 + }, + { + "epoch": 1.8450310030771133, + "grad_norm": 0.1539214551448822, + "learning_rate": 0.00024181912193312717, + "loss": 2.1543, + "step": 477280 + }, + { + "epoch": 1.8450696602804966, + "grad_norm": 0.14683173596858978, + "learning_rate": 0.00024170537167401897, + "loss": 2.1538, + "step": 477290 + }, + { + "epoch": 1.8451083174838798, + "grad_norm": 0.1581607162952423, + "learning_rate": 0.0002415916287733386, + "loss": 2.1513, + "step": 477300 + }, + { + "epoch": 1.845146974687263, + "grad_norm": 0.14941240847110748, + "learning_rate": 0.00024147789322965862, + "loss": 2.1475, + "step": 477310 + }, + { + "epoch": 1.8451856318906465, + "grad_norm": 0.15306486189365387, + "learning_rate": 0.00024136416504155124, + "loss": 2.1584, + "step": 477320 + }, + { + "epoch": 1.8452242890940298, + "grad_norm": 0.15010662376880646, + "learning_rate": 0.0002412504442075898, + "loss": 2.1582, + "step": 477330 + }, + { + "epoch": 1.845262946297413, + "grad_norm": 0.8174676299095154, + "learning_rate": 0.0002411367307263479, + "loss": 2.1466, + "step": 477340 + }, + { + "epoch": 1.8453016035007963, + "grad_norm": 0.1684281975030899, + "learning_rate": 0.00024102302459639912, + "loss": 2.1379, + "step": 477350 + }, + { + "epoch": 1.8453402607041798, + "grad_norm": 0.15354377031326294, + "learning_rate": 0.0002409093258163182, + "loss": 2.1526, + "step": 477360 + }, + { + "epoch": 1.845378917907563, + "grad_norm": 0.15894003212451935, + "learning_rate": 0.00024079563438468, + "loss": 2.1566, + "step": 477370 + }, + { + "epoch": 1.8454175751109463, + "grad_norm": 0.14311885833740234, + "learning_rate": 0.00024068195030006013, + "loss": 2.1597, + "step": 477380 + }, + { + "epoch": 1.8454562323143295, + "grad_norm": 0.13516804575920105, + "learning_rate": 0.00024056827356103415, + "loss": 2.1488, + "step": 477390 + }, + { + "epoch": 1.8454948895177128, + "grad_norm": 0.15137454867362976, + "learning_rate": 0.00024045460416617838, + "loss": 2.1505, + "step": 477400 + }, + { + "epoch": 1.845533546721096, + "grad_norm": 0.15255650877952576, + "learning_rate": 0.00024034094211406964, + "loss": 2.1423, + "step": 477410 + }, + { + "epoch": 1.8455722039244793, + "grad_norm": 0.1438567042350769, + "learning_rate": 0.00024022728740328515, + "loss": 2.1501, + "step": 477420 + }, + { + "epoch": 1.8456108611278625, + "grad_norm": 0.15280750393867493, + "learning_rate": 0.00024011364003240267, + "loss": 2.1642, + "step": 477430 + }, + { + "epoch": 1.8456495183312458, + "grad_norm": 0.14523179829120636, + "learning_rate": 0.00024, + "loss": 2.1465, + "step": 477440 + }, + { + "epoch": 1.845688175534629, + "grad_norm": 0.14758449792861938, + "learning_rate": 0.00023988636730465606, + "loss": 2.151, + "step": 477450 + }, + { + "epoch": 1.8457268327380123, + "grad_norm": 0.13859793543815613, + "learning_rate": 0.00023977274194495002, + "loss": 2.1414, + "step": 477460 + }, + { + "epoch": 1.8457654899413956, + "grad_norm": 0.15651504695415497, + "learning_rate": 0.00023965912391946077, + "loss": 2.1559, + "step": 477470 + }, + { + "epoch": 1.8458041471447788, + "grad_norm": 0.14840462803840637, + "learning_rate": 0.000239545513226769, + "loss": 2.1443, + "step": 477480 + }, + { + "epoch": 1.8458428043481623, + "grad_norm": 0.15093280375003815, + "learning_rate": 0.0002394319098654547, + "loss": 2.1402, + "step": 477490 + }, + { + "epoch": 1.8458814615515455, + "grad_norm": 0.15339349210262299, + "learning_rate": 0.0002393183138340991, + "loss": 2.1493, + "step": 477500 + }, + { + "epoch": 1.8459201187549288, + "grad_norm": 0.1480063647031784, + "learning_rate": 0.00023920472513128322, + "loss": 2.1406, + "step": 477510 + }, + { + "epoch": 1.845958775958312, + "grad_norm": 0.1451997458934784, + "learning_rate": 0.00023909114375558893, + "loss": 2.1465, + "step": 477520 + }, + { + "epoch": 1.8459974331616955, + "grad_norm": 0.15897716581821442, + "learning_rate": 0.00023897756970559848, + "loss": 2.1402, + "step": 477530 + }, + { + "epoch": 1.8460360903650788, + "grad_norm": 0.15453803539276123, + "learning_rate": 0.00023886400297989474, + "loss": 2.1535, + "step": 477540 + }, + { + "epoch": 1.846074747568462, + "grad_norm": 0.15898756682872772, + "learning_rate": 0.00023875044357706088, + "loss": 2.1513, + "step": 477550 + }, + { + "epoch": 1.8461134047718453, + "grad_norm": 0.13958440721035004, + "learning_rate": 0.00023863689149568025, + "loss": 2.1457, + "step": 477560 + }, + { + "epoch": 1.8461520619752285, + "grad_norm": 0.1507442146539688, + "learning_rate": 0.0002385233467343373, + "loss": 2.1382, + "step": 477570 + }, + { + "epoch": 1.8461907191786118, + "grad_norm": 0.14807669818401337, + "learning_rate": 0.00023840980929161625, + "loss": 2.1518, + "step": 477580 + }, + { + "epoch": 1.846229376381995, + "grad_norm": 0.1578161120414734, + "learning_rate": 0.0002382962791661023, + "loss": 2.1465, + "step": 477590 + }, + { + "epoch": 1.8462680335853783, + "grad_norm": 0.13840286433696747, + "learning_rate": 0.00023818275635638077, + "loss": 2.1467, + "step": 477600 + }, + { + "epoch": 1.8463066907887615, + "grad_norm": 0.14774125814437866, + "learning_rate": 0.00023806924086103764, + "loss": 2.1564, + "step": 477610 + }, + { + "epoch": 1.8463453479921448, + "grad_norm": 0.14336572587490082, + "learning_rate": 0.00023795573267865945, + "loss": 2.1463, + "step": 477620 + }, + { + "epoch": 1.846384005195528, + "grad_norm": 0.15975713729858398, + "learning_rate": 0.00023784223180783282, + "loss": 2.1378, + "step": 477630 + }, + { + "epoch": 1.8464226623989113, + "grad_norm": 0.14392030239105225, + "learning_rate": 0.0002377287382471449, + "loss": 2.1564, + "step": 477640 + }, + { + "epoch": 1.8464613196022945, + "grad_norm": 0.15305523574352264, + "learning_rate": 0.00023761525199518375, + "loss": 2.143, + "step": 477650 + }, + { + "epoch": 1.846499976805678, + "grad_norm": 0.1496932953596115, + "learning_rate": 0.00023750177305053734, + "loss": 2.1506, + "step": 477660 + }, + { + "epoch": 1.8465386340090613, + "grad_norm": 0.14829660952091217, + "learning_rate": 0.00023738830141179434, + "loss": 2.1441, + "step": 477670 + }, + { + "epoch": 1.8465772912124445, + "grad_norm": 0.1704322248697281, + "learning_rate": 0.00023727483707754372, + "loss": 2.1422, + "step": 477680 + }, + { + "epoch": 1.8466159484158278, + "grad_norm": 0.14436282217502594, + "learning_rate": 0.00023716138004637523, + "loss": 2.1489, + "step": 477690 + }, + { + "epoch": 1.8466546056192112, + "grad_norm": 0.45129719376564026, + "learning_rate": 0.00023704793031687887, + "loss": 2.1324, + "step": 477700 + }, + { + "epoch": 1.8466932628225945, + "grad_norm": 0.1636328250169754, + "learning_rate": 0.00023693448788764517, + "loss": 2.1626, + "step": 477710 + }, + { + "epoch": 1.8467319200259777, + "grad_norm": 0.14438024163246155, + "learning_rate": 0.00023682105275726474, + "loss": 2.1558, + "step": 477720 + }, + { + "epoch": 1.846770577229361, + "grad_norm": 0.1593109369277954, + "learning_rate": 0.00023670762492432917, + "loss": 2.1418, + "step": 477730 + }, + { + "epoch": 1.8468092344327443, + "grad_norm": 0.14541137218475342, + "learning_rate": 0.00023659420438743028, + "loss": 2.1584, + "step": 477740 + }, + { + "epoch": 1.8468478916361275, + "grad_norm": 0.1478029489517212, + "learning_rate": 0.0002364807911451603, + "loss": 2.1348, + "step": 477750 + }, + { + "epoch": 1.8468865488395108, + "grad_norm": 0.15339234471321106, + "learning_rate": 0.0002363673851961119, + "loss": 2.1536, + "step": 477760 + }, + { + "epoch": 1.846925206042894, + "grad_norm": 0.16422609984874725, + "learning_rate": 0.0002362539865388782, + "loss": 2.1419, + "step": 477770 + }, + { + "epoch": 1.8469638632462773, + "grad_norm": 0.1466737985610962, + "learning_rate": 0.00023614059517205278, + "loss": 2.1402, + "step": 477780 + }, + { + "epoch": 1.8470025204496605, + "grad_norm": 0.1492961049079895, + "learning_rate": 0.00023602721109423008, + "loss": 2.1619, + "step": 477790 + }, + { + "epoch": 1.8470411776530438, + "grad_norm": 0.15063001215457916, + "learning_rate": 0.00023591383430400438, + "loss": 2.1351, + "step": 477800 + }, + { + "epoch": 1.847079834856427, + "grad_norm": 0.1395963877439499, + "learning_rate": 0.00023580046479997052, + "loss": 2.1354, + "step": 477810 + }, + { + "epoch": 1.8471184920598103, + "grad_norm": 0.15143616497516632, + "learning_rate": 0.00023568710258072413, + "loss": 2.1403, + "step": 477820 + }, + { + "epoch": 1.8471571492631937, + "grad_norm": 0.15706034004688263, + "learning_rate": 0.00023557374764486116, + "loss": 2.136, + "step": 477830 + }, + { + "epoch": 1.847195806466577, + "grad_norm": 0.14363554120063782, + "learning_rate": 0.0002354603999909779, + "loss": 2.1397, + "step": 477840 + }, + { + "epoch": 1.8472344636699602, + "grad_norm": 0.13538502156734467, + "learning_rate": 0.000235347059617671, + "loss": 2.1375, + "step": 477850 + }, + { + "epoch": 1.8472731208733435, + "grad_norm": 0.1521271914243698, + "learning_rate": 0.0002352337265235376, + "loss": 2.1448, + "step": 477860 + }, + { + "epoch": 1.847311778076727, + "grad_norm": 0.1534687876701355, + "learning_rate": 0.0002351204007071759, + "loss": 2.1527, + "step": 477870 + }, + { + "epoch": 1.8473504352801102, + "grad_norm": 0.14379604160785675, + "learning_rate": 0.00023500708216718326, + "loss": 2.1533, + "step": 477880 + }, + { + "epoch": 1.8473890924834935, + "grad_norm": 0.1465234011411667, + "learning_rate": 0.00023489377090215902, + "loss": 2.1589, + "step": 477890 + }, + { + "epoch": 1.8474277496868767, + "grad_norm": 0.13903623819351196, + "learning_rate": 0.00023478046691070164, + "loss": 2.1462, + "step": 477900 + }, + { + "epoch": 1.84746640689026, + "grad_norm": 0.15117527544498444, + "learning_rate": 0.00023466717019141116, + "loss": 2.1504, + "step": 477910 + }, + { + "epoch": 1.8475050640936432, + "grad_norm": 0.15345846116542816, + "learning_rate": 0.00023455388074288707, + "loss": 2.1339, + "step": 477920 + }, + { + "epoch": 1.8475437212970265, + "grad_norm": 0.1577267050743103, + "learning_rate": 0.00023444059856373011, + "loss": 2.1497, + "step": 477930 + }, + { + "epoch": 1.8475823785004097, + "grad_norm": 0.15517856180667877, + "learning_rate": 0.00023432732365254072, + "loss": 2.1436, + "step": 477940 + }, + { + "epoch": 1.847621035703793, + "grad_norm": 0.14545536041259766, + "learning_rate": 0.00023421405600792045, + "loss": 2.1409, + "step": 477950 + }, + { + "epoch": 1.8476596929071762, + "grad_norm": 0.14651986956596375, + "learning_rate": 0.00023410079562847087, + "loss": 2.1468, + "step": 477960 + }, + { + "epoch": 1.8476983501105595, + "grad_norm": 0.16297149658203125, + "learning_rate": 0.0002339875425127942, + "loss": 2.1414, + "step": 477970 + }, + { + "epoch": 1.8477370073139427, + "grad_norm": 0.1577177792787552, + "learning_rate": 0.0002338742966594931, + "loss": 2.1626, + "step": 477980 + }, + { + "epoch": 1.847775664517326, + "grad_norm": 0.15041273832321167, + "learning_rate": 0.00023376105806717073, + "loss": 2.1421, + "step": 477990 + }, + { + "epoch": 1.8478143217207095, + "grad_norm": 0.14626824855804443, + "learning_rate": 0.00023364782673443064, + "loss": 2.1448, + "step": 478000 + }, + { + "epoch": 1.8478529789240927, + "grad_norm": 0.14885634183883667, + "learning_rate": 0.00023353460265987657, + "loss": 2.1398, + "step": 478010 + }, + { + "epoch": 1.847891636127476, + "grad_norm": 0.1460237354040146, + "learning_rate": 0.00023342138584211326, + "loss": 2.1466, + "step": 478020 + }, + { + "epoch": 1.8479302933308592, + "grad_norm": 0.15810991823673248, + "learning_rate": 0.0002333081762797451, + "loss": 2.1635, + "step": 478030 + }, + { + "epoch": 1.8479689505342427, + "grad_norm": 0.15092025697231293, + "learning_rate": 0.0002331949739713779, + "loss": 2.1538, + "step": 478040 + }, + { + "epoch": 1.848007607737626, + "grad_norm": 0.15626902878284454, + "learning_rate": 0.00023308177891561723, + "loss": 2.1489, + "step": 478050 + }, + { + "epoch": 1.8480462649410092, + "grad_norm": 0.16233813762664795, + "learning_rate": 0.00023296859111106928, + "loss": 2.1525, + "step": 478060 + }, + { + "epoch": 1.8480849221443925, + "grad_norm": 0.16185061633586884, + "learning_rate": 0.00023285541055634052, + "loss": 2.1612, + "step": 478070 + }, + { + "epoch": 1.8481235793477757, + "grad_norm": 0.16262325644493103, + "learning_rate": 0.0002327422372500383, + "loss": 2.1439, + "step": 478080 + }, + { + "epoch": 1.848162236551159, + "grad_norm": 0.1457241326570511, + "learning_rate": 0.00023262907119077014, + "loss": 2.1635, + "step": 478090 + }, + { + "epoch": 1.8482008937545422, + "grad_norm": 0.14732667803764343, + "learning_rate": 0.00023251591237714387, + "loss": 2.1589, + "step": 478100 + }, + { + "epoch": 1.8482395509579255, + "grad_norm": 0.1515752077102661, + "learning_rate": 0.00023240276080776812, + "loss": 2.1383, + "step": 478110 + }, + { + "epoch": 1.8482782081613087, + "grad_norm": 0.1464495062828064, + "learning_rate": 0.0002322896164812518, + "loss": 2.1357, + "step": 478120 + }, + { + "epoch": 1.848316865364692, + "grad_norm": 0.15452329814434052, + "learning_rate": 0.000232176479396204, + "loss": 2.1674, + "step": 478130 + }, + { + "epoch": 1.8483555225680752, + "grad_norm": 0.15739069879055023, + "learning_rate": 0.00023206334955123476, + "loss": 2.1432, + "step": 478140 + }, + { + "epoch": 1.8483941797714585, + "grad_norm": 0.15375475585460663, + "learning_rate": 0.00023195022694495404, + "loss": 2.1417, + "step": 478150 + }, + { + "epoch": 1.8484328369748417, + "grad_norm": 0.1497449278831482, + "learning_rate": 0.00023183711157597254, + "loss": 2.1378, + "step": 478160 + }, + { + "epoch": 1.8484714941782252, + "grad_norm": 0.14955320954322815, + "learning_rate": 0.00023172400344290156, + "loss": 2.1485, + "step": 478170 + }, + { + "epoch": 1.8485101513816085, + "grad_norm": 0.15689685940742493, + "learning_rate": 0.00023161090254435223, + "loss": 2.1523, + "step": 478180 + }, + { + "epoch": 1.8485488085849917, + "grad_norm": 0.16159173846244812, + "learning_rate": 0.00023149780887893724, + "loss": 2.1467, + "step": 478190 + }, + { + "epoch": 1.848587465788375, + "grad_norm": 0.1486079841852188, + "learning_rate": 0.00023138472244526854, + "loss": 2.1461, + "step": 478200 + }, + { + "epoch": 1.8486261229917584, + "grad_norm": 0.1565522402524948, + "learning_rate": 0.00023127164324195903, + "loss": 2.1311, + "step": 478210 + }, + { + "epoch": 1.8486647801951417, + "grad_norm": 0.13985569775104523, + "learning_rate": 0.00023115857126762207, + "loss": 2.1518, + "step": 478220 + }, + { + "epoch": 1.848703437398525, + "grad_norm": 0.14702102541923523, + "learning_rate": 0.00023104550652087164, + "loss": 2.1481, + "step": 478230 + }, + { + "epoch": 1.8487420946019082, + "grad_norm": 0.14980337023735046, + "learning_rate": 0.0002309324490003215, + "loss": 2.1523, + "step": 478240 + }, + { + "epoch": 1.8487807518052914, + "grad_norm": 0.1474117487668991, + "learning_rate": 0.0002308193987045868, + "loss": 2.146, + "step": 478250 + }, + { + "epoch": 1.8488194090086747, + "grad_norm": 0.14778390526771545, + "learning_rate": 0.00023070635563228237, + "loss": 2.1527, + "step": 478260 + }, + { + "epoch": 1.848858066212058, + "grad_norm": 0.14890675246715546, + "learning_rate": 0.0002305933197820238, + "loss": 2.1535, + "step": 478270 + }, + { + "epoch": 1.8488967234154412, + "grad_norm": 0.14470405876636505, + "learning_rate": 0.00023048029115242685, + "loss": 2.1449, + "step": 478280 + }, + { + "epoch": 1.8489353806188245, + "grad_norm": 0.15589329600334167, + "learning_rate": 0.0002303672697421082, + "loss": 2.1545, + "step": 478290 + }, + { + "epoch": 1.8489740378222077, + "grad_norm": 0.15257033705711365, + "learning_rate": 0.00023025425554968492, + "loss": 2.1412, + "step": 478300 + }, + { + "epoch": 1.849012695025591, + "grad_norm": 0.16834686696529388, + "learning_rate": 0.00023014124857377395, + "loss": 2.1404, + "step": 478310 + }, + { + "epoch": 1.8490513522289742, + "grad_norm": 0.15387968719005585, + "learning_rate": 0.000230028248812993, + "loss": 2.1511, + "step": 478320 + }, + { + "epoch": 1.8490900094323575, + "grad_norm": 0.1577366292476654, + "learning_rate": 0.00022991525626596054, + "loss": 2.1405, + "step": 478330 + }, + { + "epoch": 1.849128666635741, + "grad_norm": 0.15258517861366272, + "learning_rate": 0.000229802270931295, + "loss": 2.1339, + "step": 478340 + }, + { + "epoch": 1.8491673238391242, + "grad_norm": 0.16167719662189484, + "learning_rate": 0.00022968929280761574, + "loss": 2.1293, + "step": 478350 + }, + { + "epoch": 1.8492059810425074, + "grad_norm": 0.17660821974277496, + "learning_rate": 0.00022957632189354182, + "loss": 2.1588, + "step": 478360 + }, + { + "epoch": 1.8492446382458907, + "grad_norm": 0.15384353697299957, + "learning_rate": 0.0002294633581876935, + "loss": 2.1441, + "step": 478370 + }, + { + "epoch": 1.8492832954492742, + "grad_norm": 0.15233395993709564, + "learning_rate": 0.00022935040168869093, + "loss": 2.1485, + "step": 478380 + }, + { + "epoch": 1.8493219526526574, + "grad_norm": 0.15865083038806915, + "learning_rate": 0.00022923745239515525, + "loss": 2.1522, + "step": 478390 + }, + { + "epoch": 1.8493606098560407, + "grad_norm": 0.14701934158802032, + "learning_rate": 0.00022912451030570759, + "loss": 2.1356, + "step": 478400 + }, + { + "epoch": 1.849399267059424, + "grad_norm": 0.15493252873420715, + "learning_rate": 0.00022901157541896965, + "loss": 2.1524, + "step": 478410 + }, + { + "epoch": 1.8494379242628072, + "grad_norm": 0.1526908129453659, + "learning_rate": 0.0002288986477335635, + "loss": 2.1404, + "step": 478420 + }, + { + "epoch": 1.8494765814661904, + "grad_norm": 0.15958495438098907, + "learning_rate": 0.00022878572724811198, + "loss": 2.1437, + "step": 478430 + }, + { + "epoch": 1.8495152386695737, + "grad_norm": 0.15282100439071655, + "learning_rate": 0.00022867281396123773, + "loss": 2.1558, + "step": 478440 + }, + { + "epoch": 1.849553895872957, + "grad_norm": 0.19952651858329773, + "learning_rate": 0.00022855990787156455, + "loss": 2.1479, + "step": 478450 + }, + { + "epoch": 1.8495925530763402, + "grad_norm": 0.15583156049251556, + "learning_rate": 0.0002284470089777162, + "loss": 2.1595, + "step": 478460 + }, + { + "epoch": 1.8496312102797234, + "grad_norm": 0.14816786348819733, + "learning_rate": 0.0002283341172783171, + "loss": 2.1473, + "step": 478470 + }, + { + "epoch": 1.8496698674831067, + "grad_norm": 0.14721480011940002, + "learning_rate": 0.00022822123277199192, + "loss": 2.1432, + "step": 478480 + }, + { + "epoch": 1.84970852468649, + "grad_norm": 0.15829156339168549, + "learning_rate": 0.00022810835545736597, + "loss": 2.1543, + "step": 478490 + }, + { + "epoch": 1.8497471818898732, + "grad_norm": 0.14759312570095062, + "learning_rate": 0.000227995485333065, + "loss": 2.1546, + "step": 478500 + }, + { + "epoch": 1.8497858390932567, + "grad_norm": 0.14392654597759247, + "learning_rate": 0.00022788262239771485, + "loss": 2.1652, + "step": 478510 + }, + { + "epoch": 1.84982449629664, + "grad_norm": 0.1517169177532196, + "learning_rate": 0.00022776976664994232, + "loss": 2.1496, + "step": 478520 + }, + { + "epoch": 1.8498631535000232, + "grad_norm": 0.14544710516929626, + "learning_rate": 0.00022765691808837409, + "loss": 2.1537, + "step": 478530 + }, + { + "epoch": 1.8499018107034064, + "grad_norm": 0.15497958660125732, + "learning_rate": 0.0002275440767116379, + "loss": 2.1317, + "step": 478540 + }, + { + "epoch": 1.84994046790679, + "grad_norm": 0.15696972608566284, + "learning_rate": 0.00022743124251836157, + "loss": 2.1529, + "step": 478550 + }, + { + "epoch": 1.8499791251101732, + "grad_norm": 0.1588466912508011, + "learning_rate": 0.00022731841550717303, + "loss": 2.1546, + "step": 478560 + }, + { + "epoch": 1.8500177823135564, + "grad_norm": 0.15451645851135254, + "learning_rate": 0.00022720559567670118, + "loss": 2.1448, + "step": 478570 + }, + { + "epoch": 1.8500564395169397, + "grad_norm": 0.15711510181427002, + "learning_rate": 0.00022709278302557513, + "loss": 2.1333, + "step": 478580 + }, + { + "epoch": 1.850095096720323, + "grad_norm": 0.14721792936325073, + "learning_rate": 0.00022697997755242484, + "loss": 2.151, + "step": 478590 + }, + { + "epoch": 1.8501337539237062, + "grad_norm": 0.1567068099975586, + "learning_rate": 0.00022686717925587984, + "loss": 2.1518, + "step": 478600 + }, + { + "epoch": 1.8501724111270894, + "grad_norm": 0.14560212194919586, + "learning_rate": 0.00022675438813457083, + "loss": 2.1303, + "step": 478610 + }, + { + "epoch": 1.8502110683304727, + "grad_norm": 0.15781927108764648, + "learning_rate": 0.00022664160418712888, + "loss": 2.1512, + "step": 478620 + }, + { + "epoch": 1.850249725533856, + "grad_norm": 0.14911913871765137, + "learning_rate": 0.00022652882741218484, + "loss": 2.1596, + "step": 478630 + }, + { + "epoch": 1.8502883827372392, + "grad_norm": 0.17982690036296844, + "learning_rate": 0.00022641605780837095, + "loss": 2.1453, + "step": 478640 + }, + { + "epoch": 1.8503270399406224, + "grad_norm": 0.31263604760169983, + "learning_rate": 0.00022630329537431915, + "loss": 2.1547, + "step": 478650 + }, + { + "epoch": 1.8503656971440057, + "grad_norm": 0.14549441635608673, + "learning_rate": 0.00022619054010866214, + "loss": 2.1579, + "step": 478660 + }, + { + "epoch": 1.8504043543473891, + "grad_norm": 0.1487957090139389, + "learning_rate": 0.00022607779201003298, + "loss": 2.1394, + "step": 478670 + }, + { + "epoch": 1.8504430115507724, + "grad_norm": 0.14924855530261993, + "learning_rate": 0.00022596505107706522, + "loss": 2.1412, + "step": 478680 + }, + { + "epoch": 1.8504816687541557, + "grad_norm": 0.14954110980033875, + "learning_rate": 0.0002258523173083926, + "loss": 2.1458, + "step": 478690 + }, + { + "epoch": 1.850520325957539, + "grad_norm": 0.16220064461231232, + "learning_rate": 0.00022573959070265004, + "loss": 2.1553, + "step": 478700 + }, + { + "epoch": 1.8505589831609222, + "grad_norm": 0.14964133501052856, + "learning_rate": 0.0002256268712584717, + "loss": 2.149, + "step": 478710 + }, + { + "epoch": 1.8505976403643056, + "grad_norm": 0.1590881198644638, + "learning_rate": 0.00022551415897449336, + "loss": 2.1485, + "step": 478720 + }, + { + "epoch": 1.8506362975676889, + "grad_norm": 0.1411176323890686, + "learning_rate": 0.0002254014538493503, + "loss": 2.1484, + "step": 478730 + }, + { + "epoch": 1.8506749547710721, + "grad_norm": 0.15405212342739105, + "learning_rate": 0.00022528875588167873, + "loss": 2.1365, + "step": 478740 + }, + { + "epoch": 1.8507136119744554, + "grad_norm": 0.14678388833999634, + "learning_rate": 0.00022517606507011512, + "loss": 2.172, + "step": 478750 + }, + { + "epoch": 1.8507522691778386, + "grad_norm": 0.14999248087406158, + "learning_rate": 0.00022506338141329674, + "loss": 2.1521, + "step": 478760 + }, + { + "epoch": 1.850790926381222, + "grad_norm": 0.14902278780937195, + "learning_rate": 0.00022495070490986092, + "loss": 2.1409, + "step": 478770 + }, + { + "epoch": 1.8508295835846051, + "grad_norm": 0.14862895011901855, + "learning_rate": 0.00022483803555844518, + "loss": 2.1455, + "step": 478780 + }, + { + "epoch": 1.8508682407879884, + "grad_norm": 0.1663714349269867, + "learning_rate": 0.00022472537335768795, + "loss": 2.1408, + "step": 478790 + }, + { + "epoch": 1.8509068979913716, + "grad_norm": 0.16073864698410034, + "learning_rate": 0.0002246127183062283, + "loss": 2.1484, + "step": 478800 + }, + { + "epoch": 1.850945555194755, + "grad_norm": 0.1747768223285675, + "learning_rate": 0.00022450007040270491, + "loss": 2.1543, + "step": 478810 + }, + { + "epoch": 1.8509842123981382, + "grad_norm": 0.1444845199584961, + "learning_rate": 0.00022438742964575755, + "loss": 2.1471, + "step": 478820 + }, + { + "epoch": 1.8510228696015214, + "grad_norm": 0.14794333279132843, + "learning_rate": 0.0002242747960340259, + "loss": 2.1572, + "step": 478830 + }, + { + "epoch": 1.8510615268049049, + "grad_norm": 0.16190718114376068, + "learning_rate": 0.0002241621695661509, + "loss": 2.1573, + "step": 478840 + }, + { + "epoch": 1.8511001840082881, + "grad_norm": 0.17460119724273682, + "learning_rate": 0.00022404955024077312, + "loss": 2.1372, + "step": 478850 + }, + { + "epoch": 1.8511388412116714, + "grad_norm": 0.1538202315568924, + "learning_rate": 0.00022393693805653392, + "loss": 2.1665, + "step": 478860 + }, + { + "epoch": 1.8511774984150546, + "grad_norm": 0.14792491495609283, + "learning_rate": 0.00022382433301207505, + "loss": 2.1484, + "step": 478870 + }, + { + "epoch": 1.8512161556184379, + "grad_norm": 0.14176231622695923, + "learning_rate": 0.0002237117351060387, + "loss": 2.1504, + "step": 478880 + }, + { + "epoch": 1.8512548128218214, + "grad_norm": 0.166255384683609, + "learning_rate": 0.00022359914433706706, + "loss": 2.1418, + "step": 478890 + }, + { + "epoch": 1.8512934700252046, + "grad_norm": 0.15698324143886566, + "learning_rate": 0.00022348656070380368, + "loss": 2.131, + "step": 478900 + }, + { + "epoch": 1.8513321272285879, + "grad_norm": 0.15468591451644897, + "learning_rate": 0.00022337398420489187, + "loss": 2.1589, + "step": 478910 + }, + { + "epoch": 1.8513707844319711, + "grad_norm": 0.14415162801742554, + "learning_rate": 0.00022326141483897533, + "loss": 2.1583, + "step": 478920 + }, + { + "epoch": 1.8514094416353544, + "grad_norm": 0.1514313519001007, + "learning_rate": 0.0002231488526046983, + "loss": 2.1489, + "step": 478930 + }, + { + "epoch": 1.8514480988387376, + "grad_norm": 0.14063210785388947, + "learning_rate": 0.00022303629750070587, + "loss": 2.1363, + "step": 478940 + }, + { + "epoch": 1.8514867560421209, + "grad_norm": 0.1474284678697586, + "learning_rate": 0.00022292374952564287, + "loss": 2.1437, + "step": 478950 + }, + { + "epoch": 1.8515254132455041, + "grad_norm": 0.15619979798793793, + "learning_rate": 0.00022281120867815508, + "loss": 2.1506, + "step": 478960 + }, + { + "epoch": 1.8515640704488874, + "grad_norm": 0.14546486735343933, + "learning_rate": 0.00022269867495688844, + "loss": 2.1491, + "step": 478970 + }, + { + "epoch": 1.8516027276522706, + "grad_norm": 0.15706144273281097, + "learning_rate": 0.00022258614836048962, + "loss": 2.1321, + "step": 478980 + }, + { + "epoch": 1.8516413848556539, + "grad_norm": 0.1643100529909134, + "learning_rate": 0.0002224736288876048, + "loss": 2.1422, + "step": 478990 + }, + { + "epoch": 1.8516800420590371, + "grad_norm": 0.15362891554832458, + "learning_rate": 0.0002223611165368822, + "loss": 2.1395, + "step": 479000 + }, + { + "epoch": 1.8517186992624206, + "grad_norm": 0.13917003571987152, + "learning_rate": 0.00022224861130696905, + "loss": 2.17, + "step": 479010 + }, + { + "epoch": 1.8517573564658039, + "grad_norm": 0.15380679070949554, + "learning_rate": 0.0002221361131965136, + "loss": 2.148, + "step": 479020 + }, + { + "epoch": 1.8517960136691871, + "grad_norm": 0.15027354657649994, + "learning_rate": 0.0002220236222041643, + "loss": 2.1452, + "step": 479030 + }, + { + "epoch": 1.8518346708725704, + "grad_norm": 0.1432102769613266, + "learning_rate": 0.00022191113832857056, + "loss": 2.142, + "step": 479040 + }, + { + "epoch": 1.8518733280759538, + "grad_norm": 0.1524096429347992, + "learning_rate": 0.00022179866156838134, + "loss": 2.1489, + "step": 479050 + }, + { + "epoch": 1.851911985279337, + "grad_norm": 0.15654858946800232, + "learning_rate": 0.0002216861919222468, + "loss": 2.1513, + "step": 479060 + }, + { + "epoch": 1.8519506424827203, + "grad_norm": 0.14918921887874603, + "learning_rate": 0.00022157372938881714, + "loss": 2.1352, + "step": 479070 + }, + { + "epoch": 1.8519892996861036, + "grad_norm": 0.1569262593984604, + "learning_rate": 0.0002214612739667432, + "loss": 2.1396, + "step": 479080 + }, + { + "epoch": 1.8520279568894868, + "grad_norm": 0.15429821610450745, + "learning_rate": 0.00022134882565467607, + "loss": 2.1583, + "step": 479090 + }, + { + "epoch": 1.85206661409287, + "grad_norm": 0.15140283107757568, + "learning_rate": 0.00022123638445126727, + "loss": 2.1419, + "step": 479100 + }, + { + "epoch": 1.8521052712962534, + "grad_norm": 0.15845070779323578, + "learning_rate": 0.00022112395035516874, + "loss": 2.1437, + "step": 479110 + }, + { + "epoch": 1.8521439284996366, + "grad_norm": 0.15516333281993866, + "learning_rate": 0.00022101152336503316, + "loss": 2.1388, + "step": 479120 + }, + { + "epoch": 1.8521825857030199, + "grad_norm": 0.14900943636894226, + "learning_rate": 0.00022089910347951314, + "loss": 2.1492, + "step": 479130 + }, + { + "epoch": 1.852221242906403, + "grad_norm": 0.15713250637054443, + "learning_rate": 0.0002207866906972622, + "loss": 2.153, + "step": 479140 + }, + { + "epoch": 1.8522599001097864, + "grad_norm": 0.15991628170013428, + "learning_rate": 0.00022067428501693366, + "loss": 2.1466, + "step": 479150 + }, + { + "epoch": 1.8522985573131696, + "grad_norm": 0.14617495238780975, + "learning_rate": 0.00022056188643718213, + "loss": 2.1565, + "step": 479160 + }, + { + "epoch": 1.8523372145165529, + "grad_norm": 0.14892008900642395, + "learning_rate": 0.0002204494949566618, + "loss": 2.128, + "step": 479170 + }, + { + "epoch": 1.8523758717199363, + "grad_norm": 0.15686427056789398, + "learning_rate": 0.00022033711057402794, + "loss": 2.1391, + "step": 479180 + }, + { + "epoch": 1.8524145289233196, + "grad_norm": 0.1545068472623825, + "learning_rate": 0.00022022473328793546, + "loss": 2.1442, + "step": 479190 + }, + { + "epoch": 1.8524531861267028, + "grad_norm": 0.14783914387226105, + "learning_rate": 0.00022011236309704075, + "loss": 2.1486, + "step": 479200 + }, + { + "epoch": 1.852491843330086, + "grad_norm": 0.15659219026565552, + "learning_rate": 0.00021999999999999998, + "loss": 2.14, + "step": 479210 + }, + { + "epoch": 1.8525305005334696, + "grad_norm": 0.14546005427837372, + "learning_rate": 0.00021988764399546957, + "loss": 2.1278, + "step": 479220 + }, + { + "epoch": 1.8525691577368528, + "grad_norm": 0.15138238668441772, + "learning_rate": 0.0002197752950821068, + "loss": 2.1388, + "step": 479230 + }, + { + "epoch": 1.852607814940236, + "grad_norm": 0.14894860982894897, + "learning_rate": 0.000219662953258569, + "loss": 2.1373, + "step": 479240 + }, + { + "epoch": 1.8526464721436193, + "grad_norm": 0.1465761661529541, + "learning_rate": 0.00021955061852351455, + "loss": 2.1502, + "step": 479250 + }, + { + "epoch": 1.8526851293470026, + "grad_norm": 0.14742115139961243, + "learning_rate": 0.0002194382908756012, + "loss": 2.1348, + "step": 479260 + }, + { + "epoch": 1.8527237865503858, + "grad_norm": 0.1882600635290146, + "learning_rate": 0.00021932597031348844, + "loss": 2.1405, + "step": 479270 + }, + { + "epoch": 1.852762443753769, + "grad_norm": 0.1500731259584427, + "learning_rate": 0.00021921365683583494, + "loss": 2.1383, + "step": 479280 + }, + { + "epoch": 1.8528011009571523, + "grad_norm": 0.14603668451309204, + "learning_rate": 0.0002191013504413004, + "loss": 2.1354, + "step": 479290 + }, + { + "epoch": 1.8528397581605356, + "grad_norm": 0.14398865401744843, + "learning_rate": 0.00021898905112854506, + "loss": 2.1507, + "step": 479300 + }, + { + "epoch": 1.8528784153639188, + "grad_norm": 0.28502047061920166, + "learning_rate": 0.00021887675889622948, + "loss": 2.1256, + "step": 479310 + }, + { + "epoch": 1.852917072567302, + "grad_norm": 0.1565953642129898, + "learning_rate": 0.00021876447374301455, + "loss": 2.1344, + "step": 479320 + }, + { + "epoch": 1.8529557297706853, + "grad_norm": 0.15478111803531647, + "learning_rate": 0.00021865219566756134, + "loss": 2.1355, + "step": 479330 + }, + { + "epoch": 1.8529943869740686, + "grad_norm": 0.15464521944522858, + "learning_rate": 0.000218539924668532, + "loss": 2.1406, + "step": 479340 + }, + { + "epoch": 1.853033044177452, + "grad_norm": 0.15871332585811615, + "learning_rate": 0.00021842766074458832, + "loss": 2.1531, + "step": 479350 + }, + { + "epoch": 1.8530717013808353, + "grad_norm": 0.14946506917476654, + "learning_rate": 0.0002183154038943931, + "loss": 2.1345, + "step": 479360 + }, + { + "epoch": 1.8531103585842186, + "grad_norm": 0.1543097198009491, + "learning_rate": 0.0002182031541166092, + "loss": 2.1536, + "step": 479370 + }, + { + "epoch": 1.8531490157876018, + "grad_norm": 0.15099318325519562, + "learning_rate": 0.00021809091140990034, + "loss": 2.1269, + "step": 479380 + }, + { + "epoch": 1.8531876729909853, + "grad_norm": 0.15252339839935303, + "learning_rate": 0.00021797867577293007, + "loss": 2.1417, + "step": 479390 + }, + { + "epoch": 1.8532263301943686, + "grad_norm": 0.16234450042247772, + "learning_rate": 0.00021786644720436278, + "loss": 2.1313, + "step": 479400 + }, + { + "epoch": 1.8532649873977518, + "grad_norm": 0.1449269950389862, + "learning_rate": 0.0002177542257028633, + "loss": 2.1406, + "step": 479410 + }, + { + "epoch": 1.853303644601135, + "grad_norm": 0.15894246101379395, + "learning_rate": 0.0002176420112670967, + "loss": 2.1507, + "step": 479420 + }, + { + "epoch": 1.8533423018045183, + "grad_norm": 0.15004245936870575, + "learning_rate": 0.00021752980389572852, + "loss": 2.1223, + "step": 479430 + }, + { + "epoch": 1.8533809590079016, + "grad_norm": 0.17616984248161316, + "learning_rate": 0.0002174176035874247, + "loss": 2.1495, + "step": 479440 + }, + { + "epoch": 1.8534196162112848, + "grad_norm": 0.14092867076396942, + "learning_rate": 0.00021730541034085138, + "loss": 2.1402, + "step": 479450 + }, + { + "epoch": 1.853458273414668, + "grad_norm": 0.15265598893165588, + "learning_rate": 0.0002171932241546759, + "loss": 2.1377, + "step": 479460 + }, + { + "epoch": 1.8534969306180513, + "grad_norm": 0.1434614360332489, + "learning_rate": 0.00021708104502756486, + "loss": 2.1522, + "step": 479470 + }, + { + "epoch": 1.8535355878214346, + "grad_norm": 0.14725279808044434, + "learning_rate": 0.00021696887295818645, + "loss": 2.1534, + "step": 479480 + }, + { + "epoch": 1.8535742450248178, + "grad_norm": 0.15050674974918365, + "learning_rate": 0.0002168567079452084, + "loss": 2.1455, + "step": 479490 + }, + { + "epoch": 1.853612902228201, + "grad_norm": 0.14720620214939117, + "learning_rate": 0.00021674454998729908, + "loss": 2.1453, + "step": 479500 + }, + { + "epoch": 1.8536515594315843, + "grad_norm": 0.1538514643907547, + "learning_rate": 0.00021663239908312781, + "loss": 2.1589, + "step": 479510 + }, + { + "epoch": 1.8536902166349678, + "grad_norm": 0.15909264981746674, + "learning_rate": 0.00021652025523136365, + "loss": 2.1354, + "step": 479520 + }, + { + "epoch": 1.853728873838351, + "grad_norm": 0.15823258459568024, + "learning_rate": 0.0002164081184306763, + "loss": 2.148, + "step": 479530 + }, + { + "epoch": 1.8537675310417343, + "grad_norm": 0.15360794961452484, + "learning_rate": 0.0002162959886797362, + "loss": 2.1573, + "step": 479540 + }, + { + "epoch": 1.8538061882451176, + "grad_norm": 0.15805944800376892, + "learning_rate": 0.00021618386597721352, + "loss": 2.149, + "step": 479550 + }, + { + "epoch": 1.853844845448501, + "grad_norm": 0.164720356464386, + "learning_rate": 0.0002160717503217793, + "loss": 2.1375, + "step": 479560 + }, + { + "epoch": 1.8538835026518843, + "grad_norm": 0.15219347178936005, + "learning_rate": 0.00021595964171210524, + "loss": 2.1444, + "step": 479570 + }, + { + "epoch": 1.8539221598552675, + "grad_norm": 0.15331251919269562, + "learning_rate": 0.00021584754014686292, + "loss": 2.1411, + "step": 479580 + }, + { + "epoch": 1.8539608170586508, + "grad_norm": 0.15167184174060822, + "learning_rate": 0.00021573544562472448, + "loss": 2.1376, + "step": 479590 + }, + { + "epoch": 1.853999474262034, + "grad_norm": 0.16140606999397278, + "learning_rate": 0.00021562335814436295, + "loss": 2.122, + "step": 479600 + }, + { + "epoch": 1.8540381314654173, + "grad_norm": 0.15602107346057892, + "learning_rate": 0.00021551127770445122, + "loss": 2.1376, + "step": 479610 + }, + { + "epoch": 1.8540767886688005, + "grad_norm": 0.15653519332408905, + "learning_rate": 0.0002153992043036628, + "loss": 2.1518, + "step": 479620 + }, + { + "epoch": 1.8541154458721838, + "grad_norm": 0.15672744810581207, + "learning_rate": 0.00021528713794067135, + "loss": 2.1379, + "step": 479630 + }, + { + "epoch": 1.854154103075567, + "grad_norm": 0.15469038486480713, + "learning_rate": 0.00021517507861415152, + "loss": 2.146, + "step": 479640 + }, + { + "epoch": 1.8541927602789503, + "grad_norm": 0.15438590943813324, + "learning_rate": 0.00021506302632277795, + "loss": 2.1518, + "step": 479650 + }, + { + "epoch": 1.8542314174823336, + "grad_norm": 0.1710808128118515, + "learning_rate": 0.00021495098106522571, + "loss": 2.1442, + "step": 479660 + }, + { + "epoch": 1.8542700746857168, + "grad_norm": 0.14917579293251038, + "learning_rate": 0.00021483894284017046, + "loss": 2.1487, + "step": 479670 + }, + { + "epoch": 1.8543087318891, + "grad_norm": 0.9882127046585083, + "learning_rate": 0.0002147269116462882, + "loss": 2.1286, + "step": 479680 + }, + { + "epoch": 1.8543473890924835, + "grad_norm": 0.17200686037540436, + "learning_rate": 0.00021461488748225532, + "loss": 2.1462, + "step": 479690 + }, + { + "epoch": 1.8543860462958668, + "grad_norm": 0.15341956913471222, + "learning_rate": 0.00021450287034674843, + "loss": 2.1443, + "step": 479700 + }, + { + "epoch": 1.85442470349925, + "grad_norm": 0.15414680540561676, + "learning_rate": 0.000214390860238445, + "loss": 2.1424, + "step": 479710 + }, + { + "epoch": 1.8544633607026333, + "grad_norm": 0.15845142304897308, + "learning_rate": 0.0002142788571560228, + "loss": 2.1278, + "step": 479720 + }, + { + "epoch": 1.8545020179060168, + "grad_norm": 0.14842304587364197, + "learning_rate": 0.00021416686109815953, + "loss": 2.1378, + "step": 479730 + }, + { + "epoch": 1.8545406751094, + "grad_norm": 0.1431240290403366, + "learning_rate": 0.00021405487206353402, + "loss": 2.129, + "step": 479740 + }, + { + "epoch": 1.8545793323127833, + "grad_norm": 0.1469450742006302, + "learning_rate": 0.0002139428900508249, + "loss": 2.1358, + "step": 479750 + }, + { + "epoch": 1.8546179895161665, + "grad_norm": 0.1524389535188675, + "learning_rate": 0.00021383091505871145, + "loss": 2.1401, + "step": 479760 + }, + { + "epoch": 1.8546566467195498, + "grad_norm": 0.14780215919017792, + "learning_rate": 0.0002137189470858736, + "loss": 2.1581, + "step": 479770 + }, + { + "epoch": 1.854695303922933, + "grad_norm": 0.16302737593650818, + "learning_rate": 0.0002136069861309915, + "loss": 2.1486, + "step": 479780 + }, + { + "epoch": 1.8547339611263163, + "grad_norm": 0.15350155532360077, + "learning_rate": 0.00021349503219274536, + "loss": 2.151, + "step": 479790 + }, + { + "epoch": 1.8547726183296995, + "grad_norm": 0.14787808060646057, + "learning_rate": 0.00021338308526981642, + "loss": 2.1374, + "step": 479800 + }, + { + "epoch": 1.8548112755330828, + "grad_norm": 0.1476603001356125, + "learning_rate": 0.000213271145360886, + "loss": 2.1433, + "step": 479810 + }, + { + "epoch": 1.854849932736466, + "grad_norm": 0.1481381356716156, + "learning_rate": 0.00021315921246463598, + "loss": 2.1312, + "step": 479820 + }, + { + "epoch": 1.8548885899398493, + "grad_norm": 0.1528703272342682, + "learning_rate": 0.00021304728657974836, + "loss": 2.1444, + "step": 479830 + }, + { + "epoch": 1.8549272471432325, + "grad_norm": 0.14263570308685303, + "learning_rate": 0.00021293536770490595, + "loss": 2.1385, + "step": 479840 + }, + { + "epoch": 1.8549659043466158, + "grad_norm": 0.15099699795246124, + "learning_rate": 0.00021282345583879158, + "loss": 2.1437, + "step": 479850 + }, + { + "epoch": 1.8550045615499993, + "grad_norm": 0.1449032425880432, + "learning_rate": 0.00021271155098008876, + "loss": 2.1295, + "step": 479860 + }, + { + "epoch": 1.8550432187533825, + "grad_norm": 0.15079474449157715, + "learning_rate": 0.0002125996531274814, + "loss": 2.1551, + "step": 479870 + }, + { + "epoch": 1.8550818759567658, + "grad_norm": 0.16255615651607513, + "learning_rate": 0.0002124877622796537, + "loss": 2.1425, + "step": 479880 + }, + { + "epoch": 1.855120533160149, + "grad_norm": 0.16530774533748627, + "learning_rate": 0.00021237587843529027, + "loss": 2.142, + "step": 479890 + }, + { + "epoch": 1.8551591903635325, + "grad_norm": 0.15740294754505157, + "learning_rate": 0.00021226400159307635, + "loss": 2.1331, + "step": 479900 + }, + { + "epoch": 1.8551978475669157, + "grad_norm": 0.1708087921142578, + "learning_rate": 0.00021215213175169745, + "loss": 2.1379, + "step": 479910 + }, + { + "epoch": 1.855236504770299, + "grad_norm": 0.16706566512584686, + "learning_rate": 0.00021204026890983928, + "loss": 2.1447, + "step": 479920 + }, + { + "epoch": 1.8552751619736823, + "grad_norm": 0.14972372353076935, + "learning_rate": 0.00021192841306618828, + "loss": 2.1391, + "step": 479930 + }, + { + "epoch": 1.8553138191770655, + "grad_norm": 0.14616578817367554, + "learning_rate": 0.0002118165642194312, + "loss": 2.1469, + "step": 479940 + }, + { + "epoch": 1.8553524763804488, + "grad_norm": 0.15004681050777435, + "learning_rate": 0.0002117047223682549, + "loss": 2.1505, + "step": 479950 + }, + { + "epoch": 1.855391133583832, + "grad_norm": 0.15200355648994446, + "learning_rate": 0.00021159288751134754, + "loss": 2.1349, + "step": 479960 + }, + { + "epoch": 1.8554297907872153, + "grad_norm": 0.15411987900733948, + "learning_rate": 0.0002114810596473964, + "loss": 2.1361, + "step": 479970 + }, + { + "epoch": 1.8554684479905985, + "grad_norm": 0.1675427407026291, + "learning_rate": 0.0002113692387750903, + "loss": 2.1356, + "step": 479980 + }, + { + "epoch": 1.8555071051939818, + "grad_norm": 0.15278521180152893, + "learning_rate": 0.00021125742489311783, + "loss": 2.1384, + "step": 479990 + }, + { + "epoch": 1.855545762397365, + "grad_norm": 0.14866431057453156, + "learning_rate": 0.0002111456180001683, + "loss": 2.1452, + "step": 480000 + }, + { + "epoch": 1.8555844196007483, + "grad_norm": 0.15397635102272034, + "learning_rate": 0.00021103381809493116, + "loss": 2.1495, + "step": 480010 + }, + { + "epoch": 1.8556230768041315, + "grad_norm": 0.15433016419410706, + "learning_rate": 0.00021092202517609637, + "loss": 2.1346, + "step": 480020 + }, + { + "epoch": 1.855661734007515, + "grad_norm": 0.1563122272491455, + "learning_rate": 0.00021081023924235476, + "loss": 2.1481, + "step": 480030 + }, + { + "epoch": 1.8557003912108982, + "grad_norm": 0.1498311460018158, + "learning_rate": 0.0002106984602923967, + "loss": 2.1397, + "step": 480040 + }, + { + "epoch": 1.8557390484142815, + "grad_norm": 0.15273213386535645, + "learning_rate": 0.0002105866883249137, + "loss": 2.1441, + "step": 480050 + }, + { + "epoch": 1.8557777056176648, + "grad_norm": 0.1529931277036667, + "learning_rate": 0.0002104749233385972, + "loss": 2.1401, + "step": 480060 + }, + { + "epoch": 1.8558163628210482, + "grad_norm": 0.155166894197464, + "learning_rate": 0.00021036316533213918, + "loss": 2.1409, + "step": 480070 + }, + { + "epoch": 1.8558550200244315, + "grad_norm": 0.14800570905208588, + "learning_rate": 0.00021025141430423266, + "loss": 2.1492, + "step": 480080 + }, + { + "epoch": 1.8558936772278147, + "grad_norm": 0.14278216660022736, + "learning_rate": 0.00021013967025356982, + "loss": 2.1379, + "step": 480090 + }, + { + "epoch": 1.855932334431198, + "grad_norm": 0.1603323519229889, + "learning_rate": 0.00021002793317884418, + "loss": 2.1262, + "step": 480100 + }, + { + "epoch": 1.8559709916345812, + "grad_norm": 0.16060198843479156, + "learning_rate": 0.00020991620307874958, + "loss": 2.1357, + "step": 480110 + }, + { + "epoch": 1.8560096488379645, + "grad_norm": 0.14317700266838074, + "learning_rate": 0.00020980447995198027, + "loss": 2.1536, + "step": 480120 + }, + { + "epoch": 1.8560483060413477, + "grad_norm": 0.22249913215637207, + "learning_rate": 0.00020969276379723034, + "loss": 2.1612, + "step": 480130 + }, + { + "epoch": 1.856086963244731, + "grad_norm": 0.16858729720115662, + "learning_rate": 0.00020958105461319466, + "loss": 2.1437, + "step": 480140 + }, + { + "epoch": 1.8561256204481142, + "grad_norm": 0.14766691625118256, + "learning_rate": 0.0002094693523985689, + "loss": 2.145, + "step": 480150 + }, + { + "epoch": 1.8561642776514975, + "grad_norm": 0.15140976011753082, + "learning_rate": 0.0002093576571520488, + "loss": 2.1551, + "step": 480160 + }, + { + "epoch": 1.8562029348548807, + "grad_norm": 0.1595422774553299, + "learning_rate": 0.00020924596887233004, + "loss": 2.139, + "step": 480170 + }, + { + "epoch": 1.856241592058264, + "grad_norm": 0.14984270930290222, + "learning_rate": 0.0002091342875581095, + "loss": 2.1372, + "step": 480180 + }, + { + "epoch": 1.8562802492616473, + "grad_norm": 0.1510619968175888, + "learning_rate": 0.00020902261320808414, + "loss": 2.1542, + "step": 480190 + }, + { + "epoch": 1.8563189064650307, + "grad_norm": 0.1541844755411148, + "learning_rate": 0.00020891094582095105, + "loss": 2.1303, + "step": 480200 + }, + { + "epoch": 1.856357563668414, + "grad_norm": 0.16086439788341522, + "learning_rate": 0.00020879928539540814, + "loss": 2.1366, + "step": 480210 + }, + { + "epoch": 1.8563962208717972, + "grad_norm": 0.14684398472309113, + "learning_rate": 0.00020868763193015384, + "loss": 2.1436, + "step": 480220 + }, + { + "epoch": 1.8564348780751805, + "grad_norm": 0.15152642130851746, + "learning_rate": 0.00020857598542388622, + "loss": 2.1465, + "step": 480230 + }, + { + "epoch": 1.856473535278564, + "grad_norm": 0.156929150223732, + "learning_rate": 0.0002084643458753046, + "loss": 2.1514, + "step": 480240 + }, + { + "epoch": 1.8565121924819472, + "grad_norm": 0.1541062295436859, + "learning_rate": 0.00020835271328310822, + "loss": 2.1311, + "step": 480250 + }, + { + "epoch": 1.8565508496853305, + "grad_norm": 0.17336539924144745, + "learning_rate": 0.00020824108764599703, + "loss": 2.1542, + "step": 480260 + }, + { + "epoch": 1.8565895068887137, + "grad_norm": 0.16037580370903015, + "learning_rate": 0.00020812946896267093, + "loss": 2.158, + "step": 480270 + }, + { + "epoch": 1.856628164092097, + "grad_norm": 0.15069545805454254, + "learning_rate": 0.00020801785723183053, + "loss": 2.1451, + "step": 480280 + }, + { + "epoch": 1.8566668212954802, + "grad_norm": 0.147901713848114, + "learning_rate": 0.00020790625245217708, + "loss": 2.1389, + "step": 480290 + }, + { + "epoch": 1.8567054784988635, + "grad_norm": 0.14777696132659912, + "learning_rate": 0.00020779465462241188, + "loss": 2.1386, + "step": 480300 + }, + { + "epoch": 1.8567441357022467, + "grad_norm": 0.1483861654996872, + "learning_rate": 0.00020768306374123658, + "loss": 2.127, + "step": 480310 + }, + { + "epoch": 1.85678279290563, + "grad_norm": 0.1482858806848526, + "learning_rate": 0.00020757147980735358, + "loss": 2.1259, + "step": 480320 + }, + { + "epoch": 1.8568214501090132, + "grad_norm": 0.14977093040943146, + "learning_rate": 0.00020745990281946546, + "loss": 2.1416, + "step": 480330 + }, + { + "epoch": 1.8568601073123965, + "grad_norm": 0.1569179892539978, + "learning_rate": 0.0002073483327762753, + "loss": 2.1518, + "step": 480340 + }, + { + "epoch": 1.8568987645157797, + "grad_norm": 0.15524955093860626, + "learning_rate": 0.0002072367696764863, + "loss": 2.1476, + "step": 480350 + }, + { + "epoch": 1.856937421719163, + "grad_norm": 0.14991699159145355, + "learning_rate": 0.00020712521351880242, + "loss": 2.1576, + "step": 480360 + }, + { + "epoch": 1.8569760789225465, + "grad_norm": 0.13605119287967682, + "learning_rate": 0.00020701366430192782, + "loss": 2.1511, + "step": 480370 + }, + { + "epoch": 1.8570147361259297, + "grad_norm": 0.14542028307914734, + "learning_rate": 0.00020690212202456725, + "loss": 2.1367, + "step": 480380 + }, + { + "epoch": 1.857053393329313, + "grad_norm": 0.15281639993190765, + "learning_rate": 0.0002067905866854256, + "loss": 2.1535, + "step": 480390 + }, + { + "epoch": 1.8570920505326962, + "grad_norm": 0.15086878836154938, + "learning_rate": 0.00020667905828320832, + "loss": 2.1308, + "step": 480400 + }, + { + "epoch": 1.8571307077360797, + "grad_norm": 0.15849731862545013, + "learning_rate": 0.00020656753681662154, + "loss": 2.1441, + "step": 480410 + }, + { + "epoch": 1.857169364939463, + "grad_norm": 0.17104901373386383, + "learning_rate": 0.00020645602228437122, + "loss": 2.1438, + "step": 480420 + }, + { + "epoch": 1.8572080221428462, + "grad_norm": 0.14817239344120026, + "learning_rate": 0.00020634451468516392, + "loss": 2.1279, + "step": 480430 + }, + { + "epoch": 1.8572466793462294, + "grad_norm": 0.1596451699733734, + "learning_rate": 0.0002062330140177069, + "loss": 2.1289, + "step": 480440 + }, + { + "epoch": 1.8572853365496127, + "grad_norm": 0.16045700013637543, + "learning_rate": 0.00020612152028070762, + "loss": 2.1445, + "step": 480450 + }, + { + "epoch": 1.857323993752996, + "grad_norm": 0.1484665721654892, + "learning_rate": 0.0002060100334728736, + "loss": 2.1343, + "step": 480460 + }, + { + "epoch": 1.8573626509563792, + "grad_norm": 0.15261681377887726, + "learning_rate": 0.00020589855359291344, + "loss": 2.1466, + "step": 480470 + }, + { + "epoch": 1.8574013081597625, + "grad_norm": 0.14725467562675476, + "learning_rate": 0.00020578708063953566, + "loss": 2.1402, + "step": 480480 + }, + { + "epoch": 1.8574399653631457, + "grad_norm": 0.14944353699684143, + "learning_rate": 0.00020567561461144935, + "loss": 2.145, + "step": 480490 + }, + { + "epoch": 1.857478622566529, + "grad_norm": 0.15037965774536133, + "learning_rate": 0.00020556415550736397, + "loss": 2.1352, + "step": 480500 + }, + { + "epoch": 1.8575172797699122, + "grad_norm": 0.1716088354587555, + "learning_rate": 0.0002054527033259892, + "loss": 2.1406, + "step": 480510 + }, + { + "epoch": 1.8575559369732955, + "grad_norm": 0.1456490159034729, + "learning_rate": 0.00020534125806603542, + "loss": 2.1293, + "step": 480520 + }, + { + "epoch": 1.857594594176679, + "grad_norm": 0.1489306390285492, + "learning_rate": 0.00020522981972621347, + "loss": 2.1434, + "step": 480530 + }, + { + "epoch": 1.8576332513800622, + "grad_norm": 0.1532936990261078, + "learning_rate": 0.00020511838830523411, + "loss": 2.1411, + "step": 480540 + }, + { + "epoch": 1.8576719085834454, + "grad_norm": 0.14928655326366425, + "learning_rate": 0.00020500696380180904, + "loss": 2.1499, + "step": 480550 + }, + { + "epoch": 1.8577105657868287, + "grad_norm": 0.1661611646413803, + "learning_rate": 0.00020489554621465, + "loss": 2.1282, + "step": 480560 + }, + { + "epoch": 1.857749222990212, + "grad_norm": 0.1443454474210739, + "learning_rate": 0.0002047841355424691, + "loss": 2.1462, + "step": 480570 + }, + { + "epoch": 1.8577878801935954, + "grad_norm": 0.15595643222332, + "learning_rate": 0.00020467273178397915, + "loss": 2.1332, + "step": 480580 + }, + { + "epoch": 1.8578265373969787, + "grad_norm": 0.15329024195671082, + "learning_rate": 0.0002045613349378932, + "loss": 2.1271, + "step": 480590 + }, + { + "epoch": 1.857865194600362, + "grad_norm": 0.1506737768650055, + "learning_rate": 0.00020444994500292467, + "loss": 2.1498, + "step": 480600 + }, + { + "epoch": 1.8579038518037452, + "grad_norm": 0.15003903210163116, + "learning_rate": 0.00020433856197778756, + "loss": 2.1247, + "step": 480610 + }, + { + "epoch": 1.8579425090071284, + "grad_norm": 0.15266703069210052, + "learning_rate": 0.00020422718586119592, + "loss": 2.1401, + "step": 480620 + }, + { + "epoch": 1.8579811662105117, + "grad_norm": 0.1718612164258957, + "learning_rate": 0.0002041158166518644, + "loss": 2.1377, + "step": 480630 + }, + { + "epoch": 1.858019823413895, + "grad_norm": 0.15592265129089355, + "learning_rate": 0.00020400445434850823, + "loss": 2.127, + "step": 480640 + }, + { + "epoch": 1.8580584806172782, + "grad_norm": 0.1493699699640274, + "learning_rate": 0.00020389309894984264, + "loss": 2.1341, + "step": 480650 + }, + { + "epoch": 1.8580971378206614, + "grad_norm": 0.17006298899650574, + "learning_rate": 0.0002037817504545836, + "loss": 2.1658, + "step": 480660 + }, + { + "epoch": 1.8581357950240447, + "grad_norm": 0.1559170037508011, + "learning_rate": 0.0002036704088614474, + "loss": 2.14, + "step": 480670 + }, + { + "epoch": 1.858174452227428, + "grad_norm": 0.2897215485572815, + "learning_rate": 0.00020355907416915042, + "loss": 2.1544, + "step": 480680 + }, + { + "epoch": 1.8582131094308112, + "grad_norm": 0.15754903852939606, + "learning_rate": 0.00020344774637641016, + "loss": 2.1396, + "step": 480690 + }, + { + "epoch": 1.8582517666341947, + "grad_norm": 0.16809256374835968, + "learning_rate": 0.0002033364254819434, + "loss": 2.1651, + "step": 480700 + }, + { + "epoch": 1.858290423837578, + "grad_norm": 0.16307581961154938, + "learning_rate": 0.00020322511148446854, + "loss": 2.1433, + "step": 480710 + }, + { + "epoch": 1.8583290810409612, + "grad_norm": 0.16375359892845154, + "learning_rate": 0.00020311380438270388, + "loss": 2.143, + "step": 480720 + }, + { + "epoch": 1.8583677382443444, + "grad_norm": 0.14777526259422302, + "learning_rate": 0.00020300250417536758, + "loss": 2.1462, + "step": 480730 + }, + { + "epoch": 1.8584063954477277, + "grad_norm": 0.15680406987667084, + "learning_rate": 0.0002028912108611789, + "loss": 2.1271, + "step": 480740 + }, + { + "epoch": 1.8584450526511112, + "grad_norm": 0.1567578762769699, + "learning_rate": 0.00020277992443885708, + "loss": 2.1343, + "step": 480750 + }, + { + "epoch": 1.8584837098544944, + "grad_norm": 0.15893380343914032, + "learning_rate": 0.00020266864490712223, + "loss": 2.1406, + "step": 480760 + }, + { + "epoch": 1.8585223670578777, + "grad_norm": 0.14979642629623413, + "learning_rate": 0.0002025573722646945, + "loss": 2.1323, + "step": 480770 + }, + { + "epoch": 1.858561024261261, + "grad_norm": 0.15911221504211426, + "learning_rate": 0.0002024461065102945, + "loss": 2.1474, + "step": 480780 + }, + { + "epoch": 1.8585996814646442, + "grad_norm": 0.1472899615764618, + "learning_rate": 0.00020233484764264298, + "loss": 2.1277, + "step": 480790 + }, + { + "epoch": 1.8586383386680274, + "grad_norm": 0.5327991843223572, + "learning_rate": 0.00020222359566046146, + "loss": 2.1463, + "step": 480800 + }, + { + "epoch": 1.8586769958714107, + "grad_norm": 0.15671391785144806, + "learning_rate": 0.0002021123505624718, + "loss": 2.1622, + "step": 480810 + }, + { + "epoch": 1.858715653074794, + "grad_norm": 0.15824978053569794, + "learning_rate": 0.00020200111234739638, + "loss": 2.1395, + "step": 480820 + }, + { + "epoch": 1.8587543102781772, + "grad_norm": 0.15751320123672485, + "learning_rate": 0.00020188988101395755, + "loss": 2.1479, + "step": 480830 + }, + { + "epoch": 1.8587929674815604, + "grad_norm": 0.14931145310401917, + "learning_rate": 0.00020177865656087813, + "loss": 2.1417, + "step": 480840 + }, + { + "epoch": 1.8588316246849437, + "grad_norm": 0.15421828627586365, + "learning_rate": 0.000201667438986882, + "loss": 2.1507, + "step": 480850 + }, + { + "epoch": 1.858870281888327, + "grad_norm": 0.14969541132450104, + "learning_rate": 0.00020155622829069243, + "loss": 2.1333, + "step": 480860 + }, + { + "epoch": 1.8589089390917104, + "grad_norm": 0.1477525681257248, + "learning_rate": 0.00020144502447103397, + "loss": 2.1481, + "step": 480870 + }, + { + "epoch": 1.8589475962950937, + "grad_norm": 0.17129698395729065, + "learning_rate": 0.00020133382752663076, + "loss": 2.1463, + "step": 480880 + }, + { + "epoch": 1.858986253498477, + "grad_norm": 0.15016396343708038, + "learning_rate": 0.00020122263745620827, + "loss": 2.1285, + "step": 480890 + }, + { + "epoch": 1.8590249107018602, + "grad_norm": 0.1594838947057724, + "learning_rate": 0.0002011114542584913, + "loss": 2.1379, + "step": 480900 + }, + { + "epoch": 1.8590635679052434, + "grad_norm": 0.16034048795700073, + "learning_rate": 0.00020100027793220598, + "loss": 2.1364, + "step": 480910 + }, + { + "epoch": 1.8591022251086269, + "grad_norm": 0.15773534774780273, + "learning_rate": 0.00020088910847607843, + "loss": 2.125, + "step": 480920 + }, + { + "epoch": 1.8591408823120101, + "grad_norm": 0.15313485264778137, + "learning_rate": 0.00020077794588883502, + "loss": 2.133, + "step": 480930 + }, + { + "epoch": 1.8591795395153934, + "grad_norm": 0.1616487205028534, + "learning_rate": 0.00020066679016920275, + "loss": 2.1419, + "step": 480940 + }, + { + "epoch": 1.8592181967187766, + "grad_norm": 0.22435157001018524, + "learning_rate": 0.00020055564131590887, + "loss": 2.1246, + "step": 480950 + }, + { + "epoch": 1.85925685392216, + "grad_norm": 0.15276563167572021, + "learning_rate": 0.00020044449932768126, + "loss": 2.138, + "step": 480960 + }, + { + "epoch": 1.8592955111255431, + "grad_norm": 0.14432063698768616, + "learning_rate": 0.00020033336420324788, + "loss": 2.14, + "step": 480970 + }, + { + "epoch": 1.8593341683289264, + "grad_norm": 0.14203238487243652, + "learning_rate": 0.00020022223594133725, + "loss": 2.1249, + "step": 480980 + }, + { + "epoch": 1.8593728255323096, + "grad_norm": 0.15544167160987854, + "learning_rate": 0.0002001111145406782, + "loss": 2.1445, + "step": 480990 + }, + { + "epoch": 1.859411482735693, + "grad_norm": 0.16453050076961517, + "learning_rate": 0.00019999999999999996, + "loss": 2.1366, + "step": 481000 + }, + { + "epoch": 1.8594501399390762, + "grad_norm": 0.15054762363433838, + "learning_rate": 0.00019988889231803243, + "loss": 2.1366, + "step": 481010 + }, + { + "epoch": 1.8594887971424594, + "grad_norm": 0.15232422947883606, + "learning_rate": 0.00019977779149350571, + "loss": 2.1546, + "step": 481020 + }, + { + "epoch": 1.8595274543458427, + "grad_norm": 0.15228308737277985, + "learning_rate": 0.00019966669752514998, + "loss": 2.1237, + "step": 481030 + }, + { + "epoch": 1.8595661115492261, + "grad_norm": 0.16331073641777039, + "learning_rate": 0.00019955561041169623, + "loss": 2.1459, + "step": 481040 + }, + { + "epoch": 1.8596047687526094, + "grad_norm": 0.16497546434402466, + "learning_rate": 0.0001994445301518757, + "loss": 2.1263, + "step": 481050 + }, + { + "epoch": 1.8596434259559926, + "grad_norm": 0.1411234438419342, + "learning_rate": 0.00019933345674441982, + "loss": 2.1437, + "step": 481060 + }, + { + "epoch": 1.8596820831593759, + "grad_norm": 0.15147987008094788, + "learning_rate": 0.000199222390188061, + "loss": 2.1351, + "step": 481070 + }, + { + "epoch": 1.8597207403627594, + "grad_norm": 0.16037048399448395, + "learning_rate": 0.00019911133048153152, + "loss": 2.1301, + "step": 481080 + }, + { + "epoch": 1.8597593975661426, + "grad_norm": 0.15522290766239166, + "learning_rate": 0.000199000277623564, + "loss": 2.1439, + "step": 481090 + }, + { + "epoch": 1.8597980547695259, + "grad_norm": 0.14487279951572418, + "learning_rate": 0.00019888923161289163, + "loss": 2.1494, + "step": 481100 + }, + { + "epoch": 1.8598367119729091, + "grad_norm": 0.14845119416713715, + "learning_rate": 0.00019877819244824813, + "loss": 2.1363, + "step": 481110 + }, + { + "epoch": 1.8598753691762924, + "grad_norm": 0.16259387135505676, + "learning_rate": 0.0001986671601283676, + "loss": 2.1332, + "step": 481120 + }, + { + "epoch": 1.8599140263796756, + "grad_norm": 0.16065792739391327, + "learning_rate": 0.00019855613465198418, + "loss": 2.1547, + "step": 481130 + }, + { + "epoch": 1.8599526835830589, + "grad_norm": 0.1511327475309372, + "learning_rate": 0.00019844511601783266, + "loss": 2.1412, + "step": 481140 + }, + { + "epoch": 1.8599913407864421, + "grad_norm": 0.14757372438907623, + "learning_rate": 0.00019833410422464849, + "loss": 2.1191, + "step": 481150 + }, + { + "epoch": 1.8600299979898254, + "grad_norm": 0.14953935146331787, + "learning_rate": 0.00019822309927116667, + "loss": 2.1367, + "step": 481160 + }, + { + "epoch": 1.8600686551932086, + "grad_norm": 0.14788267016410828, + "learning_rate": 0.00019811210115612356, + "loss": 2.146, + "step": 481170 + }, + { + "epoch": 1.8601073123965919, + "grad_norm": 0.14757679402828217, + "learning_rate": 0.00019800110987825526, + "loss": 2.1438, + "step": 481180 + }, + { + "epoch": 1.8601459695999751, + "grad_norm": 0.1558070331811905, + "learning_rate": 0.00019789012543629857, + "loss": 2.1433, + "step": 481190 + }, + { + "epoch": 1.8601846268033584, + "grad_norm": 0.16387951374053955, + "learning_rate": 0.00019777914782899032, + "loss": 2.1311, + "step": 481200 + }, + { + "epoch": 1.8602232840067419, + "grad_norm": 0.26343652606010437, + "learning_rate": 0.00019766817705506835, + "loss": 2.147, + "step": 481210 + }, + { + "epoch": 1.8602619412101251, + "grad_norm": 0.17362500727176666, + "learning_rate": 0.00019755721311327058, + "loss": 2.1419, + "step": 481220 + }, + { + "epoch": 1.8603005984135084, + "grad_norm": 0.16361446678638458, + "learning_rate": 0.0001974462560023349, + "loss": 2.1311, + "step": 481230 + }, + { + "epoch": 1.8603392556168916, + "grad_norm": 0.14905600249767303, + "learning_rate": 0.00019733530572100012, + "loss": 2.1439, + "step": 481240 + }, + { + "epoch": 1.860377912820275, + "grad_norm": 0.15537752211093903, + "learning_rate": 0.00019722436226800544, + "loss": 2.1553, + "step": 481250 + }, + { + "epoch": 1.8604165700236583, + "grad_norm": 0.16758646070957184, + "learning_rate": 0.0001971134256420899, + "loss": 2.135, + "step": 481260 + }, + { + "epoch": 1.8604552272270416, + "grad_norm": 0.15464277565479279, + "learning_rate": 0.00019700249584199382, + "loss": 2.1376, + "step": 481270 + }, + { + "epoch": 1.8604938844304248, + "grad_norm": 0.1499345600605011, + "learning_rate": 0.00019689157286645686, + "loss": 2.1457, + "step": 481280 + }, + { + "epoch": 1.860532541633808, + "grad_norm": 0.15644638240337372, + "learning_rate": 0.00019678065671422008, + "loss": 2.1401, + "step": 481290 + }, + { + "epoch": 1.8605711988371914, + "grad_norm": 0.15277767181396484, + "learning_rate": 0.00019666974738402398, + "loss": 2.142, + "step": 481300 + }, + { + "epoch": 1.8606098560405746, + "grad_norm": 0.14947441220283508, + "learning_rate": 0.0001965588448746103, + "loss": 2.1266, + "step": 481310 + }, + { + "epoch": 1.8606485132439579, + "grad_norm": 0.14855043590068817, + "learning_rate": 0.00019644794918472063, + "loss": 2.1375, + "step": 481320 + }, + { + "epoch": 1.860687170447341, + "grad_norm": 0.1475961059331894, + "learning_rate": 0.00019633706031309717, + "loss": 2.14, + "step": 481330 + }, + { + "epoch": 1.8607258276507244, + "grad_norm": 0.15531527996063232, + "learning_rate": 0.00019622617825848244, + "loss": 2.14, + "step": 481340 + }, + { + "epoch": 1.8607644848541076, + "grad_norm": 0.15467391908168793, + "learning_rate": 0.0001961153030196192, + "loss": 2.1358, + "step": 481350 + }, + { + "epoch": 1.8608031420574909, + "grad_norm": 0.15794637799263, + "learning_rate": 0.00019600443459525096, + "loss": 2.1215, + "step": 481360 + }, + { + "epoch": 1.8608417992608741, + "grad_norm": 0.15984663367271423, + "learning_rate": 0.00019589357298412135, + "loss": 2.1246, + "step": 481370 + }, + { + "epoch": 1.8608804564642576, + "grad_norm": 0.15682539343833923, + "learning_rate": 0.00019578271818497428, + "loss": 2.1351, + "step": 481380 + }, + { + "epoch": 1.8609191136676408, + "grad_norm": 0.16392043232917786, + "learning_rate": 0.0001956718701965543, + "loss": 2.1516, + "step": 481390 + }, + { + "epoch": 1.860957770871024, + "grad_norm": 0.16104425489902496, + "learning_rate": 0.00019556102901760643, + "loss": 2.1396, + "step": 481400 + }, + { + "epoch": 1.8609964280744073, + "grad_norm": 0.17536848783493042, + "learning_rate": 0.00019545019464687542, + "loss": 2.1361, + "step": 481410 + }, + { + "epoch": 1.8610350852777908, + "grad_norm": 0.16459685564041138, + "learning_rate": 0.00019533936708310718, + "loss": 2.1469, + "step": 481420 + }, + { + "epoch": 1.861073742481174, + "grad_norm": 0.15408708155155182, + "learning_rate": 0.00019522854632504782, + "loss": 2.1438, + "step": 481430 + }, + { + "epoch": 1.8611123996845573, + "grad_norm": 0.16289669275283813, + "learning_rate": 0.0001951177323714435, + "loss": 2.1419, + "step": 481440 + }, + { + "epoch": 1.8611510568879406, + "grad_norm": 0.15478117763996124, + "learning_rate": 0.00019500692522104112, + "loss": 2.132, + "step": 481450 + }, + { + "epoch": 1.8611897140913238, + "grad_norm": 0.14389732480049133, + "learning_rate": 0.0001948961248725878, + "loss": 2.1376, + "step": 481460 + }, + { + "epoch": 1.861228371294707, + "grad_norm": 0.17540700733661652, + "learning_rate": 0.00019478533132483112, + "loss": 2.1362, + "step": 481470 + }, + { + "epoch": 1.8612670284980903, + "grad_norm": 0.14640533924102783, + "learning_rate": 0.0001946745445765188, + "loss": 2.1416, + "step": 481480 + }, + { + "epoch": 1.8613056857014736, + "grad_norm": 0.15137052536010742, + "learning_rate": 0.00019456376462639936, + "loss": 2.141, + "step": 481490 + }, + { + "epoch": 1.8613443429048568, + "grad_norm": 0.14634105563163757, + "learning_rate": 0.00019445299147322117, + "loss": 2.1271, + "step": 481500 + }, + { + "epoch": 1.86138300010824, + "grad_norm": 0.17296701669692993, + "learning_rate": 0.00019434222511573363, + "loss": 2.1336, + "step": 481510 + }, + { + "epoch": 1.8614216573116233, + "grad_norm": 0.1500747799873352, + "learning_rate": 0.000194231465552686, + "loss": 2.1351, + "step": 481520 + }, + { + "epoch": 1.8614603145150066, + "grad_norm": 0.15063370764255524, + "learning_rate": 0.00019412071278282838, + "loss": 2.1338, + "step": 481530 + }, + { + "epoch": 1.8614989717183898, + "grad_norm": 0.15135610103607178, + "learning_rate": 0.00019400996680491068, + "loss": 2.1433, + "step": 481540 + }, + { + "epoch": 1.8615376289217733, + "grad_norm": 0.15572667121887207, + "learning_rate": 0.00019389922761768363, + "loss": 2.1276, + "step": 481550 + }, + { + "epoch": 1.8615762861251566, + "grad_norm": 0.14076967537403107, + "learning_rate": 0.00019378849521989804, + "loss": 2.1277, + "step": 481560 + }, + { + "epoch": 1.8616149433285398, + "grad_norm": 0.15520797669887543, + "learning_rate": 0.0001936777696103056, + "loss": 2.1461, + "step": 481570 + }, + { + "epoch": 1.861653600531923, + "grad_norm": 0.14745649695396423, + "learning_rate": 0.00019356705078765792, + "loss": 2.1442, + "step": 481580 + }, + { + "epoch": 1.8616922577353066, + "grad_norm": 0.14961227774620056, + "learning_rate": 0.00019345633875070712, + "loss": 2.1143, + "step": 481590 + }, + { + "epoch": 1.8617309149386898, + "grad_norm": 0.20407278835773468, + "learning_rate": 0.00019334563349820578, + "loss": 2.1338, + "step": 481600 + }, + { + "epoch": 1.861769572142073, + "grad_norm": 0.17995503544807434, + "learning_rate": 0.00019323493502890643, + "loss": 2.1313, + "step": 481610 + }, + { + "epoch": 1.8618082293454563, + "grad_norm": 0.161366268992424, + "learning_rate": 0.00019312424334156275, + "loss": 2.133, + "step": 481620 + }, + { + "epoch": 1.8618468865488396, + "grad_norm": 0.151927188038826, + "learning_rate": 0.00019301355843492842, + "loss": 2.131, + "step": 481630 + }, + { + "epoch": 1.8618855437522228, + "grad_norm": 0.1566513329744339, + "learning_rate": 0.00019290288030775728, + "loss": 2.1357, + "step": 481640 + }, + { + "epoch": 1.861924200955606, + "grad_norm": 0.15415827929973602, + "learning_rate": 0.00019279220895880366, + "loss": 2.1348, + "step": 481650 + }, + { + "epoch": 1.8619628581589893, + "grad_norm": 0.1509864628314972, + "learning_rate": 0.00019268154438682283, + "loss": 2.1459, + "step": 481660 + }, + { + "epoch": 1.8620015153623726, + "grad_norm": 0.16013085842132568, + "learning_rate": 0.00019257088659056955, + "loss": 2.1362, + "step": 481670 + }, + { + "epoch": 1.8620401725657558, + "grad_norm": 0.16000637412071228, + "learning_rate": 0.00019246023556879942, + "loss": 2.1395, + "step": 481680 + }, + { + "epoch": 1.862078829769139, + "grad_norm": 0.1579359471797943, + "learning_rate": 0.00019234959132026863, + "loss": 2.152, + "step": 481690 + }, + { + "epoch": 1.8621174869725223, + "grad_norm": 0.17409813404083252, + "learning_rate": 0.00019223895384373323, + "loss": 2.1409, + "step": 481700 + }, + { + "epoch": 1.8621561441759056, + "grad_norm": 0.15212517976760864, + "learning_rate": 0.00019212832313795003, + "loss": 2.1281, + "step": 481710 + }, + { + "epoch": 1.862194801379289, + "grad_norm": 0.15253092348575592, + "learning_rate": 0.00019201769920167621, + "loss": 2.1231, + "step": 481720 + }, + { + "epoch": 1.8622334585826723, + "grad_norm": 0.16168487071990967, + "learning_rate": 0.00019190708203366925, + "loss": 2.136, + "step": 481730 + }, + { + "epoch": 1.8622721157860556, + "grad_norm": 0.16396203637123108, + "learning_rate": 0.000191796471632687, + "loss": 2.1247, + "step": 481740 + }, + { + "epoch": 1.8623107729894388, + "grad_norm": 0.1663297414779663, + "learning_rate": 0.00019168586799748756, + "loss": 2.1506, + "step": 481750 + }, + { + "epoch": 1.8623494301928223, + "grad_norm": 0.16218312084674835, + "learning_rate": 0.0001915752711268295, + "loss": 2.1432, + "step": 481760 + }, + { + "epoch": 1.8623880873962055, + "grad_norm": 0.1498250812292099, + "learning_rate": 0.000191464681019472, + "loss": 2.113, + "step": 481770 + }, + { + "epoch": 1.8624267445995888, + "grad_norm": 0.15257258713245392, + "learning_rate": 0.00019135409767417433, + "loss": 2.1467, + "step": 481780 + }, + { + "epoch": 1.862465401802972, + "grad_norm": 0.16256940364837646, + "learning_rate": 0.00019124352108969633, + "loss": 2.1341, + "step": 481790 + }, + { + "epoch": 1.8625040590063553, + "grad_norm": 0.17142386734485626, + "learning_rate": 0.00019113295126479792, + "loss": 2.1369, + "step": 481800 + }, + { + "epoch": 1.8625427162097385, + "grad_norm": 0.1537153124809265, + "learning_rate": 0.0001910223881982398, + "loss": 2.1435, + "step": 481810 + }, + { + "epoch": 1.8625813734131218, + "grad_norm": 0.36844369769096375, + "learning_rate": 0.0001909118318887826, + "loss": 2.1321, + "step": 481820 + }, + { + "epoch": 1.862620030616505, + "grad_norm": 0.146562859416008, + "learning_rate": 0.0001908012823351879, + "loss": 2.145, + "step": 481830 + }, + { + "epoch": 1.8626586878198883, + "grad_norm": 0.16262121498584747, + "learning_rate": 0.0001906907395362174, + "loss": 2.133, + "step": 481840 + }, + { + "epoch": 1.8626973450232716, + "grad_norm": 0.15142381191253662, + "learning_rate": 0.00019058020349063277, + "loss": 2.1399, + "step": 481850 + }, + { + "epoch": 1.8627360022266548, + "grad_norm": 0.16121938824653625, + "learning_rate": 0.00019046967419719652, + "loss": 2.1323, + "step": 481860 + }, + { + "epoch": 1.862774659430038, + "grad_norm": 0.14631405472755432, + "learning_rate": 0.00019035915165467165, + "loss": 2.1298, + "step": 481870 + }, + { + "epoch": 1.8628133166334213, + "grad_norm": 0.16886377334594727, + "learning_rate": 0.00019024863586182095, + "loss": 2.1239, + "step": 481880 + }, + { + "epoch": 1.8628519738368048, + "grad_norm": 0.1621556133031845, + "learning_rate": 0.0001901381268174083, + "loss": 2.1285, + "step": 481890 + }, + { + "epoch": 1.862890631040188, + "grad_norm": 0.16947074234485626, + "learning_rate": 0.00019002762452019728, + "loss": 2.139, + "step": 481900 + }, + { + "epoch": 1.8629292882435713, + "grad_norm": 0.14935055375099182, + "learning_rate": 0.0001899171289689523, + "loss": 2.1452, + "step": 481910 + }, + { + "epoch": 1.8629679454469545, + "grad_norm": 0.15513372421264648, + "learning_rate": 0.00018980664016243832, + "loss": 2.1468, + "step": 481920 + }, + { + "epoch": 1.863006602650338, + "grad_norm": 0.16348998248577118, + "learning_rate": 0.00018969615809942008, + "loss": 2.1398, + "step": 481930 + }, + { + "epoch": 1.8630452598537213, + "grad_norm": 0.1574879288673401, + "learning_rate": 0.00018958568277866283, + "loss": 2.1301, + "step": 481940 + }, + { + "epoch": 1.8630839170571045, + "grad_norm": 0.15516617894172668, + "learning_rate": 0.00018947521419893288, + "loss": 2.1426, + "step": 481950 + }, + { + "epoch": 1.8631225742604878, + "grad_norm": 0.15340623259544373, + "learning_rate": 0.0001893647523589961, + "loss": 2.1258, + "step": 481960 + }, + { + "epoch": 1.863161231463871, + "grad_norm": 0.16473540663719177, + "learning_rate": 0.00018925429725761878, + "loss": 2.1442, + "step": 481970 + }, + { + "epoch": 1.8631998886672543, + "grad_norm": 0.15988117456436157, + "learning_rate": 0.00018914384889356817, + "loss": 2.1216, + "step": 481980 + }, + { + "epoch": 1.8632385458706375, + "grad_norm": 0.14998415112495422, + "learning_rate": 0.0001890334072656117, + "loss": 2.1168, + "step": 481990 + }, + { + "epoch": 1.8632772030740208, + "grad_norm": 0.1660185307264328, + "learning_rate": 0.00018892297237251677, + "loss": 2.137, + "step": 482000 + }, + { + "epoch": 1.863315860277404, + "grad_norm": 0.14230751991271973, + "learning_rate": 0.00018881254421305129, + "loss": 2.1246, + "step": 482010 + }, + { + "epoch": 1.8633545174807873, + "grad_norm": 0.15070542693138123, + "learning_rate": 0.0001887021227859842, + "loss": 2.1282, + "step": 482020 + }, + { + "epoch": 1.8633931746841705, + "grad_norm": 0.1525546759366989, + "learning_rate": 0.00018859170809008385, + "loss": 2.1445, + "step": 482030 + }, + { + "epoch": 1.8634318318875538, + "grad_norm": 0.20553191006183624, + "learning_rate": 0.00018848130012411968, + "loss": 2.1378, + "step": 482040 + }, + { + "epoch": 1.863470489090937, + "grad_norm": 0.15216852724552155, + "learning_rate": 0.0001883708988868611, + "loss": 2.1275, + "step": 482050 + }, + { + "epoch": 1.8635091462943205, + "grad_norm": 0.15563897788524628, + "learning_rate": 0.000188260504377078, + "loss": 2.1381, + "step": 482060 + }, + { + "epoch": 1.8635478034977038, + "grad_norm": 0.1471768468618393, + "learning_rate": 0.00018815011659354086, + "loss": 2.1353, + "step": 482070 + }, + { + "epoch": 1.863586460701087, + "grad_norm": 0.16196849942207336, + "learning_rate": 0.00018803973553502008, + "loss": 2.1457, + "step": 482080 + }, + { + "epoch": 1.8636251179044703, + "grad_norm": 0.15377961099147797, + "learning_rate": 0.00018792936120028703, + "loss": 2.1387, + "step": 482090 + }, + { + "epoch": 1.8636637751078537, + "grad_norm": 0.1568869948387146, + "learning_rate": 0.00018781899358811293, + "loss": 2.1265, + "step": 482100 + }, + { + "epoch": 1.863702432311237, + "grad_norm": 0.1527637392282486, + "learning_rate": 0.00018770863269726968, + "loss": 2.1321, + "step": 482110 + }, + { + "epoch": 1.8637410895146203, + "grad_norm": 0.1602158546447754, + "learning_rate": 0.00018759827852652934, + "loss": 2.1186, + "step": 482120 + }, + { + "epoch": 1.8637797467180035, + "grad_norm": 0.1528051197528839, + "learning_rate": 0.00018748793107466443, + "loss": 2.1397, + "step": 482130 + }, + { + "epoch": 1.8638184039213868, + "grad_norm": 0.15877701342105865, + "learning_rate": 0.0001873775903404482, + "loss": 2.1318, + "step": 482140 + }, + { + "epoch": 1.86385706112477, + "grad_norm": 0.14863698184490204, + "learning_rate": 0.00018726725632265363, + "loss": 2.1431, + "step": 482150 + }, + { + "epoch": 1.8638957183281533, + "grad_norm": 0.15921323001384735, + "learning_rate": 0.00018715692902005433, + "loss": 2.1409, + "step": 482160 + }, + { + "epoch": 1.8639343755315365, + "grad_norm": 0.15810279548168182, + "learning_rate": 0.00018704660843142464, + "loss": 2.1297, + "step": 482170 + }, + { + "epoch": 1.8639730327349198, + "grad_norm": 0.16936606168746948, + "learning_rate": 0.00018693629455553886, + "loss": 2.1365, + "step": 482180 + }, + { + "epoch": 1.864011689938303, + "grad_norm": 0.15929560363292694, + "learning_rate": 0.0001868259873911715, + "loss": 2.12, + "step": 482190 + }, + { + "epoch": 1.8640503471416863, + "grad_norm": 0.15395177900791168, + "learning_rate": 0.00018671568693709806, + "loss": 2.1392, + "step": 482200 + }, + { + "epoch": 1.8640890043450695, + "grad_norm": 0.15042053163051605, + "learning_rate": 0.0001866053931920939, + "loss": 2.1473, + "step": 482210 + }, + { + "epoch": 1.8641276615484528, + "grad_norm": 0.14917847514152527, + "learning_rate": 0.00018649510615493514, + "loss": 2.1431, + "step": 482220 + }, + { + "epoch": 1.8641663187518362, + "grad_norm": 0.15905718505382538, + "learning_rate": 0.00018638482582439763, + "loss": 2.1237, + "step": 482230 + }, + { + "epoch": 1.8642049759552195, + "grad_norm": 0.14712104201316833, + "learning_rate": 0.0001862745521992586, + "loss": 2.1291, + "step": 482240 + }, + { + "epoch": 1.8642436331586028, + "grad_norm": 0.15073151886463165, + "learning_rate": 0.00018616428527829453, + "loss": 2.1296, + "step": 482250 + }, + { + "epoch": 1.864282290361986, + "grad_norm": 0.1549181491136551, + "learning_rate": 0.00018605402506028312, + "loss": 2.132, + "step": 482260 + }, + { + "epoch": 1.8643209475653695, + "grad_norm": 0.15898975729942322, + "learning_rate": 0.000185943771544002, + "loss": 2.1344, + "step": 482270 + }, + { + "epoch": 1.8643596047687527, + "grad_norm": 0.18511076271533966, + "learning_rate": 0.00018583352472822945, + "loss": 2.1376, + "step": 482280 + }, + { + "epoch": 1.864398261972136, + "grad_norm": 0.14872094988822937, + "learning_rate": 0.0001857232846117438, + "loss": 2.1394, + "step": 482290 + }, + { + "epoch": 1.8644369191755192, + "grad_norm": 0.15533463656902313, + "learning_rate": 0.00018561305119332383, + "loss": 2.1179, + "step": 482300 + }, + { + "epoch": 1.8644755763789025, + "grad_norm": 0.14731545746326447, + "learning_rate": 0.00018550282447174916, + "loss": 2.1482, + "step": 482310 + }, + { + "epoch": 1.8645142335822857, + "grad_norm": 0.17277966439723969, + "learning_rate": 0.0001853926044457992, + "loss": 2.1355, + "step": 482320 + }, + { + "epoch": 1.864552890785669, + "grad_norm": 0.1634880006313324, + "learning_rate": 0.00018528239111425382, + "loss": 2.137, + "step": 482330 + }, + { + "epoch": 1.8645915479890522, + "grad_norm": 0.15768516063690186, + "learning_rate": 0.00018517218447589357, + "loss": 2.1376, + "step": 482340 + }, + { + "epoch": 1.8646302051924355, + "grad_norm": 0.15683893859386444, + "learning_rate": 0.0001850619845294992, + "loss": 2.1432, + "step": 482350 + }, + { + "epoch": 1.8646688623958187, + "grad_norm": 0.1512300670146942, + "learning_rate": 0.00018495179127385166, + "loss": 2.1321, + "step": 482360 + }, + { + "epoch": 1.864707519599202, + "grad_norm": 0.15822456777095795, + "learning_rate": 0.00018484160470773236, + "loss": 2.1379, + "step": 482370 + }, + { + "epoch": 1.8647461768025853, + "grad_norm": 0.17078734934329987, + "learning_rate": 0.00018473142482992345, + "loss": 2.117, + "step": 482380 + }, + { + "epoch": 1.8647848340059685, + "grad_norm": 0.15186278522014618, + "learning_rate": 0.00018462125163920694, + "loss": 2.1434, + "step": 482390 + }, + { + "epoch": 1.864823491209352, + "grad_norm": 0.1607029289007187, + "learning_rate": 0.0001845110851343652, + "loss": 2.1275, + "step": 482400 + }, + { + "epoch": 1.8648621484127352, + "grad_norm": 0.1595838963985443, + "learning_rate": 0.00018440092531418162, + "loss": 2.1437, + "step": 482410 + }, + { + "epoch": 1.8649008056161185, + "grad_norm": 0.17397913336753845, + "learning_rate": 0.00018429077217743916, + "loss": 2.1392, + "step": 482420 + }, + { + "epoch": 1.8649394628195017, + "grad_norm": 0.15208283066749573, + "learning_rate": 0.0001841806257229217, + "loss": 2.1398, + "step": 482430 + }, + { + "epoch": 1.8649781200228852, + "grad_norm": 0.149830162525177, + "learning_rate": 0.00018407048594941332, + "loss": 2.1394, + "step": 482440 + }, + { + "epoch": 1.8650167772262685, + "grad_norm": 0.14657558500766754, + "learning_rate": 0.00018396035285569835, + "loss": 2.1315, + "step": 482450 + }, + { + "epoch": 1.8650554344296517, + "grad_norm": 0.15106050670146942, + "learning_rate": 0.00018385022644056147, + "loss": 2.128, + "step": 482460 + }, + { + "epoch": 1.865094091633035, + "grad_norm": 0.15019604563713074, + "learning_rate": 0.00018374010670278796, + "loss": 2.1319, + "step": 482470 + }, + { + "epoch": 1.8651327488364182, + "grad_norm": 0.15791895985603333, + "learning_rate": 0.0001836299936411634, + "loss": 2.1241, + "step": 482480 + }, + { + "epoch": 1.8651714060398015, + "grad_norm": 0.1745760440826416, + "learning_rate": 0.00018351988725447366, + "loss": 2.133, + "step": 482490 + }, + { + "epoch": 1.8652100632431847, + "grad_norm": 0.1689693182706833, + "learning_rate": 0.00018340978754150506, + "loss": 2.1328, + "step": 482500 + }, + { + "epoch": 1.865248720446568, + "grad_norm": 0.14829005300998688, + "learning_rate": 0.00018329969450104413, + "loss": 2.1249, + "step": 482510 + }, + { + "epoch": 1.8652873776499512, + "grad_norm": 0.15087290108203888, + "learning_rate": 0.0001831896081318778, + "loss": 2.1148, + "step": 482520 + }, + { + "epoch": 1.8653260348533345, + "grad_norm": 0.14755970239639282, + "learning_rate": 0.0001830795284327935, + "loss": 2.1343, + "step": 482530 + }, + { + "epoch": 1.8653646920567177, + "grad_norm": 0.15620015561580658, + "learning_rate": 0.00018296945540257915, + "loss": 2.1269, + "step": 482540 + }, + { + "epoch": 1.865403349260101, + "grad_norm": 0.16158314049243927, + "learning_rate": 0.00018285938904002253, + "loss": 2.1348, + "step": 482550 + }, + { + "epoch": 1.8654420064634845, + "grad_norm": 0.1560443490743637, + "learning_rate": 0.00018274932934391242, + "loss": 2.1357, + "step": 482560 + }, + { + "epoch": 1.8654806636668677, + "grad_norm": 0.15698698163032532, + "learning_rate": 0.00018263927631303757, + "loss": 2.1244, + "step": 482570 + }, + { + "epoch": 1.865519320870251, + "grad_norm": 0.1533084660768509, + "learning_rate": 0.00018252922994618692, + "loss": 2.1444, + "step": 482580 + }, + { + "epoch": 1.8655579780736342, + "grad_norm": 0.168225958943367, + "learning_rate": 0.00018241919024215058, + "loss": 2.1364, + "step": 482590 + }, + { + "epoch": 1.8655966352770175, + "grad_norm": 0.15649709105491638, + "learning_rate": 0.00018230915719971797, + "loss": 2.1379, + "step": 482600 + }, + { + "epoch": 1.865635292480401, + "grad_norm": 0.15371111035346985, + "learning_rate": 0.00018219913081767957, + "loss": 2.1213, + "step": 482610 + }, + { + "epoch": 1.8656739496837842, + "grad_norm": 0.1561652272939682, + "learning_rate": 0.00018208911109482596, + "loss": 2.1355, + "step": 482620 + }, + { + "epoch": 1.8657126068871674, + "grad_norm": 0.15515419840812683, + "learning_rate": 0.00018197909802994828, + "loss": 2.1387, + "step": 482630 + }, + { + "epoch": 1.8657512640905507, + "grad_norm": 0.16376587748527527, + "learning_rate": 0.00018186909162183818, + "loss": 2.1385, + "step": 482640 + }, + { + "epoch": 1.865789921293934, + "grad_norm": 0.15734699368476868, + "learning_rate": 0.00018175909186928709, + "loss": 2.1345, + "step": 482650 + }, + { + "epoch": 1.8658285784973172, + "grad_norm": 0.15997950732707977, + "learning_rate": 0.00018164909877108705, + "loss": 2.1414, + "step": 482660 + }, + { + "epoch": 1.8658672357007005, + "grad_norm": 0.15235339105129242, + "learning_rate": 0.0001815391123260308, + "loss": 2.1272, + "step": 482670 + }, + { + "epoch": 1.8659058929040837, + "grad_norm": 0.15580111742019653, + "learning_rate": 0.00018142913253291115, + "loss": 2.1336, + "step": 482680 + }, + { + "epoch": 1.865944550107467, + "grad_norm": 0.1720065474510193, + "learning_rate": 0.00018131915939052147, + "loss": 2.1288, + "step": 482690 + }, + { + "epoch": 1.8659832073108502, + "grad_norm": 0.17029879987239838, + "learning_rate": 0.00018120919289765493, + "loss": 2.1259, + "step": 482700 + }, + { + "epoch": 1.8660218645142335, + "grad_norm": 0.1510152667760849, + "learning_rate": 0.00018109923305310583, + "loss": 2.1421, + "step": 482710 + }, + { + "epoch": 1.8660605217176167, + "grad_norm": 0.19208844006061554, + "learning_rate": 0.00018098927985566826, + "loss": 2.1118, + "step": 482720 + }, + { + "epoch": 1.8660991789210002, + "grad_norm": 0.1666448712348938, + "learning_rate": 0.0001808793333041372, + "loss": 2.1489, + "step": 482730 + }, + { + "epoch": 1.8661378361243834, + "grad_norm": 0.16235436499118805, + "learning_rate": 0.00018076939339730757, + "loss": 2.1442, + "step": 482740 + }, + { + "epoch": 1.8661764933277667, + "grad_norm": 0.14573925733566284, + "learning_rate": 0.00018065946013397482, + "loss": 2.1479, + "step": 482750 + }, + { + "epoch": 1.86621515053115, + "grad_norm": 0.1504787802696228, + "learning_rate": 0.00018054953351293458, + "loss": 2.1493, + "step": 482760 + }, + { + "epoch": 1.8662538077345332, + "grad_norm": 0.15526950359344482, + "learning_rate": 0.0001804396135329831, + "loss": 2.1348, + "step": 482770 + }, + { + "epoch": 1.8662924649379167, + "grad_norm": 0.15732213854789734, + "learning_rate": 0.00018032970019291695, + "loss": 2.1192, + "step": 482780 + }, + { + "epoch": 1.8663311221413, + "grad_norm": 0.16246534883975983, + "learning_rate": 0.00018021979349153262, + "loss": 2.1264, + "step": 482790 + }, + { + "epoch": 1.8663697793446832, + "grad_norm": 0.15312378108501434, + "learning_rate": 0.00018010989342762796, + "loss": 2.1406, + "step": 482800 + }, + { + "epoch": 1.8664084365480664, + "grad_norm": 0.17585042119026184, + "learning_rate": 0.00018000000000000017, + "loss": 2.15, + "step": 482810 + }, + { + "epoch": 1.8664470937514497, + "grad_norm": 0.15485194325447083, + "learning_rate": 0.0001798901132074471, + "loss": 2.1439, + "step": 482820 + }, + { + "epoch": 1.866485750954833, + "grad_norm": 0.19659042358398438, + "learning_rate": 0.00017978023304876723, + "loss": 2.1181, + "step": 482830 + }, + { + "epoch": 1.8665244081582162, + "grad_norm": 0.14731131494045258, + "learning_rate": 0.00017967035952275935, + "loss": 2.1382, + "step": 482840 + }, + { + "epoch": 1.8665630653615994, + "grad_norm": 0.15818408131599426, + "learning_rate": 0.0001795604926282224, + "loss": 2.1297, + "step": 482850 + }, + { + "epoch": 1.8666017225649827, + "grad_norm": 0.17280857264995575, + "learning_rate": 0.00017945063236395598, + "loss": 2.1388, + "step": 482860 + }, + { + "epoch": 1.866640379768366, + "grad_norm": 0.15899436175823212, + "learning_rate": 0.00017934077872875954, + "loss": 2.1233, + "step": 482870 + }, + { + "epoch": 1.8666790369717492, + "grad_norm": 0.1656186729669571, + "learning_rate": 0.00017923093172143335, + "loss": 2.1503, + "step": 482880 + }, + { + "epoch": 1.8667176941751324, + "grad_norm": 0.1577361524105072, + "learning_rate": 0.00017912109134077793, + "loss": 2.1376, + "step": 482890 + }, + { + "epoch": 1.866756351378516, + "grad_norm": 0.17275647819042206, + "learning_rate": 0.000179011257585594, + "loss": 2.1421, + "step": 482900 + }, + { + "epoch": 1.8667950085818992, + "grad_norm": 0.16903147101402283, + "learning_rate": 0.000178901430454683, + "loss": 2.1134, + "step": 482910 + }, + { + "epoch": 1.8668336657852824, + "grad_norm": 0.14542759954929352, + "learning_rate": 0.0001787916099468463, + "loss": 2.1384, + "step": 482920 + }, + { + "epoch": 1.8668723229886657, + "grad_norm": 0.1691199541091919, + "learning_rate": 0.00017868179606088597, + "loss": 2.1316, + "step": 482930 + }, + { + "epoch": 1.8669109801920492, + "grad_norm": 0.15983478724956512, + "learning_rate": 0.00017857198879560432, + "loss": 2.1406, + "step": 482940 + }, + { + "epoch": 1.8669496373954324, + "grad_norm": 0.15318359434604645, + "learning_rate": 0.00017846218814980409, + "loss": 2.1402, + "step": 482950 + }, + { + "epoch": 1.8669882945988157, + "grad_norm": 0.16408123075962067, + "learning_rate": 0.00017835239412228798, + "loss": 2.1361, + "step": 482960 + }, + { + "epoch": 1.867026951802199, + "grad_norm": 0.15106911957263947, + "learning_rate": 0.00017824260671185966, + "loss": 2.1272, + "step": 482970 + }, + { + "epoch": 1.8670656090055822, + "grad_norm": 0.16219568252563477, + "learning_rate": 0.00017813282591732293, + "loss": 2.1442, + "step": 482980 + }, + { + "epoch": 1.8671042662089654, + "grad_norm": 0.16399811208248138, + "learning_rate": 0.00017802305173748146, + "loss": 2.1214, + "step": 482990 + }, + { + "epoch": 1.8671429234123487, + "grad_norm": 0.15034039318561554, + "learning_rate": 0.00017791328417114015, + "loss": 2.127, + "step": 483000 + }, + { + "epoch": 1.867181580615732, + "grad_norm": 0.15108898282051086, + "learning_rate": 0.0001778035232171038, + "loss": 2.129, + "step": 483010 + }, + { + "epoch": 1.8672202378191152, + "grad_norm": 0.1576581746339798, + "learning_rate": 0.00017769376887417733, + "loss": 2.141, + "step": 483020 + }, + { + "epoch": 1.8672588950224984, + "grad_norm": 0.14400140941143036, + "learning_rate": 0.00017758402114116656, + "loss": 2.1375, + "step": 483030 + }, + { + "epoch": 1.8672975522258817, + "grad_norm": 0.16434095799922943, + "learning_rate": 0.00017747428001687715, + "loss": 2.1233, + "step": 483040 + }, + { + "epoch": 1.867336209429265, + "grad_norm": 0.16243363916873932, + "learning_rate": 0.0001773645455001156, + "loss": 2.1175, + "step": 483050 + }, + { + "epoch": 1.8673748666326482, + "grad_norm": 0.1503385305404663, + "learning_rate": 0.00017725481758968842, + "loss": 2.1224, + "step": 483060 + }, + { + "epoch": 1.8674135238360317, + "grad_norm": 0.1493840366601944, + "learning_rate": 0.00017714509628440257, + "loss": 2.1218, + "step": 483070 + }, + { + "epoch": 1.867452181039415, + "grad_norm": 0.15031930804252625, + "learning_rate": 0.0001770353815830654, + "loss": 2.1392, + "step": 483080 + }, + { + "epoch": 1.8674908382427982, + "grad_norm": 0.1668921858072281, + "learning_rate": 0.00017692567348448463, + "loss": 2.1309, + "step": 483090 + }, + { + "epoch": 1.8675294954461814, + "grad_norm": 0.16633087396621704, + "learning_rate": 0.00017681597198746825, + "loss": 2.1331, + "step": 483100 + }, + { + "epoch": 1.8675681526495649, + "grad_norm": 0.17308935523033142, + "learning_rate": 0.00017670627709082497, + "loss": 2.1349, + "step": 483110 + }, + { + "epoch": 1.8676068098529481, + "grad_norm": 0.15083816647529602, + "learning_rate": 0.00017659658879336294, + "loss": 2.1346, + "step": 483120 + }, + { + "epoch": 1.8676454670563314, + "grad_norm": 0.14988155663013458, + "learning_rate": 0.00017648690709389192, + "loss": 2.1369, + "step": 483130 + }, + { + "epoch": 1.8676841242597146, + "grad_norm": 0.14918486773967743, + "learning_rate": 0.00017637723199122137, + "loss": 2.1328, + "step": 483140 + }, + { + "epoch": 1.867722781463098, + "grad_norm": 0.1509108543395996, + "learning_rate": 0.00017626756348416062, + "loss": 2.1115, + "step": 483150 + }, + { + "epoch": 1.8677614386664811, + "grad_norm": 0.16556978225708008, + "learning_rate": 0.00017615790157152046, + "loss": 2.1411, + "step": 483160 + }, + { + "epoch": 1.8678000958698644, + "grad_norm": 0.15764233469963074, + "learning_rate": 0.0001760482462521109, + "loss": 2.1285, + "step": 483170 + }, + { + "epoch": 1.8678387530732476, + "grad_norm": 0.1732935756444931, + "learning_rate": 0.00017593859752474338, + "loss": 2.1386, + "step": 483180 + }, + { + "epoch": 1.867877410276631, + "grad_norm": 0.14939621090888977, + "learning_rate": 0.00017582895538822908, + "loss": 2.1402, + "step": 483190 + }, + { + "epoch": 1.8679160674800142, + "grad_norm": 0.16969427466392517, + "learning_rate": 0.00017571931984137934, + "loss": 2.1324, + "step": 483200 + }, + { + "epoch": 1.8679547246833974, + "grad_norm": 0.1577702909708023, + "learning_rate": 0.00017560969088300626, + "loss": 2.127, + "step": 483210 + }, + { + "epoch": 1.8679933818867807, + "grad_norm": 0.15833325684070587, + "learning_rate": 0.00017550006851192236, + "loss": 2.1255, + "step": 483220 + }, + { + "epoch": 1.868032039090164, + "grad_norm": 0.15044455230236053, + "learning_rate": 0.00017539045272694031, + "loss": 2.1314, + "step": 483230 + }, + { + "epoch": 1.8680706962935474, + "grad_norm": 0.15325351059436798, + "learning_rate": 0.0001752808435268729, + "loss": 2.1401, + "step": 483240 + }, + { + "epoch": 1.8681093534969306, + "grad_norm": 0.15859206020832062, + "learning_rate": 0.00017517124091053415, + "loss": 2.1297, + "step": 483250 + }, + { + "epoch": 1.8681480107003139, + "grad_norm": 0.1479908525943756, + "learning_rate": 0.00017506164487673725, + "loss": 2.1147, + "step": 483260 + }, + { + "epoch": 1.8681866679036971, + "grad_norm": 0.15474596619606018, + "learning_rate": 0.00017495205542429647, + "loss": 2.1334, + "step": 483270 + }, + { + "epoch": 1.8682253251070806, + "grad_norm": 0.14719825983047485, + "learning_rate": 0.00017484247255202657, + "loss": 2.1269, + "step": 483280 + }, + { + "epoch": 1.8682639823104639, + "grad_norm": 0.15452922880649567, + "learning_rate": 0.000174732896258742, + "loss": 2.1139, + "step": 483290 + }, + { + "epoch": 1.8683026395138471, + "grad_norm": 0.160796120762825, + "learning_rate": 0.00017462332654325841, + "loss": 2.133, + "step": 483300 + }, + { + "epoch": 1.8683412967172304, + "grad_norm": 0.14414744079113007, + "learning_rate": 0.00017451376340439095, + "loss": 2.1281, + "step": 483310 + }, + { + "epoch": 1.8683799539206136, + "grad_norm": 0.15399828553199768, + "learning_rate": 0.0001744042068409557, + "loss": 2.1407, + "step": 483320 + }, + { + "epoch": 1.8684186111239969, + "grad_norm": 0.15108740329742432, + "learning_rate": 0.00017429465685176894, + "loss": 2.1304, + "step": 483330 + }, + { + "epoch": 1.8684572683273801, + "grad_norm": 0.14604243636131287, + "learning_rate": 0.00017418511343564736, + "loss": 2.1389, + "step": 483340 + }, + { + "epoch": 1.8684959255307634, + "grad_norm": 0.1571522355079651, + "learning_rate": 0.00017407557659140772, + "loss": 2.139, + "step": 483350 + }, + { + "epoch": 1.8685345827341466, + "grad_norm": 0.15916845202445984, + "learning_rate": 0.0001739660463178676, + "loss": 2.1364, + "step": 483360 + }, + { + "epoch": 1.8685732399375299, + "grad_norm": 0.1621374934911728, + "learning_rate": 0.00017385652261384465, + "loss": 2.1282, + "step": 483370 + }, + { + "epoch": 1.8686118971409131, + "grad_norm": 0.15086057782173157, + "learning_rate": 0.0001737470054781567, + "loss": 2.1474, + "step": 483380 + }, + { + "epoch": 1.8686505543442964, + "grad_norm": 0.15980279445648193, + "learning_rate": 0.0001736374949096222, + "loss": 2.1328, + "step": 483390 + }, + { + "epoch": 1.8686892115476796, + "grad_norm": 0.16501730680465698, + "learning_rate": 0.00017352799090706016, + "loss": 2.1457, + "step": 483400 + }, + { + "epoch": 1.8687278687510631, + "grad_norm": 0.14960528910160065, + "learning_rate": 0.00017341849346928952, + "loss": 2.129, + "step": 483410 + }, + { + "epoch": 1.8687665259544464, + "grad_norm": 0.1552843600511551, + "learning_rate": 0.0001733090025951296, + "loss": 2.143, + "step": 483420 + }, + { + "epoch": 1.8688051831578296, + "grad_norm": 0.14932730793952942, + "learning_rate": 0.00017319951828340054, + "loss": 2.1336, + "step": 483430 + }, + { + "epoch": 1.8688438403612129, + "grad_norm": 0.15046526491641998, + "learning_rate": 0.0001730900405329221, + "loss": 2.132, + "step": 483440 + }, + { + "epoch": 1.8688824975645963, + "grad_norm": 0.17451153695583344, + "learning_rate": 0.00017298056934251504, + "loss": 2.1268, + "step": 483450 + }, + { + "epoch": 1.8689211547679796, + "grad_norm": 0.151564359664917, + "learning_rate": 0.00017287110471100032, + "loss": 2.127, + "step": 483460 + }, + { + "epoch": 1.8689598119713628, + "grad_norm": 0.15798701345920563, + "learning_rate": 0.00017276164663719906, + "loss": 2.1415, + "step": 483470 + }, + { + "epoch": 1.868998469174746, + "grad_norm": 0.16805607080459595, + "learning_rate": 0.0001726521951199329, + "loss": 2.1163, + "step": 483480 + }, + { + "epoch": 1.8690371263781294, + "grad_norm": 0.15856941044330597, + "learning_rate": 0.00017254275015802346, + "loss": 2.125, + "step": 483490 + }, + { + "epoch": 1.8690757835815126, + "grad_norm": 0.1525784581899643, + "learning_rate": 0.0001724333117502934, + "loss": 2.1211, + "step": 483500 + }, + { + "epoch": 1.8691144407848959, + "grad_norm": 0.16771401464939117, + "learning_rate": 0.00017232387989556531, + "loss": 2.1177, + "step": 483510 + }, + { + "epoch": 1.869153097988279, + "grad_norm": 0.15492326021194458, + "learning_rate": 0.00017221445459266206, + "loss": 2.1077, + "step": 483520 + }, + { + "epoch": 1.8691917551916624, + "grad_norm": 0.15133053064346313, + "learning_rate": 0.00017210503584040682, + "loss": 2.1411, + "step": 483530 + }, + { + "epoch": 1.8692304123950456, + "grad_norm": 0.16135741770267487, + "learning_rate": 0.00017199562363762368, + "loss": 2.1339, + "step": 483540 + }, + { + "epoch": 1.8692690695984289, + "grad_norm": 0.15097425878047943, + "learning_rate": 0.00017188621798313664, + "loss": 2.1395, + "step": 483550 + }, + { + "epoch": 1.8693077268018121, + "grad_norm": 0.1603614091873169, + "learning_rate": 0.0001717768188757698, + "loss": 2.1365, + "step": 483560 + }, + { + "epoch": 1.8693463840051954, + "grad_norm": 0.14693760871887207, + "learning_rate": 0.00017166742631434808, + "loss": 2.1465, + "step": 483570 + }, + { + "epoch": 1.8693850412085788, + "grad_norm": 0.150224968791008, + "learning_rate": 0.00017155804029769662, + "loss": 2.1161, + "step": 483580 + }, + { + "epoch": 1.869423698411962, + "grad_norm": 1.3554350137710571, + "learning_rate": 0.0001714486608246406, + "loss": 2.1413, + "step": 483590 + }, + { + "epoch": 1.8694623556153454, + "grad_norm": 0.18808268010616302, + "learning_rate": 0.00017133928789400633, + "loss": 2.1193, + "step": 483600 + }, + { + "epoch": 1.8695010128187286, + "grad_norm": 0.15480872988700867, + "learning_rate": 0.00017122992150461958, + "loss": 2.1132, + "step": 483610 + }, + { + "epoch": 1.869539670022112, + "grad_norm": 0.15553432703018188, + "learning_rate": 0.00017112056165530687, + "loss": 2.1279, + "step": 483620 + }, + { + "epoch": 1.8695783272254953, + "grad_norm": 0.15232661366462708, + "learning_rate": 0.00017101120834489537, + "loss": 2.1363, + "step": 483630 + }, + { + "epoch": 1.8696169844288786, + "grad_norm": 0.15363068878650665, + "learning_rate": 0.00017090186157221176, + "loss": 2.1229, + "step": 483640 + }, + { + "epoch": 1.8696556416322618, + "grad_norm": 0.15250281989574432, + "learning_rate": 0.00017079252133608413, + "loss": 2.1319, + "step": 483650 + }, + { + "epoch": 1.869694298835645, + "grad_norm": 0.15061458945274353, + "learning_rate": 0.00017068318763534008, + "loss": 2.125, + "step": 483660 + }, + { + "epoch": 1.8697329560390283, + "grad_norm": 0.14238150417804718, + "learning_rate": 0.00017057386046880808, + "loss": 2.1277, + "step": 483670 + }, + { + "epoch": 1.8697716132424116, + "grad_norm": 0.16396325826644897, + "learning_rate": 0.0001704645398353166, + "loss": 2.1408, + "step": 483680 + }, + { + "epoch": 1.8698102704457948, + "grad_norm": 0.16763406991958618, + "learning_rate": 0.00017035522573369445, + "loss": 2.1272, + "step": 483690 + }, + { + "epoch": 1.869848927649178, + "grad_norm": 0.16632625460624695, + "learning_rate": 0.00017024591816277134, + "loss": 2.1199, + "step": 483700 + }, + { + "epoch": 1.8698875848525613, + "grad_norm": 0.15391650795936584, + "learning_rate": 0.0001701366171213765, + "loss": 2.1323, + "step": 483710 + }, + { + "epoch": 1.8699262420559446, + "grad_norm": 0.16232500970363617, + "learning_rate": 0.0001700273226083402, + "loss": 2.1286, + "step": 483720 + }, + { + "epoch": 1.8699648992593279, + "grad_norm": 0.16050100326538086, + "learning_rate": 0.00016991803462249267, + "loss": 2.1351, + "step": 483730 + }, + { + "epoch": 1.870003556462711, + "grad_norm": 0.15308740735054016, + "learning_rate": 0.00016980875316266464, + "loss": 2.1319, + "step": 483740 + }, + { + "epoch": 1.8700422136660946, + "grad_norm": 0.1580066829919815, + "learning_rate": 0.00016969947822768728, + "loss": 2.1328, + "step": 483750 + }, + { + "epoch": 1.8700808708694778, + "grad_norm": 0.1556428223848343, + "learning_rate": 0.00016959020981639194, + "loss": 2.1202, + "step": 483760 + }, + { + "epoch": 1.870119528072861, + "grad_norm": 0.1538323163986206, + "learning_rate": 0.00016948094792761025, + "loss": 2.1229, + "step": 483770 + }, + { + "epoch": 1.8701581852762443, + "grad_norm": 0.16740500926971436, + "learning_rate": 0.0001693716925601745, + "loss": 2.1457, + "step": 483780 + }, + { + "epoch": 1.8701968424796278, + "grad_norm": 0.15987759828567505, + "learning_rate": 0.0001692624437129171, + "loss": 2.14, + "step": 483790 + }, + { + "epoch": 1.870235499683011, + "grad_norm": 0.1688595712184906, + "learning_rate": 0.00016915320138467083, + "loss": 2.1329, + "step": 483800 + }, + { + "epoch": 1.8702741568863943, + "grad_norm": 0.19591599702835083, + "learning_rate": 0.00016904396557426859, + "loss": 2.1394, + "step": 483810 + }, + { + "epoch": 1.8703128140897776, + "grad_norm": 0.17649315297603607, + "learning_rate": 0.00016893473628054424, + "loss": 2.1205, + "step": 483820 + }, + { + "epoch": 1.8703514712931608, + "grad_norm": 0.16197006404399872, + "learning_rate": 0.00016882551350233134, + "loss": 2.1503, + "step": 483830 + }, + { + "epoch": 1.870390128496544, + "grad_norm": 0.15415135025978088, + "learning_rate": 0.00016871629723846438, + "loss": 2.1454, + "step": 483840 + }, + { + "epoch": 1.8704287856999273, + "grad_norm": 0.16244566440582275, + "learning_rate": 0.00016860708748777785, + "loss": 2.1232, + "step": 483850 + }, + { + "epoch": 1.8704674429033106, + "grad_norm": 0.1580551117658615, + "learning_rate": 0.0001684978842491063, + "loss": 2.1333, + "step": 483860 + }, + { + "epoch": 1.8705061001066938, + "grad_norm": 0.16132232546806335, + "learning_rate": 0.0001683886875212852, + "loss": 2.121, + "step": 483870 + }, + { + "epoch": 1.870544757310077, + "grad_norm": 0.1563108265399933, + "learning_rate": 0.00016827949730315029, + "loss": 2.1359, + "step": 483880 + }, + { + "epoch": 1.8705834145134603, + "grad_norm": 0.16171641647815704, + "learning_rate": 0.0001681703135935373, + "loss": 2.1247, + "step": 483890 + }, + { + "epoch": 1.8706220717168436, + "grad_norm": 0.1576695740222931, + "learning_rate": 0.00016806113639128228, + "loss": 2.1313, + "step": 483900 + }, + { + "epoch": 1.8706607289202268, + "grad_norm": 0.15826772153377533, + "learning_rate": 0.00016795196569522242, + "loss": 2.1241, + "step": 483910 + }, + { + "epoch": 1.8706993861236103, + "grad_norm": 0.16920413076877594, + "learning_rate": 0.00016784280150419417, + "loss": 2.1433, + "step": 483920 + }, + { + "epoch": 1.8707380433269936, + "grad_norm": 0.15005506575107574, + "learning_rate": 0.00016773364381703494, + "loss": 2.138, + "step": 483930 + }, + { + "epoch": 1.8707767005303768, + "grad_norm": 0.16162391006946564, + "learning_rate": 0.00016762449263258274, + "loss": 2.1464, + "step": 483940 + }, + { + "epoch": 1.87081535773376, + "grad_norm": 0.15866687893867493, + "learning_rate": 0.00016751534794967515, + "loss": 2.1179, + "step": 483950 + }, + { + "epoch": 1.8708540149371435, + "grad_norm": 0.15613588690757751, + "learning_rate": 0.00016740620976715071, + "loss": 2.1181, + "step": 483960 + }, + { + "epoch": 1.8708926721405268, + "grad_norm": 0.15326961874961853, + "learning_rate": 0.00016729707808384831, + "loss": 2.1349, + "step": 483970 + }, + { + "epoch": 1.87093132934391, + "grad_norm": 0.16684074699878693, + "learning_rate": 0.00016718795289860666, + "loss": 2.1445, + "step": 483980 + }, + { + "epoch": 1.8709699865472933, + "grad_norm": 0.17308743298053741, + "learning_rate": 0.00016707883421026538, + "loss": 2.1417, + "step": 483990 + }, + { + "epoch": 1.8710086437506765, + "grad_norm": 0.16576573252677917, + "learning_rate": 0.00016696972201766402, + "loss": 2.1312, + "step": 484000 + }, + { + "epoch": 1.8710473009540598, + "grad_norm": 0.15447202324867249, + "learning_rate": 0.00016686061631964288, + "loss": 2.1339, + "step": 484010 + }, + { + "epoch": 1.871085958157443, + "grad_norm": 0.14893104135990143, + "learning_rate": 0.000166751517115042, + "loss": 2.1237, + "step": 484020 + }, + { + "epoch": 1.8711246153608263, + "grad_norm": 0.1605764925479889, + "learning_rate": 0.0001666424244027025, + "loss": 2.1284, + "step": 484030 + }, + { + "epoch": 1.8711632725642096, + "grad_norm": 0.15590053796768188, + "learning_rate": 0.00016653333818146554, + "loss": 2.124, + "step": 484040 + }, + { + "epoch": 1.8712019297675928, + "grad_norm": 0.17661923170089722, + "learning_rate": 0.0001664242584501725, + "loss": 2.1364, + "step": 484050 + }, + { + "epoch": 1.871240586970976, + "grad_norm": 0.16185982525348663, + "learning_rate": 0.00016631518520766498, + "loss": 2.1356, + "step": 484060 + }, + { + "epoch": 1.8712792441743593, + "grad_norm": 0.16207298636436462, + "learning_rate": 0.00016620611845278544, + "loss": 2.1341, + "step": 484070 + }, + { + "epoch": 1.8713179013777426, + "grad_norm": 0.17160029709339142, + "learning_rate": 0.00016609705818437592, + "loss": 2.1346, + "step": 484080 + }, + { + "epoch": 1.871356558581126, + "grad_norm": 0.16804343461990356, + "learning_rate": 0.00016598800440127982, + "loss": 2.1449, + "step": 484090 + }, + { + "epoch": 1.8713952157845093, + "grad_norm": 0.14551687240600586, + "learning_rate": 0.00016587895710234, + "loss": 2.1222, + "step": 484100 + }, + { + "epoch": 1.8714338729878925, + "grad_norm": 0.1561463177204132, + "learning_rate": 0.00016576991628640015, + "loss": 2.1394, + "step": 484110 + }, + { + "epoch": 1.8714725301912758, + "grad_norm": 0.1606985479593277, + "learning_rate": 0.0001656608819523038, + "loss": 2.1411, + "step": 484120 + }, + { + "epoch": 1.8715111873946593, + "grad_norm": 0.1565343588590622, + "learning_rate": 0.0001655518540988954, + "loss": 2.1393, + "step": 484130 + }, + { + "epoch": 1.8715498445980425, + "grad_norm": 0.15693065524101257, + "learning_rate": 0.0001654428327250197, + "loss": 2.1288, + "step": 484140 + }, + { + "epoch": 1.8715885018014258, + "grad_norm": 0.1524079442024231, + "learning_rate": 0.0001653338178295214, + "loss": 2.131, + "step": 484150 + }, + { + "epoch": 1.871627159004809, + "grad_norm": 0.15368495881557465, + "learning_rate": 0.00016522480941124563, + "loss": 2.1196, + "step": 484160 + }, + { + "epoch": 1.8716658162081923, + "grad_norm": 0.15699228644371033, + "learning_rate": 0.00016511580746903821, + "loss": 2.1361, + "step": 484170 + }, + { + "epoch": 1.8717044734115755, + "grad_norm": 0.15115079283714294, + "learning_rate": 0.00016500681200174472, + "loss": 2.1361, + "step": 484180 + }, + { + "epoch": 1.8717431306149588, + "grad_norm": 0.1602882593870163, + "learning_rate": 0.00016489782300821188, + "loss": 2.1227, + "step": 484190 + }, + { + "epoch": 1.871781787818342, + "grad_norm": 0.1549602597951889, + "learning_rate": 0.00016478884048728616, + "loss": 2.1352, + "step": 484200 + }, + { + "epoch": 1.8718204450217253, + "grad_norm": 0.15959705412387848, + "learning_rate": 0.00016467986443781423, + "loss": 2.1341, + "step": 484210 + }, + { + "epoch": 1.8718591022251085, + "grad_norm": 0.16147272288799286, + "learning_rate": 0.00016457089485864373, + "loss": 2.1306, + "step": 484220 + }, + { + "epoch": 1.8718977594284918, + "grad_norm": 0.2113785594701767, + "learning_rate": 0.00016446193174862202, + "loss": 2.1235, + "step": 484230 + }, + { + "epoch": 1.871936416631875, + "grad_norm": 0.16358286142349243, + "learning_rate": 0.00016435297510659753, + "loss": 2.1388, + "step": 484240 + }, + { + "epoch": 1.8719750738352583, + "grad_norm": 0.15772676467895508, + "learning_rate": 0.0001642440249314181, + "loss": 2.1265, + "step": 484250 + }, + { + "epoch": 1.8720137310386418, + "grad_norm": 0.14948904514312744, + "learning_rate": 0.00016413508122193265, + "loss": 2.1163, + "step": 484260 + }, + { + "epoch": 1.872052388242025, + "grad_norm": 0.154491126537323, + "learning_rate": 0.0001640261439769901, + "loss": 2.1356, + "step": 484270 + }, + { + "epoch": 1.8720910454454083, + "grad_norm": 0.18150001764297485, + "learning_rate": 0.00016391721319543984, + "loss": 2.1355, + "step": 484280 + }, + { + "epoch": 1.8721297026487915, + "grad_norm": 0.16020137071609497, + "learning_rate": 0.00016380828887613163, + "loss": 2.1295, + "step": 484290 + }, + { + "epoch": 1.872168359852175, + "grad_norm": 0.14903488755226135, + "learning_rate": 0.00016369937101791532, + "loss": 2.1428, + "step": 484300 + }, + { + "epoch": 1.8722070170555583, + "grad_norm": 0.15942107141017914, + "learning_rate": 0.00016359045961964136, + "loss": 2.1186, + "step": 484310 + }, + { + "epoch": 1.8722456742589415, + "grad_norm": 0.15722288191318512, + "learning_rate": 0.00016348155468016068, + "loss": 2.124, + "step": 484320 + }, + { + "epoch": 1.8722843314623248, + "grad_norm": 0.15270255506038666, + "learning_rate": 0.00016337265619832398, + "loss": 2.1225, + "step": 484330 + }, + { + "epoch": 1.872322988665708, + "grad_norm": 0.1539790779352188, + "learning_rate": 0.00016326376417298284, + "loss": 2.1341, + "step": 484340 + }, + { + "epoch": 1.8723616458690913, + "grad_norm": 0.16061554849147797, + "learning_rate": 0.00016315487860298905, + "loss": 2.1399, + "step": 484350 + }, + { + "epoch": 1.8724003030724745, + "grad_norm": 0.33266738057136536, + "learning_rate": 0.00016304599948719446, + "loss": 2.1191, + "step": 484360 + }, + { + "epoch": 1.8724389602758578, + "grad_norm": 0.18792271614074707, + "learning_rate": 0.00016293712682445194, + "loss": 2.1264, + "step": 484370 + }, + { + "epoch": 1.872477617479241, + "grad_norm": 0.21823568642139435, + "learning_rate": 0.00016282826061361379, + "loss": 2.1211, + "step": 484380 + }, + { + "epoch": 1.8725162746826243, + "grad_norm": 0.14920276403427124, + "learning_rate": 0.0001627194008535333, + "loss": 2.1313, + "step": 484390 + }, + { + "epoch": 1.8725549318860075, + "grad_norm": 0.16141065955162048, + "learning_rate": 0.0001626105475430637, + "loss": 2.1345, + "step": 484400 + }, + { + "epoch": 1.8725935890893908, + "grad_norm": 0.15054553747177124, + "learning_rate": 0.0001625017006810592, + "loss": 2.1196, + "step": 484410 + }, + { + "epoch": 1.8726322462927742, + "grad_norm": 0.16641885042190552, + "learning_rate": 0.00016239286026637357, + "loss": 2.1425, + "step": 484420 + }, + { + "epoch": 1.8726709034961575, + "grad_norm": 0.17120453715324402, + "learning_rate": 0.00016228402629786109, + "loss": 2.1292, + "step": 484430 + }, + { + "epoch": 1.8727095606995408, + "grad_norm": 0.16046109795570374, + "learning_rate": 0.00016217519877437714, + "loss": 2.1344, + "step": 484440 + }, + { + "epoch": 1.872748217902924, + "grad_norm": 0.15624983608722687, + "learning_rate": 0.00016206637769477638, + "loss": 2.1161, + "step": 484450 + }, + { + "epoch": 1.8727868751063073, + "grad_norm": 0.15509746968746185, + "learning_rate": 0.00016195756305791465, + "loss": 2.1338, + "step": 484460 + }, + { + "epoch": 1.8728255323096907, + "grad_norm": 0.16712747514247894, + "learning_rate": 0.00016184875486264727, + "loss": 2.1228, + "step": 484470 + }, + { + "epoch": 1.872864189513074, + "grad_norm": 0.18799036741256714, + "learning_rate": 0.00016173995310783074, + "loss": 2.1281, + "step": 484480 + }, + { + "epoch": 1.8729028467164572, + "grad_norm": 0.15011072158813477, + "learning_rate": 0.00016163115779232152, + "loss": 2.1337, + "step": 484490 + }, + { + "epoch": 1.8729415039198405, + "grad_norm": 0.15797634422779083, + "learning_rate": 0.00016152236891497652, + "loss": 2.1344, + "step": 484500 + }, + { + "epoch": 1.8729801611232237, + "grad_norm": 0.1545293778181076, + "learning_rate": 0.00016141358647465265, + "loss": 2.1312, + "step": 484510 + }, + { + "epoch": 1.873018818326607, + "grad_norm": 0.16722118854522705, + "learning_rate": 0.00016130481047020752, + "loss": 2.1385, + "step": 484520 + }, + { + "epoch": 1.8730574755299902, + "grad_norm": 0.15909487009048462, + "learning_rate": 0.00016119604090049865, + "loss": 2.1127, + "step": 484530 + }, + { + "epoch": 1.8730961327333735, + "grad_norm": 0.16210758686065674, + "learning_rate": 0.00016108727776438503, + "loss": 2.1274, + "step": 484540 + }, + { + "epoch": 1.8731347899367567, + "grad_norm": 0.16574212908744812, + "learning_rate": 0.00016097852106072442, + "loss": 2.133, + "step": 484550 + }, + { + "epoch": 1.87317344714014, + "grad_norm": 0.15301044285297394, + "learning_rate": 0.00016086977078837617, + "loss": 2.1248, + "step": 484560 + }, + { + "epoch": 1.8732121043435233, + "grad_norm": 0.1614498645067215, + "learning_rate": 0.00016076102694619922, + "loss": 2.1401, + "step": 484570 + }, + { + "epoch": 1.8732507615469065, + "grad_norm": 0.15972541272640228, + "learning_rate": 0.0001606522895330529, + "loss": 2.1208, + "step": 484580 + }, + { + "epoch": 1.87328941875029, + "grad_norm": 0.17323851585388184, + "learning_rate": 0.00016054355854779745, + "loss": 2.1266, + "step": 484590 + }, + { + "epoch": 1.8733280759536732, + "grad_norm": 0.14773207902908325, + "learning_rate": 0.00016043483398929292, + "loss": 2.1314, + "step": 484600 + }, + { + "epoch": 1.8733667331570565, + "grad_norm": 0.15913040935993195, + "learning_rate": 0.0001603261158564, + "loss": 2.1275, + "step": 484610 + }, + { + "epoch": 1.8734053903604397, + "grad_norm": 0.15760380029678345, + "learning_rate": 0.00016021740414797937, + "loss": 2.13, + "step": 484620 + }, + { + "epoch": 1.873444047563823, + "grad_norm": 0.15176202356815338, + "learning_rate": 0.00016010869886289193, + "loss": 2.1198, + "step": 484630 + }, + { + "epoch": 1.8734827047672065, + "grad_norm": 0.1578439623117447, + "learning_rate": 0.00015999999999999993, + "loss": 2.126, + "step": 484640 + }, + { + "epoch": 1.8735213619705897, + "grad_norm": 0.15178169310092926, + "learning_rate": 0.00015989130755816495, + "loss": 2.1295, + "step": 484650 + }, + { + "epoch": 1.873560019173973, + "grad_norm": 0.1513904482126236, + "learning_rate": 0.000159782621536249, + "loss": 2.1362, + "step": 484660 + }, + { + "epoch": 1.8735986763773562, + "grad_norm": 0.15508714318275452, + "learning_rate": 0.00015967394193311502, + "loss": 2.1361, + "step": 484670 + }, + { + "epoch": 1.8736373335807395, + "grad_norm": 0.1674482524394989, + "learning_rate": 0.00015956526874762544, + "loss": 2.1354, + "step": 484680 + }, + { + "epoch": 1.8736759907841227, + "grad_norm": 0.15818580985069275, + "learning_rate": 0.00015945660197864387, + "loss": 2.1283, + "step": 484690 + }, + { + "epoch": 1.873714647987506, + "grad_norm": 0.15632577240467072, + "learning_rate": 0.00015934794162503386, + "loss": 2.1207, + "step": 484700 + }, + { + "epoch": 1.8737533051908892, + "grad_norm": 0.1655329465866089, + "learning_rate": 0.000159239287685659, + "loss": 2.1354, + "step": 484710 + }, + { + "epoch": 1.8737919623942725, + "grad_norm": 0.1501639187335968, + "learning_rate": 0.00015913064015938395, + "loss": 2.1357, + "step": 484720 + }, + { + "epoch": 1.8738306195976557, + "grad_norm": 0.16501399874687195, + "learning_rate": 0.00015902199904507276, + "loss": 2.1292, + "step": 484730 + }, + { + "epoch": 1.873869276801039, + "grad_norm": 0.15810200572013855, + "learning_rate": 0.00015891336434159075, + "loss": 2.1133, + "step": 484740 + }, + { + "epoch": 1.8739079340044222, + "grad_norm": 0.15270966291427612, + "learning_rate": 0.0001588047360478031, + "loss": 2.1288, + "step": 484750 + }, + { + "epoch": 1.8739465912078057, + "grad_norm": 0.16167369484901428, + "learning_rate": 0.00015869611416257533, + "loss": 2.1416, + "step": 484760 + }, + { + "epoch": 1.873985248411189, + "grad_norm": 0.1594432145357132, + "learning_rate": 0.0001585874986847733, + "loss": 2.1347, + "step": 484770 + }, + { + "epoch": 1.8740239056145722, + "grad_norm": 0.1732734590768814, + "learning_rate": 0.00015847888961326316, + "loss": 2.1336, + "step": 484780 + }, + { + "epoch": 1.8740625628179555, + "grad_norm": 0.15716975927352905, + "learning_rate": 0.0001583702869469119, + "loss": 2.128, + "step": 484790 + }, + { + "epoch": 1.874101220021339, + "grad_norm": 0.14907538890838623, + "learning_rate": 0.00015826169068458595, + "loss": 2.1324, + "step": 484800 + }, + { + "epoch": 1.8741398772247222, + "grad_norm": 0.1508820801973343, + "learning_rate": 0.00015815310082515266, + "loss": 2.14, + "step": 484810 + }, + { + "epoch": 1.8741785344281054, + "grad_norm": 0.16098782420158386, + "learning_rate": 0.00015804451736748005, + "loss": 2.1336, + "step": 484820 + }, + { + "epoch": 1.8742171916314887, + "grad_norm": 0.19443218410015106, + "learning_rate": 0.00015793594031043523, + "loss": 2.119, + "step": 484830 + }, + { + "epoch": 1.874255848834872, + "grad_norm": 0.1581234484910965, + "learning_rate": 0.00015782736965288714, + "loss": 2.1249, + "step": 484840 + }, + { + "epoch": 1.8742945060382552, + "grad_norm": 0.14153549075126648, + "learning_rate": 0.0001577188053937042, + "loss": 2.1368, + "step": 484850 + }, + { + "epoch": 1.8743331632416385, + "grad_norm": 0.1574030965566635, + "learning_rate": 0.00015761024753175535, + "loss": 2.1379, + "step": 484860 + }, + { + "epoch": 1.8743718204450217, + "grad_norm": 0.17852358520030975, + "learning_rate": 0.00015750169606590947, + "loss": 2.1255, + "step": 484870 + }, + { + "epoch": 1.874410477648405, + "grad_norm": 0.15847718715667725, + "learning_rate": 0.0001573931509950366, + "loss": 2.1324, + "step": 484880 + }, + { + "epoch": 1.8744491348517882, + "grad_norm": 0.1581220179796219, + "learning_rate": 0.00015728461231800651, + "loss": 2.1266, + "step": 484890 + }, + { + "epoch": 1.8744877920551715, + "grad_norm": 0.1563778966665268, + "learning_rate": 0.0001571760800336892, + "loss": 2.1276, + "step": 484900 + }, + { + "epoch": 1.8745264492585547, + "grad_norm": 0.15693436563014984, + "learning_rate": 0.00015706755414095563, + "loss": 2.1275, + "step": 484910 + }, + { + "epoch": 1.874565106461938, + "grad_norm": 0.16170282661914825, + "learning_rate": 0.00015695903463867645, + "loss": 2.1097, + "step": 484920 + }, + { + "epoch": 1.8746037636653214, + "grad_norm": 0.1586095094680786, + "learning_rate": 0.00015685052152572277, + "loss": 2.114, + "step": 484930 + }, + { + "epoch": 1.8746424208687047, + "grad_norm": 0.16190335154533386, + "learning_rate": 0.0001567420148009666, + "loss": 2.1227, + "step": 484940 + }, + { + "epoch": 1.874681078072088, + "grad_norm": 0.15662255883216858, + "learning_rate": 0.00015663351446327956, + "loss": 2.1219, + "step": 484950 + }, + { + "epoch": 1.8747197352754712, + "grad_norm": 0.1621389091014862, + "learning_rate": 0.00015652502051153406, + "loss": 2.1306, + "step": 484960 + }, + { + "epoch": 1.8747583924788547, + "grad_norm": 0.1581934094429016, + "learning_rate": 0.00015641653294460255, + "loss": 2.1447, + "step": 484970 + }, + { + "epoch": 1.874797049682238, + "grad_norm": 0.15592055022716522, + "learning_rate": 0.00015630805176135775, + "loss": 2.1279, + "step": 484980 + }, + { + "epoch": 1.8748357068856212, + "grad_norm": 0.167986199259758, + "learning_rate": 0.00015619957696067345, + "loss": 2.1249, + "step": 484990 + }, + { + "epoch": 1.8748743640890044, + "grad_norm": 0.16340020298957825, + "learning_rate": 0.0001560911085414225, + "loss": 2.1389, + "step": 485000 + }, + { + "epoch": 1.8749130212923877, + "grad_norm": 0.15751740336418152, + "learning_rate": 0.00015598264650247938, + "loss": 2.1055, + "step": 485010 + }, + { + "epoch": 1.874951678495771, + "grad_norm": 0.16410748660564423, + "learning_rate": 0.00015587419084271814, + "loss": 2.1328, + "step": 485020 + }, + { + "epoch": 1.8749903356991542, + "grad_norm": 0.16635240614414215, + "learning_rate": 0.0001557657415610132, + "loss": 2.1311, + "step": 485030 + }, + { + "epoch": 1.8750289929025374, + "grad_norm": 0.15139134228229523, + "learning_rate": 0.00015565729865623946, + "loss": 2.1283, + "step": 485040 + }, + { + "epoch": 1.8750676501059207, + "grad_norm": 0.16576196253299713, + "learning_rate": 0.00015554886212727225, + "loss": 2.124, + "step": 485050 + }, + { + "epoch": 1.875106307309304, + "grad_norm": 0.1514219492673874, + "learning_rate": 0.00015544043197298718, + "loss": 2.1285, + "step": 485060 + }, + { + "epoch": 1.8751449645126872, + "grad_norm": 0.18162968754768372, + "learning_rate": 0.00015533200819226002, + "loss": 2.1296, + "step": 485070 + }, + { + "epoch": 1.8751836217160704, + "grad_norm": 0.1785215139389038, + "learning_rate": 0.0001552235907839672, + "loss": 2.1444, + "step": 485080 + }, + { + "epoch": 1.8752222789194537, + "grad_norm": 0.15658053755760193, + "learning_rate": 0.00015511517974698498, + "loss": 2.1308, + "step": 485090 + }, + { + "epoch": 1.8752609361228372, + "grad_norm": 0.15919972956180573, + "learning_rate": 0.00015500677508019023, + "loss": 2.1163, + "step": 485100 + }, + { + "epoch": 1.8752995933262204, + "grad_norm": 1.2751634120941162, + "learning_rate": 0.00015489837678246031, + "loss": 2.1182, + "step": 485110 + }, + { + "epoch": 1.8753382505296037, + "grad_norm": 0.15868133306503296, + "learning_rate": 0.00015478998485267281, + "loss": 2.1322, + "step": 485120 + }, + { + "epoch": 1.875376907732987, + "grad_norm": 0.1583377718925476, + "learning_rate": 0.00015468159928970525, + "loss": 2.1199, + "step": 485130 + }, + { + "epoch": 1.8754155649363704, + "grad_norm": 0.1486121267080307, + "learning_rate": 0.00015457322009243614, + "loss": 2.1289, + "step": 485140 + }, + { + "epoch": 1.8754542221397537, + "grad_norm": 0.15714432299137115, + "learning_rate": 0.00015446484725974407, + "loss": 2.1399, + "step": 485150 + }, + { + "epoch": 1.875492879343137, + "grad_norm": 0.1527232676744461, + "learning_rate": 0.00015435648079050758, + "loss": 2.1301, + "step": 485160 + }, + { + "epoch": 1.8755315365465202, + "grad_norm": 0.15908005833625793, + "learning_rate": 0.00015424812068360615, + "loss": 2.1265, + "step": 485170 + }, + { + "epoch": 1.8755701937499034, + "grad_norm": 0.16998544335365295, + "learning_rate": 0.00015413976693791898, + "loss": 2.1127, + "step": 485180 + }, + { + "epoch": 1.8756088509532867, + "grad_norm": 0.15736450254917145, + "learning_rate": 0.00015403141955232623, + "loss": 2.1282, + "step": 485190 + }, + { + "epoch": 1.87564750815667, + "grad_norm": 0.16208943724632263, + "learning_rate": 0.00015392307852570776, + "loss": 2.1342, + "step": 485200 + }, + { + "epoch": 1.8756861653600532, + "grad_norm": 0.17129312455654144, + "learning_rate": 0.00015381474385694439, + "loss": 2.1387, + "step": 485210 + }, + { + "epoch": 1.8757248225634364, + "grad_norm": 0.17631563544273376, + "learning_rate": 0.0001537064155449166, + "loss": 2.1307, + "step": 485220 + }, + { + "epoch": 1.8757634797668197, + "grad_norm": 0.16245117783546448, + "learning_rate": 0.00015359809358850597, + "loss": 2.1263, + "step": 485230 + }, + { + "epoch": 1.875802136970203, + "grad_norm": 0.1504349410533905, + "learning_rate": 0.00015348977798659337, + "loss": 2.1237, + "step": 485240 + }, + { + "epoch": 1.8758407941735862, + "grad_norm": 0.17673885822296143, + "learning_rate": 0.00015338146873806124, + "loss": 2.1216, + "step": 485250 + }, + { + "epoch": 1.8758794513769694, + "grad_norm": 0.17802615463733673, + "learning_rate": 0.0001532731658417912, + "loss": 2.1193, + "step": 485260 + }, + { + "epoch": 1.875918108580353, + "grad_norm": 0.6284236311912537, + "learning_rate": 0.0001531648692966663, + "loss": 2.1408, + "step": 485270 + }, + { + "epoch": 1.8759567657837362, + "grad_norm": 0.1544286012649536, + "learning_rate": 0.00015305657910156877, + "loss": 2.0998, + "step": 485280 + }, + { + "epoch": 1.8759954229871194, + "grad_norm": 0.16721487045288086, + "learning_rate": 0.00015294829525538178, + "loss": 2.1216, + "step": 485290 + }, + { + "epoch": 1.8760340801905027, + "grad_norm": 0.16701914370059967, + "learning_rate": 0.0001528400177569893, + "loss": 2.1176, + "step": 485300 + }, + { + "epoch": 1.8760727373938861, + "grad_norm": 0.14934012293815613, + "learning_rate": 0.00015273174660527446, + "loss": 2.1364, + "step": 485310 + }, + { + "epoch": 1.8761113945972694, + "grad_norm": 0.15575703978538513, + "learning_rate": 0.00015262348179912165, + "loss": 2.1406, + "step": 485320 + }, + { + "epoch": 1.8761500518006526, + "grad_norm": 0.20635825395584106, + "learning_rate": 0.00015251522333741541, + "loss": 2.124, + "step": 485330 + }, + { + "epoch": 1.876188709004036, + "grad_norm": 0.15215198695659637, + "learning_rate": 0.00015240697121904034, + "loss": 2.1229, + "step": 485340 + }, + { + "epoch": 1.8762273662074191, + "grad_norm": 0.16519252955913544, + "learning_rate": 0.00015229872544288159, + "loss": 2.128, + "step": 485350 + }, + { + "epoch": 1.8762660234108024, + "grad_norm": 0.17716598510742188, + "learning_rate": 0.00015219048600782447, + "loss": 2.1367, + "step": 485360 + }, + { + "epoch": 1.8763046806141856, + "grad_norm": 0.1608271300792694, + "learning_rate": 0.00015208225291275478, + "loss": 2.1294, + "step": 485370 + }, + { + "epoch": 1.876343337817569, + "grad_norm": 0.15679559111595154, + "learning_rate": 0.00015197402615655853, + "loss": 2.1329, + "step": 485380 + }, + { + "epoch": 1.8763819950209522, + "grad_norm": 0.14653101563453674, + "learning_rate": 0.00015186580573812235, + "loss": 2.1109, + "step": 485390 + }, + { + "epoch": 1.8764206522243354, + "grad_norm": 0.16167862713336945, + "learning_rate": 0.00015175759165633252, + "loss": 2.1208, + "step": 485400 + }, + { + "epoch": 1.8764593094277187, + "grad_norm": 0.15243785083293915, + "learning_rate": 0.00015164938391007654, + "loss": 2.124, + "step": 485410 + }, + { + "epoch": 1.876497966631102, + "grad_norm": 0.17141318321228027, + "learning_rate": 0.00015154118249824134, + "loss": 2.1336, + "step": 485420 + }, + { + "epoch": 1.8765366238344852, + "grad_norm": 0.15932010114192963, + "learning_rate": 0.0001514329874197149, + "loss": 2.1272, + "step": 485430 + }, + { + "epoch": 1.8765752810378686, + "grad_norm": 0.1552756428718567, + "learning_rate": 0.000151324798673385, + "loss": 2.1196, + "step": 485440 + }, + { + "epoch": 1.8766139382412519, + "grad_norm": 0.16936630010604858, + "learning_rate": 0.00015121661625814033, + "loss": 2.1271, + "step": 485450 + }, + { + "epoch": 1.8766525954446351, + "grad_norm": 0.15281683206558228, + "learning_rate": 0.0001511084401728693, + "loss": 2.129, + "step": 485460 + }, + { + "epoch": 1.8766912526480184, + "grad_norm": 0.15728114545345306, + "learning_rate": 0.00015100027041646102, + "loss": 2.1303, + "step": 485470 + }, + { + "epoch": 1.8767299098514019, + "grad_norm": 0.1558963656425476, + "learning_rate": 0.00015089210698780486, + "loss": 2.137, + "step": 485480 + }, + { + "epoch": 1.8767685670547851, + "grad_norm": 0.15373340249061584, + "learning_rate": 0.00015078394988579015, + "loss": 2.1224, + "step": 485490 + }, + { + "epoch": 1.8768072242581684, + "grad_norm": 0.15859030187129974, + "learning_rate": 0.00015067579910930706, + "loss": 2.1145, + "step": 485500 + }, + { + "epoch": 1.8768458814615516, + "grad_norm": 0.17240332067012787, + "learning_rate": 0.00015056765465724586, + "loss": 2.1175, + "step": 485510 + }, + { + "epoch": 1.8768845386649349, + "grad_norm": 0.16093796491622925, + "learning_rate": 0.00015045951652849744, + "loss": 2.1186, + "step": 485520 + }, + { + "epoch": 1.8769231958683181, + "grad_norm": 0.15308769047260284, + "learning_rate": 0.0001503513847219522, + "loss": 2.1228, + "step": 485530 + }, + { + "epoch": 1.8769618530717014, + "grad_norm": 0.15490011870861053, + "learning_rate": 0.0001502432592365015, + "loss": 2.1218, + "step": 485540 + }, + { + "epoch": 1.8770005102750846, + "grad_norm": 0.15395839512348175, + "learning_rate": 0.00015013514007103756, + "loss": 2.1373, + "step": 485550 + }, + { + "epoch": 1.8770391674784679, + "grad_norm": 0.15664257109165192, + "learning_rate": 0.00015002702722445149, + "loss": 2.1299, + "step": 485560 + }, + { + "epoch": 1.8770778246818511, + "grad_norm": 0.15513335168361664, + "learning_rate": 0.00014991892069563618, + "loss": 2.1108, + "step": 485570 + }, + { + "epoch": 1.8771164818852344, + "grad_norm": 0.15297462046146393, + "learning_rate": 0.0001498108204834836, + "loss": 2.1336, + "step": 485580 + }, + { + "epoch": 1.8771551390886176, + "grad_norm": 0.16893671452999115, + "learning_rate": 0.0001497027265868871, + "loss": 2.1354, + "step": 485590 + }, + { + "epoch": 1.877193796292001, + "grad_norm": 0.1585252583026886, + "learning_rate": 0.00014959463900473958, + "loss": 2.1261, + "step": 485600 + }, + { + "epoch": 1.8772324534953844, + "grad_norm": 0.1679084599018097, + "learning_rate": 0.00014948655773593477, + "loss": 2.1334, + "step": 485610 + }, + { + "epoch": 1.8772711106987676, + "grad_norm": 0.15539462864398956, + "learning_rate": 0.0001493784827793665, + "loss": 2.1448, + "step": 485620 + }, + { + "epoch": 1.8773097679021509, + "grad_norm": 0.15404072403907776, + "learning_rate": 0.00014927041413392871, + "loss": 2.132, + "step": 485630 + }, + { + "epoch": 1.8773484251055341, + "grad_norm": 0.17238987982273102, + "learning_rate": 0.00014916235179851612, + "loss": 2.1136, + "step": 485640 + }, + { + "epoch": 1.8773870823089176, + "grad_norm": 0.15702857077121735, + "learning_rate": 0.00014905429577202333, + "loss": 2.1272, + "step": 485650 + }, + { + "epoch": 1.8774257395123009, + "grad_norm": 0.1531582623720169, + "learning_rate": 0.00014894624605334594, + "loss": 2.1256, + "step": 485660 + }, + { + "epoch": 1.877464396715684, + "grad_norm": 0.16453592479228973, + "learning_rate": 0.00014883820264137903, + "loss": 2.1306, + "step": 485670 + }, + { + "epoch": 1.8775030539190674, + "grad_norm": 0.1602281630039215, + "learning_rate": 0.00014873016553501839, + "loss": 2.1412, + "step": 485680 + }, + { + "epoch": 1.8775417111224506, + "grad_norm": 0.15847299993038177, + "learning_rate": 0.00014862213473316045, + "loss": 2.1222, + "step": 485690 + }, + { + "epoch": 1.8775803683258339, + "grad_norm": 0.15805181860923767, + "learning_rate": 0.00014851411023470118, + "loss": 2.1125, + "step": 485700 + }, + { + "epoch": 1.8776190255292171, + "grad_norm": 0.1721765697002411, + "learning_rate": 0.00014840609203853772, + "loss": 2.1191, + "step": 485710 + }, + { + "epoch": 1.8776576827326004, + "grad_norm": 0.16002759337425232, + "learning_rate": 0.00014829808014356694, + "loss": 2.1325, + "step": 485720 + }, + { + "epoch": 1.8776963399359836, + "grad_norm": 0.1535620242357254, + "learning_rate": 0.00014819007454868637, + "loss": 2.1066, + "step": 485730 + }, + { + "epoch": 1.8777349971393669, + "grad_norm": 0.1585405021905899, + "learning_rate": 0.00014808207525279337, + "loss": 2.1305, + "step": 485740 + }, + { + "epoch": 1.8777736543427501, + "grad_norm": 0.15157096087932587, + "learning_rate": 0.0001479740822547866, + "loss": 2.1197, + "step": 485750 + }, + { + "epoch": 1.8778123115461334, + "grad_norm": 0.15094926953315735, + "learning_rate": 0.000147866095553564, + "loss": 2.1225, + "step": 485760 + }, + { + "epoch": 1.8778509687495166, + "grad_norm": 0.1576872318983078, + "learning_rate": 0.00014775811514802428, + "loss": 2.1347, + "step": 485770 + }, + { + "epoch": 1.8778896259529, + "grad_norm": 0.16082343459129333, + "learning_rate": 0.00014765014103706631, + "loss": 2.1338, + "step": 485780 + }, + { + "epoch": 1.8779282831562834, + "grad_norm": 0.1641700267791748, + "learning_rate": 0.00014754217321958984, + "loss": 2.1274, + "step": 485790 + }, + { + "epoch": 1.8779669403596666, + "grad_norm": 0.15711650252342224, + "learning_rate": 0.00014743421169449424, + "loss": 2.1242, + "step": 485800 + }, + { + "epoch": 1.8780055975630499, + "grad_norm": 0.16527198255062103, + "learning_rate": 0.00014732625646067944, + "loss": 2.1207, + "step": 485810 + }, + { + "epoch": 1.8780442547664333, + "grad_norm": 0.1620732545852661, + "learning_rate": 0.0001472183075170459, + "loss": 2.1259, + "step": 485820 + }, + { + "epoch": 1.8780829119698166, + "grad_norm": 0.15371723473072052, + "learning_rate": 0.00014711036486249408, + "loss": 2.1207, + "step": 485830 + }, + { + "epoch": 1.8781215691731998, + "grad_norm": 0.15590305626392365, + "learning_rate": 0.00014700242849592483, + "loss": 2.1269, + "step": 485840 + }, + { + "epoch": 1.878160226376583, + "grad_norm": 0.18628154695034027, + "learning_rate": 0.00014689449841623968, + "loss": 2.1157, + "step": 485850 + }, + { + "epoch": 1.8781988835799663, + "grad_norm": 0.16660727560520172, + "learning_rate": 0.00014678657462234, + "loss": 2.1185, + "step": 485860 + }, + { + "epoch": 1.8782375407833496, + "grad_norm": 0.16076427698135376, + "learning_rate": 0.00014667865711312755, + "loss": 2.1309, + "step": 485870 + }, + { + "epoch": 1.8782761979867328, + "grad_norm": 0.16650605201721191, + "learning_rate": 0.00014657074588750497, + "loss": 2.1267, + "step": 485880 + }, + { + "epoch": 1.878314855190116, + "grad_norm": 0.15219688415527344, + "learning_rate": 0.00014646284094437423, + "loss": 2.1206, + "step": 485890 + }, + { + "epoch": 1.8783535123934993, + "grad_norm": 0.15262344479560852, + "learning_rate": 0.00014635494228263868, + "loss": 2.1256, + "step": 485900 + }, + { + "epoch": 1.8783921695968826, + "grad_norm": 0.15145814418792725, + "learning_rate": 0.00014624704990120118, + "loss": 2.1177, + "step": 485910 + }, + { + "epoch": 1.8784308268002659, + "grad_norm": 0.16400784254074097, + "learning_rate": 0.0001461391637989653, + "loss": 2.1161, + "step": 485920 + }, + { + "epoch": 1.878469484003649, + "grad_norm": 0.15878716111183167, + "learning_rate": 0.00014603128397483477, + "loss": 2.1112, + "step": 485930 + }, + { + "epoch": 1.8785081412070324, + "grad_norm": 0.16059978306293488, + "learning_rate": 0.00014592341042771363, + "loss": 2.119, + "step": 485940 + }, + { + "epoch": 1.8785467984104158, + "grad_norm": 0.16229532659053802, + "learning_rate": 0.00014581554315650668, + "loss": 2.1209, + "step": 485950 + }, + { + "epoch": 1.878585455613799, + "grad_norm": 0.1519196480512619, + "learning_rate": 0.00014570768216011842, + "loss": 2.1122, + "step": 485960 + }, + { + "epoch": 1.8786241128171823, + "grad_norm": 0.1595258116722107, + "learning_rate": 0.00014559982743745414, + "loss": 2.1316, + "step": 485970 + }, + { + "epoch": 1.8786627700205656, + "grad_norm": 0.17319795489311218, + "learning_rate": 0.00014549197898741894, + "loss": 2.1407, + "step": 485980 + }, + { + "epoch": 1.878701427223949, + "grad_norm": 0.15375776588916779, + "learning_rate": 0.0001453841368089186, + "loss": 2.1252, + "step": 485990 + }, + { + "epoch": 1.8787400844273323, + "grad_norm": 0.1607261598110199, + "learning_rate": 0.0001452763009008593, + "loss": 2.1285, + "step": 486000 + }, + { + "epoch": 1.8787787416307156, + "grad_norm": 0.15706588327884674, + "learning_rate": 0.00014516847126214727, + "loss": 2.114, + "step": 486010 + }, + { + "epoch": 1.8788173988340988, + "grad_norm": 0.17170752584934235, + "learning_rate": 0.00014506064789168916, + "loss": 2.1228, + "step": 486020 + }, + { + "epoch": 1.878856056037482, + "grad_norm": 0.1548837274312973, + "learning_rate": 0.00014495283078839205, + "loss": 2.1338, + "step": 486030 + }, + { + "epoch": 1.8788947132408653, + "grad_norm": 0.15781624615192413, + "learning_rate": 0.00014484501995116306, + "loss": 2.1202, + "step": 486040 + }, + { + "epoch": 1.8789333704442486, + "grad_norm": 0.16095009446144104, + "learning_rate": 0.00014473721537891016, + "loss": 2.125, + "step": 486050 + }, + { + "epoch": 1.8789720276476318, + "grad_norm": 0.14927291870117188, + "learning_rate": 0.00014462941707054112, + "loss": 2.1126, + "step": 486060 + }, + { + "epoch": 1.879010684851015, + "grad_norm": 0.15893907845020294, + "learning_rate": 0.00014452162502496412, + "loss": 2.1177, + "step": 486070 + }, + { + "epoch": 1.8790493420543983, + "grad_norm": 0.16693106293678284, + "learning_rate": 0.00014441383924108765, + "loss": 2.1403, + "step": 486080 + }, + { + "epoch": 1.8790879992577816, + "grad_norm": 0.15528330206871033, + "learning_rate": 0.00014430605971782075, + "loss": 2.1151, + "step": 486090 + }, + { + "epoch": 1.8791266564611648, + "grad_norm": 0.16108617186546326, + "learning_rate": 0.00014419828645407273, + "loss": 2.1045, + "step": 486100 + }, + { + "epoch": 1.879165313664548, + "grad_norm": 0.16097860038280487, + "learning_rate": 0.00014409051944875296, + "loss": 2.1268, + "step": 486110 + }, + { + "epoch": 1.8792039708679316, + "grad_norm": 0.16463223099708557, + "learning_rate": 0.00014398275870077115, + "loss": 2.1292, + "step": 486120 + }, + { + "epoch": 1.8792426280713148, + "grad_norm": 0.14849969744682312, + "learning_rate": 0.00014387500420903775, + "loss": 2.1199, + "step": 486130 + }, + { + "epoch": 1.879281285274698, + "grad_norm": 0.1572045236825943, + "learning_rate": 0.00014376725597246276, + "loss": 2.1243, + "step": 486140 + }, + { + "epoch": 1.8793199424780813, + "grad_norm": 0.15840677917003632, + "learning_rate": 0.0001436595139899577, + "loss": 2.1358, + "step": 486150 + }, + { + "epoch": 1.8793585996814648, + "grad_norm": 0.18019139766693115, + "learning_rate": 0.00014355177826043318, + "loss": 2.1158, + "step": 486160 + }, + { + "epoch": 1.879397256884848, + "grad_norm": 0.15955372154712677, + "learning_rate": 0.00014344404878280058, + "loss": 2.1315, + "step": 486170 + }, + { + "epoch": 1.8794359140882313, + "grad_norm": 0.16374975442886353, + "learning_rate": 0.00014333632555597187, + "loss": 2.103, + "step": 486180 + }, + { + "epoch": 1.8794745712916145, + "grad_norm": 0.1583992838859558, + "learning_rate": 0.000143228608578859, + "loss": 2.1356, + "step": 486190 + }, + { + "epoch": 1.8795132284949978, + "grad_norm": 0.16933327913284302, + "learning_rate": 0.0001431208978503744, + "loss": 2.1388, + "step": 486200 + }, + { + "epoch": 1.879551885698381, + "grad_norm": 0.15624232590198517, + "learning_rate": 0.00014301319336943052, + "loss": 2.1248, + "step": 486210 + }, + { + "epoch": 1.8795905429017643, + "grad_norm": 0.16327272355556488, + "learning_rate": 0.00014290549513494067, + "loss": 2.1302, + "step": 486220 + }, + { + "epoch": 1.8796292001051476, + "grad_norm": 0.15281060338020325, + "learning_rate": 0.00014279780314581793, + "loss": 2.1258, + "step": 486230 + }, + { + "epoch": 1.8796678573085308, + "grad_norm": 0.1542748659849167, + "learning_rate": 0.00014269011740097604, + "loss": 2.1019, + "step": 486240 + }, + { + "epoch": 1.879706514511914, + "grad_norm": 0.15855109691619873, + "learning_rate": 0.000142582437899329, + "loss": 2.1122, + "step": 486250 + }, + { + "epoch": 1.8797451717152973, + "grad_norm": 0.16607624292373657, + "learning_rate": 0.00014247476463979104, + "loss": 2.1334, + "step": 486260 + }, + { + "epoch": 1.8797838289186806, + "grad_norm": 0.1596800535917282, + "learning_rate": 0.00014236709762127653, + "loss": 2.1187, + "step": 486270 + }, + { + "epoch": 1.879822486122064, + "grad_norm": 0.1803329885005951, + "learning_rate": 0.0001422594368427006, + "loss": 2.1246, + "step": 486280 + }, + { + "epoch": 1.8798611433254473, + "grad_norm": 0.1732296645641327, + "learning_rate": 0.0001421517823029783, + "loss": 2.1277, + "step": 486290 + }, + { + "epoch": 1.8798998005288305, + "grad_norm": 0.159649059176445, + "learning_rate": 0.00014204413400102523, + "loss": 2.1386, + "step": 486300 + }, + { + "epoch": 1.8799384577322138, + "grad_norm": 0.15590937435626984, + "learning_rate": 0.0001419364919357573, + "loss": 2.1168, + "step": 486310 + }, + { + "epoch": 1.879977114935597, + "grad_norm": 0.16680322587490082, + "learning_rate": 0.00014182885610609054, + "loss": 2.1175, + "step": 486320 + }, + { + "epoch": 1.8800157721389805, + "grad_norm": 0.19824039936065674, + "learning_rate": 0.00014172122651094155, + "loss": 2.1226, + "step": 486330 + }, + { + "epoch": 1.8800544293423638, + "grad_norm": 0.16763262450695038, + "learning_rate": 0.00014161360314922679, + "loss": 2.122, + "step": 486340 + }, + { + "epoch": 1.880093086545747, + "grad_norm": 0.15511512756347656, + "learning_rate": 0.0001415059860198633, + "loss": 2.1234, + "step": 486350 + }, + { + "epoch": 1.8801317437491303, + "grad_norm": 0.155037060379982, + "learning_rate": 0.00014139837512176911, + "loss": 2.1265, + "step": 486360 + }, + { + "epoch": 1.8801704009525135, + "grad_norm": 0.16194379329681396, + "learning_rate": 0.00014129077045386151, + "loss": 2.1276, + "step": 486370 + }, + { + "epoch": 1.8802090581558968, + "grad_norm": 0.1541333943605423, + "learning_rate": 0.00014118317201505847, + "loss": 2.1315, + "step": 486380 + }, + { + "epoch": 1.88024771535928, + "grad_norm": 0.16888245940208435, + "learning_rate": 0.00014107557980427843, + "loss": 2.1098, + "step": 486390 + }, + { + "epoch": 1.8802863725626633, + "grad_norm": 0.15296173095703125, + "learning_rate": 0.00014096799382044, + "loss": 2.1297, + "step": 486400 + }, + { + "epoch": 1.8803250297660465, + "grad_norm": 0.1709115356206894, + "learning_rate": 0.00014086041406246208, + "loss": 2.1034, + "step": 486410 + }, + { + "epoch": 1.8803636869694298, + "grad_norm": 0.15439197421073914, + "learning_rate": 0.00014075284052926417, + "loss": 2.1248, + "step": 486420 + }, + { + "epoch": 1.880402344172813, + "grad_norm": 0.16216754913330078, + "learning_rate": 0.0001406452732197656, + "loss": 2.1247, + "step": 486430 + }, + { + "epoch": 1.8804410013761963, + "grad_norm": 0.16694015264511108, + "learning_rate": 0.00014053771213288658, + "loss": 2.1192, + "step": 486440 + }, + { + "epoch": 1.8804796585795798, + "grad_norm": 0.16613997519016266, + "learning_rate": 0.00014043015726754703, + "loss": 2.1379, + "step": 486450 + }, + { + "epoch": 1.880518315782963, + "grad_norm": 0.16087213158607483, + "learning_rate": 0.00014032260862266766, + "loss": 2.1101, + "step": 486460 + }, + { + "epoch": 1.8805569729863463, + "grad_norm": 0.1598331481218338, + "learning_rate": 0.00014021506619716907, + "loss": 2.1167, + "step": 486470 + }, + { + "epoch": 1.8805956301897295, + "grad_norm": 0.15491698682308197, + "learning_rate": 0.00014010752998997277, + "loss": 2.111, + "step": 486480 + }, + { + "epoch": 1.8806342873931128, + "grad_norm": 0.16934970021247864, + "learning_rate": 0.0001399999999999999, + "loss": 2.1386, + "step": 486490 + }, + { + "epoch": 1.8806729445964963, + "grad_norm": 0.15546073019504547, + "learning_rate": 0.00013989247622617261, + "loss": 2.1342, + "step": 486500 + }, + { + "epoch": 1.8807116017998795, + "grad_norm": 0.15205851197242737, + "learning_rate": 0.00013978495866741246, + "loss": 2.1296, + "step": 486510 + }, + { + "epoch": 1.8807502590032628, + "grad_norm": 0.16739751398563385, + "learning_rate": 0.0001396774473226423, + "loss": 2.112, + "step": 486520 + }, + { + "epoch": 1.880788916206646, + "grad_norm": 0.15638157725334167, + "learning_rate": 0.0001395699421907848, + "loss": 2.1197, + "step": 486530 + }, + { + "epoch": 1.8808275734100293, + "grad_norm": 0.16130010783672333, + "learning_rate": 0.00013946244327076273, + "loss": 2.132, + "step": 486540 + }, + { + "epoch": 1.8808662306134125, + "grad_norm": 0.1580820232629776, + "learning_rate": 0.0001393549505614995, + "loss": 2.1167, + "step": 486550 + }, + { + "epoch": 1.8809048878167958, + "grad_norm": 0.15250080823898315, + "learning_rate": 0.0001392474640619188, + "loss": 2.1264, + "step": 486560 + }, + { + "epoch": 1.880943545020179, + "grad_norm": 0.1518588364124298, + "learning_rate": 0.0001391399837709446, + "loss": 2.1198, + "step": 486570 + }, + { + "epoch": 1.8809822022235623, + "grad_norm": 0.161702960729599, + "learning_rate": 0.00013903250968750136, + "loss": 2.1373, + "step": 486580 + }, + { + "epoch": 1.8810208594269455, + "grad_norm": 0.16377724707126617, + "learning_rate": 0.00013892504181051325, + "loss": 2.1427, + "step": 486590 + }, + { + "epoch": 1.8810595166303288, + "grad_norm": 0.15726323425769806, + "learning_rate": 0.00013881758013890532, + "loss": 2.1385, + "step": 486600 + }, + { + "epoch": 1.881098173833712, + "grad_norm": 0.16078706085681915, + "learning_rate": 0.00013871012467160293, + "loss": 2.1095, + "step": 486610 + }, + { + "epoch": 1.8811368310370955, + "grad_norm": 0.16197741031646729, + "learning_rate": 0.00013860267540753135, + "loss": 2.1102, + "step": 486620 + }, + { + "epoch": 1.8811754882404788, + "grad_norm": 0.15010589361190796, + "learning_rate": 0.00013849523234561655, + "loss": 2.1181, + "step": 486630 + }, + { + "epoch": 1.881214145443862, + "grad_norm": 0.168479323387146, + "learning_rate": 0.00013838779548478475, + "loss": 2.1193, + "step": 486640 + }, + { + "epoch": 1.8812528026472453, + "grad_norm": 0.3779033422470093, + "learning_rate": 0.00013828036482396188, + "loss": 2.1389, + "step": 486650 + }, + { + "epoch": 1.8812914598506285, + "grad_norm": 0.1739717423915863, + "learning_rate": 0.00013817294036207528, + "loss": 2.1246, + "step": 486660 + }, + { + "epoch": 1.881330117054012, + "grad_norm": 0.1617521196603775, + "learning_rate": 0.0001380655220980518, + "loss": 2.1083, + "step": 486670 + }, + { + "epoch": 1.8813687742573952, + "grad_norm": 0.17464694380760193, + "learning_rate": 0.00013795811003081894, + "loss": 2.1222, + "step": 486680 + }, + { + "epoch": 1.8814074314607785, + "grad_norm": 0.17022061347961426, + "learning_rate": 0.00013785070415930402, + "loss": 2.129, + "step": 486690 + }, + { + "epoch": 1.8814460886641617, + "grad_norm": 0.16917157173156738, + "learning_rate": 0.0001377433044824352, + "loss": 2.1218, + "step": 486700 + }, + { + "epoch": 1.881484745867545, + "grad_norm": 0.15667511522769928, + "learning_rate": 0.00013763591099914097, + "loss": 2.1158, + "step": 486710 + }, + { + "epoch": 1.8815234030709282, + "grad_norm": 0.16120067238807678, + "learning_rate": 0.00013752852370834946, + "loss": 2.1243, + "step": 486720 + }, + { + "epoch": 1.8815620602743115, + "grad_norm": 0.17816998064517975, + "learning_rate": 0.0001374211426089902, + "loss": 2.1227, + "step": 486730 + }, + { + "epoch": 1.8816007174776948, + "grad_norm": 0.1660146415233612, + "learning_rate": 0.0001373137676999918, + "loss": 2.1195, + "step": 486740 + }, + { + "epoch": 1.881639374681078, + "grad_norm": 0.15490961074829102, + "learning_rate": 0.00013720639898028407, + "loss": 2.123, + "step": 486750 + }, + { + "epoch": 1.8816780318844613, + "grad_norm": 0.1545129418373108, + "learning_rate": 0.00013709903644879717, + "loss": 2.1264, + "step": 486760 + }, + { + "epoch": 1.8817166890878445, + "grad_norm": 0.1632499098777771, + "learning_rate": 0.0001369916801044606, + "loss": 2.1117, + "step": 486770 + }, + { + "epoch": 1.8817553462912278, + "grad_norm": 0.1726263165473938, + "learning_rate": 0.0001368843299462055, + "loss": 2.1142, + "step": 486780 + }, + { + "epoch": 1.8817940034946112, + "grad_norm": 0.16340069472789764, + "learning_rate": 0.00013677698597296196, + "loss": 2.1167, + "step": 486790 + }, + { + "epoch": 1.8818326606979945, + "grad_norm": 0.1490408033132553, + "learning_rate": 0.00013666964818366157, + "loss": 2.1218, + "step": 486800 + }, + { + "epoch": 1.8818713179013777, + "grad_norm": 0.16850118339061737, + "learning_rate": 0.00013656231657723517, + "loss": 2.1352, + "step": 486810 + }, + { + "epoch": 1.881909975104761, + "grad_norm": 0.16097033023834229, + "learning_rate": 0.00013645499115261516, + "loss": 2.1231, + "step": 486820 + }, + { + "epoch": 1.8819486323081445, + "grad_norm": 0.163710355758667, + "learning_rate": 0.00013634767190873288, + "loss": 2.1355, + "step": 486830 + }, + { + "epoch": 1.8819872895115277, + "grad_norm": 0.15214866399765015, + "learning_rate": 0.0001362403588445209, + "loss": 2.1244, + "step": 486840 + }, + { + "epoch": 1.882025946714911, + "grad_norm": 0.15033996105194092, + "learning_rate": 0.0001361330519589119, + "loss": 2.1294, + "step": 486850 + }, + { + "epoch": 1.8820646039182942, + "grad_norm": 0.1694653034210205, + "learning_rate": 0.0001360257512508387, + "loss": 2.1317, + "step": 486860 + }, + { + "epoch": 1.8821032611216775, + "grad_norm": 0.1655498743057251, + "learning_rate": 0.00013591845671923465, + "loss": 2.1338, + "step": 486870 + }, + { + "epoch": 1.8821419183250607, + "grad_norm": 0.17144210636615753, + "learning_rate": 0.00013581116836303297, + "loss": 2.1327, + "step": 486880 + }, + { + "epoch": 1.882180575528444, + "grad_norm": 0.17053934931755066, + "learning_rate": 0.0001357038861811679, + "loss": 2.1095, + "step": 486890 + }, + { + "epoch": 1.8822192327318272, + "grad_norm": 0.1776016503572464, + "learning_rate": 0.00013559661017257317, + "loss": 2.1428, + "step": 486900 + }, + { + "epoch": 1.8822578899352105, + "grad_norm": 0.1560843139886856, + "learning_rate": 0.00013548934033618366, + "loss": 2.1265, + "step": 486910 + }, + { + "epoch": 1.8822965471385937, + "grad_norm": 0.16443794965744019, + "learning_rate": 0.0001353820766709337, + "loss": 2.1147, + "step": 486920 + }, + { + "epoch": 1.882335204341977, + "grad_norm": 0.15588612854480743, + "learning_rate": 0.00013527481917575867, + "loss": 2.1185, + "step": 486930 + }, + { + "epoch": 1.8823738615453602, + "grad_norm": 0.16323307156562805, + "learning_rate": 0.0001351675678495936, + "loss": 2.1171, + "step": 486940 + }, + { + "epoch": 1.8824125187487435, + "grad_norm": 0.1513441652059555, + "learning_rate": 0.00013506032269137446, + "loss": 2.1351, + "step": 486950 + }, + { + "epoch": 1.882451175952127, + "grad_norm": 0.15196464955806732, + "learning_rate": 0.00013495308370003724, + "loss": 2.1327, + "step": 486960 + }, + { + "epoch": 1.8824898331555102, + "grad_norm": 0.1596112847328186, + "learning_rate": 0.00013484585087451828, + "loss": 2.1268, + "step": 486970 + }, + { + "epoch": 1.8825284903588935, + "grad_norm": 0.16076111793518066, + "learning_rate": 0.00013473862421375382, + "loss": 2.1334, + "step": 486980 + }, + { + "epoch": 1.8825671475622767, + "grad_norm": 0.15090005099773407, + "learning_rate": 0.00013463140371668093, + "loss": 2.121, + "step": 486990 + }, + { + "epoch": 1.8826058047656602, + "grad_norm": 0.15267503261566162, + "learning_rate": 0.0001345241893822371, + "loss": 2.1141, + "step": 487000 + }, + { + "epoch": 1.8826444619690434, + "grad_norm": 0.1471244841814041, + "learning_rate": 0.00013441698120935942, + "loss": 2.1237, + "step": 487010 + }, + { + "epoch": 1.8826831191724267, + "grad_norm": 0.16960223019123077, + "learning_rate": 0.00013430977919698584, + "loss": 2.1114, + "step": 487020 + }, + { + "epoch": 1.88272177637581, + "grad_norm": 0.16533735394477844, + "learning_rate": 0.00013420258334405455, + "loss": 2.1116, + "step": 487030 + }, + { + "epoch": 1.8827604335791932, + "grad_norm": 0.1614953726530075, + "learning_rate": 0.00013409539364950395, + "loss": 2.1298, + "step": 487040 + }, + { + "epoch": 1.8827990907825765, + "grad_norm": 0.15685488283634186, + "learning_rate": 0.00013398821011227248, + "loss": 2.1154, + "step": 487050 + }, + { + "epoch": 1.8828377479859597, + "grad_norm": 0.14576545357704163, + "learning_rate": 0.00013388103273129982, + "loss": 2.1209, + "step": 487060 + }, + { + "epoch": 1.882876405189343, + "grad_norm": 0.158021941781044, + "learning_rate": 0.00013377386150552483, + "loss": 2.1167, + "step": 487070 + }, + { + "epoch": 1.8829150623927262, + "grad_norm": 0.16273944079875946, + "learning_rate": 0.0001336666964338873, + "loss": 2.1141, + "step": 487080 + }, + { + "epoch": 1.8829537195961095, + "grad_norm": 0.16825802624225616, + "learning_rate": 0.0001335595375153269, + "loss": 2.1288, + "step": 487090 + }, + { + "epoch": 1.8829923767994927, + "grad_norm": 0.16017933189868927, + "learning_rate": 0.0001334523847487843, + "loss": 2.1084, + "step": 487100 + }, + { + "epoch": 1.883031034002876, + "grad_norm": 0.16350767016410828, + "learning_rate": 0.00013334523813319986, + "loss": 2.1136, + "step": 487110 + }, + { + "epoch": 1.8830696912062592, + "grad_norm": 0.17448604106903076, + "learning_rate": 0.0001332380976675145, + "loss": 2.1318, + "step": 487120 + }, + { + "epoch": 1.8831083484096427, + "grad_norm": 0.16670966148376465, + "learning_rate": 0.00013313096335066944, + "loss": 2.1139, + "step": 487130 + }, + { + "epoch": 1.883147005613026, + "grad_norm": 0.1731451004743576, + "learning_rate": 0.00013302383518160578, + "loss": 2.1107, + "step": 487140 + }, + { + "epoch": 1.8831856628164092, + "grad_norm": 0.1635160744190216, + "learning_rate": 0.00013291671315926568, + "loss": 2.1325, + "step": 487150 + }, + { + "epoch": 1.8832243200197925, + "grad_norm": 0.15141884982585907, + "learning_rate": 0.0001328095972825909, + "loss": 2.1392, + "step": 487160 + }, + { + "epoch": 1.883262977223176, + "grad_norm": 0.15572725236415863, + "learning_rate": 0.00013270248755052426, + "loss": 2.1259, + "step": 487170 + }, + { + "epoch": 1.8833016344265592, + "grad_norm": 0.15225672721862793, + "learning_rate": 0.000132595383962008, + "loss": 2.1211, + "step": 487180 + }, + { + "epoch": 1.8833402916299424, + "grad_norm": 0.40289080142974854, + "learning_rate": 0.00013248828651598555, + "loss": 2.1192, + "step": 487190 + }, + { + "epoch": 1.8833789488333257, + "grad_norm": 0.20261716842651367, + "learning_rate": 0.00013238119521139958, + "loss": 2.1045, + "step": 487200 + }, + { + "epoch": 1.883417606036709, + "grad_norm": 0.1588941514492035, + "learning_rate": 0.00013227411004719425, + "loss": 2.1217, + "step": 487210 + }, + { + "epoch": 1.8834562632400922, + "grad_norm": 0.16519004106521606, + "learning_rate": 0.0001321670310223133, + "loss": 2.1242, + "step": 487220 + }, + { + "epoch": 1.8834949204434754, + "grad_norm": 0.15652592480182648, + "learning_rate": 0.0001320599581357007, + "loss": 2.1223, + "step": 487230 + }, + { + "epoch": 1.8835335776468587, + "grad_norm": 0.18390634655952454, + "learning_rate": 0.0001319528913863013, + "loss": 2.121, + "step": 487240 + }, + { + "epoch": 1.883572234850242, + "grad_norm": 0.15679307281970978, + "learning_rate": 0.00013184583077305946, + "loss": 2.1028, + "step": 487250 + }, + { + "epoch": 1.8836108920536252, + "grad_norm": 0.17130286991596222, + "learning_rate": 0.0001317387762949207, + "loss": 2.1373, + "step": 487260 + }, + { + "epoch": 1.8836495492570084, + "grad_norm": 0.15090951323509216, + "learning_rate": 0.00013163172795083034, + "loss": 2.1299, + "step": 487270 + }, + { + "epoch": 1.8836882064603917, + "grad_norm": 0.15132005512714386, + "learning_rate": 0.00013152468573973387, + "loss": 2.143, + "step": 487280 + }, + { + "epoch": 1.883726863663775, + "grad_norm": 0.15275931358337402, + "learning_rate": 0.00013141764966057769, + "loss": 2.1084, + "step": 487290 + }, + { + "epoch": 1.8837655208671584, + "grad_norm": 0.1515328586101532, + "learning_rate": 0.0001313106197123075, + "loss": 2.1221, + "step": 487300 + }, + { + "epoch": 1.8838041780705417, + "grad_norm": 0.153660848736763, + "learning_rate": 0.00013120359589387042, + "loss": 2.1173, + "step": 487310 + }, + { + "epoch": 1.883842835273925, + "grad_norm": 0.15133818984031677, + "learning_rate": 0.00013109657820421327, + "loss": 2.1224, + "step": 487320 + }, + { + "epoch": 1.8838814924773082, + "grad_norm": 0.16636015474796295, + "learning_rate": 0.00013098956664228334, + "loss": 2.1232, + "step": 487330 + }, + { + "epoch": 1.8839201496806917, + "grad_norm": 0.1621353030204773, + "learning_rate": 0.0001308825612070279, + "loss": 2.1188, + "step": 487340 + }, + { + "epoch": 1.883958806884075, + "grad_norm": 0.15565310418605804, + "learning_rate": 0.0001307755618973947, + "loss": 2.1248, + "step": 487350 + }, + { + "epoch": 1.8839974640874582, + "grad_norm": 0.15307669341564178, + "learning_rate": 0.00013066856871233234, + "loss": 2.1392, + "step": 487360 + }, + { + "epoch": 1.8840361212908414, + "grad_norm": 0.16902467608451843, + "learning_rate": 0.0001305615816507888, + "loss": 2.1208, + "step": 487370 + }, + { + "epoch": 1.8840747784942247, + "grad_norm": 0.1794981211423874, + "learning_rate": 0.0001304546007117131, + "loss": 2.1186, + "step": 487380 + }, + { + "epoch": 1.884113435697608, + "grad_norm": 0.16142766177654266, + "learning_rate": 0.00013034762589405415, + "loss": 2.139, + "step": 487390 + }, + { + "epoch": 1.8841520929009912, + "grad_norm": 0.1620454490184784, + "learning_rate": 0.00013024065719676115, + "loss": 2.131, + "step": 487400 + }, + { + "epoch": 1.8841907501043744, + "grad_norm": 0.17644056677818298, + "learning_rate": 0.00013013369461878366, + "loss": 2.1261, + "step": 487410 + }, + { + "epoch": 1.8842294073077577, + "grad_norm": 0.15617609024047852, + "learning_rate": 0.00013002673815907208, + "loss": 2.1163, + "step": 487420 + }, + { + "epoch": 1.884268064511141, + "grad_norm": 0.15311667323112488, + "learning_rate": 0.0001299197878165761, + "loss": 2.1354, + "step": 487430 + }, + { + "epoch": 1.8843067217145242, + "grad_norm": 0.15431946516036987, + "learning_rate": 0.00012981284359024658, + "loss": 2.1246, + "step": 487440 + }, + { + "epoch": 1.8843453789179074, + "grad_norm": 0.15608011186122894, + "learning_rate": 0.00012970590547903393, + "loss": 2.1194, + "step": 487450 + }, + { + "epoch": 1.8843840361212907, + "grad_norm": 0.16052739322185516, + "learning_rate": 0.00012959897348188988, + "loss": 2.1343, + "step": 487460 + }, + { + "epoch": 1.8844226933246742, + "grad_norm": 0.15716208517551422, + "learning_rate": 0.00012949204759776545, + "loss": 2.1354, + "step": 487470 + }, + { + "epoch": 1.8844613505280574, + "grad_norm": 0.16176652908325195, + "learning_rate": 0.0001293851278256124, + "loss": 2.1224, + "step": 487480 + }, + { + "epoch": 1.8845000077314407, + "grad_norm": 0.1501695066690445, + "learning_rate": 0.0001292782141643829, + "loss": 2.1173, + "step": 487490 + }, + { + "epoch": 1.884538664934824, + "grad_norm": 0.1616787314414978, + "learning_rate": 0.00012917130661302935, + "loss": 2.1343, + "step": 487500 + }, + { + "epoch": 1.8845773221382074, + "grad_norm": 0.15648533403873444, + "learning_rate": 0.0001290644051705041, + "loss": 2.1268, + "step": 487510 + }, + { + "epoch": 1.8846159793415906, + "grad_norm": 0.17075617611408234, + "learning_rate": 0.00012895750983576005, + "loss": 2.1178, + "step": 487520 + }, + { + "epoch": 1.884654636544974, + "grad_norm": 0.15663281083106995, + "learning_rate": 0.00012885062060775067, + "loss": 2.1132, + "step": 487530 + }, + { + "epoch": 1.8846932937483571, + "grad_norm": 0.1682283878326416, + "learning_rate": 0.00012874373748542924, + "loss": 2.1176, + "step": 487540 + }, + { + "epoch": 1.8847319509517404, + "grad_norm": 0.1556764394044876, + "learning_rate": 0.00012863686046774993, + "loss": 2.1112, + "step": 487550 + }, + { + "epoch": 1.8847706081551237, + "grad_norm": 0.1526879519224167, + "learning_rate": 0.00012852998955366647, + "loss": 2.1326, + "step": 487560 + }, + { + "epoch": 1.884809265358507, + "grad_norm": 0.17758522927761078, + "learning_rate": 0.00012842312474213325, + "loss": 2.1105, + "step": 487570 + }, + { + "epoch": 1.8848479225618902, + "grad_norm": 0.16019943356513977, + "learning_rate": 0.00012831626603210556, + "loss": 2.1063, + "step": 487580 + }, + { + "epoch": 1.8848865797652734, + "grad_norm": 0.16548269987106323, + "learning_rate": 0.00012820941342253777, + "loss": 2.1245, + "step": 487590 + }, + { + "epoch": 1.8849252369686567, + "grad_norm": 0.16766045987606049, + "learning_rate": 0.00012810256691238563, + "loss": 2.1243, + "step": 487600 + }, + { + "epoch": 1.88496389417204, + "grad_norm": 0.16872750222682953, + "learning_rate": 0.00012799572650060444, + "loss": 2.1228, + "step": 487610 + }, + { + "epoch": 1.8850025513754232, + "grad_norm": 0.15807810425758362, + "learning_rate": 0.0001278888921861503, + "loss": 2.1236, + "step": 487620 + }, + { + "epoch": 1.8850412085788064, + "grad_norm": 0.27638283371925354, + "learning_rate": 0.0001277820639679792, + "loss": 2.1273, + "step": 487630 + }, + { + "epoch": 1.88507986578219, + "grad_norm": 0.17451395094394684, + "learning_rate": 0.00012767524184504798, + "loss": 2.1293, + "step": 487640 + }, + { + "epoch": 1.8851185229855731, + "grad_norm": 0.16688884794712067, + "learning_rate": 0.000127568425816313, + "loss": 2.1357, + "step": 487650 + }, + { + "epoch": 1.8851571801889564, + "grad_norm": 0.15373960137367249, + "learning_rate": 0.0001274616158807318, + "loss": 2.1121, + "step": 487660 + }, + { + "epoch": 1.8851958373923396, + "grad_norm": 0.15518179535865784, + "learning_rate": 0.0001273548120372616, + "loss": 2.1217, + "step": 487670 + }, + { + "epoch": 1.8852344945957231, + "grad_norm": 0.15639948844909668, + "learning_rate": 0.00012724801428486, + "loss": 2.1219, + "step": 487680 + }, + { + "epoch": 1.8852731517991064, + "grad_norm": 0.1561547964811325, + "learning_rate": 0.00012714122262248506, + "loss": 2.1041, + "step": 487690 + }, + { + "epoch": 1.8853118090024896, + "grad_norm": 0.15862923860549927, + "learning_rate": 0.00012703443704909523, + "loss": 2.1182, + "step": 487700 + }, + { + "epoch": 1.8853504662058729, + "grad_norm": 0.1520431488752365, + "learning_rate": 0.00012692765756364865, + "loss": 2.1168, + "step": 487710 + }, + { + "epoch": 1.8853891234092561, + "grad_norm": 0.1558041274547577, + "learning_rate": 0.00012682088416510485, + "loss": 2.1045, + "step": 487720 + }, + { + "epoch": 1.8854277806126394, + "grad_norm": 0.15232814848423004, + "learning_rate": 0.0001267141168524224, + "loss": 2.1187, + "step": 487730 + }, + { + "epoch": 1.8854664378160226, + "grad_norm": 0.15680523216724396, + "learning_rate": 0.00012660735562456104, + "loss": 2.1226, + "step": 487740 + }, + { + "epoch": 1.8855050950194059, + "grad_norm": 0.15346527099609375, + "learning_rate": 0.0001265006004804805, + "loss": 2.1253, + "step": 487750 + }, + { + "epoch": 1.8855437522227891, + "grad_norm": 0.16200987994670868, + "learning_rate": 0.00012639385141914095, + "loss": 2.1183, + "step": 487760 + }, + { + "epoch": 1.8855824094261724, + "grad_norm": 0.159101665019989, + "learning_rate": 0.00012628710843950276, + "loss": 2.1051, + "step": 487770 + }, + { + "epoch": 1.8856210666295556, + "grad_norm": 0.16451914608478546, + "learning_rate": 0.0001261803715405263, + "loss": 2.112, + "step": 487780 + }, + { + "epoch": 1.885659723832939, + "grad_norm": 0.15994106233119965, + "learning_rate": 0.00012607364072117288, + "loss": 2.13, + "step": 487790 + }, + { + "epoch": 1.8856983810363221, + "grad_norm": 0.16974908113479614, + "learning_rate": 0.00012596691598040354, + "loss": 2.1299, + "step": 487800 + }, + { + "epoch": 1.8857370382397056, + "grad_norm": 0.1618840992450714, + "learning_rate": 0.00012586019731717978, + "loss": 2.1022, + "step": 487810 + }, + { + "epoch": 1.8857756954430889, + "grad_norm": 0.15879476070404053, + "learning_rate": 0.00012575348473046376, + "loss": 2.1283, + "step": 487820 + }, + { + "epoch": 1.8858143526464721, + "grad_norm": 0.16188471019268036, + "learning_rate": 0.00012564677821921743, + "loss": 2.1182, + "step": 487830 + }, + { + "epoch": 1.8858530098498554, + "grad_norm": 0.16069914400577545, + "learning_rate": 0.00012554007778240296, + "loss": 2.12, + "step": 487840 + }, + { + "epoch": 1.8858916670532389, + "grad_norm": 0.1658526360988617, + "learning_rate": 0.0001254333834189836, + "loss": 2.1085, + "step": 487850 + }, + { + "epoch": 1.885930324256622, + "grad_norm": 0.1735226958990097, + "learning_rate": 0.0001253266951279217, + "loss": 2.1164, + "step": 487860 + }, + { + "epoch": 1.8859689814600054, + "grad_norm": 0.16433432698249817, + "learning_rate": 0.0001252200129081813, + "loss": 2.1188, + "step": 487870 + }, + { + "epoch": 1.8860076386633886, + "grad_norm": 0.15564902126789093, + "learning_rate": 0.0001251133367587256, + "loss": 2.1229, + "step": 487880 + }, + { + "epoch": 1.8860462958667719, + "grad_norm": 0.16110435128211975, + "learning_rate": 0.00012500666667851856, + "loss": 2.126, + "step": 487890 + }, + { + "epoch": 1.8860849530701551, + "grad_norm": 0.14982998371124268, + "learning_rate": 0.00012490000266652434, + "loss": 2.121, + "step": 487900 + }, + { + "epoch": 1.8861236102735384, + "grad_norm": 0.17189742624759674, + "learning_rate": 0.00012479334472170777, + "loss": 2.118, + "step": 487910 + }, + { + "epoch": 1.8861622674769216, + "grad_norm": 0.16747061908245087, + "learning_rate": 0.00012468669284303324, + "loss": 2.1028, + "step": 487920 + }, + { + "epoch": 1.8862009246803049, + "grad_norm": 0.1881030797958374, + "learning_rate": 0.00012458004702946602, + "loss": 2.108, + "step": 487930 + }, + { + "epoch": 1.8862395818836881, + "grad_norm": 0.16692771017551422, + "learning_rate": 0.00012447340727997136, + "loss": 2.1305, + "step": 487940 + }, + { + "epoch": 1.8862782390870714, + "grad_norm": 0.15471774339675903, + "learning_rate": 0.00012436677359351522, + "loss": 2.1135, + "step": 487950 + }, + { + "epoch": 1.8863168962904546, + "grad_norm": 0.1992231011390686, + "learning_rate": 0.0001242601459690631, + "loss": 2.125, + "step": 487960 + }, + { + "epoch": 1.8863555534938379, + "grad_norm": 0.17142538726329803, + "learning_rate": 0.0001241535244055818, + "loss": 2.1121, + "step": 487970 + }, + { + "epoch": 1.8863942106972214, + "grad_norm": 0.17059116065502167, + "learning_rate": 0.0001240469089020375, + "loss": 2.0921, + "step": 487980 + }, + { + "epoch": 1.8864328679006046, + "grad_norm": 0.15251795947551727, + "learning_rate": 0.00012394029945739726, + "loss": 2.128, + "step": 487990 + }, + { + "epoch": 1.8864715251039879, + "grad_norm": 0.14697898924350739, + "learning_rate": 0.00012383369607062812, + "loss": 2.1197, + "step": 488000 + }, + { + "epoch": 1.886510182307371, + "grad_norm": 0.15622594952583313, + "learning_rate": 0.00012372709874069755, + "loss": 2.117, + "step": 488010 + }, + { + "epoch": 1.8865488395107546, + "grad_norm": 0.15641766786575317, + "learning_rate": 0.00012362050746657328, + "loss": 2.127, + "step": 488020 + }, + { + "epoch": 1.8865874967141378, + "grad_norm": 0.16108526289463043, + "learning_rate": 0.00012351392224722323, + "loss": 2.1187, + "step": 488030 + }, + { + "epoch": 1.886626153917521, + "grad_norm": 0.16521552205085754, + "learning_rate": 0.00012340734308161606, + "loss": 2.1193, + "step": 488040 + }, + { + "epoch": 1.8866648111209043, + "grad_norm": 0.15821205079555511, + "learning_rate": 0.00012330076996871987, + "loss": 2.1193, + "step": 488050 + }, + { + "epoch": 1.8867034683242876, + "grad_norm": 0.16402612626552582, + "learning_rate": 0.0001231942029075037, + "loss": 2.104, + "step": 488060 + }, + { + "epoch": 1.8867421255276708, + "grad_norm": 0.16501112282276154, + "learning_rate": 0.0001230876418969371, + "loss": 2.1198, + "step": 488070 + }, + { + "epoch": 1.886780782731054, + "grad_norm": 0.16025055944919586, + "learning_rate": 0.0001229810869359893, + "loss": 2.1342, + "step": 488080 + }, + { + "epoch": 1.8868194399344373, + "grad_norm": 0.15322361886501312, + "learning_rate": 0.00012287453802363002, + "loss": 2.1132, + "step": 488090 + }, + { + "epoch": 1.8868580971378206, + "grad_norm": 0.15714363753795624, + "learning_rate": 0.00012276799515882964, + "loss": 2.116, + "step": 488100 + }, + { + "epoch": 1.8868967543412039, + "grad_norm": 0.15222831070423126, + "learning_rate": 0.0001226614583405581, + "loss": 2.1186, + "step": 488110 + }, + { + "epoch": 1.886935411544587, + "grad_norm": 0.1677834838628769, + "learning_rate": 0.00012255492756778642, + "loss": 2.1143, + "step": 488120 + }, + { + "epoch": 1.8869740687479704, + "grad_norm": 0.15755745768547058, + "learning_rate": 0.00012244840283948523, + "loss": 2.1114, + "step": 488130 + }, + { + "epoch": 1.8870127259513536, + "grad_norm": 0.17090429365634918, + "learning_rate": 0.000122341884154626, + "loss": 2.1224, + "step": 488140 + }, + { + "epoch": 1.887051383154737, + "grad_norm": 0.17305462062358856, + "learning_rate": 0.00012223537151218023, + "loss": 2.1288, + "step": 488150 + }, + { + "epoch": 1.8870900403581203, + "grad_norm": 0.17158415913581848, + "learning_rate": 0.00012212886491111963, + "loss": 2.1184, + "step": 488160 + }, + { + "epoch": 1.8871286975615036, + "grad_norm": 0.17280597984790802, + "learning_rate": 0.00012202236435041637, + "loss": 2.1083, + "step": 488170 + }, + { + "epoch": 1.8871673547648868, + "grad_norm": 0.16423514485359192, + "learning_rate": 0.00012191586982904301, + "loss": 2.1219, + "step": 488180 + }, + { + "epoch": 1.8872060119682703, + "grad_norm": 0.16097402572631836, + "learning_rate": 0.00012180938134597219, + "loss": 2.1161, + "step": 488190 + }, + { + "epoch": 1.8872446691716536, + "grad_norm": 0.15495575964450836, + "learning_rate": 0.00012170289890017671, + "loss": 2.1266, + "step": 488200 + }, + { + "epoch": 1.8872833263750368, + "grad_norm": 0.15620481967926025, + "learning_rate": 0.00012159642249062985, + "loss": 2.1178, + "step": 488210 + }, + { + "epoch": 1.88732198357842, + "grad_norm": 0.15706951916217804, + "learning_rate": 0.00012148995211630553, + "loss": 2.1418, + "step": 488220 + }, + { + "epoch": 1.8873606407818033, + "grad_norm": 0.16063298285007477, + "learning_rate": 0.00012138348777617747, + "loss": 2.112, + "step": 488230 + }, + { + "epoch": 1.8873992979851866, + "grad_norm": 0.16350902616977692, + "learning_rate": 0.0001212770294692196, + "loss": 2.108, + "step": 488240 + }, + { + "epoch": 1.8874379551885698, + "grad_norm": 0.17477919161319733, + "learning_rate": 0.0001211705771944065, + "loss": 2.1117, + "step": 488250 + }, + { + "epoch": 1.887476612391953, + "grad_norm": 0.16410206258296967, + "learning_rate": 0.00012106413095071278, + "loss": 2.1094, + "step": 488260 + }, + { + "epoch": 1.8875152695953363, + "grad_norm": 0.1584874391555786, + "learning_rate": 0.00012095769073711371, + "loss": 2.1284, + "step": 488270 + }, + { + "epoch": 1.8875539267987196, + "grad_norm": 0.1609467715024948, + "learning_rate": 0.00012085125655258455, + "loss": 2.1154, + "step": 488280 + }, + { + "epoch": 1.8875925840021028, + "grad_norm": 0.15845824778079987, + "learning_rate": 0.00012074482839610102, + "loss": 2.1077, + "step": 488290 + }, + { + "epoch": 1.887631241205486, + "grad_norm": 0.15312768518924713, + "learning_rate": 0.00012063840626663858, + "loss": 2.1206, + "step": 488300 + }, + { + "epoch": 1.8876698984088696, + "grad_norm": 0.40977486968040466, + "learning_rate": 0.00012053199016317384, + "loss": 2.116, + "step": 488310 + }, + { + "epoch": 1.8877085556122528, + "grad_norm": 0.1777162402868271, + "learning_rate": 0.00012042558008468318, + "loss": 2.115, + "step": 488320 + }, + { + "epoch": 1.887747212815636, + "grad_norm": 0.1626431792974472, + "learning_rate": 0.00012031917603014319, + "loss": 2.108, + "step": 488330 + }, + { + "epoch": 1.8877858700190193, + "grad_norm": 0.16696617007255554, + "learning_rate": 0.00012021277799853114, + "loss": 2.1241, + "step": 488340 + }, + { + "epoch": 1.8878245272224026, + "grad_norm": 0.15991130471229553, + "learning_rate": 0.00012010638598882407, + "loss": 2.105, + "step": 488350 + }, + { + "epoch": 1.887863184425786, + "grad_norm": 0.16908201575279236, + "learning_rate": 0.00011999999999999988, + "loss": 2.099, + "step": 488360 + }, + { + "epoch": 1.8879018416291693, + "grad_norm": 0.15702217817306519, + "learning_rate": 0.00011989362003103655, + "loss": 2.1133, + "step": 488370 + }, + { + "epoch": 1.8879404988325525, + "grad_norm": 0.15526017546653748, + "learning_rate": 0.00011978724608091218, + "loss": 2.118, + "step": 488380 + }, + { + "epoch": 1.8879791560359358, + "grad_norm": 0.1556904911994934, + "learning_rate": 0.00011968087814860539, + "loss": 2.1328, + "step": 488390 + }, + { + "epoch": 1.888017813239319, + "grad_norm": 0.15495745837688446, + "learning_rate": 0.00011957451623309457, + "loss": 2.1317, + "step": 488400 + }, + { + "epoch": 1.8880564704427023, + "grad_norm": 0.16102412343025208, + "learning_rate": 0.00011946816033335938, + "loss": 2.1058, + "step": 488410 + }, + { + "epoch": 1.8880951276460856, + "grad_norm": 0.16249670088291168, + "learning_rate": 0.00011936181044837868, + "loss": 2.1146, + "step": 488420 + }, + { + "epoch": 1.8881337848494688, + "grad_norm": 0.165382981300354, + "learning_rate": 0.00011925546657713238, + "loss": 2.1091, + "step": 488430 + }, + { + "epoch": 1.888172442052852, + "grad_norm": 0.16189248859882355, + "learning_rate": 0.0001191491287186004, + "loss": 2.1206, + "step": 488440 + }, + { + "epoch": 1.8882110992562353, + "grad_norm": 0.1849163919687271, + "learning_rate": 0.00011904279687176312, + "loss": 2.1092, + "step": 488450 + }, + { + "epoch": 1.8882497564596186, + "grad_norm": 0.17459073662757874, + "learning_rate": 0.00011893647103560046, + "loss": 2.1263, + "step": 488460 + }, + { + "epoch": 1.8882884136630018, + "grad_norm": 0.1747400015592575, + "learning_rate": 0.00011883015120909391, + "loss": 2.1259, + "step": 488470 + }, + { + "epoch": 1.8883270708663853, + "grad_norm": 0.16780216991901398, + "learning_rate": 0.0001187238373912245, + "loss": 2.123, + "step": 488480 + }, + { + "epoch": 1.8883657280697685, + "grad_norm": 0.17317448556423187, + "learning_rate": 0.00011861752958097328, + "loss": 2.101, + "step": 488490 + }, + { + "epoch": 1.8884043852731518, + "grad_norm": 0.1796831637620926, + "learning_rate": 0.00011851122777732215, + "loss": 2.1058, + "step": 488500 + }, + { + "epoch": 1.888443042476535, + "grad_norm": 0.18265679478645325, + "learning_rate": 0.00011840493197925285, + "loss": 2.1085, + "step": 488510 + }, + { + "epoch": 1.8884816996799183, + "grad_norm": 0.15292461216449738, + "learning_rate": 0.00011829864218574792, + "loss": 2.1173, + "step": 488520 + }, + { + "epoch": 1.8885203568833018, + "grad_norm": 0.15771964192390442, + "learning_rate": 0.0001181923583957898, + "loss": 2.1086, + "step": 488530 + }, + { + "epoch": 1.888559014086685, + "grad_norm": 0.16015884280204773, + "learning_rate": 0.00011808608060836123, + "loss": 2.1265, + "step": 488540 + }, + { + "epoch": 1.8885976712900683, + "grad_norm": 0.1658799648284912, + "learning_rate": 0.0001179798088224453, + "loss": 2.1318, + "step": 488550 + }, + { + "epoch": 1.8886363284934515, + "grad_norm": 0.23712924122810364, + "learning_rate": 0.00011787354303702525, + "loss": 2.127, + "step": 488560 + }, + { + "epoch": 1.8886749856968348, + "grad_norm": 0.17107848823070526, + "learning_rate": 0.00011776728325108521, + "loss": 2.1213, + "step": 488570 + }, + { + "epoch": 1.888713642900218, + "grad_norm": 0.16752935945987701, + "learning_rate": 0.00011766102946360912, + "loss": 2.1112, + "step": 488580 + }, + { + "epoch": 1.8887523001036013, + "grad_norm": 0.1616651713848114, + "learning_rate": 0.00011755478167358069, + "loss": 2.1214, + "step": 488590 + }, + { + "epoch": 1.8887909573069845, + "grad_norm": 0.15499405562877655, + "learning_rate": 0.00011744853987998516, + "loss": 2.1187, + "step": 488600 + }, + { + "epoch": 1.8888296145103678, + "grad_norm": 0.16054315865039825, + "learning_rate": 0.00011734230408180691, + "loss": 2.1052, + "step": 488610 + }, + { + "epoch": 1.888868271713751, + "grad_norm": 0.15929390490055084, + "learning_rate": 0.00011723607427803095, + "loss": 2.0946, + "step": 488620 + }, + { + "epoch": 1.8889069289171343, + "grad_norm": 0.15963678061962128, + "learning_rate": 0.00011712985046764325, + "loss": 2.1204, + "step": 488630 + }, + { + "epoch": 1.8889455861205176, + "grad_norm": 0.17510229349136353, + "learning_rate": 0.00011702363264962879, + "loss": 2.1243, + "step": 488640 + }, + { + "epoch": 1.888984243323901, + "grad_norm": 0.15754762291908264, + "learning_rate": 0.00011691742082297418, + "loss": 2.1106, + "step": 488650 + }, + { + "epoch": 1.8890229005272843, + "grad_norm": 0.1671770066022873, + "learning_rate": 0.00011681121498666514, + "loss": 2.1041, + "step": 488660 + }, + { + "epoch": 1.8890615577306675, + "grad_norm": 0.17678804695606232, + "learning_rate": 0.00011670501513968867, + "loss": 2.1089, + "step": 488670 + }, + { + "epoch": 1.8891002149340508, + "grad_norm": 0.16856589913368225, + "learning_rate": 0.00011659882128103139, + "loss": 2.1023, + "step": 488680 + }, + { + "epoch": 1.8891388721374343, + "grad_norm": 0.17763420939445496, + "learning_rate": 0.00011649263340968052, + "loss": 2.1089, + "step": 488690 + }, + { + "epoch": 1.8891775293408175, + "grad_norm": 0.15996591746807098, + "learning_rate": 0.00011638645152462335, + "loss": 2.1165, + "step": 488700 + }, + { + "epoch": 1.8892161865442008, + "grad_norm": 0.16765794157981873, + "learning_rate": 0.00011628027562484755, + "loss": 2.1063, + "step": 488710 + }, + { + "epoch": 1.889254843747584, + "grad_norm": 0.16311784088611603, + "learning_rate": 0.00011617410570934128, + "loss": 2.1214, + "step": 488720 + }, + { + "epoch": 1.8892935009509673, + "grad_norm": 0.16353066265583038, + "learning_rate": 0.00011606794177709268, + "loss": 2.1171, + "step": 488730 + }, + { + "epoch": 1.8893321581543505, + "grad_norm": 0.15941888093948364, + "learning_rate": 0.00011596178382709033, + "loss": 2.0929, + "step": 488740 + }, + { + "epoch": 1.8893708153577338, + "grad_norm": 0.15811793506145477, + "learning_rate": 0.00011585563185832282, + "loss": 2.1072, + "step": 488750 + }, + { + "epoch": 1.889409472561117, + "grad_norm": 0.16331081092357635, + "learning_rate": 0.00011574948586977962, + "loss": 2.1119, + "step": 488760 + }, + { + "epoch": 1.8894481297645003, + "grad_norm": 0.1618635654449463, + "learning_rate": 0.00011564334586044978, + "loss": 2.1161, + "step": 488770 + }, + { + "epoch": 1.8894867869678835, + "grad_norm": 0.17881222069263458, + "learning_rate": 0.00011553721182932343, + "loss": 2.1156, + "step": 488780 + }, + { + "epoch": 1.8895254441712668, + "grad_norm": 0.18215249478816986, + "learning_rate": 0.00011543108377539024, + "loss": 2.1133, + "step": 488790 + }, + { + "epoch": 1.88956410137465, + "grad_norm": 0.17007042467594147, + "learning_rate": 0.00011532496169764061, + "loss": 2.1209, + "step": 488800 + }, + { + "epoch": 1.8896027585780333, + "grad_norm": 0.16164295375347137, + "learning_rate": 0.0001152188455950649, + "loss": 2.1067, + "step": 488810 + }, + { + "epoch": 1.8896414157814168, + "grad_norm": 0.17763203382492065, + "learning_rate": 0.0001151127354666539, + "loss": 2.1166, + "step": 488820 + }, + { + "epoch": 1.8896800729848, + "grad_norm": 0.1676315814256668, + "learning_rate": 0.0001150066313113991, + "loss": 2.1156, + "step": 488830 + }, + { + "epoch": 1.8897187301881833, + "grad_norm": 0.15571266412734985, + "learning_rate": 0.00011490053312829151, + "loss": 2.1094, + "step": 488840 + }, + { + "epoch": 1.8897573873915665, + "grad_norm": 0.17012977600097656, + "learning_rate": 0.00011479444091632285, + "loss": 2.1101, + "step": 488850 + }, + { + "epoch": 1.88979604459495, + "grad_norm": 0.16374844312667847, + "learning_rate": 0.00011468835467448524, + "loss": 2.1116, + "step": 488860 + }, + { + "epoch": 1.8898347017983332, + "grad_norm": 0.15436828136444092, + "learning_rate": 0.0001145822744017706, + "loss": 2.1159, + "step": 488870 + }, + { + "epoch": 1.8898733590017165, + "grad_norm": 0.15185752511024475, + "learning_rate": 0.00011447620009717197, + "loss": 2.1062, + "step": 488880 + }, + { + "epoch": 1.8899120162050997, + "grad_norm": 0.16675709187984467, + "learning_rate": 0.0001143701317596817, + "loss": 2.0991, + "step": 488890 + }, + { + "epoch": 1.889950673408483, + "grad_norm": 0.15582484006881714, + "learning_rate": 0.00011426406938829326, + "loss": 2.1184, + "step": 488900 + }, + { + "epoch": 1.8899893306118662, + "grad_norm": 0.1695159524679184, + "learning_rate": 0.00011415801298199969, + "loss": 2.1161, + "step": 488910 + }, + { + "epoch": 1.8900279878152495, + "grad_norm": 0.1637667566537857, + "learning_rate": 0.00011405196253979466, + "loss": 2.1202, + "step": 488920 + }, + { + "epoch": 1.8900666450186328, + "grad_norm": 0.16691024601459503, + "learning_rate": 0.00011394591806067233, + "loss": 2.1157, + "step": 488930 + }, + { + "epoch": 1.890105302222016, + "grad_norm": 0.1548158973455429, + "learning_rate": 0.00011383987954362685, + "loss": 2.1125, + "step": 488940 + }, + { + "epoch": 1.8901439594253993, + "grad_norm": 0.17079788446426392, + "learning_rate": 0.00011373384698765276, + "loss": 2.1317, + "step": 488950 + }, + { + "epoch": 1.8901826166287825, + "grad_norm": 0.1615532487630844, + "learning_rate": 0.00011362782039174468, + "loss": 2.1065, + "step": 488960 + }, + { + "epoch": 1.8902212738321658, + "grad_norm": 0.1652381718158722, + "learning_rate": 0.00011352179975489763, + "loss": 2.1217, + "step": 488970 + }, + { + "epoch": 1.890259931035549, + "grad_norm": 0.15695932507514954, + "learning_rate": 0.0001134157850761075, + "loss": 2.1181, + "step": 488980 + }, + { + "epoch": 1.8902985882389325, + "grad_norm": 0.15450656414031982, + "learning_rate": 0.00011330977635436934, + "loss": 2.0999, + "step": 488990 + }, + { + "epoch": 1.8903372454423157, + "grad_norm": 0.16795946657657623, + "learning_rate": 0.00011320377358867929, + "loss": 2.1087, + "step": 489000 + }, + { + "epoch": 1.890375902645699, + "grad_norm": 0.18700319528579712, + "learning_rate": 0.00011309777677803346, + "loss": 2.0991, + "step": 489010 + }, + { + "epoch": 1.8904145598490822, + "grad_norm": 0.15968748927116394, + "learning_rate": 0.00011299178592142844, + "loss": 2.1284, + "step": 489020 + }, + { + "epoch": 1.8904532170524657, + "grad_norm": 0.1658526510000229, + "learning_rate": 0.00011288580101786105, + "loss": 2.1193, + "step": 489030 + }, + { + "epoch": 1.890491874255849, + "grad_norm": 0.1720876395702362, + "learning_rate": 0.00011277982206632808, + "loss": 2.1051, + "step": 489040 + }, + { + "epoch": 1.8905305314592322, + "grad_norm": 0.17556461691856384, + "learning_rate": 0.00011267384906582723, + "loss": 2.107, + "step": 489050 + }, + { + "epoch": 1.8905691886626155, + "grad_norm": 0.16108305752277374, + "learning_rate": 0.00011256788201535573, + "loss": 2.1214, + "step": 489060 + }, + { + "epoch": 1.8906078458659987, + "grad_norm": 0.169228196144104, + "learning_rate": 0.00011246192091391172, + "loss": 2.1144, + "step": 489070 + }, + { + "epoch": 1.890646503069382, + "grad_norm": 0.16416077315807343, + "learning_rate": 0.00011235596576049334, + "loss": 2.0981, + "step": 489080 + }, + { + "epoch": 1.8906851602727652, + "grad_norm": 0.15246742963790894, + "learning_rate": 0.00011225001655409894, + "loss": 2.1026, + "step": 489090 + }, + { + "epoch": 1.8907238174761485, + "grad_norm": 0.1551797091960907, + "learning_rate": 0.00011214407329372711, + "loss": 2.1206, + "step": 489100 + }, + { + "epoch": 1.8907624746795317, + "grad_norm": 0.16857977211475372, + "learning_rate": 0.00011203813597837731, + "loss": 2.1246, + "step": 489110 + }, + { + "epoch": 1.890801131882915, + "grad_norm": 0.1534964144229889, + "learning_rate": 0.00011193220460704856, + "loss": 2.0995, + "step": 489120 + }, + { + "epoch": 1.8908397890862982, + "grad_norm": 0.16405372321605682, + "learning_rate": 0.00011182627917874033, + "loss": 2.0984, + "step": 489130 + }, + { + "epoch": 1.8908784462896815, + "grad_norm": 0.16919684410095215, + "learning_rate": 0.00011172035969245275, + "loss": 2.1236, + "step": 489140 + }, + { + "epoch": 1.8909171034930647, + "grad_norm": 0.17017854750156403, + "learning_rate": 0.00011161444614718574, + "loss": 2.1196, + "step": 489150 + }, + { + "epoch": 1.8909557606964482, + "grad_norm": 0.17277006804943085, + "learning_rate": 0.00011150853854193987, + "loss": 2.1239, + "step": 489160 + }, + { + "epoch": 1.8909944178998315, + "grad_norm": 0.17439673840999603, + "learning_rate": 0.0001114026368757155, + "loss": 2.0974, + "step": 489170 + }, + { + "epoch": 1.8910330751032147, + "grad_norm": 0.17001686990261078, + "learning_rate": 0.00011129674114751409, + "loss": 2.1178, + "step": 489180 + }, + { + "epoch": 1.891071732306598, + "grad_norm": 0.17020079493522644, + "learning_rate": 0.00011119085135633666, + "loss": 2.1116, + "step": 489190 + }, + { + "epoch": 1.8911103895099814, + "grad_norm": 0.1608460545539856, + "learning_rate": 0.0001110849675011849, + "loss": 2.1123, + "step": 489200 + }, + { + "epoch": 1.8911490467133647, + "grad_norm": 0.16076086461544037, + "learning_rate": 0.00011097908958106028, + "loss": 2.1265, + "step": 489210 + }, + { + "epoch": 1.891187703916748, + "grad_norm": 0.17049437761306763, + "learning_rate": 0.00011087321759496517, + "loss": 2.1173, + "step": 489220 + }, + { + "epoch": 1.8912263611201312, + "grad_norm": 0.1605737954378128, + "learning_rate": 0.00011076735154190188, + "loss": 2.1261, + "step": 489230 + }, + { + "epoch": 1.8912650183235145, + "grad_norm": 0.16839636862277985, + "learning_rate": 0.000110661491420873, + "loss": 2.1043, + "step": 489240 + }, + { + "epoch": 1.8913036755268977, + "grad_norm": 0.16663509607315063, + "learning_rate": 0.00011055563723088157, + "loss": 2.1162, + "step": 489250 + }, + { + "epoch": 1.891342332730281, + "grad_norm": 0.17225117981433868, + "learning_rate": 0.00011044978897093084, + "loss": 2.1134, + "step": 489260 + }, + { + "epoch": 1.8913809899336642, + "grad_norm": 0.1735590398311615, + "learning_rate": 0.000110343946640024, + "loss": 2.1098, + "step": 489270 + }, + { + "epoch": 1.8914196471370475, + "grad_norm": 0.16959644854068756, + "learning_rate": 0.00011023811023716523, + "loss": 2.1175, + "step": 489280 + }, + { + "epoch": 1.8914583043404307, + "grad_norm": 0.17331750690937042, + "learning_rate": 0.00011013227976135842, + "loss": 2.1147, + "step": 489290 + }, + { + "epoch": 1.891496961543814, + "grad_norm": 0.18145471811294556, + "learning_rate": 0.00011002645521160792, + "loss": 2.1365, + "step": 489300 + }, + { + "epoch": 1.8915356187471972, + "grad_norm": 0.159023255109787, + "learning_rate": 0.00010992063658691832, + "loss": 2.0984, + "step": 489310 + }, + { + "epoch": 1.8915742759505805, + "grad_norm": 0.21775251626968384, + "learning_rate": 0.00010981482388629438, + "loss": 2.1036, + "step": 489320 + }, + { + "epoch": 1.891612933153964, + "grad_norm": 0.16009607911109924, + "learning_rate": 0.00010970901710874159, + "loss": 2.1108, + "step": 489330 + }, + { + "epoch": 1.8916515903573472, + "grad_norm": 0.17309287190437317, + "learning_rate": 0.00010960321625326497, + "loss": 2.1237, + "step": 489340 + }, + { + "epoch": 1.8916902475607305, + "grad_norm": 0.15677177906036377, + "learning_rate": 0.00010949742131887063, + "loss": 2.1011, + "step": 489350 + }, + { + "epoch": 1.8917289047641137, + "grad_norm": 0.16687935590744019, + "learning_rate": 0.00010939163230456428, + "loss": 2.1016, + "step": 489360 + }, + { + "epoch": 1.8917675619674972, + "grad_norm": 0.17260977625846863, + "learning_rate": 0.00010928584920935225, + "loss": 2.1076, + "step": 489370 + }, + { + "epoch": 1.8918062191708804, + "grad_norm": 0.17035962641239166, + "learning_rate": 0.00010918007203224135, + "loss": 2.1092, + "step": 489380 + }, + { + "epoch": 1.8918448763742637, + "grad_norm": 0.17869456112384796, + "learning_rate": 0.00010907430077223812, + "loss": 2.1203, + "step": 489390 + }, + { + "epoch": 1.891883533577647, + "grad_norm": 0.17196069657802582, + "learning_rate": 0.00010896853542834984, + "loss": 2.1163, + "step": 489400 + }, + { + "epoch": 1.8919221907810302, + "grad_norm": 0.17762623727321625, + "learning_rate": 0.00010886277599958394, + "loss": 2.1064, + "step": 489410 + }, + { + "epoch": 1.8919608479844134, + "grad_norm": 0.16974352300167084, + "learning_rate": 0.00010875702248494789, + "loss": 2.1059, + "step": 489420 + }, + { + "epoch": 1.8919995051877967, + "grad_norm": 0.16500453650951385, + "learning_rate": 0.00010865127488344984, + "loss": 2.1321, + "step": 489430 + }, + { + "epoch": 1.89203816239118, + "grad_norm": 0.1621197909116745, + "learning_rate": 0.00010854553319409788, + "loss": 2.1056, + "step": 489440 + }, + { + "epoch": 1.8920768195945632, + "grad_norm": 0.178715780377388, + "learning_rate": 0.00010843979741590037, + "loss": 2.0964, + "step": 489450 + }, + { + "epoch": 1.8921154767979464, + "grad_norm": 0.17246749997138977, + "learning_rate": 0.00010833406754786656, + "loss": 2.1143, + "step": 489460 + }, + { + "epoch": 1.8921541340013297, + "grad_norm": 0.1624779850244522, + "learning_rate": 0.00010822834358900479, + "loss": 2.1088, + "step": 489470 + }, + { + "epoch": 1.892192791204713, + "grad_norm": 0.16980381309986115, + "learning_rate": 0.00010812262553832519, + "loss": 2.129, + "step": 489480 + }, + { + "epoch": 1.8922314484080962, + "grad_norm": 0.1774771809577942, + "learning_rate": 0.000108016913394837, + "loss": 2.1068, + "step": 489490 + }, + { + "epoch": 1.8922701056114797, + "grad_norm": 0.182191401720047, + "learning_rate": 0.00010791120715754987, + "loss": 2.1016, + "step": 489500 + }, + { + "epoch": 1.892308762814863, + "grad_norm": 0.17115147411823273, + "learning_rate": 0.00010780550682547441, + "loss": 2.1183, + "step": 489510 + }, + { + "epoch": 1.8923474200182462, + "grad_norm": 0.1588856279850006, + "learning_rate": 0.00010769981239762072, + "loss": 2.1241, + "step": 489520 + }, + { + "epoch": 1.8923860772216294, + "grad_norm": 0.15776441991329193, + "learning_rate": 0.0001075941238729996, + "loss": 2.1039, + "step": 489530 + }, + { + "epoch": 1.892424734425013, + "grad_norm": 0.15810024738311768, + "learning_rate": 0.00010748844125062207, + "loss": 2.1067, + "step": 489540 + }, + { + "epoch": 1.8924633916283962, + "grad_norm": 0.15623559057712555, + "learning_rate": 0.00010738276452949957, + "loss": 2.1129, + "step": 489550 + }, + { + "epoch": 1.8925020488317794, + "grad_norm": 0.16129539906978607, + "learning_rate": 0.00010727709370864335, + "loss": 2.1111, + "step": 489560 + }, + { + "epoch": 1.8925407060351627, + "grad_norm": 0.1808207482099533, + "learning_rate": 0.0001071714287870651, + "loss": 2.1164, + "step": 489570 + }, + { + "epoch": 1.892579363238546, + "grad_norm": 0.16985240578651428, + "learning_rate": 0.00010706576976377757, + "loss": 2.1112, + "step": 489580 + }, + { + "epoch": 1.8926180204419292, + "grad_norm": 0.16655011475086212, + "learning_rate": 0.00010696011663779248, + "loss": 2.1204, + "step": 489590 + }, + { + "epoch": 1.8926566776453124, + "grad_norm": 0.16127444803714752, + "learning_rate": 0.00010685446940812282, + "loss": 2.1041, + "step": 489600 + }, + { + "epoch": 1.8926953348486957, + "grad_norm": 0.17058631777763367, + "learning_rate": 0.0001067488280737814, + "loss": 2.1279, + "step": 489610 + }, + { + "epoch": 1.892733992052079, + "grad_norm": 0.1634225845336914, + "learning_rate": 0.00010664319263378142, + "loss": 2.1183, + "step": 489620 + }, + { + "epoch": 1.8927726492554622, + "grad_norm": 0.16177918016910553, + "learning_rate": 0.00010653756308713635, + "loss": 2.1147, + "step": 489630 + }, + { + "epoch": 1.8928113064588454, + "grad_norm": 0.16086721420288086, + "learning_rate": 0.00010643193943286011, + "loss": 2.1138, + "step": 489640 + }, + { + "epoch": 1.8928499636622287, + "grad_norm": 0.1561380922794342, + "learning_rate": 0.00010632632166996636, + "loss": 2.1159, + "step": 489650 + }, + { + "epoch": 1.892888620865612, + "grad_norm": 0.17296133935451508, + "learning_rate": 0.00010622070979746967, + "loss": 2.1123, + "step": 489660 + }, + { + "epoch": 1.8929272780689954, + "grad_norm": 0.16407518088817596, + "learning_rate": 0.00010611510381438439, + "loss": 2.1127, + "step": 489670 + }, + { + "epoch": 1.8929659352723787, + "grad_norm": 0.16518047451972961, + "learning_rate": 0.00010600950371972551, + "loss": 2.12, + "step": 489680 + }, + { + "epoch": 1.893004592475762, + "grad_norm": 0.1828446090221405, + "learning_rate": 0.00010590390951250828, + "loss": 2.1121, + "step": 489690 + }, + { + "epoch": 1.8930432496791452, + "grad_norm": 0.17029117047786713, + "learning_rate": 0.00010579832119174814, + "loss": 2.1005, + "step": 489700 + }, + { + "epoch": 1.8930819068825286, + "grad_norm": 0.15997372567653656, + "learning_rate": 0.00010569273875646035, + "loss": 2.1075, + "step": 489710 + }, + { + "epoch": 1.893120564085912, + "grad_norm": 0.1557878702878952, + "learning_rate": 0.00010558716220566122, + "loss": 2.1279, + "step": 489720 + }, + { + "epoch": 1.8931592212892951, + "grad_norm": 0.18081702291965485, + "learning_rate": 0.00010548159153836667, + "loss": 2.1233, + "step": 489730 + }, + { + "epoch": 1.8931978784926784, + "grad_norm": 0.16168901324272156, + "learning_rate": 0.00010537602675359348, + "loss": 2.1285, + "step": 489740 + }, + { + "epoch": 1.8932365356960617, + "grad_norm": 0.16792845726013184, + "learning_rate": 0.00010527046785035843, + "loss": 2.1283, + "step": 489750 + }, + { + "epoch": 1.893275192899445, + "grad_norm": 0.16445757448673248, + "learning_rate": 0.00010516491482767832, + "loss": 2.1007, + "step": 489760 + }, + { + "epoch": 1.8933138501028282, + "grad_norm": 0.17205293476581573, + "learning_rate": 0.00010505936768457036, + "loss": 2.1099, + "step": 489770 + }, + { + "epoch": 1.8933525073062114, + "grad_norm": 0.15777753293514252, + "learning_rate": 0.0001049538264200527, + "loss": 2.1143, + "step": 489780 + }, + { + "epoch": 1.8933911645095947, + "grad_norm": 0.17058062553405762, + "learning_rate": 0.00010484829103314253, + "loss": 2.121, + "step": 489790 + }, + { + "epoch": 1.893429821712978, + "grad_norm": 0.16718408465385437, + "learning_rate": 0.00010474276152285867, + "loss": 2.1058, + "step": 489800 + }, + { + "epoch": 1.8934684789163612, + "grad_norm": 0.16895322501659393, + "learning_rate": 0.00010463723788821899, + "loss": 2.125, + "step": 489810 + }, + { + "epoch": 1.8935071361197444, + "grad_norm": 0.1603923887014389, + "learning_rate": 0.00010453172012824231, + "loss": 2.1064, + "step": 489820 + }, + { + "epoch": 1.8935457933231277, + "grad_norm": 0.16751614212989807, + "learning_rate": 0.00010442620824194759, + "loss": 2.0994, + "step": 489830 + }, + { + "epoch": 1.8935844505265111, + "grad_norm": 0.16878701746463776, + "learning_rate": 0.00010432070222835433, + "loss": 2.1189, + "step": 489840 + }, + { + "epoch": 1.8936231077298944, + "grad_norm": 0.1680416315793991, + "learning_rate": 0.00010421520208648149, + "loss": 2.1102, + "step": 489850 + }, + { + "epoch": 1.8936617649332776, + "grad_norm": 0.18519482016563416, + "learning_rate": 0.00010410970781534945, + "loss": 2.1132, + "step": 489860 + }, + { + "epoch": 1.893700422136661, + "grad_norm": 0.15986217558383942, + "learning_rate": 0.00010400421941397764, + "loss": 2.1003, + "step": 489870 + }, + { + "epoch": 1.8937390793400444, + "grad_norm": 0.1842077672481537, + "learning_rate": 0.00010389873688138684, + "loss": 2.1031, + "step": 489880 + }, + { + "epoch": 1.8937777365434276, + "grad_norm": 0.17824490368366241, + "learning_rate": 0.00010379326021659763, + "loss": 2.1104, + "step": 489890 + }, + { + "epoch": 1.8938163937468109, + "grad_norm": 0.1671292632818222, + "learning_rate": 0.00010368778941863055, + "loss": 2.0957, + "step": 489900 + }, + { + "epoch": 1.8938550509501941, + "grad_norm": 0.169941246509552, + "learning_rate": 0.00010358232448650707, + "loss": 2.1178, + "step": 489910 + }, + { + "epoch": 1.8938937081535774, + "grad_norm": 0.16140219569206238, + "learning_rate": 0.00010347686541924839, + "loss": 2.1135, + "step": 489920 + }, + { + "epoch": 1.8939323653569606, + "grad_norm": 0.15384657680988312, + "learning_rate": 0.0001033714122158762, + "loss": 2.1182, + "step": 489930 + }, + { + "epoch": 1.8939710225603439, + "grad_norm": 0.1640327274799347, + "learning_rate": 0.0001032659648754124, + "loss": 2.108, + "step": 489940 + }, + { + "epoch": 1.8940096797637271, + "grad_norm": 0.17281986773014069, + "learning_rate": 0.00010316052339687953, + "loss": 2.1032, + "step": 489950 + }, + { + "epoch": 1.8940483369671104, + "grad_norm": 0.16244745254516602, + "learning_rate": 0.00010305508777929995, + "loss": 2.108, + "step": 489960 + }, + { + "epoch": 1.8940869941704936, + "grad_norm": 0.17009977996349335, + "learning_rate": 0.00010294965802169598, + "loss": 2.1132, + "step": 489970 + }, + { + "epoch": 1.894125651373877, + "grad_norm": 0.1579209715127945, + "learning_rate": 0.0001028442341230913, + "loss": 2.1107, + "step": 489980 + }, + { + "epoch": 1.8941643085772601, + "grad_norm": 0.16433437168598175, + "learning_rate": 0.00010273881608250891, + "loss": 2.122, + "step": 489990 + }, + { + "epoch": 1.8942029657806434, + "grad_norm": 0.1659168004989624, + "learning_rate": 0.00010263340389897247, + "loss": 2.1132, + "step": 490000 + }, + { + "epoch": 1.8942416229840269, + "grad_norm": 0.16217505931854248, + "learning_rate": 0.00010252799757150566, + "loss": 2.1159, + "step": 490010 + }, + { + "epoch": 1.8942802801874101, + "grad_norm": 0.15367664396762848, + "learning_rate": 0.00010242259709913282, + "loss": 2.1236, + "step": 490020 + }, + { + "epoch": 1.8943189373907934, + "grad_norm": 0.1673082560300827, + "learning_rate": 0.00010231720248087829, + "loss": 2.1194, + "step": 490030 + }, + { + "epoch": 1.8943575945941766, + "grad_norm": 0.16598571836948395, + "learning_rate": 0.0001022118137157666, + "loss": 2.1027, + "step": 490040 + }, + { + "epoch": 1.89439625179756, + "grad_norm": 0.8749808669090271, + "learning_rate": 0.00010210643080282301, + "loss": 2.1101, + "step": 490050 + }, + { + "epoch": 1.8944349090009434, + "grad_norm": 0.16333632171154022, + "learning_rate": 0.00010200105374107228, + "loss": 2.1011, + "step": 490060 + }, + { + "epoch": 1.8944735662043266, + "grad_norm": 0.17187359929084778, + "learning_rate": 0.00010189568252954029, + "loss": 2.09, + "step": 490070 + }, + { + "epoch": 1.8945122234077099, + "grad_norm": 0.16790197789669037, + "learning_rate": 0.00010179031716725252, + "loss": 2.1102, + "step": 490080 + }, + { + "epoch": 1.8945508806110931, + "grad_norm": 0.16747283935546875, + "learning_rate": 0.00010168495765323504, + "loss": 2.0999, + "step": 490090 + }, + { + "epoch": 1.8945895378144764, + "grad_norm": 0.16838395595550537, + "learning_rate": 0.00010157960398651423, + "loss": 2.1348, + "step": 490100 + }, + { + "epoch": 1.8946281950178596, + "grad_norm": 0.16698074340820312, + "learning_rate": 0.00010147425616611661, + "loss": 2.0991, + "step": 490110 + }, + { + "epoch": 1.8946668522212429, + "grad_norm": 0.15873688459396362, + "learning_rate": 0.00010136891419106898, + "loss": 2.1061, + "step": 490120 + }, + { + "epoch": 1.8947055094246261, + "grad_norm": 0.17689259350299835, + "learning_rate": 0.00010126357806039855, + "loss": 2.1158, + "step": 490130 + }, + { + "epoch": 1.8947441666280094, + "grad_norm": 0.15214355289936066, + "learning_rate": 0.00010115824777313253, + "loss": 2.1043, + "step": 490140 + }, + { + "epoch": 1.8947828238313926, + "grad_norm": 0.15863355994224548, + "learning_rate": 0.00010105292332829885, + "loss": 2.1053, + "step": 490150 + }, + { + "epoch": 1.8948214810347759, + "grad_norm": 0.1705903857946396, + "learning_rate": 0.00010094760472492493, + "loss": 2.1247, + "step": 490160 + }, + { + "epoch": 1.8948601382381594, + "grad_norm": 0.16898466646671295, + "learning_rate": 0.00010084229196203931, + "loss": 2.1054, + "step": 490170 + }, + { + "epoch": 1.8948987954415426, + "grad_norm": 0.16580814123153687, + "learning_rate": 0.00010073698503867035, + "loss": 2.1213, + "step": 490180 + }, + { + "epoch": 1.8949374526449259, + "grad_norm": 0.15544813871383667, + "learning_rate": 0.00010063168395384658, + "loss": 2.1152, + "step": 490190 + }, + { + "epoch": 1.894976109848309, + "grad_norm": 0.15902158617973328, + "learning_rate": 0.00010052638870659747, + "loss": 2.0942, + "step": 490200 + }, + { + "epoch": 1.8950147670516924, + "grad_norm": 0.1620321124792099, + "learning_rate": 0.00010042109929595178, + "loss": 2.0955, + "step": 490210 + }, + { + "epoch": 1.8950534242550758, + "grad_norm": 0.17381994426250458, + "learning_rate": 0.0001003158157209394, + "loss": 2.1169, + "step": 490220 + }, + { + "epoch": 1.895092081458459, + "grad_norm": 0.16555550694465637, + "learning_rate": 0.00010021053798058977, + "loss": 2.0989, + "step": 490230 + }, + { + "epoch": 1.8951307386618423, + "grad_norm": 0.1549934446811676, + "learning_rate": 0.00010010526607393322, + "loss": 2.1271, + "step": 490240 + }, + { + "epoch": 1.8951693958652256, + "grad_norm": 0.16366319358348846, + "learning_rate": 0.00010000000000000009, + "loss": 2.1254, + "step": 490250 + }, + { + "epoch": 1.8952080530686088, + "grad_norm": 0.1647455394268036, + "learning_rate": 9.98947397578207e-05, + "loss": 2.1154, + "step": 490260 + }, + { + "epoch": 1.895246710271992, + "grad_norm": 0.16468089818954468, + "learning_rate": 9.978948534642629e-05, + "loss": 2.1167, + "step": 490270 + }, + { + "epoch": 1.8952853674753753, + "grad_norm": 0.1661677360534668, + "learning_rate": 9.96842367648474e-05, + "loss": 2.111, + "step": 490280 + }, + { + "epoch": 1.8953240246787586, + "grad_norm": 0.1536356657743454, + "learning_rate": 9.957899401211634e-05, + "loss": 2.1241, + "step": 490290 + }, + { + "epoch": 1.8953626818821419, + "grad_norm": 0.16869594156742096, + "learning_rate": 9.947375708726392e-05, + "loss": 2.1159, + "step": 490300 + }, + { + "epoch": 1.895401339085525, + "grad_norm": 0.16110627353191376, + "learning_rate": 9.936852598932267e-05, + "loss": 2.1121, + "step": 490310 + }, + { + "epoch": 1.8954399962889084, + "grad_norm": 0.1599881798028946, + "learning_rate": 9.926330071732448e-05, + "loss": 2.117, + "step": 490320 + }, + { + "epoch": 1.8954786534922916, + "grad_norm": 0.18143880367279053, + "learning_rate": 9.91580812703019e-05, + "loss": 2.1017, + "step": 490330 + }, + { + "epoch": 1.895517310695675, + "grad_norm": 0.16994333267211914, + "learning_rate": 9.905286764728772e-05, + "loss": 2.1139, + "step": 490340 + }, + { + "epoch": 1.8955559678990583, + "grad_norm": 0.1837039440870285, + "learning_rate": 9.894765984731513e-05, + "loss": 2.1103, + "step": 490350 + }, + { + "epoch": 1.8955946251024416, + "grad_norm": 0.1612176150083542, + "learning_rate": 9.884245786941693e-05, + "loss": 2.1163, + "step": 490360 + }, + { + "epoch": 1.8956332823058248, + "grad_norm": 0.19064176082611084, + "learning_rate": 9.873726171262698e-05, + "loss": 2.099, + "step": 490370 + }, + { + "epoch": 1.895671939509208, + "grad_norm": 0.16493946313858032, + "learning_rate": 9.863207137597897e-05, + "loss": 2.1121, + "step": 490380 + }, + { + "epoch": 1.8957105967125916, + "grad_norm": 0.18231101334095, + "learning_rate": 9.852688685850719e-05, + "loss": 2.1142, + "step": 490390 + }, + { + "epoch": 1.8957492539159748, + "grad_norm": 0.15567000210285187, + "learning_rate": 9.8421708159246e-05, + "loss": 2.1014, + "step": 490400 + }, + { + "epoch": 1.895787911119358, + "grad_norm": 0.1578628122806549, + "learning_rate": 9.831653527722973e-05, + "loss": 2.098, + "step": 490410 + }, + { + "epoch": 1.8958265683227413, + "grad_norm": 0.1570051908493042, + "learning_rate": 9.821136821149334e-05, + "loss": 2.1152, + "step": 490420 + }, + { + "epoch": 1.8958652255261246, + "grad_norm": 0.1664205938577652, + "learning_rate": 9.810620696107209e-05, + "loss": 2.0934, + "step": 490430 + }, + { + "epoch": 1.8959038827295078, + "grad_norm": 0.15952792763710022, + "learning_rate": 9.80010515250016e-05, + "loss": 2.1018, + "step": 490440 + }, + { + "epoch": 1.895942539932891, + "grad_norm": 0.17305637896060944, + "learning_rate": 9.789590190231712e-05, + "loss": 2.1097, + "step": 490450 + }, + { + "epoch": 1.8959811971362743, + "grad_norm": 0.17136140167713165, + "learning_rate": 9.779075809205473e-05, + "loss": 2.1068, + "step": 490460 + }, + { + "epoch": 1.8960198543396576, + "grad_norm": 0.16530878841876984, + "learning_rate": 9.768562009325077e-05, + "loss": 2.1179, + "step": 490470 + }, + { + "epoch": 1.8960585115430408, + "grad_norm": 0.16368193924427032, + "learning_rate": 9.758048790494156e-05, + "loss": 2.1151, + "step": 490480 + }, + { + "epoch": 1.896097168746424, + "grad_norm": 0.18392740190029144, + "learning_rate": 9.747536152616409e-05, + "loss": 2.1116, + "step": 490490 + }, + { + "epoch": 1.8961358259498073, + "grad_norm": 0.17140942811965942, + "learning_rate": 9.737024095595515e-05, + "loss": 2.1141, + "step": 490500 + }, + { + "epoch": 1.8961744831531908, + "grad_norm": 0.17512132227420807, + "learning_rate": 9.726512619335215e-05, + "loss": 2.1088, + "step": 490510 + }, + { + "epoch": 1.896213140356574, + "grad_norm": 0.16664306819438934, + "learning_rate": 9.716001723739254e-05, + "loss": 2.099, + "step": 490520 + }, + { + "epoch": 1.8962517975599573, + "grad_norm": 0.46299320459365845, + "learning_rate": 9.70549140871142e-05, + "loss": 2.0923, + "step": 490530 + }, + { + "epoch": 1.8962904547633406, + "grad_norm": 0.17620672285556793, + "learning_rate": 9.694981674155523e-05, + "loss": 2.1093, + "step": 490540 + }, + { + "epoch": 1.896329111966724, + "grad_norm": 0.16797108948230743, + "learning_rate": 9.684472519975396e-05, + "loss": 2.1117, + "step": 490550 + }, + { + "epoch": 1.8963677691701073, + "grad_norm": 0.16022421419620514, + "learning_rate": 9.673963946074893e-05, + "loss": 2.1063, + "step": 490560 + }, + { + "epoch": 1.8964064263734906, + "grad_norm": 0.16269107162952423, + "learning_rate": 9.663455952357913e-05, + "loss": 2.1209, + "step": 490570 + }, + { + "epoch": 1.8964450835768738, + "grad_norm": 0.1669422686100006, + "learning_rate": 9.652948538728334e-05, + "loss": 2.1159, + "step": 490580 + }, + { + "epoch": 1.896483740780257, + "grad_norm": 0.16729198396205902, + "learning_rate": 9.642441705090143e-05, + "loss": 2.1197, + "step": 490590 + }, + { + "epoch": 1.8965223979836403, + "grad_norm": 0.1610100418329239, + "learning_rate": 9.631935451347307e-05, + "loss": 2.1063, + "step": 490600 + }, + { + "epoch": 1.8965610551870236, + "grad_norm": 0.15145553648471832, + "learning_rate": 9.62142977740379e-05, + "loss": 2.1107, + "step": 490610 + }, + { + "epoch": 1.8965997123904068, + "grad_norm": 0.16301782429218292, + "learning_rate": 9.610924683163602e-05, + "loss": 2.1093, + "step": 490620 + }, + { + "epoch": 1.89663836959379, + "grad_norm": 0.1682334691286087, + "learning_rate": 9.600420168530844e-05, + "loss": 2.1127, + "step": 490630 + }, + { + "epoch": 1.8966770267971733, + "grad_norm": 0.16489244997501373, + "learning_rate": 9.589916233409523e-05, + "loss": 2.0874, + "step": 490640 + }, + { + "epoch": 1.8967156840005566, + "grad_norm": 0.15822367370128632, + "learning_rate": 9.579412877703786e-05, + "loss": 2.095, + "step": 490650 + }, + { + "epoch": 1.8967543412039398, + "grad_norm": 0.15861819684505463, + "learning_rate": 9.568910101317752e-05, + "loss": 2.0985, + "step": 490660 + }, + { + "epoch": 1.896792998407323, + "grad_norm": 0.16408208012580872, + "learning_rate": 9.558407904155563e-05, + "loss": 2.1079, + "step": 490670 + }, + { + "epoch": 1.8968316556107065, + "grad_norm": 0.1616327166557312, + "learning_rate": 9.547906286121389e-05, + "loss": 2.0993, + "step": 490680 + }, + { + "epoch": 1.8968703128140898, + "grad_norm": 0.15668755769729614, + "learning_rate": 9.537405247119457e-05, + "loss": 2.1131, + "step": 490690 + }, + { + "epoch": 1.896908970017473, + "grad_norm": 0.17546828091144562, + "learning_rate": 9.52690478705398e-05, + "loss": 2.1081, + "step": 490700 + }, + { + "epoch": 1.8969476272208563, + "grad_norm": 0.17848749458789825, + "learning_rate": 9.516404905829234e-05, + "loss": 2.1102, + "step": 490710 + }, + { + "epoch": 1.8969862844242398, + "grad_norm": 0.15275801718235016, + "learning_rate": 9.505905603349474e-05, + "loss": 2.1089, + "step": 490720 + }, + { + "epoch": 1.897024941627623, + "grad_norm": 0.17563579976558685, + "learning_rate": 9.495406879519042e-05, + "loss": 2.1168, + "step": 490730 + }, + { + "epoch": 1.8970635988310063, + "grad_norm": 0.17075873911380768, + "learning_rate": 9.484908734242259e-05, + "loss": 2.1148, + "step": 490740 + }, + { + "epoch": 1.8971022560343895, + "grad_norm": 0.168623149394989, + "learning_rate": 9.474411167423491e-05, + "loss": 2.1064, + "step": 490750 + }, + { + "epoch": 1.8971409132377728, + "grad_norm": 0.16039146482944489, + "learning_rate": 9.463914178967147e-05, + "loss": 2.1172, + "step": 490760 + }, + { + "epoch": 1.897179570441156, + "grad_norm": 0.16803433001041412, + "learning_rate": 9.453417768777595e-05, + "loss": 2.107, + "step": 490770 + }, + { + "epoch": 1.8972182276445393, + "grad_norm": 0.18396279215812683, + "learning_rate": 9.44292193675933e-05, + "loss": 2.1161, + "step": 490780 + }, + { + "epoch": 1.8972568848479225, + "grad_norm": 0.15813076496124268, + "learning_rate": 9.432426682816786e-05, + "loss": 2.0962, + "step": 490790 + }, + { + "epoch": 1.8972955420513058, + "grad_norm": 0.15674901008605957, + "learning_rate": 9.421932006854461e-05, + "loss": 2.1119, + "step": 490800 + }, + { + "epoch": 1.897334199254689, + "grad_norm": 0.169746994972229, + "learning_rate": 9.411437908776899e-05, + "loss": 2.108, + "step": 490810 + }, + { + "epoch": 1.8973728564580723, + "grad_norm": 0.16584500670433044, + "learning_rate": 9.400944388488641e-05, + "loss": 2.1231, + "step": 490820 + }, + { + "epoch": 1.8974115136614556, + "grad_norm": 0.1614682823419571, + "learning_rate": 9.390451445894254e-05, + "loss": 2.0992, + "step": 490830 + }, + { + "epoch": 1.8974501708648388, + "grad_norm": 0.16342851519584656, + "learning_rate": 9.379959080898325e-05, + "loss": 2.0946, + "step": 490840 + }, + { + "epoch": 1.8974888280682223, + "grad_norm": 0.16105535626411438, + "learning_rate": 9.369467293405488e-05, + "loss": 2.1039, + "step": 490850 + }, + { + "epoch": 1.8975274852716055, + "grad_norm": 0.170151948928833, + "learning_rate": 9.358976083320414e-05, + "loss": 2.1133, + "step": 490860 + }, + { + "epoch": 1.8975661424749888, + "grad_norm": 0.1685088872909546, + "learning_rate": 9.348485450547761e-05, + "loss": 2.1215, + "step": 490870 + }, + { + "epoch": 1.897604799678372, + "grad_norm": 0.1650388389825821, + "learning_rate": 9.337995394992227e-05, + "loss": 2.1133, + "step": 490880 + }, + { + "epoch": 1.8976434568817555, + "grad_norm": 0.43443021178245544, + "learning_rate": 9.327505916558576e-05, + "loss": 2.1133, + "step": 490890 + }, + { + "epoch": 1.8976821140851388, + "grad_norm": 0.1729881912469864, + "learning_rate": 9.317017015151552e-05, + "loss": 2.1096, + "step": 490900 + }, + { + "epoch": 1.897720771288522, + "grad_norm": 0.15954531729221344, + "learning_rate": 9.306528690675942e-05, + "loss": 2.1196, + "step": 490910 + }, + { + "epoch": 1.8977594284919053, + "grad_norm": 0.16167645156383514, + "learning_rate": 9.296040943036532e-05, + "loss": 2.1126, + "step": 490920 + }, + { + "epoch": 1.8977980856952885, + "grad_norm": 0.16091011464595795, + "learning_rate": 9.285553772138178e-05, + "loss": 2.1164, + "step": 490930 + }, + { + "epoch": 1.8978367428986718, + "grad_norm": 0.16552941501140594, + "learning_rate": 9.275067177885732e-05, + "loss": 2.1084, + "step": 490940 + }, + { + "epoch": 1.897875400102055, + "grad_norm": 0.16158799827098846, + "learning_rate": 9.264581160184094e-05, + "loss": 2.1159, + "step": 490950 + }, + { + "epoch": 1.8979140573054383, + "grad_norm": 0.16633044183254242, + "learning_rate": 9.254095718938182e-05, + "loss": 2.102, + "step": 490960 + }, + { + "epoch": 1.8979527145088215, + "grad_norm": 0.1752433031797409, + "learning_rate": 9.243610854052919e-05, + "loss": 2.1225, + "step": 490970 + }, + { + "epoch": 1.8979913717122048, + "grad_norm": 0.17548854649066925, + "learning_rate": 9.23312656543327e-05, + "loss": 2.1005, + "step": 490980 + }, + { + "epoch": 1.898030028915588, + "grad_norm": 0.15903107821941376, + "learning_rate": 9.222642852984242e-05, + "loss": 2.1208, + "step": 490990 + }, + { + "epoch": 1.8980686861189713, + "grad_norm": 0.1658545732498169, + "learning_rate": 9.212159716610868e-05, + "loss": 2.1086, + "step": 491000 + }, + { + "epoch": 1.8981073433223545, + "grad_norm": 0.1633501648902893, + "learning_rate": 9.201677156218158e-05, + "loss": 2.1149, + "step": 491010 + }, + { + "epoch": 1.898146000525738, + "grad_norm": 0.19065724313259125, + "learning_rate": 9.191195171711208e-05, + "loss": 2.1066, + "step": 491020 + }, + { + "epoch": 1.8981846577291213, + "grad_norm": 0.16385003924369812, + "learning_rate": 9.180713762995119e-05, + "loss": 2.1046, + "step": 491030 + }, + { + "epoch": 1.8982233149325045, + "grad_norm": 0.1566624492406845, + "learning_rate": 9.170232929974986e-05, + "loss": 2.1091, + "step": 491040 + }, + { + "epoch": 1.8982619721358878, + "grad_norm": 0.16979220509529114, + "learning_rate": 9.159752672555999e-05, + "loss": 2.1047, + "step": 491050 + }, + { + "epoch": 1.8983006293392712, + "grad_norm": 0.15770958364009857, + "learning_rate": 9.149272990643298e-05, + "loss": 2.0884, + "step": 491060 + }, + { + "epoch": 1.8983392865426545, + "grad_norm": 0.18209083378314972, + "learning_rate": 9.138793884142093e-05, + "loss": 2.1155, + "step": 491070 + }, + { + "epoch": 1.8983779437460377, + "grad_norm": 0.17145898938179016, + "learning_rate": 9.128315352957595e-05, + "loss": 2.1227, + "step": 491080 + }, + { + "epoch": 1.898416600949421, + "grad_norm": 0.16826361417770386, + "learning_rate": 9.1178373969951e-05, + "loss": 2.0928, + "step": 491090 + }, + { + "epoch": 1.8984552581528042, + "grad_norm": 0.17281651496887207, + "learning_rate": 9.10736001615986e-05, + "loss": 2.1212, + "step": 491100 + }, + { + "epoch": 1.8984939153561875, + "grad_norm": 0.1669350415468216, + "learning_rate": 9.096883210357199e-05, + "loss": 2.1081, + "step": 491110 + }, + { + "epoch": 1.8985325725595708, + "grad_norm": 0.16174283623695374, + "learning_rate": 9.086406979492412e-05, + "loss": 2.1074, + "step": 491120 + }, + { + "epoch": 1.898571229762954, + "grad_norm": 0.18184956908226013, + "learning_rate": 9.07593132347091e-05, + "loss": 2.108, + "step": 491130 + }, + { + "epoch": 1.8986098869663373, + "grad_norm": 0.1940072625875473, + "learning_rate": 9.065456242198011e-05, + "loss": 2.1051, + "step": 491140 + }, + { + "epoch": 1.8986485441697205, + "grad_norm": 0.17223842442035675, + "learning_rate": 9.054981735579193e-05, + "loss": 2.1127, + "step": 491150 + }, + { + "epoch": 1.8986872013731038, + "grad_norm": 0.1719387173652649, + "learning_rate": 9.044507803519841e-05, + "loss": 2.1026, + "step": 491160 + }, + { + "epoch": 1.898725858576487, + "grad_norm": 0.16838513314723969, + "learning_rate": 9.034034445925433e-05, + "loss": 2.1093, + "step": 491170 + }, + { + "epoch": 1.8987645157798703, + "grad_norm": 0.165732741355896, + "learning_rate": 9.02356166270144e-05, + "loss": 2.1217, + "step": 491180 + }, + { + "epoch": 1.8988031729832537, + "grad_norm": 0.17605352401733398, + "learning_rate": 9.013089453753387e-05, + "loss": 2.0982, + "step": 491190 + }, + { + "epoch": 1.898841830186637, + "grad_norm": 0.16870661079883575, + "learning_rate": 9.002617818986836e-05, + "loss": 2.111, + "step": 491200 + }, + { + "epoch": 1.8988804873900202, + "grad_norm": 0.1646031141281128, + "learning_rate": 8.992146758307352e-05, + "loss": 2.1055, + "step": 491210 + }, + { + "epoch": 1.8989191445934035, + "grad_norm": 0.18385101854801178, + "learning_rate": 8.981676271620476e-05, + "loss": 2.1093, + "step": 491220 + }, + { + "epoch": 1.898957801796787, + "grad_norm": 0.1688939929008484, + "learning_rate": 8.971206358831862e-05, + "loss": 2.1119, + "step": 491230 + }, + { + "epoch": 1.8989964590001702, + "grad_norm": 0.18524891138076782, + "learning_rate": 8.960737019847143e-05, + "loss": 2.1087, + "step": 491240 + }, + { + "epoch": 1.8990351162035535, + "grad_norm": 0.16723163425922394, + "learning_rate": 8.950268254571992e-05, + "loss": 2.1083, + "step": 491250 + }, + { + "epoch": 1.8990737734069367, + "grad_norm": 0.17184731364250183, + "learning_rate": 8.939800062912106e-05, + "loss": 2.1076, + "step": 491260 + }, + { + "epoch": 1.89911243061032, + "grad_norm": 0.1555802822113037, + "learning_rate": 8.929332444773208e-05, + "loss": 2.1193, + "step": 491270 + }, + { + "epoch": 1.8991510878137032, + "grad_norm": 0.1600703001022339, + "learning_rate": 8.918865400061016e-05, + "loss": 2.1156, + "step": 491280 + }, + { + "epoch": 1.8991897450170865, + "grad_norm": 0.15900921821594238, + "learning_rate": 8.908398928681315e-05, + "loss": 2.1085, + "step": 491290 + }, + { + "epoch": 1.8992284022204697, + "grad_norm": 0.1766301989555359, + "learning_rate": 8.89793303053994e-05, + "loss": 2.1065, + "step": 491300 + }, + { + "epoch": 1.899267059423853, + "grad_norm": 0.18237997591495514, + "learning_rate": 8.887467705542651e-05, + "loss": 2.1187, + "step": 491310 + }, + { + "epoch": 1.8993057166272362, + "grad_norm": 0.17178525030612946, + "learning_rate": 8.877002953595348e-05, + "loss": 2.1063, + "step": 491320 + }, + { + "epoch": 1.8993443738306195, + "grad_norm": 0.1610889583826065, + "learning_rate": 8.866538774603883e-05, + "loss": 2.1138, + "step": 491330 + }, + { + "epoch": 1.8993830310340027, + "grad_norm": 0.17128078639507294, + "learning_rate": 8.856075168474153e-05, + "loss": 2.1002, + "step": 491340 + }, + { + "epoch": 1.899421688237386, + "grad_norm": 0.20281025767326355, + "learning_rate": 8.845612135112103e-05, + "loss": 2.1107, + "step": 491350 + }, + { + "epoch": 1.8994603454407695, + "grad_norm": 0.18568745255470276, + "learning_rate": 8.835149674423671e-05, + "loss": 2.1036, + "step": 491360 + }, + { + "epoch": 1.8994990026441527, + "grad_norm": 0.16455897688865662, + "learning_rate": 8.824687786314845e-05, + "loss": 2.1048, + "step": 491370 + }, + { + "epoch": 1.899537659847536, + "grad_norm": 0.16593965888023376, + "learning_rate": 8.81422647069161e-05, + "loss": 2.1175, + "step": 491380 + }, + { + "epoch": 1.8995763170509192, + "grad_norm": 0.1637643575668335, + "learning_rate": 8.80376572746e-05, + "loss": 2.1058, + "step": 491390 + }, + { + "epoch": 1.8996149742543027, + "grad_norm": 0.17839862406253815, + "learning_rate": 8.793305556526087e-05, + "loss": 2.1113, + "step": 491400 + }, + { + "epoch": 1.899653631457686, + "grad_norm": 0.15830892324447632, + "learning_rate": 8.782845957795927e-05, + "loss": 2.1175, + "step": 491410 + }, + { + "epoch": 1.8996922886610692, + "grad_norm": 0.1570470631122589, + "learning_rate": 8.772386931175657e-05, + "loss": 2.1049, + "step": 491420 + }, + { + "epoch": 1.8997309458644525, + "grad_norm": 0.16065655648708344, + "learning_rate": 8.761928476571379e-05, + "loss": 2.107, + "step": 491430 + }, + { + "epoch": 1.8997696030678357, + "grad_norm": 0.16688545048236847, + "learning_rate": 8.751470593889277e-05, + "loss": 2.1079, + "step": 491440 + }, + { + "epoch": 1.899808260271219, + "grad_norm": 0.15952154994010925, + "learning_rate": 8.741013283035515e-05, + "loss": 2.1246, + "step": 491450 + }, + { + "epoch": 1.8998469174746022, + "grad_norm": 0.1672104001045227, + "learning_rate": 8.730556543916301e-05, + "loss": 2.1136, + "step": 491460 + }, + { + "epoch": 1.8998855746779855, + "grad_norm": 0.17419083416461945, + "learning_rate": 8.72010037643789e-05, + "loss": 2.1179, + "step": 491470 + }, + { + "epoch": 1.8999242318813687, + "grad_norm": 0.15057621896266937, + "learning_rate": 8.709644780506509e-05, + "loss": 2.0982, + "step": 491480 + }, + { + "epoch": 1.899962889084752, + "grad_norm": 0.15921354293823242, + "learning_rate": 8.699189756028481e-05, + "loss": 2.1102, + "step": 491490 + }, + { + "epoch": 1.9000015462881352, + "grad_norm": 0.1713653802871704, + "learning_rate": 8.688735302910078e-05, + "loss": 2.1014, + "step": 491500 + }, + { + "epoch": 1.9000402034915185, + "grad_norm": 0.16376575827598572, + "learning_rate": 8.678281421057687e-05, + "loss": 2.1036, + "step": 491510 + }, + { + "epoch": 1.9000788606949017, + "grad_norm": 0.17783987522125244, + "learning_rate": 8.66782811037763e-05, + "loss": 2.1069, + "step": 491520 + }, + { + "epoch": 1.9001175178982852, + "grad_norm": 0.1579967737197876, + "learning_rate": 8.65737537077631e-05, + "loss": 2.1106, + "step": 491530 + }, + { + "epoch": 1.9001561751016685, + "grad_norm": 0.19459132850170135, + "learning_rate": 8.64692320216014e-05, + "loss": 2.1063, + "step": 491540 + }, + { + "epoch": 1.9001948323050517, + "grad_norm": 0.1886938214302063, + "learning_rate": 8.636471604435547e-05, + "loss": 2.1083, + "step": 491550 + }, + { + "epoch": 1.900233489508435, + "grad_norm": 0.18700428307056427, + "learning_rate": 8.62602057750903e-05, + "loss": 2.1215, + "step": 491560 + }, + { + "epoch": 1.9002721467118184, + "grad_norm": 0.1653238981962204, + "learning_rate": 8.615570121287042e-05, + "loss": 2.1021, + "step": 491570 + }, + { + "epoch": 1.9003108039152017, + "grad_norm": 0.15879376232624054, + "learning_rate": 8.605120235676122e-05, + "loss": 2.1079, + "step": 491580 + }, + { + "epoch": 1.900349461118585, + "grad_norm": 0.15012842416763306, + "learning_rate": 8.59467092058277e-05, + "loss": 2.1037, + "step": 491590 + }, + { + "epoch": 1.9003881183219682, + "grad_norm": 0.15867432951927185, + "learning_rate": 8.584222175913614e-05, + "loss": 2.1041, + "step": 491600 + }, + { + "epoch": 1.9004267755253514, + "grad_norm": 0.16058456897735596, + "learning_rate": 8.573774001575219e-05, + "loss": 2.0914, + "step": 491610 + }, + { + "epoch": 1.9004654327287347, + "grad_norm": 0.17358066141605377, + "learning_rate": 8.563326397474191e-05, + "loss": 2.0977, + "step": 491620 + }, + { + "epoch": 1.900504089932118, + "grad_norm": 0.16065537929534912, + "learning_rate": 8.552879363517185e-05, + "loss": 2.1127, + "step": 491630 + }, + { + "epoch": 1.9005427471355012, + "grad_norm": 0.16244420409202576, + "learning_rate": 8.542432899610898e-05, + "loss": 2.1001, + "step": 491640 + }, + { + "epoch": 1.9005814043388845, + "grad_norm": 0.1713636815547943, + "learning_rate": 8.531987005661957e-05, + "loss": 2.1211, + "step": 491650 + }, + { + "epoch": 1.9006200615422677, + "grad_norm": 0.15860594809055328, + "learning_rate": 8.521541681577149e-05, + "loss": 2.1229, + "step": 491660 + }, + { + "epoch": 1.900658718745651, + "grad_norm": 0.16405101120471954, + "learning_rate": 8.511096927263174e-05, + "loss": 2.118, + "step": 491670 + }, + { + "epoch": 1.9006973759490342, + "grad_norm": 0.153397798538208, + "learning_rate": 8.500652742626836e-05, + "loss": 2.108, + "step": 491680 + }, + { + "epoch": 1.9007360331524175, + "grad_norm": 0.15481743216514587, + "learning_rate": 8.490209127574877e-05, + "loss": 2.1038, + "step": 491690 + }, + { + "epoch": 1.900774690355801, + "grad_norm": 0.16705960035324097, + "learning_rate": 8.479766082014196e-05, + "loss": 2.1018, + "step": 491700 + }, + { + "epoch": 1.9008133475591842, + "grad_norm": 0.16170643270015717, + "learning_rate": 8.469323605851575e-05, + "loss": 2.0994, + "step": 491710 + }, + { + "epoch": 1.9008520047625674, + "grad_norm": 0.16278567910194397, + "learning_rate": 8.458881698993936e-05, + "loss": 2.1103, + "step": 491720 + }, + { + "epoch": 1.9008906619659507, + "grad_norm": 0.16915789246559143, + "learning_rate": 8.448440361348131e-05, + "loss": 2.0999, + "step": 491730 + }, + { + "epoch": 1.9009293191693342, + "grad_norm": 0.1592896431684494, + "learning_rate": 8.437999592821121e-05, + "loss": 2.0839, + "step": 491740 + }, + { + "epoch": 1.9009679763727174, + "grad_norm": 0.16387787461280823, + "learning_rate": 8.427559393319828e-05, + "loss": 2.0966, + "step": 491750 + }, + { + "epoch": 1.9010066335761007, + "grad_norm": 0.16786013543605804, + "learning_rate": 8.417119762751257e-05, + "loss": 2.1064, + "step": 491760 + }, + { + "epoch": 1.901045290779484, + "grad_norm": 0.1598120778799057, + "learning_rate": 8.406680701022352e-05, + "loss": 2.1104, + "step": 491770 + }, + { + "epoch": 1.9010839479828672, + "grad_norm": 0.1629783660173416, + "learning_rate": 8.396242208040183e-05, + "loss": 2.1139, + "step": 491780 + }, + { + "epoch": 1.9011226051862504, + "grad_norm": 0.1871945708990097, + "learning_rate": 8.385804283711784e-05, + "loss": 2.1141, + "step": 491790 + }, + { + "epoch": 1.9011612623896337, + "grad_norm": 0.16119788587093353, + "learning_rate": 8.37536692794425e-05, + "loss": 2.1084, + "step": 491800 + }, + { + "epoch": 1.901199919593017, + "grad_norm": 0.83812016248703, + "learning_rate": 8.364930140644655e-05, + "loss": 2.0879, + "step": 491810 + }, + { + "epoch": 1.9012385767964002, + "grad_norm": 0.17770658433437347, + "learning_rate": 8.35449392172014e-05, + "loss": 2.1137, + "step": 491820 + }, + { + "epoch": 1.9012772339997834, + "grad_norm": 0.16469278931617737, + "learning_rate": 8.344058271077847e-05, + "loss": 2.1256, + "step": 491830 + }, + { + "epoch": 1.9013158912031667, + "grad_norm": 0.15335942804813385, + "learning_rate": 8.333623188624961e-05, + "loss": 2.0877, + "step": 491840 + }, + { + "epoch": 1.90135454840655, + "grad_norm": 0.17696666717529297, + "learning_rate": 8.323188674268689e-05, + "loss": 2.1227, + "step": 491850 + }, + { + "epoch": 1.9013932056099332, + "grad_norm": 0.16875462234020233, + "learning_rate": 8.312754727916238e-05, + "loss": 2.1048, + "step": 491860 + }, + { + "epoch": 1.9014318628133167, + "grad_norm": 0.16450636088848114, + "learning_rate": 8.302321349474862e-05, + "loss": 2.1136, + "step": 491870 + }, + { + "epoch": 1.9014705200167, + "grad_norm": 0.17607928812503815, + "learning_rate": 8.291888538851854e-05, + "loss": 2.1152, + "step": 491880 + }, + { + "epoch": 1.9015091772200832, + "grad_norm": 0.18208394944667816, + "learning_rate": 8.281456295954515e-05, + "loss": 2.1085, + "step": 491890 + }, + { + "epoch": 1.9015478344234664, + "grad_norm": 0.16099147498607635, + "learning_rate": 8.27102462069016e-05, + "loss": 2.1094, + "step": 491900 + }, + { + "epoch": 1.90158649162685, + "grad_norm": 0.1623336374759674, + "learning_rate": 8.260593512966153e-05, + "loss": 2.0994, + "step": 491910 + }, + { + "epoch": 1.9016251488302331, + "grad_norm": 0.16568103432655334, + "learning_rate": 8.250162972689856e-05, + "loss": 2.0917, + "step": 491920 + }, + { + "epoch": 1.9016638060336164, + "grad_norm": 0.16497556865215302, + "learning_rate": 8.2397329997687e-05, + "loss": 2.1014, + "step": 491930 + }, + { + "epoch": 1.9017024632369997, + "grad_norm": 0.17084918916225433, + "learning_rate": 8.229303594110093e-05, + "loss": 2.1148, + "step": 491940 + }, + { + "epoch": 1.901741120440383, + "grad_norm": 0.15589167177677155, + "learning_rate": 8.218874755621485e-05, + "loss": 2.0954, + "step": 491950 + }, + { + "epoch": 1.9017797776437662, + "grad_norm": 0.16014504432678223, + "learning_rate": 8.208446484210374e-05, + "loss": 2.1057, + "step": 491960 + }, + { + "epoch": 1.9018184348471494, + "grad_norm": 0.1599462628364563, + "learning_rate": 8.198018779784234e-05, + "loss": 2.1057, + "step": 491970 + }, + { + "epoch": 1.9018570920505327, + "grad_norm": 0.16896983981132507, + "learning_rate": 8.187591642250647e-05, + "loss": 2.1032, + "step": 491980 + }, + { + "epoch": 1.901895749253916, + "grad_norm": 0.1815507858991623, + "learning_rate": 8.17716507151709e-05, + "loss": 2.1013, + "step": 491990 + }, + { + "epoch": 1.9019344064572992, + "grad_norm": 0.178026482462883, + "learning_rate": 8.166739067491213e-05, + "loss": 2.1065, + "step": 492000 + }, + { + "epoch": 1.9019730636606824, + "grad_norm": 0.15985998511314392, + "learning_rate": 8.156313630080603e-05, + "loss": 2.1001, + "step": 492010 + }, + { + "epoch": 1.9020117208640657, + "grad_norm": 0.1622258871793747, + "learning_rate": 8.145888759192866e-05, + "loss": 2.0882, + "step": 492020 + }, + { + "epoch": 1.9020503780674491, + "grad_norm": 0.1622309535741806, + "learning_rate": 8.135464454735675e-05, + "loss": 2.1052, + "step": 492030 + }, + { + "epoch": 1.9020890352708324, + "grad_norm": 0.16055290400981903, + "learning_rate": 8.125040716616706e-05, + "loss": 2.1015, + "step": 492040 + }, + { + "epoch": 1.9021276924742156, + "grad_norm": 0.17899253964424133, + "learning_rate": 8.114617544743653e-05, + "loss": 2.1045, + "step": 492050 + }, + { + "epoch": 1.902166349677599, + "grad_norm": 0.1712222844362259, + "learning_rate": 8.104194939024278e-05, + "loss": 2.104, + "step": 492060 + }, + { + "epoch": 1.9022050068809822, + "grad_norm": 0.16439427435398102, + "learning_rate": 8.093772899366303e-05, + "loss": 2.1121, + "step": 492070 + }, + { + "epoch": 1.9022436640843656, + "grad_norm": 0.16032835841178894, + "learning_rate": 8.083351425677509e-05, + "loss": 2.1072, + "step": 492080 + }, + { + "epoch": 1.9022823212877489, + "grad_norm": 0.15671013295650482, + "learning_rate": 8.072930517865706e-05, + "loss": 2.0875, + "step": 492090 + }, + { + "epoch": 1.9023209784911321, + "grad_norm": 0.16478151082992554, + "learning_rate": 8.062510175838744e-05, + "loss": 2.0919, + "step": 492100 + }, + { + "epoch": 1.9023596356945154, + "grad_norm": 0.1695231795310974, + "learning_rate": 8.052090399504475e-05, + "loss": 2.1059, + "step": 492110 + }, + { + "epoch": 1.9023982928978986, + "grad_norm": 0.16111020743846893, + "learning_rate": 8.041671188770772e-05, + "loss": 2.0965, + "step": 492120 + }, + { + "epoch": 1.9024369501012819, + "grad_norm": 0.16548483073711395, + "learning_rate": 8.031252543545509e-05, + "loss": 2.0972, + "step": 492130 + }, + { + "epoch": 1.9024756073046651, + "grad_norm": 0.1690864861011505, + "learning_rate": 8.02083446373667e-05, + "loss": 2.1142, + "step": 492140 + }, + { + "epoch": 1.9025142645080484, + "grad_norm": 0.1572396159172058, + "learning_rate": 8.010416949252175e-05, + "loss": 2.094, + "step": 492150 + }, + { + "epoch": 1.9025529217114316, + "grad_norm": 0.17758668959140778, + "learning_rate": 8.000000000000007e-05, + "loss": 2.1138, + "step": 492160 + }, + { + "epoch": 1.902591578914815, + "grad_norm": 0.16696658730506897, + "learning_rate": 7.989583615888174e-05, + "loss": 2.1092, + "step": 492170 + }, + { + "epoch": 1.9026302361181981, + "grad_norm": 0.17181764543056488, + "learning_rate": 7.979167796824727e-05, + "loss": 2.1121, + "step": 492180 + }, + { + "epoch": 1.9026688933215814, + "grad_norm": 0.21159546077251434, + "learning_rate": 7.968752542717673e-05, + "loss": 2.1018, + "step": 492190 + }, + { + "epoch": 1.9027075505249649, + "grad_norm": 0.16343888640403748, + "learning_rate": 7.958337853475129e-05, + "loss": 2.1122, + "step": 492200 + }, + { + "epoch": 1.9027462077283481, + "grad_norm": 0.16192626953125, + "learning_rate": 7.947923729005213e-05, + "loss": 2.1112, + "step": 492210 + }, + { + "epoch": 1.9027848649317314, + "grad_norm": 0.15736247599124908, + "learning_rate": 7.937510169216022e-05, + "loss": 2.1062, + "step": 492220 + }, + { + "epoch": 1.9028235221351146, + "grad_norm": 0.15779156982898712, + "learning_rate": 7.927097174015718e-05, + "loss": 2.1118, + "step": 492230 + }, + { + "epoch": 1.9028621793384979, + "grad_norm": 0.158307746052742, + "learning_rate": 7.916684743312486e-05, + "loss": 2.1033, + "step": 492240 + }, + { + "epoch": 1.9029008365418814, + "grad_norm": 0.1720828264951706, + "learning_rate": 7.906272877014531e-05, + "loss": 2.0993, + "step": 492250 + }, + { + "epoch": 1.9029394937452646, + "grad_norm": 0.17028604447841644, + "learning_rate": 7.895861575030106e-05, + "loss": 2.0985, + "step": 492260 + }, + { + "epoch": 1.9029781509486479, + "grad_norm": 0.16534186899662018, + "learning_rate": 7.885450837267416e-05, + "loss": 2.1087, + "step": 492270 + }, + { + "epoch": 1.9030168081520311, + "grad_norm": 0.17664501070976257, + "learning_rate": 7.875040663634758e-05, + "loss": 2.1143, + "step": 492280 + }, + { + "epoch": 1.9030554653554144, + "grad_norm": 0.1606430858373642, + "learning_rate": 7.864631054040427e-05, + "loss": 2.1024, + "step": 492290 + }, + { + "epoch": 1.9030941225587976, + "grad_norm": 0.16679500043392181, + "learning_rate": 7.854222008392809e-05, + "loss": 2.0931, + "step": 492300 + }, + { + "epoch": 1.9031327797621809, + "grad_norm": 0.17149166762828827, + "learning_rate": 7.843813526600197e-05, + "loss": 2.1047, + "step": 492310 + }, + { + "epoch": 1.9031714369655641, + "grad_norm": 0.17418958246707916, + "learning_rate": 7.833405608570977e-05, + "loss": 2.1088, + "step": 492320 + }, + { + "epoch": 1.9032100941689474, + "grad_norm": 0.15961408615112305, + "learning_rate": 7.822998254213576e-05, + "loss": 2.1078, + "step": 492330 + }, + { + "epoch": 1.9032487513723306, + "grad_norm": 0.1657746434211731, + "learning_rate": 7.812591463436403e-05, + "loss": 2.0973, + "step": 492340 + }, + { + "epoch": 1.9032874085757139, + "grad_norm": 0.1687639206647873, + "learning_rate": 7.802185236147908e-05, + "loss": 2.1027, + "step": 492350 + }, + { + "epoch": 1.9033260657790971, + "grad_norm": 0.15681514143943787, + "learning_rate": 7.791779572256585e-05, + "loss": 2.1186, + "step": 492360 + }, + { + "epoch": 1.9033647229824806, + "grad_norm": 0.16729243099689484, + "learning_rate": 7.781374471670933e-05, + "loss": 2.0935, + "step": 492370 + }, + { + "epoch": 1.9034033801858639, + "grad_norm": 0.16020803153514862, + "learning_rate": 7.770969934299466e-05, + "loss": 2.101, + "step": 492380 + }, + { + "epoch": 1.903442037389247, + "grad_norm": 0.16960710287094116, + "learning_rate": 7.760565960050747e-05, + "loss": 2.0957, + "step": 492390 + }, + { + "epoch": 1.9034806945926304, + "grad_norm": 0.16362504661083221, + "learning_rate": 7.75016254883334e-05, + "loss": 2.097, + "step": 492400 + }, + { + "epoch": 1.9035193517960136, + "grad_norm": 0.3407345712184906, + "learning_rate": 7.739759700555871e-05, + "loss": 2.0968, + "step": 492410 + }, + { + "epoch": 1.903558008999397, + "grad_norm": 0.16705606877803802, + "learning_rate": 7.729357415126948e-05, + "loss": 2.1039, + "step": 492420 + }, + { + "epoch": 1.9035966662027803, + "grad_norm": 0.16231025755405426, + "learning_rate": 7.71895569245522e-05, + "loss": 2.1105, + "step": 492430 + }, + { + "epoch": 1.9036353234061636, + "grad_norm": 0.161398783326149, + "learning_rate": 7.708554532449364e-05, + "loss": 2.1272, + "step": 492440 + }, + { + "epoch": 1.9036739806095468, + "grad_norm": 0.45710331201553345, + "learning_rate": 7.698153935018093e-05, + "loss": 2.1006, + "step": 492450 + }, + { + "epoch": 1.90371263781293, + "grad_norm": 0.17735600471496582, + "learning_rate": 7.687753900070105e-05, + "loss": 2.096, + "step": 492460 + }, + { + "epoch": 1.9037512950163133, + "grad_norm": 0.15921702980995178, + "learning_rate": 7.677354427514182e-05, + "loss": 2.1209, + "step": 492470 + }, + { + "epoch": 1.9037899522196966, + "grad_norm": 0.16269877552986145, + "learning_rate": 7.666955517259089e-05, + "loss": 2.1105, + "step": 492480 + }, + { + "epoch": 1.9038286094230799, + "grad_norm": 0.1621716171503067, + "learning_rate": 7.656557169213585e-05, + "loss": 2.1091, + "step": 492490 + }, + { + "epoch": 1.903867266626463, + "grad_norm": 0.1870255023241043, + "learning_rate": 7.646159383286543e-05, + "loss": 2.1107, + "step": 492500 + }, + { + "epoch": 1.9039059238298464, + "grad_norm": 0.16632509231567383, + "learning_rate": 7.635762159386816e-05, + "loss": 2.1009, + "step": 492510 + }, + { + "epoch": 1.9039445810332296, + "grad_norm": 0.17069365084171295, + "learning_rate": 7.625365497423231e-05, + "loss": 2.1023, + "step": 492520 + }, + { + "epoch": 1.9039832382366129, + "grad_norm": 0.15745419263839722, + "learning_rate": 7.614969397304727e-05, + "loss": 2.1124, + "step": 492530 + }, + { + "epoch": 1.9040218954399963, + "grad_norm": 0.16946125030517578, + "learning_rate": 7.604573858940201e-05, + "loss": 2.0966, + "step": 492540 + }, + { + "epoch": 1.9040605526433796, + "grad_norm": 0.17170938849449158, + "learning_rate": 7.59417888223859e-05, + "loss": 2.109, + "step": 492550 + }, + { + "epoch": 1.9040992098467628, + "grad_norm": 0.1586230844259262, + "learning_rate": 7.583784467108901e-05, + "loss": 2.1093, + "step": 492560 + }, + { + "epoch": 1.904137867050146, + "grad_norm": 0.18770702183246613, + "learning_rate": 7.573390613460118e-05, + "loss": 2.1032, + "step": 492570 + }, + { + "epoch": 1.9041765242535296, + "grad_norm": 0.16343411803245544, + "learning_rate": 7.562997321201249e-05, + "loss": 2.0975, + "step": 492580 + }, + { + "epoch": 1.9042151814569128, + "grad_norm": 0.1600416898727417, + "learning_rate": 7.552604590241342e-05, + "loss": 2.108, + "step": 492590 + }, + { + "epoch": 1.904253838660296, + "grad_norm": 0.16096729040145874, + "learning_rate": 7.54221242048947e-05, + "loss": 2.0926, + "step": 492600 + }, + { + "epoch": 1.9042924958636793, + "grad_norm": 0.16454388201236725, + "learning_rate": 7.531820811854705e-05, + "loss": 2.1128, + "step": 492610 + }, + { + "epoch": 1.9043311530670626, + "grad_norm": 0.15723025798797607, + "learning_rate": 7.521429764246212e-05, + "loss": 2.0927, + "step": 492620 + }, + { + "epoch": 1.9043698102704458, + "grad_norm": 0.16144847869873047, + "learning_rate": 7.511039277573107e-05, + "loss": 2.1037, + "step": 492630 + }, + { + "epoch": 1.904408467473829, + "grad_norm": 0.16370125114917755, + "learning_rate": 7.500649351744571e-05, + "loss": 2.0843, + "step": 492640 + }, + { + "epoch": 1.9044471246772123, + "grad_norm": 0.17126250267028809, + "learning_rate": 7.490259986669768e-05, + "loss": 2.0975, + "step": 492650 + }, + { + "epoch": 1.9044857818805956, + "grad_norm": 0.16658659279346466, + "learning_rate": 7.479871182257924e-05, + "loss": 2.0936, + "step": 492660 + }, + { + "epoch": 1.9045244390839788, + "grad_norm": 0.17625750601291656, + "learning_rate": 7.469482938418314e-05, + "loss": 2.1041, + "step": 492670 + }, + { + "epoch": 1.904563096287362, + "grad_norm": 0.17531739175319672, + "learning_rate": 7.459095255060167e-05, + "loss": 2.0967, + "step": 492680 + }, + { + "epoch": 1.9046017534907453, + "grad_norm": 0.15542291104793549, + "learning_rate": 7.448708132092774e-05, + "loss": 2.0852, + "step": 492690 + }, + { + "epoch": 1.9046404106941286, + "grad_norm": 0.15380744636058807, + "learning_rate": 7.438321569425454e-05, + "loss": 2.104, + "step": 492700 + }, + { + "epoch": 1.904679067897512, + "grad_norm": 0.1589040756225586, + "learning_rate": 7.427935566967547e-05, + "loss": 2.0979, + "step": 492710 + }, + { + "epoch": 1.9047177251008953, + "grad_norm": 0.16777478158473969, + "learning_rate": 7.417550124628436e-05, + "loss": 2.1063, + "step": 492720 + }, + { + "epoch": 1.9047563823042786, + "grad_norm": 0.1524752974510193, + "learning_rate": 7.407165242317481e-05, + "loss": 2.0863, + "step": 492730 + }, + { + "epoch": 1.9047950395076618, + "grad_norm": 0.16807228326797485, + "learning_rate": 7.396780919944113e-05, + "loss": 2.1105, + "step": 492740 + }, + { + "epoch": 1.9048336967110453, + "grad_norm": 0.15382082760334015, + "learning_rate": 7.38639715741778e-05, + "loss": 2.117, + "step": 492750 + }, + { + "epoch": 1.9048723539144286, + "grad_norm": 0.1633932739496231, + "learning_rate": 7.37601395464791e-05, + "loss": 2.1043, + "step": 492760 + }, + { + "epoch": 1.9049110111178118, + "grad_norm": 0.16407723724842072, + "learning_rate": 7.365631311543996e-05, + "loss": 2.1054, + "step": 492770 + }, + { + "epoch": 1.904949668321195, + "grad_norm": 0.17223787307739258, + "learning_rate": 7.35524922801556e-05, + "loss": 2.1032, + "step": 492780 + }, + { + "epoch": 1.9049883255245783, + "grad_norm": 0.1645432412624359, + "learning_rate": 7.344867703972136e-05, + "loss": 2.1154, + "step": 492790 + }, + { + "epoch": 1.9050269827279616, + "grad_norm": 0.15831483900547028, + "learning_rate": 7.334486739323265e-05, + "loss": 2.1056, + "step": 492800 + }, + { + "epoch": 1.9050656399313448, + "grad_norm": 0.1638144552707672, + "learning_rate": 7.324106333978552e-05, + "loss": 2.1131, + "step": 492810 + }, + { + "epoch": 1.905104297134728, + "grad_norm": 0.15509441494941711, + "learning_rate": 7.313726487847605e-05, + "loss": 2.1032, + "step": 492820 + }, + { + "epoch": 1.9051429543381113, + "grad_norm": 0.17044693231582642, + "learning_rate": 7.30334720084005e-05, + "loss": 2.1115, + "step": 492830 + }, + { + "epoch": 1.9051816115414946, + "grad_norm": 0.17525023221969604, + "learning_rate": 7.292968472865536e-05, + "loss": 2.1051, + "step": 492840 + }, + { + "epoch": 1.9052202687448778, + "grad_norm": 0.1675274521112442, + "learning_rate": 7.282590303833735e-05, + "loss": 2.0874, + "step": 492850 + }, + { + "epoch": 1.905258925948261, + "grad_norm": 0.16292007267475128, + "learning_rate": 7.272212693654367e-05, + "loss": 2.1031, + "step": 492860 + }, + { + "epoch": 1.9052975831516443, + "grad_norm": 0.16628581285476685, + "learning_rate": 7.26183564223717e-05, + "loss": 2.0982, + "step": 492870 + }, + { + "epoch": 1.9053362403550278, + "grad_norm": 0.1902909129858017, + "learning_rate": 7.251459149491879e-05, + "loss": 2.0997, + "step": 492880 + }, + { + "epoch": 1.905374897558411, + "grad_norm": 0.18070019781589508, + "learning_rate": 7.241083215328281e-05, + "loss": 2.0984, + "step": 492890 + }, + { + "epoch": 1.9054135547617943, + "grad_norm": 0.15961043536663055, + "learning_rate": 7.230707839656159e-05, + "loss": 2.0905, + "step": 492900 + }, + { + "epoch": 1.9054522119651776, + "grad_norm": 0.15894865989685059, + "learning_rate": 7.220333022385362e-05, + "loss": 2.1048, + "step": 492910 + }, + { + "epoch": 1.905490869168561, + "grad_norm": 0.16615809500217438, + "learning_rate": 7.209958763425739e-05, + "loss": 2.0916, + "step": 492920 + }, + { + "epoch": 1.9055295263719443, + "grad_norm": 0.1639198213815689, + "learning_rate": 7.199585062687164e-05, + "loss": 2.1091, + "step": 492930 + }, + { + "epoch": 1.9055681835753275, + "grad_norm": 0.1681961864233017, + "learning_rate": 7.189211920079531e-05, + "loss": 2.0868, + "step": 492940 + }, + { + "epoch": 1.9056068407787108, + "grad_norm": 0.1636681854724884, + "learning_rate": 7.178839335512755e-05, + "loss": 2.1019, + "step": 492950 + }, + { + "epoch": 1.905645497982094, + "grad_norm": 0.16586966812610626, + "learning_rate": 7.168467308896797e-05, + "loss": 2.0925, + "step": 492960 + }, + { + "epoch": 1.9056841551854773, + "grad_norm": 0.17896217107772827, + "learning_rate": 7.158095840141643e-05, + "loss": 2.1119, + "step": 492970 + }, + { + "epoch": 1.9057228123888605, + "grad_norm": 0.1667049378156662, + "learning_rate": 7.147724929157251e-05, + "loss": 2.0868, + "step": 492980 + }, + { + "epoch": 1.9057614695922438, + "grad_norm": 0.15626761317253113, + "learning_rate": 7.137354575853649e-05, + "loss": 2.0883, + "step": 492990 + }, + { + "epoch": 1.905800126795627, + "grad_norm": 0.1608096808195114, + "learning_rate": 7.126984780140888e-05, + "loss": 2.1107, + "step": 493000 + }, + { + "epoch": 1.9058387839990103, + "grad_norm": 0.16706053912639618, + "learning_rate": 7.116615541929061e-05, + "loss": 2.1116, + "step": 493010 + }, + { + "epoch": 1.9058774412023936, + "grad_norm": 0.17353135347366333, + "learning_rate": 7.10624686112824e-05, + "loss": 2.1139, + "step": 493020 + }, + { + "epoch": 1.9059160984057768, + "grad_norm": 0.165201336145401, + "learning_rate": 7.09587873764852e-05, + "loss": 2.096, + "step": 493030 + }, + { + "epoch": 1.90595475560916, + "grad_norm": 0.16129054129123688, + "learning_rate": 7.085511171400083e-05, + "loss": 2.1152, + "step": 493040 + }, + { + "epoch": 1.9059934128125435, + "grad_norm": 0.15986932814121246, + "learning_rate": 7.075144162293068e-05, + "loss": 2.1079, + "step": 493050 + }, + { + "epoch": 1.9060320700159268, + "grad_norm": 0.16731788218021393, + "learning_rate": 7.06477771023768e-05, + "loss": 2.0957, + "step": 493060 + }, + { + "epoch": 1.90607072721931, + "grad_norm": 0.16379481554031372, + "learning_rate": 7.054411815144123e-05, + "loss": 2.0806, + "step": 493070 + }, + { + "epoch": 1.9061093844226933, + "grad_norm": 0.17693264782428741, + "learning_rate": 7.044046476922628e-05, + "loss": 2.0933, + "step": 493080 + }, + { + "epoch": 1.9061480416260768, + "grad_norm": 0.2025172859430313, + "learning_rate": 7.033681695483463e-05, + "loss": 2.1058, + "step": 493090 + }, + { + "epoch": 1.90618669882946, + "grad_norm": 0.16205249726772308, + "learning_rate": 7.0233174707369e-05, + "loss": 2.1102, + "step": 493100 + }, + { + "epoch": 1.9062253560328433, + "grad_norm": 0.16341358423233032, + "learning_rate": 7.012953802593258e-05, + "loss": 2.0891, + "step": 493110 + }, + { + "epoch": 1.9062640132362265, + "grad_norm": 0.16129940748214722, + "learning_rate": 7.002590690962896e-05, + "loss": 2.0967, + "step": 493120 + }, + { + "epoch": 1.9063026704396098, + "grad_norm": 0.16581223905086517, + "learning_rate": 6.992228135756152e-05, + "loss": 2.1091, + "step": 493130 + }, + { + "epoch": 1.906341327642993, + "grad_norm": 0.185667023062706, + "learning_rate": 6.981866136883408e-05, + "loss": 2.1036, + "step": 493140 + }, + { + "epoch": 1.9063799848463763, + "grad_norm": 0.16112111508846283, + "learning_rate": 6.971504694255049e-05, + "loss": 2.0944, + "step": 493150 + }, + { + "epoch": 1.9064186420497595, + "grad_norm": 0.16273248195648193, + "learning_rate": 6.961143807781545e-05, + "loss": 2.0983, + "step": 493160 + }, + { + "epoch": 1.9064572992531428, + "grad_norm": 0.16150085628032684, + "learning_rate": 6.950783477373324e-05, + "loss": 2.0926, + "step": 493170 + }, + { + "epoch": 1.906495956456526, + "grad_norm": 0.17488321661949158, + "learning_rate": 6.940423702940857e-05, + "loss": 2.0894, + "step": 493180 + }, + { + "epoch": 1.9065346136599093, + "grad_norm": 0.16473519802093506, + "learning_rate": 6.930064484394683e-05, + "loss": 2.1142, + "step": 493190 + }, + { + "epoch": 1.9065732708632925, + "grad_norm": 0.17141981422901154, + "learning_rate": 6.919705821645272e-05, + "loss": 2.1066, + "step": 493200 + }, + { + "epoch": 1.9066119280666758, + "grad_norm": 0.16588962078094482, + "learning_rate": 6.909347714603232e-05, + "loss": 2.1022, + "step": 493210 + }, + { + "epoch": 1.9066505852700593, + "grad_norm": 0.179043710231781, + "learning_rate": 6.898990163179097e-05, + "loss": 2.1046, + "step": 493220 + }, + { + "epoch": 1.9066892424734425, + "grad_norm": 0.20257137715816498, + "learning_rate": 6.888633167283498e-05, + "loss": 2.0871, + "step": 493230 + }, + { + "epoch": 1.9067278996768258, + "grad_norm": 0.17999424040317535, + "learning_rate": 6.878276726827015e-05, + "loss": 2.1039, + "step": 493240 + }, + { + "epoch": 1.906766556880209, + "grad_norm": 0.176579087972641, + "learning_rate": 6.867920841720343e-05, + "loss": 2.0887, + "step": 493250 + }, + { + "epoch": 1.9068052140835925, + "grad_norm": 0.17156197130680084, + "learning_rate": 6.857565511874109e-05, + "loss": 2.1202, + "step": 493260 + }, + { + "epoch": 1.9068438712869757, + "grad_norm": 0.16644462943077087, + "learning_rate": 6.847210737199028e-05, + "loss": 2.1053, + "step": 493270 + }, + { + "epoch": 1.906882528490359, + "grad_norm": 0.17738530039787292, + "learning_rate": 6.836856517605816e-05, + "loss": 2.106, + "step": 493280 + }, + { + "epoch": 1.9069211856937422, + "grad_norm": 0.17238637804985046, + "learning_rate": 6.826502853005234e-05, + "loss": 2.0958, + "step": 493290 + }, + { + "epoch": 1.9069598428971255, + "grad_norm": 0.16516748070716858, + "learning_rate": 6.816149743307998e-05, + "loss": 2.0796, + "step": 493300 + }, + { + "epoch": 1.9069985001005088, + "grad_norm": 0.1585695445537567, + "learning_rate": 6.805797188424934e-05, + "loss": 2.1046, + "step": 493310 + }, + { + "epoch": 1.907037157303892, + "grad_norm": 0.15817442536354065, + "learning_rate": 6.795445188266847e-05, + "loss": 2.0786, + "step": 493320 + }, + { + "epoch": 1.9070758145072753, + "grad_norm": 0.1786375343799591, + "learning_rate": 6.785093742744586e-05, + "loss": 2.0931, + "step": 493330 + }, + { + "epoch": 1.9071144717106585, + "grad_norm": 0.18257126212120056, + "learning_rate": 6.774742851768978e-05, + "loss": 2.1082, + "step": 493340 + }, + { + "epoch": 1.9071531289140418, + "grad_norm": 0.1689731925725937, + "learning_rate": 6.764392515250962e-05, + "loss": 2.1119, + "step": 493350 + }, + { + "epoch": 1.907191786117425, + "grad_norm": 0.1615624576807022, + "learning_rate": 6.754042733101406e-05, + "loss": 2.096, + "step": 493360 + }, + { + "epoch": 1.9072304433208083, + "grad_norm": 0.16443485021591187, + "learning_rate": 6.743693505231252e-05, + "loss": 2.1194, + "step": 493370 + }, + { + "epoch": 1.9072691005241915, + "grad_norm": 0.1595473736524582, + "learning_rate": 6.733344831551458e-05, + "loss": 2.106, + "step": 493380 + }, + { + "epoch": 1.907307757727575, + "grad_norm": 0.16571669280529022, + "learning_rate": 6.722996711973006e-05, + "loss": 2.1058, + "step": 493390 + }, + { + "epoch": 1.9073464149309582, + "grad_norm": 0.169804185628891, + "learning_rate": 6.712649146406879e-05, + "loss": 2.1144, + "step": 493400 + }, + { + "epoch": 1.9073850721343415, + "grad_norm": 0.15742474794387817, + "learning_rate": 6.70230213476415e-05, + "loss": 2.0847, + "step": 493410 + }, + { + "epoch": 1.9074237293377247, + "grad_norm": 0.16608522832393646, + "learning_rate": 6.691955676955841e-05, + "loss": 2.1136, + "step": 493420 + }, + { + "epoch": 1.9074623865411082, + "grad_norm": 0.1622288078069687, + "learning_rate": 6.681609772893049e-05, + "loss": 2.0849, + "step": 493430 + }, + { + "epoch": 1.9075010437444915, + "grad_norm": 0.1680872142314911, + "learning_rate": 6.671264422486845e-05, + "loss": 2.0969, + "step": 493440 + }, + { + "epoch": 1.9075397009478747, + "grad_norm": 0.17885079979896545, + "learning_rate": 6.660919625648365e-05, + "loss": 2.097, + "step": 493450 + }, + { + "epoch": 1.907578358151258, + "grad_norm": 0.1733819842338562, + "learning_rate": 6.65057538228877e-05, + "loss": 2.0908, + "step": 493460 + }, + { + "epoch": 1.9076170153546412, + "grad_norm": 0.1674387902021408, + "learning_rate": 6.640231692319199e-05, + "loss": 2.1053, + "step": 493470 + }, + { + "epoch": 1.9076556725580245, + "grad_norm": 0.16638167202472687, + "learning_rate": 6.629888555650876e-05, + "loss": 2.0994, + "step": 493480 + }, + { + "epoch": 1.9076943297614077, + "grad_norm": 0.18338817358016968, + "learning_rate": 6.619545972195007e-05, + "loss": 2.0999, + "step": 493490 + }, + { + "epoch": 1.907732986964791, + "grad_norm": 0.1649668663740158, + "learning_rate": 6.60920394186284e-05, + "loss": 2.1204, + "step": 493500 + }, + { + "epoch": 1.9077716441681742, + "grad_norm": 0.18231862783432007, + "learning_rate": 6.598862464565625e-05, + "loss": 2.1075, + "step": 493510 + }, + { + "epoch": 1.9078103013715575, + "grad_norm": 0.1718713939189911, + "learning_rate": 6.588521540214676e-05, + "loss": 2.0866, + "step": 493520 + }, + { + "epoch": 1.9078489585749407, + "grad_norm": 0.16804492473602295, + "learning_rate": 6.578181168721287e-05, + "loss": 2.119, + "step": 493530 + }, + { + "epoch": 1.907887615778324, + "grad_norm": 0.16353757679462433, + "learning_rate": 6.567841349996817e-05, + "loss": 2.0941, + "step": 493540 + }, + { + "epoch": 1.9079262729817072, + "grad_norm": 0.16315865516662598, + "learning_rate": 6.557502083952605e-05, + "loss": 2.1108, + "step": 493550 + }, + { + "epoch": 1.9079649301850907, + "grad_norm": 0.17387837171554565, + "learning_rate": 6.547163370500053e-05, + "loss": 2.1107, + "step": 493560 + }, + { + "epoch": 1.908003587388474, + "grad_norm": 0.16080337762832642, + "learning_rate": 6.536825209550546e-05, + "loss": 2.1074, + "step": 493570 + }, + { + "epoch": 1.9080422445918572, + "grad_norm": 0.16609761118888855, + "learning_rate": 6.526487601015529e-05, + "loss": 2.1042, + "step": 493580 + }, + { + "epoch": 1.9080809017952405, + "grad_norm": 0.16756406426429749, + "learning_rate": 6.516150544806454e-05, + "loss": 2.0849, + "step": 493590 + }, + { + "epoch": 1.908119558998624, + "grad_norm": 0.16301332414150238, + "learning_rate": 6.505814040834812e-05, + "loss": 2.0829, + "step": 493600 + }, + { + "epoch": 1.9081582162020072, + "grad_norm": 0.16793601214885712, + "learning_rate": 6.495478089012097e-05, + "loss": 2.103, + "step": 493610 + }, + { + "epoch": 1.9081968734053905, + "grad_norm": 0.17147155106067657, + "learning_rate": 6.485142689249823e-05, + "loss": 2.0983, + "step": 493620 + }, + { + "epoch": 1.9082355306087737, + "grad_norm": 0.15886180102825165, + "learning_rate": 6.474807841459574e-05, + "loss": 2.1134, + "step": 493630 + }, + { + "epoch": 1.908274187812157, + "grad_norm": 0.1755588799715042, + "learning_rate": 6.464473545552885e-05, + "loss": 2.091, + "step": 493640 + }, + { + "epoch": 1.9083128450155402, + "grad_norm": 0.2920505106449127, + "learning_rate": 6.454139801441383e-05, + "loss": 2.1106, + "step": 493650 + }, + { + "epoch": 1.9083515022189235, + "grad_norm": 0.17460133135318756, + "learning_rate": 6.443806609036674e-05, + "loss": 2.0824, + "step": 493660 + }, + { + "epoch": 1.9083901594223067, + "grad_norm": 0.16135340929031372, + "learning_rate": 6.43347396825038e-05, + "loss": 2.0928, + "step": 493670 + }, + { + "epoch": 1.90842881662569, + "grad_norm": 0.16903318464756012, + "learning_rate": 6.423141878994221e-05, + "loss": 2.105, + "step": 493680 + }, + { + "epoch": 1.9084674738290732, + "grad_norm": 0.22488346695899963, + "learning_rate": 6.412810341179865e-05, + "loss": 2.1125, + "step": 493690 + }, + { + "epoch": 1.9085061310324565, + "grad_norm": 0.16939231753349304, + "learning_rate": 6.402479354719004e-05, + "loss": 2.0891, + "step": 493700 + }, + { + "epoch": 1.9085447882358397, + "grad_norm": 0.16648931801319122, + "learning_rate": 6.392148919523399e-05, + "loss": 2.0998, + "step": 493710 + }, + { + "epoch": 1.908583445439223, + "grad_norm": 0.1556132584810257, + "learning_rate": 6.381819035504832e-05, + "loss": 2.1057, + "step": 493720 + }, + { + "epoch": 1.9086221026426065, + "grad_norm": 0.15572424232959747, + "learning_rate": 6.37148970257504e-05, + "loss": 2.1056, + "step": 493730 + }, + { + "epoch": 1.9086607598459897, + "grad_norm": 0.18413236737251282, + "learning_rate": 6.36116092064587e-05, + "loss": 2.0767, + "step": 493740 + }, + { + "epoch": 1.908699417049373, + "grad_norm": 0.1694517433643341, + "learning_rate": 6.350832689629149e-05, + "loss": 2.0982, + "step": 493750 + }, + { + "epoch": 1.9087380742527562, + "grad_norm": 0.17999476194381714, + "learning_rate": 6.340505009436726e-05, + "loss": 2.092, + "step": 493760 + }, + { + "epoch": 1.9087767314561397, + "grad_norm": 0.166117861866951, + "learning_rate": 6.330177879980492e-05, + "loss": 2.0978, + "step": 493770 + }, + { + "epoch": 1.908815388659523, + "grad_norm": 0.1705954372882843, + "learning_rate": 6.319851301172319e-05, + "loss": 2.1018, + "step": 493780 + }, + { + "epoch": 1.9088540458629062, + "grad_norm": 0.15947870910167694, + "learning_rate": 6.309525272924166e-05, + "loss": 2.0963, + "step": 493790 + }, + { + "epoch": 1.9088927030662894, + "grad_norm": 0.156826451420784, + "learning_rate": 6.299199795147992e-05, + "loss": 2.0929, + "step": 493800 + }, + { + "epoch": 1.9089313602696727, + "grad_norm": 0.15414969623088837, + "learning_rate": 6.288874867755712e-05, + "loss": 2.0806, + "step": 493810 + }, + { + "epoch": 1.908970017473056, + "grad_norm": 0.15578539669513702, + "learning_rate": 6.278550490659395e-05, + "loss": 2.0833, + "step": 493820 + }, + { + "epoch": 1.9090086746764392, + "grad_norm": 0.1562417447566986, + "learning_rate": 6.268226663771026e-05, + "loss": 2.1098, + "step": 493830 + }, + { + "epoch": 1.9090473318798225, + "grad_norm": 0.16142068803310394, + "learning_rate": 6.25790338700265e-05, + "loss": 2.1012, + "step": 493840 + }, + { + "epoch": 1.9090859890832057, + "grad_norm": 0.1723729819059372, + "learning_rate": 6.247580660266339e-05, + "loss": 2.1185, + "step": 493850 + }, + { + "epoch": 1.909124646286589, + "grad_norm": 0.16679581999778748, + "learning_rate": 6.237258483474184e-05, + "loss": 2.0856, + "step": 493860 + }, + { + "epoch": 1.9091633034899722, + "grad_norm": 0.16381263732910156, + "learning_rate": 6.226936856538279e-05, + "loss": 2.0949, + "step": 493870 + }, + { + "epoch": 1.9092019606933555, + "grad_norm": 0.15974077582359314, + "learning_rate": 6.216615779370805e-05, + "loss": 2.0889, + "step": 493880 + }, + { + "epoch": 1.9092406178967387, + "grad_norm": 0.1609431654214859, + "learning_rate": 6.206295251883898e-05, + "loss": 2.1091, + "step": 493890 + }, + { + "epoch": 1.9092792751001222, + "grad_norm": 0.18349827826023102, + "learning_rate": 6.19597527398974e-05, + "loss": 2.1102, + "step": 493900 + }, + { + "epoch": 1.9093179323035054, + "grad_norm": 0.15824545919895172, + "learning_rate": 6.185655845600536e-05, + "loss": 2.0911, + "step": 493910 + }, + { + "epoch": 1.9093565895068887, + "grad_norm": 0.15692760050296783, + "learning_rate": 6.175336966628508e-05, + "loss": 2.1116, + "step": 493920 + }, + { + "epoch": 1.909395246710272, + "grad_norm": 0.1580006331205368, + "learning_rate": 6.165018636985953e-05, + "loss": 2.0933, + "step": 493930 + }, + { + "epoch": 1.9094339039136554, + "grad_norm": 0.17121177911758423, + "learning_rate": 6.154700856585117e-05, + "loss": 2.1123, + "step": 493940 + }, + { + "epoch": 1.9094725611170387, + "grad_norm": 0.1740797758102417, + "learning_rate": 6.144383625338312e-05, + "loss": 2.0984, + "step": 493950 + }, + { + "epoch": 1.909511218320422, + "grad_norm": 0.1799916923046112, + "learning_rate": 6.134066943157856e-05, + "loss": 2.1022, + "step": 493960 + }, + { + "epoch": 1.9095498755238052, + "grad_norm": 0.18135613203048706, + "learning_rate": 6.123750809956108e-05, + "loss": 2.0922, + "step": 493970 + }, + { + "epoch": 1.9095885327271884, + "grad_norm": 0.1724577099084854, + "learning_rate": 6.113435225645403e-05, + "loss": 2.0952, + "step": 493980 + }, + { + "epoch": 1.9096271899305717, + "grad_norm": 0.16730991005897522, + "learning_rate": 6.10312019013819e-05, + "loss": 2.1004, + "step": 493990 + }, + { + "epoch": 1.909665847133955, + "grad_norm": 0.17100889980793, + "learning_rate": 6.092805703346849e-05, + "loss": 2.0855, + "step": 494000 + }, + { + "epoch": 1.9097045043373382, + "grad_norm": 0.16315822303295135, + "learning_rate": 6.0824917651838285e-05, + "loss": 2.1015, + "step": 494010 + }, + { + "epoch": 1.9097431615407214, + "grad_norm": 0.15623895823955536, + "learning_rate": 6.072178375561599e-05, + "loss": 2.1154, + "step": 494020 + }, + { + "epoch": 1.9097818187441047, + "grad_norm": 0.15995801985263824, + "learning_rate": 6.061865534392652e-05, + "loss": 2.0926, + "step": 494030 + }, + { + "epoch": 1.909820475947488, + "grad_norm": 0.1566266119480133, + "learning_rate": 6.0515532415894805e-05, + "loss": 2.1107, + "step": 494040 + }, + { + "epoch": 1.9098591331508712, + "grad_norm": 0.2434413731098175, + "learning_rate": 6.0412414970646424e-05, + "loss": 2.1014, + "step": 494050 + }, + { + "epoch": 1.9098977903542547, + "grad_norm": 0.1829388439655304, + "learning_rate": 6.030930300730675e-05, + "loss": 2.1143, + "step": 494060 + }, + { + "epoch": 1.909936447557638, + "grad_norm": 0.16349373757839203, + "learning_rate": 6.020619652500181e-05, + "loss": 2.1026, + "step": 494070 + }, + { + "epoch": 1.9099751047610212, + "grad_norm": 0.1656537652015686, + "learning_rate": 6.010309552285742e-05, + "loss": 2.0957, + "step": 494080 + }, + { + "epoch": 1.9100137619644044, + "grad_norm": 0.16540515422821045, + "learning_rate": 6.0000000000000056e-05, + "loss": 2.1104, + "step": 494090 + }, + { + "epoch": 1.9100524191677877, + "grad_norm": 0.17239612340927124, + "learning_rate": 5.9896909955555964e-05, + "loss": 2.1086, + "step": 494100 + }, + { + "epoch": 1.9100910763711711, + "grad_norm": 0.1950691193342209, + "learning_rate": 5.979382538865208e-05, + "loss": 2.1026, + "step": 494110 + }, + { + "epoch": 1.9101297335745544, + "grad_norm": 0.17566072940826416, + "learning_rate": 5.969074629841531e-05, + "loss": 2.0855, + "step": 494120 + }, + { + "epoch": 1.9101683907779377, + "grad_norm": 0.15897676348686218, + "learning_rate": 5.958767268397303e-05, + "loss": 2.0936, + "step": 494130 + }, + { + "epoch": 1.910207047981321, + "grad_norm": 0.16137805581092834, + "learning_rate": 5.948460454445237e-05, + "loss": 2.1013, + "step": 494140 + }, + { + "epoch": 1.9102457051847042, + "grad_norm": 0.19188115000724792, + "learning_rate": 5.938154187898137e-05, + "loss": 2.0886, + "step": 494150 + }, + { + "epoch": 1.9102843623880874, + "grad_norm": 0.1605629324913025, + "learning_rate": 5.9278484686687396e-05, + "loss": 2.0894, + "step": 494160 + }, + { + "epoch": 1.9103230195914707, + "grad_norm": 0.17366410791873932, + "learning_rate": 5.9175432966699136e-05, + "loss": 2.0834, + "step": 494170 + }, + { + "epoch": 1.910361676794854, + "grad_norm": 0.16272400319576263, + "learning_rate": 5.9072386718144635e-05, + "loss": 2.0966, + "step": 494180 + }, + { + "epoch": 1.9104003339982372, + "grad_norm": 0.17019392549991608, + "learning_rate": 5.8969345940152356e-05, + "loss": 2.1097, + "step": 494190 + }, + { + "epoch": 1.9104389912016204, + "grad_norm": 0.16691423952579498, + "learning_rate": 5.886631063185144e-05, + "loss": 2.0996, + "step": 494200 + }, + { + "epoch": 1.9104776484050037, + "grad_norm": 0.17281699180603027, + "learning_rate": 5.876328079237081e-05, + "loss": 2.0976, + "step": 494210 + }, + { + "epoch": 1.910516305608387, + "grad_norm": 0.17887260019779205, + "learning_rate": 5.866025642083961e-05, + "loss": 2.1068, + "step": 494220 + }, + { + "epoch": 1.9105549628117704, + "grad_norm": 0.16713376343250275, + "learning_rate": 5.855723751638764e-05, + "loss": 2.0981, + "step": 494230 + }, + { + "epoch": 1.9105936200151536, + "grad_norm": 0.16143283247947693, + "learning_rate": 5.8454224078144494e-05, + "loss": 2.1031, + "step": 494240 + }, + { + "epoch": 1.910632277218537, + "grad_norm": 0.16023682057857513, + "learning_rate": 5.835121610524019e-05, + "loss": 2.1022, + "step": 494250 + }, + { + "epoch": 1.9106709344219202, + "grad_norm": 0.17260730266571045, + "learning_rate": 5.824821359680477e-05, + "loss": 2.0983, + "step": 494260 + }, + { + "epoch": 1.9107095916253034, + "grad_norm": 0.1591053307056427, + "learning_rate": 5.8145216551968916e-05, + "loss": 2.1008, + "step": 494270 + }, + { + "epoch": 1.9107482488286869, + "grad_norm": 0.16962049901485443, + "learning_rate": 5.804222496986311e-05, + "loss": 2.0997, + "step": 494280 + }, + { + "epoch": 1.9107869060320701, + "grad_norm": 0.16175496578216553, + "learning_rate": 5.793923884961827e-05, + "loss": 2.0857, + "step": 494290 + }, + { + "epoch": 1.9108255632354534, + "grad_norm": 0.1648041307926178, + "learning_rate": 5.7836258190365755e-05, + "loss": 2.0961, + "step": 494300 + }, + { + "epoch": 1.9108642204388366, + "grad_norm": 0.1681378185749054, + "learning_rate": 5.773328299123648e-05, + "loss": 2.1007, + "step": 494310 + }, + { + "epoch": 1.9109028776422199, + "grad_norm": 0.1582988053560257, + "learning_rate": 5.7630313251362474e-05, + "loss": 2.0963, + "step": 494320 + }, + { + "epoch": 1.9109415348456031, + "grad_norm": 0.1673489660024643, + "learning_rate": 5.752734896987533e-05, + "loss": 2.1051, + "step": 494330 + }, + { + "epoch": 1.9109801920489864, + "grad_norm": 0.17708514630794525, + "learning_rate": 5.742439014590728e-05, + "loss": 2.0814, + "step": 494340 + }, + { + "epoch": 1.9110188492523696, + "grad_norm": 0.16237381100654602, + "learning_rate": 5.7321436778590586e-05, + "loss": 2.1114, + "step": 494350 + }, + { + "epoch": 1.911057506455753, + "grad_norm": 0.16191226243972778, + "learning_rate": 5.721848886705749e-05, + "loss": 2.0904, + "step": 494360 + }, + { + "epoch": 1.9110961636591361, + "grad_norm": 0.17363980412483215, + "learning_rate": 5.711554641044092e-05, + "loss": 2.0971, + "step": 494370 + }, + { + "epoch": 1.9111348208625194, + "grad_norm": 0.16366997361183167, + "learning_rate": 5.701260940787378e-05, + "loss": 2.1044, + "step": 494380 + }, + { + "epoch": 1.9111734780659027, + "grad_norm": 0.16033147275447845, + "learning_rate": 5.6909677858489216e-05, + "loss": 2.109, + "step": 494390 + }, + { + "epoch": 1.9112121352692861, + "grad_norm": 0.17024444043636322, + "learning_rate": 5.680675176142103e-05, + "loss": 2.1059, + "step": 494400 + }, + { + "epoch": 1.9112507924726694, + "grad_norm": 0.17983657121658325, + "learning_rate": 5.670383111580235e-05, + "loss": 2.1084, + "step": 494410 + }, + { + "epoch": 1.9112894496760526, + "grad_norm": 0.15747319161891937, + "learning_rate": 5.660091592076744e-05, + "loss": 2.1059, + "step": 494420 + }, + { + "epoch": 1.9113281068794359, + "grad_norm": 0.1654488891363144, + "learning_rate": 5.649800617545031e-05, + "loss": 2.0944, + "step": 494430 + }, + { + "epoch": 1.9113667640828194, + "grad_norm": 0.16729286313056946, + "learning_rate": 5.639510187898522e-05, + "loss": 2.0839, + "step": 494440 + }, + { + "epoch": 1.9114054212862026, + "grad_norm": 0.16759923100471497, + "learning_rate": 5.629220303050686e-05, + "loss": 2.0828, + "step": 494450 + }, + { + "epoch": 1.9114440784895859, + "grad_norm": 0.1605670154094696, + "learning_rate": 5.6189309629150146e-05, + "loss": 2.0979, + "step": 494460 + }, + { + "epoch": 1.9114827356929691, + "grad_norm": 0.16231954097747803, + "learning_rate": 5.608642167404976e-05, + "loss": 2.0906, + "step": 494470 + }, + { + "epoch": 1.9115213928963524, + "grad_norm": 0.15948623418807983, + "learning_rate": 5.598353916434129e-05, + "loss": 2.0949, + "step": 494480 + }, + { + "epoch": 1.9115600500997356, + "grad_norm": 0.17366357147693634, + "learning_rate": 5.58806620991601e-05, + "loss": 2.1019, + "step": 494490 + }, + { + "epoch": 1.9115987073031189, + "grad_norm": 0.16815611720085144, + "learning_rate": 5.577779047764198e-05, + "loss": 2.1088, + "step": 494500 + }, + { + "epoch": 1.9116373645065021, + "grad_norm": 0.16711877286434174, + "learning_rate": 5.5674924298922735e-05, + "loss": 2.0901, + "step": 494510 + }, + { + "epoch": 1.9116760217098854, + "grad_norm": 0.17813915014266968, + "learning_rate": 5.557206356213862e-05, + "loss": 2.0913, + "step": 494520 + }, + { + "epoch": 1.9117146789132686, + "grad_norm": 0.17400996387004852, + "learning_rate": 5.54692082664261e-05, + "loss": 2.0951, + "step": 494530 + }, + { + "epoch": 1.9117533361166519, + "grad_norm": 0.17161357402801514, + "learning_rate": 5.536635841092163e-05, + "loss": 2.0766, + "step": 494540 + }, + { + "epoch": 1.9117919933200351, + "grad_norm": 0.16898822784423828, + "learning_rate": 5.526351399476237e-05, + "loss": 2.0943, + "step": 494550 + }, + { + "epoch": 1.9118306505234184, + "grad_norm": 0.16697081923484802, + "learning_rate": 5.516067501708522e-05, + "loss": 2.0733, + "step": 494560 + }, + { + "epoch": 1.9118693077268019, + "grad_norm": 0.17042747139930725, + "learning_rate": 5.5057841477027304e-05, + "loss": 2.0913, + "step": 494570 + }, + { + "epoch": 1.911907964930185, + "grad_norm": 0.17008812725543976, + "learning_rate": 5.495501337372666e-05, + "loss": 2.0995, + "step": 494580 + }, + { + "epoch": 1.9119466221335684, + "grad_norm": 0.1692279428243637, + "learning_rate": 5.485219070632064e-05, + "loss": 2.0928, + "step": 494590 + }, + { + "epoch": 1.9119852793369516, + "grad_norm": 0.16472196578979492, + "learning_rate": 5.474937347394748e-05, + "loss": 2.092, + "step": 494600 + }, + { + "epoch": 1.912023936540335, + "grad_norm": 0.16748890280723572, + "learning_rate": 5.464656167574522e-05, + "loss": 2.1186, + "step": 494610 + }, + { + "epoch": 1.9120625937437183, + "grad_norm": 0.15886230766773224, + "learning_rate": 5.454375531085254e-05, + "loss": 2.101, + "step": 494620 + }, + { + "epoch": 1.9121012509471016, + "grad_norm": 0.15965144336223602, + "learning_rate": 5.44409543784079e-05, + "loss": 2.096, + "step": 494630 + }, + { + "epoch": 1.9121399081504848, + "grad_norm": 0.1668359488248825, + "learning_rate": 5.4338158877550446e-05, + "loss": 2.0932, + "step": 494640 + }, + { + "epoch": 1.912178565353868, + "grad_norm": 0.16767774522304535, + "learning_rate": 5.4235368807419085e-05, + "loss": 2.0969, + "step": 494650 + }, + { + "epoch": 1.9122172225572514, + "grad_norm": 0.16439062356948853, + "learning_rate": 5.413258416715339e-05, + "loss": 2.0968, + "step": 494660 + }, + { + "epoch": 1.9122558797606346, + "grad_norm": 0.16456596553325653, + "learning_rate": 5.402980495589294e-05, + "loss": 2.1074, + "step": 494670 + }, + { + "epoch": 1.9122945369640179, + "grad_norm": 0.168012335896492, + "learning_rate": 5.392703117277753e-05, + "loss": 2.0912, + "step": 494680 + }, + { + "epoch": 1.912333194167401, + "grad_norm": 0.15883708000183105, + "learning_rate": 5.382426281694697e-05, + "loss": 2.0853, + "step": 494690 + }, + { + "epoch": 1.9123718513707844, + "grad_norm": 0.1676301658153534, + "learning_rate": 5.372149988754193e-05, + "loss": 2.0935, + "step": 494700 + }, + { + "epoch": 1.9124105085741676, + "grad_norm": 0.15922707319259644, + "learning_rate": 5.361874238370246e-05, + "loss": 2.0872, + "step": 494710 + }, + { + "epoch": 1.9124491657775509, + "grad_norm": 0.1661909520626068, + "learning_rate": 5.351599030456966e-05, + "loss": 2.1026, + "step": 494720 + }, + { + "epoch": 1.9124878229809341, + "grad_norm": 0.18675149977207184, + "learning_rate": 5.341324364928446e-05, + "loss": 2.0991, + "step": 494730 + }, + { + "epoch": 1.9125264801843176, + "grad_norm": 0.165910005569458, + "learning_rate": 5.331050241698798e-05, + "loss": 2.0866, + "step": 494740 + }, + { + "epoch": 1.9125651373877008, + "grad_norm": 0.16928835213184357, + "learning_rate": 5.320776660682158e-05, + "loss": 2.0892, + "step": 494750 + }, + { + "epoch": 1.912603794591084, + "grad_norm": 0.15087957680225372, + "learning_rate": 5.310503621792684e-05, + "loss": 2.0986, + "step": 494760 + }, + { + "epoch": 1.9126424517944673, + "grad_norm": 0.15772745013237, + "learning_rate": 5.3002311249445764e-05, + "loss": 2.1019, + "step": 494770 + }, + { + "epoch": 1.9126811089978508, + "grad_norm": 0.16974887251853943, + "learning_rate": 5.289959170052039e-05, + "loss": 2.112, + "step": 494780 + }, + { + "epoch": 1.912719766201234, + "grad_norm": 0.1652223765850067, + "learning_rate": 5.279687757029317e-05, + "loss": 2.1034, + "step": 494790 + }, + { + "epoch": 1.9127584234046173, + "grad_norm": 0.17936939001083374, + "learning_rate": 5.269416885790634e-05, + "loss": 2.0964, + "step": 494800 + }, + { + "epoch": 1.9127970806080006, + "grad_norm": 0.16628216207027435, + "learning_rate": 5.259146556250283e-05, + "loss": 2.0861, + "step": 494810 + }, + { + "epoch": 1.9128357378113838, + "grad_norm": 0.16931118071079254, + "learning_rate": 5.248876768322597e-05, + "loss": 2.0966, + "step": 494820 + }, + { + "epoch": 1.912874395014767, + "grad_norm": 0.1719769537448883, + "learning_rate": 5.238607521921846e-05, + "loss": 2.1026, + "step": 494830 + }, + { + "epoch": 1.9129130522181503, + "grad_norm": 0.1746273785829544, + "learning_rate": 5.228338816962386e-05, + "loss": 2.1045, + "step": 494840 + }, + { + "epoch": 1.9129517094215336, + "grad_norm": 0.1762704700231552, + "learning_rate": 5.2180706533585976e-05, + "loss": 2.0949, + "step": 494850 + }, + { + "epoch": 1.9129903666249168, + "grad_norm": 0.1763325184583664, + "learning_rate": 5.207803031024882e-05, + "loss": 2.1006, + "step": 494860 + }, + { + "epoch": 1.9130290238283, + "grad_norm": 0.1622261106967926, + "learning_rate": 5.19753594987562e-05, + "loss": 2.102, + "step": 494870 + }, + { + "epoch": 1.9130676810316833, + "grad_norm": 0.16686417162418365, + "learning_rate": 5.187269409825279e-05, + "loss": 2.0961, + "step": 494880 + }, + { + "epoch": 1.9131063382350666, + "grad_norm": 0.1586511880159378, + "learning_rate": 5.177003410788283e-05, + "loss": 2.0864, + "step": 494890 + }, + { + "epoch": 1.9131449954384498, + "grad_norm": 0.16049085557460785, + "learning_rate": 5.1667379526791456e-05, + "loss": 2.0958, + "step": 494900 + }, + { + "epoch": 1.9131836526418333, + "grad_norm": 0.1666175276041031, + "learning_rate": 5.156473035412335e-05, + "loss": 2.1041, + "step": 494910 + }, + { + "epoch": 1.9132223098452166, + "grad_norm": 0.17487689852714539, + "learning_rate": 5.1462086589024074e-05, + "loss": 2.1052, + "step": 494920 + }, + { + "epoch": 1.9132609670485998, + "grad_norm": 0.16758491098880768, + "learning_rate": 5.135944823063898e-05, + "loss": 2.0988, + "step": 494930 + }, + { + "epoch": 1.913299624251983, + "grad_norm": 0.17294859886169434, + "learning_rate": 5.1256815278113654e-05, + "loss": 2.0941, + "step": 494940 + }, + { + "epoch": 1.9133382814553666, + "grad_norm": 0.17001833021640778, + "learning_rate": 5.11541877305941e-05, + "loss": 2.1009, + "step": 494950 + }, + { + "epoch": 1.9133769386587498, + "grad_norm": 0.16715086996555328, + "learning_rate": 5.105156558722679e-05, + "loss": 2.0983, + "step": 494960 + }, + { + "epoch": 1.913415595862133, + "grad_norm": 0.16920140385627747, + "learning_rate": 5.09489488471575e-05, + "loss": 2.0909, + "step": 494970 + }, + { + "epoch": 1.9134542530655163, + "grad_norm": 0.17399254441261292, + "learning_rate": 5.0846337509533384e-05, + "loss": 2.0813, + "step": 494980 + }, + { + "epoch": 1.9134929102688996, + "grad_norm": 0.15936678647994995, + "learning_rate": 5.0743731573500874e-05, + "loss": 2.0899, + "step": 494990 + }, + { + "epoch": 1.9135315674722828, + "grad_norm": 0.1558026522397995, + "learning_rate": 5.0641131038207336e-05, + "loss": 2.0827, + "step": 495000 + }, + { + "epoch": 1.913570224675666, + "grad_norm": 0.15770964324474335, + "learning_rate": 5.0538535902799665e-05, + "loss": 2.1019, + "step": 495010 + }, + { + "epoch": 1.9136088818790493, + "grad_norm": 0.16118407249450684, + "learning_rate": 5.043594616642566e-05, + "loss": 2.1088, + "step": 495020 + }, + { + "epoch": 1.9136475390824326, + "grad_norm": 0.15688639879226685, + "learning_rate": 5.033336182823289e-05, + "loss": 2.0929, + "step": 495030 + }, + { + "epoch": 1.9136861962858158, + "grad_norm": 0.1677384227514267, + "learning_rate": 5.0230782887369596e-05, + "loss": 2.0859, + "step": 495040 + }, + { + "epoch": 1.913724853489199, + "grad_norm": 0.16379329562187195, + "learning_rate": 5.0128209342983566e-05, + "loss": 2.1021, + "step": 495050 + }, + { + "epoch": 1.9137635106925823, + "grad_norm": 0.1766320914030075, + "learning_rate": 5.002564119422326e-05, + "loss": 2.1001, + "step": 495060 + }, + { + "epoch": 1.9138021678959656, + "grad_norm": 0.16742245852947235, + "learning_rate": 4.992307844023758e-05, + "loss": 2.0897, + "step": 495070 + }, + { + "epoch": 1.913840825099349, + "grad_norm": 0.1736634373664856, + "learning_rate": 4.982052108017499e-05, + "loss": 2.1047, + "step": 495080 + }, + { + "epoch": 1.9138794823027323, + "grad_norm": 0.16338351368904114, + "learning_rate": 4.9717969113185044e-05, + "loss": 2.0953, + "step": 495090 + }, + { + "epoch": 1.9139181395061156, + "grad_norm": 0.16212569177150726, + "learning_rate": 4.9615422538416445e-05, + "loss": 2.0836, + "step": 495100 + }, + { + "epoch": 1.9139567967094988, + "grad_norm": 0.15982629358768463, + "learning_rate": 4.951288135501919e-05, + "loss": 2.0809, + "step": 495110 + }, + { + "epoch": 1.9139954539128823, + "grad_norm": 0.16392414271831512, + "learning_rate": 4.941034556214263e-05, + "loss": 2.0885, + "step": 495120 + }, + { + "epoch": 1.9140341111162655, + "grad_norm": 0.16265085339546204, + "learning_rate": 4.930781515893701e-05, + "loss": 2.0921, + "step": 495130 + }, + { + "epoch": 1.9140727683196488, + "grad_norm": 0.1701953262090683, + "learning_rate": 4.9205290144552326e-05, + "loss": 2.0745, + "step": 495140 + }, + { + "epoch": 1.914111425523032, + "grad_norm": 0.17344048619270325, + "learning_rate": 4.9102770518139274e-05, + "loss": 2.0906, + "step": 495150 + }, + { + "epoch": 1.9141500827264153, + "grad_norm": 0.16372932493686676, + "learning_rate": 4.90002562788483e-05, + "loss": 2.1067, + "step": 495160 + }, + { + "epoch": 1.9141887399297985, + "grad_norm": 0.16525626182556152, + "learning_rate": 4.8897747425830086e-05, + "loss": 2.0958, + "step": 495170 + }, + { + "epoch": 1.9142273971331818, + "grad_norm": 0.16770386695861816, + "learning_rate": 4.879524395823598e-05, + "loss": 2.1081, + "step": 495180 + }, + { + "epoch": 1.914266054336565, + "grad_norm": 0.17800508439540863, + "learning_rate": 4.8692745875217107e-05, + "loss": 2.0979, + "step": 495190 + }, + { + "epoch": 1.9143047115399483, + "grad_norm": 0.1625628024339676, + "learning_rate": 4.8590253175925246e-05, + "loss": 2.0977, + "step": 495200 + }, + { + "epoch": 1.9143433687433316, + "grad_norm": 0.17888452112674713, + "learning_rate": 4.848776585951176e-05, + "loss": 2.1001, + "step": 495210 + }, + { + "epoch": 1.9143820259467148, + "grad_norm": 0.15721629559993744, + "learning_rate": 4.838528392512886e-05, + "loss": 2.095, + "step": 495220 + }, + { + "epoch": 1.914420683150098, + "grad_norm": 0.1689646989107132, + "learning_rate": 4.8282807371928586e-05, + "loss": 2.0966, + "step": 495230 + }, + { + "epoch": 1.9144593403534813, + "grad_norm": 0.16454440355300903, + "learning_rate": 4.8180336199063593e-05, + "loss": 2.0996, + "step": 495240 + }, + { + "epoch": 1.9144979975568648, + "grad_norm": 0.15929938852787018, + "learning_rate": 4.8077870405686566e-05, + "loss": 2.0977, + "step": 495250 + }, + { + "epoch": 1.914536654760248, + "grad_norm": 0.16392184793949127, + "learning_rate": 4.7975409990949957e-05, + "loss": 2.097, + "step": 495260 + }, + { + "epoch": 1.9145753119636313, + "grad_norm": 0.1720370650291443, + "learning_rate": 4.787295495400712e-05, + "loss": 2.1052, + "step": 495270 + }, + { + "epoch": 1.9146139691670145, + "grad_norm": 0.17082928121089935, + "learning_rate": 4.777050529401139e-05, + "loss": 2.0957, + "step": 495280 + }, + { + "epoch": 1.914652626370398, + "grad_norm": 0.16387730836868286, + "learning_rate": 4.766806101011611e-05, + "loss": 2.0829, + "step": 495290 + }, + { + "epoch": 1.9146912835737813, + "grad_norm": 0.16461658477783203, + "learning_rate": 4.756562210147508e-05, + "loss": 2.089, + "step": 495300 + }, + { + "epoch": 1.9147299407771645, + "grad_norm": 0.1617818921804428, + "learning_rate": 4.746318856724252e-05, + "loss": 2.1012, + "step": 495310 + }, + { + "epoch": 1.9147685979805478, + "grad_norm": 0.16729439795017242, + "learning_rate": 4.736076040657222e-05, + "loss": 2.102, + "step": 495320 + }, + { + "epoch": 1.914807255183931, + "grad_norm": 0.1714024394750595, + "learning_rate": 4.725833761861886e-05, + "loss": 2.1033, + "step": 495330 + }, + { + "epoch": 1.9148459123873143, + "grad_norm": 0.16891701519489288, + "learning_rate": 4.7155920202536896e-05, + "loss": 2.0894, + "step": 495340 + }, + { + "epoch": 1.9148845695906975, + "grad_norm": 0.16476847231388092, + "learning_rate": 4.705350815748144e-05, + "loss": 2.1021, + "step": 495350 + }, + { + "epoch": 1.9149232267940808, + "grad_norm": 0.16403105854988098, + "learning_rate": 4.6951101482607614e-05, + "loss": 2.0897, + "step": 495360 + }, + { + "epoch": 1.914961883997464, + "grad_norm": 0.1697767972946167, + "learning_rate": 4.6848700177070324e-05, + "loss": 2.0851, + "step": 495370 + }, + { + "epoch": 1.9150005412008473, + "grad_norm": 0.16083191335201263, + "learning_rate": 4.6746304240025354e-05, + "loss": 2.0728, + "step": 495380 + }, + { + "epoch": 1.9150391984042305, + "grad_norm": 0.17566703259944916, + "learning_rate": 4.664391367062826e-05, + "loss": 2.1028, + "step": 495390 + }, + { + "epoch": 1.9150778556076138, + "grad_norm": 0.17670664191246033, + "learning_rate": 4.6541528468035277e-05, + "loss": 2.1006, + "step": 495400 + }, + { + "epoch": 1.915116512810997, + "grad_norm": 0.15895481407642365, + "learning_rate": 4.643914863140242e-05, + "loss": 2.0972, + "step": 495410 + }, + { + "epoch": 1.9151551700143805, + "grad_norm": 0.16283272206783295, + "learning_rate": 4.633677415988613e-05, + "loss": 2.0997, + "step": 495420 + }, + { + "epoch": 1.9151938272177638, + "grad_norm": 0.16882751882076263, + "learning_rate": 4.623440505264287e-05, + "loss": 2.1107, + "step": 495430 + }, + { + "epoch": 1.915232484421147, + "grad_norm": 0.16597266495227814, + "learning_rate": 4.613204130882998e-05, + "loss": 2.0941, + "step": 495440 + }, + { + "epoch": 1.9152711416245303, + "grad_norm": 0.15502455830574036, + "learning_rate": 4.60296829276039e-05, + "loss": 2.086, + "step": 495450 + }, + { + "epoch": 1.9153097988279137, + "grad_norm": 0.16382566094398499, + "learning_rate": 4.592732990812243e-05, + "loss": 2.0944, + "step": 495460 + }, + { + "epoch": 1.915348456031297, + "grad_norm": 0.16698609292507172, + "learning_rate": 4.582498224954268e-05, + "loss": 2.1043, + "step": 495470 + }, + { + "epoch": 1.9153871132346802, + "grad_norm": 0.1581621915102005, + "learning_rate": 4.572263995102266e-05, + "loss": 2.0727, + "step": 495480 + }, + { + "epoch": 1.9154257704380635, + "grad_norm": 0.1631089299917221, + "learning_rate": 4.562030301172038e-05, + "loss": 2.101, + "step": 495490 + }, + { + "epoch": 1.9154644276414468, + "grad_norm": 0.1734185665845871, + "learning_rate": 4.551797143079361e-05, + "loss": 2.1079, + "step": 495500 + }, + { + "epoch": 1.91550308484483, + "grad_norm": 0.17021867632865906, + "learning_rate": 4.541564520740127e-05, + "loss": 2.0877, + "step": 495510 + }, + { + "epoch": 1.9155417420482133, + "grad_norm": 0.17814646661281586, + "learning_rate": 4.531332434070135e-05, + "loss": 2.0873, + "step": 495520 + }, + { + "epoch": 1.9155803992515965, + "grad_norm": 0.1666654497385025, + "learning_rate": 4.521100882985318e-05, + "loss": 2.1061, + "step": 495530 + }, + { + "epoch": 1.9156190564549798, + "grad_norm": 0.1610458493232727, + "learning_rate": 4.510869867401568e-05, + "loss": 2.1013, + "step": 495540 + }, + { + "epoch": 1.915657713658363, + "grad_norm": 0.2371460646390915, + "learning_rate": 4.500639387234817e-05, + "loss": 2.0977, + "step": 495550 + }, + { + "epoch": 1.9156963708617463, + "grad_norm": 0.19147741794586182, + "learning_rate": 4.490409442401e-05, + "loss": 2.0862, + "step": 495560 + }, + { + "epoch": 1.9157350280651295, + "grad_norm": 0.1672198474407196, + "learning_rate": 4.4801800328161166e-05, + "loss": 2.0855, + "step": 495570 + }, + { + "epoch": 1.9157736852685128, + "grad_norm": 0.16370943188667297, + "learning_rate": 4.4699511583961236e-05, + "loss": 2.0843, + "step": 495580 + }, + { + "epoch": 1.9158123424718962, + "grad_norm": 0.17125527560710907, + "learning_rate": 4.4597228190570437e-05, + "loss": 2.09, + "step": 495590 + }, + { + "epoch": 1.9158509996752795, + "grad_norm": 0.1694050133228302, + "learning_rate": 4.449495014714944e-05, + "loss": 2.0969, + "step": 495600 + }, + { + "epoch": 1.9158896568786628, + "grad_norm": 0.169652059674263, + "learning_rate": 4.4392677452858464e-05, + "loss": 2.1017, + "step": 495610 + }, + { + "epoch": 1.915928314082046, + "grad_norm": 0.16704246401786804, + "learning_rate": 4.4290410106858417e-05, + "loss": 2.1011, + "step": 495620 + }, + { + "epoch": 1.9159669712854295, + "grad_norm": 0.16456107795238495, + "learning_rate": 4.41881481083104e-05, + "loss": 2.095, + "step": 495630 + }, + { + "epoch": 1.9160056284888127, + "grad_norm": 0.17001429200172424, + "learning_rate": 4.408589145637576e-05, + "loss": 2.084, + "step": 495640 + }, + { + "epoch": 1.916044285692196, + "grad_norm": 0.18315348029136658, + "learning_rate": 4.398364015021583e-05, + "loss": 2.0921, + "step": 495650 + }, + { + "epoch": 1.9160829428955792, + "grad_norm": 0.17084050178527832, + "learning_rate": 4.38813941889924e-05, + "loss": 2.0921, + "step": 495660 + }, + { + "epoch": 1.9161216000989625, + "grad_norm": 0.16386601328849792, + "learning_rate": 4.3779153571867234e-05, + "loss": 2.0905, + "step": 495670 + }, + { + "epoch": 1.9161602573023457, + "grad_norm": 0.16962173581123352, + "learning_rate": 4.367691829800258e-05, + "loss": 2.0901, + "step": 495680 + }, + { + "epoch": 1.916198914505729, + "grad_norm": 0.16165487468242645, + "learning_rate": 4.3574688366560645e-05, + "loss": 2.0929, + "step": 495690 + }, + { + "epoch": 1.9162375717091122, + "grad_norm": 0.1727007031440735, + "learning_rate": 4.347246377670433e-05, + "loss": 2.0901, + "step": 495700 + }, + { + "epoch": 1.9162762289124955, + "grad_norm": 0.1733340471982956, + "learning_rate": 4.337024452759586e-05, + "loss": 2.1096, + "step": 495710 + }, + { + "epoch": 1.9163148861158787, + "grad_norm": 0.16607238352298737, + "learning_rate": 4.326803061839857e-05, + "loss": 2.1001, + "step": 495720 + }, + { + "epoch": 1.916353543319262, + "grad_norm": 0.165201336145401, + "learning_rate": 4.3165822048275795e-05, + "loss": 2.0872, + "step": 495730 + }, + { + "epoch": 1.9163922005226453, + "grad_norm": 0.17623762786388397, + "learning_rate": 4.306361881639087e-05, + "loss": 2.1111, + "step": 495740 + }, + { + "epoch": 1.9164308577260285, + "grad_norm": 0.1635725051164627, + "learning_rate": 4.296142092190736e-05, + "loss": 2.0938, + "step": 495750 + }, + { + "epoch": 1.916469514929412, + "grad_norm": 0.17206135392189026, + "learning_rate": 4.285922836398903e-05, + "loss": 2.0796, + "step": 495760 + }, + { + "epoch": 1.9165081721327952, + "grad_norm": 0.19353315234184265, + "learning_rate": 4.2757041141800345e-05, + "loss": 2.0963, + "step": 495770 + }, + { + "epoch": 1.9165468293361785, + "grad_norm": 0.16452445089817047, + "learning_rate": 4.265485925450552e-05, + "loss": 2.1097, + "step": 495780 + }, + { + "epoch": 1.9165854865395617, + "grad_norm": 0.16897344589233398, + "learning_rate": 4.2552682701268776e-05, + "loss": 2.1021, + "step": 495790 + }, + { + "epoch": 1.9166241437429452, + "grad_norm": 0.15898294746875763, + "learning_rate": 4.2450511481255e-05, + "loss": 2.0935, + "step": 495800 + }, + { + "epoch": 1.9166628009463285, + "grad_norm": 0.15929338335990906, + "learning_rate": 4.2348345593629325e-05, + "loss": 2.0823, + "step": 495810 + }, + { + "epoch": 1.9167014581497117, + "grad_norm": 0.1751093715429306, + "learning_rate": 4.224618503755684e-05, + "loss": 2.0861, + "step": 495820 + }, + { + "epoch": 1.916740115353095, + "grad_norm": 0.16839739680290222, + "learning_rate": 4.2144029812202886e-05, + "loss": 2.0883, + "step": 495830 + }, + { + "epoch": 1.9167787725564782, + "grad_norm": 0.17080606520175934, + "learning_rate": 4.204187991673325e-05, + "loss": 2.1052, + "step": 495840 + }, + { + "epoch": 1.9168174297598615, + "grad_norm": 0.16607625782489777, + "learning_rate": 4.193973535031348e-05, + "loss": 2.0959, + "step": 495850 + }, + { + "epoch": 1.9168560869632447, + "grad_norm": 0.16274487972259521, + "learning_rate": 4.183759611211002e-05, + "loss": 2.0859, + "step": 495860 + }, + { + "epoch": 1.916894744166628, + "grad_norm": 0.16844266653060913, + "learning_rate": 4.173546220128888e-05, + "loss": 2.0837, + "step": 495870 + }, + { + "epoch": 1.9169334013700112, + "grad_norm": 0.1613076776266098, + "learning_rate": 4.163333361701649e-05, + "loss": 2.0909, + "step": 495880 + }, + { + "epoch": 1.9169720585733945, + "grad_norm": 0.16152922809123993, + "learning_rate": 4.153121035845975e-05, + "loss": 2.0916, + "step": 495890 + }, + { + "epoch": 1.9170107157767777, + "grad_norm": 0.1656840741634369, + "learning_rate": 4.142909242478532e-05, + "loss": 2.1162, + "step": 495900 + }, + { + "epoch": 1.917049372980161, + "grad_norm": 0.16846653819084167, + "learning_rate": 4.132697981516076e-05, + "loss": 2.0992, + "step": 495910 + }, + { + "epoch": 1.9170880301835445, + "grad_norm": 0.17735540866851807, + "learning_rate": 4.1224872528752954e-05, + "loss": 2.0961, + "step": 495920 + }, + { + "epoch": 1.9171266873869277, + "grad_norm": 0.16371974349021912, + "learning_rate": 4.112277056472991e-05, + "loss": 2.0876, + "step": 495930 + }, + { + "epoch": 1.917165344590311, + "grad_norm": 0.16141781210899353, + "learning_rate": 4.1020673922258944e-05, + "loss": 2.0841, + "step": 495940 + }, + { + "epoch": 1.9172040017936942, + "grad_norm": 0.16501149535179138, + "learning_rate": 4.091858260050873e-05, + "loss": 2.093, + "step": 495950 + }, + { + "epoch": 1.9172426589970775, + "grad_norm": 0.16135872900485992, + "learning_rate": 4.081649659864684e-05, + "loss": 2.0859, + "step": 495960 + }, + { + "epoch": 1.917281316200461, + "grad_norm": 0.17437012493610382, + "learning_rate": 4.071441591584213e-05, + "loss": 2.0881, + "step": 495970 + }, + { + "epoch": 1.9173199734038442, + "grad_norm": 0.170584574341774, + "learning_rate": 4.061234055126306e-05, + "loss": 2.1035, + "step": 495980 + }, + { + "epoch": 1.9173586306072274, + "grad_norm": 0.1637963205575943, + "learning_rate": 4.051027050407852e-05, + "loss": 2.0935, + "step": 495990 + }, + { + "epoch": 1.9173972878106107, + "grad_norm": 0.16705727577209473, + "learning_rate": 4.040820577345761e-05, + "loss": 2.0915, + "step": 496000 + }, + { + "epoch": 1.917435945013994, + "grad_norm": 0.17151454091072083, + "learning_rate": 4.030614635856966e-05, + "loss": 2.0944, + "step": 496010 + }, + { + "epoch": 1.9174746022173772, + "grad_norm": 0.16552621126174927, + "learning_rate": 4.020409225858423e-05, + "loss": 2.0838, + "step": 496020 + }, + { + "epoch": 1.9175132594207605, + "grad_norm": 0.17188087105751038, + "learning_rate": 4.010204347267088e-05, + "loss": 2.1022, + "step": 496030 + }, + { + "epoch": 1.9175519166241437, + "grad_norm": 0.1699785590171814, + "learning_rate": 4.000000000000004e-05, + "loss": 2.1053, + "step": 496040 + }, + { + "epoch": 1.917590573827527, + "grad_norm": 0.17038536071777344, + "learning_rate": 3.989796183974126e-05, + "loss": 2.0966, + "step": 496050 + }, + { + "epoch": 1.9176292310309102, + "grad_norm": 0.16701827943325043, + "learning_rate": 3.979592899106543e-05, + "loss": 2.0907, + "step": 496060 + }, + { + "epoch": 1.9176678882342935, + "grad_norm": 0.16330760717391968, + "learning_rate": 3.9693901453143e-05, + "loss": 2.1054, + "step": 496070 + }, + { + "epoch": 1.9177065454376767, + "grad_norm": 0.1569678634405136, + "learning_rate": 3.959187922514462e-05, + "loss": 2.0949, + "step": 496080 + }, + { + "epoch": 1.9177452026410602, + "grad_norm": 0.16981206834316254, + "learning_rate": 3.948986230624141e-05, + "loss": 2.0887, + "step": 496090 + }, + { + "epoch": 1.9177838598444434, + "grad_norm": 0.17285099625587463, + "learning_rate": 3.938785069560491e-05, + "loss": 2.083, + "step": 496100 + }, + { + "epoch": 1.9178225170478267, + "grad_norm": 0.16686466336250305, + "learning_rate": 3.928584439240623e-05, + "loss": 2.0834, + "step": 496110 + }, + { + "epoch": 1.91786117425121, + "grad_norm": 0.16524849832057953, + "learning_rate": 3.918384339581693e-05, + "loss": 2.1053, + "step": 496120 + }, + { + "epoch": 1.9178998314545932, + "grad_norm": 0.17060840129852295, + "learning_rate": 3.908184770500945e-05, + "loss": 2.0982, + "step": 496130 + }, + { + "epoch": 1.9179384886579767, + "grad_norm": 0.1700611114501953, + "learning_rate": 3.897985731915532e-05, + "loss": 2.1048, + "step": 496140 + }, + { + "epoch": 1.91797714586136, + "grad_norm": 0.1688399761915207, + "learning_rate": 3.887787223742745e-05, + "loss": 2.0942, + "step": 496150 + }, + { + "epoch": 1.9180158030647432, + "grad_norm": 0.35818973183631897, + "learning_rate": 3.877589245899804e-05, + "loss": 2.1014, + "step": 496160 + }, + { + "epoch": 1.9180544602681264, + "grad_norm": 0.18928390741348267, + "learning_rate": 3.867391798303976e-05, + "loss": 2.1023, + "step": 496170 + }, + { + "epoch": 1.9180931174715097, + "grad_norm": 0.17422586679458618, + "learning_rate": 3.857194880872572e-05, + "loss": 2.0758, + "step": 496180 + }, + { + "epoch": 1.918131774674893, + "grad_norm": 0.159042626619339, + "learning_rate": 3.846998493522924e-05, + "loss": 2.0892, + "step": 496190 + }, + { + "epoch": 1.9181704318782762, + "grad_norm": 0.17417706549167633, + "learning_rate": 3.836802636172343e-05, + "loss": 2.108, + "step": 496200 + }, + { + "epoch": 1.9182090890816594, + "grad_norm": 0.16420163214206696, + "learning_rate": 3.826607308738228e-05, + "loss": 2.0893, + "step": 496210 + }, + { + "epoch": 1.9182477462850427, + "grad_norm": 0.15608297288417816, + "learning_rate": 3.8164125111379125e-05, + "loss": 2.0745, + "step": 496220 + }, + { + "epoch": 1.918286403488426, + "grad_norm": 0.20413453876972198, + "learning_rate": 3.8062182432888616e-05, + "loss": 2.0949, + "step": 496230 + }, + { + "epoch": 1.9183250606918092, + "grad_norm": 0.16319338977336884, + "learning_rate": 3.796024505108453e-05, + "loss": 2.0946, + "step": 496240 + }, + { + "epoch": 1.9183637178951924, + "grad_norm": 0.16638115048408508, + "learning_rate": 3.785831296514153e-05, + "loss": 2.0847, + "step": 496250 + }, + { + "epoch": 1.918402375098576, + "grad_norm": 0.1976384073495865, + "learning_rate": 3.775638617423449e-05, + "loss": 2.0923, + "step": 496260 + }, + { + "epoch": 1.9184410323019592, + "grad_norm": 0.16054300963878632, + "learning_rate": 3.765446467753808e-05, + "loss": 2.0683, + "step": 496270 + }, + { + "epoch": 1.9184796895053424, + "grad_norm": 0.1680409014225006, + "learning_rate": 3.75525484742274e-05, + "loss": 2.1037, + "step": 496280 + }, + { + "epoch": 1.9185183467087257, + "grad_norm": 0.16273841261863708, + "learning_rate": 3.745063756347778e-05, + "loss": 2.0923, + "step": 496290 + }, + { + "epoch": 1.918557003912109, + "grad_norm": 0.17158521711826324, + "learning_rate": 3.7348731944464974e-05, + "loss": 2.09, + "step": 496300 + }, + { + "epoch": 1.9185956611154924, + "grad_norm": 0.18540428578853607, + "learning_rate": 3.724683161636455e-05, + "loss": 2.0847, + "step": 496310 + }, + { + "epoch": 1.9186343183188757, + "grad_norm": 0.1612491011619568, + "learning_rate": 3.714493657835249e-05, + "loss": 2.0827, + "step": 496320 + }, + { + "epoch": 1.918672975522259, + "grad_norm": 0.1789754182100296, + "learning_rate": 3.704304682960502e-05, + "loss": 2.0907, + "step": 496330 + }, + { + "epoch": 1.9187116327256422, + "grad_norm": 0.15669743716716766, + "learning_rate": 3.694116236929878e-05, + "loss": 2.1034, + "step": 496340 + }, + { + "epoch": 1.9187502899290254, + "grad_norm": 0.15760864317417145, + "learning_rate": 3.6839283196610006e-05, + "loss": 2.0776, + "step": 496350 + }, + { + "epoch": 1.9187889471324087, + "grad_norm": 0.166031152009964, + "learning_rate": 3.6737409310715784e-05, + "loss": 2.0842, + "step": 496360 + }, + { + "epoch": 1.918827604335792, + "grad_norm": 0.16171342134475708, + "learning_rate": 3.6635540710793005e-05, + "loss": 2.0912, + "step": 496370 + }, + { + "epoch": 1.9188662615391752, + "grad_norm": 0.16252924501895905, + "learning_rate": 3.6533677396019205e-05, + "loss": 2.0846, + "step": 496380 + }, + { + "epoch": 1.9189049187425584, + "grad_norm": 0.1602102369070053, + "learning_rate": 3.6431819365571496e-05, + "loss": 2.1002, + "step": 496390 + }, + { + "epoch": 1.9189435759459417, + "grad_norm": 0.16001828014850616, + "learning_rate": 3.632996661862764e-05, + "loss": 2.1107, + "step": 496400 + }, + { + "epoch": 1.918982233149325, + "grad_norm": 0.16714394092559814, + "learning_rate": 3.622811915436586e-05, + "loss": 2.0719, + "step": 496410 + }, + { + "epoch": 1.9190208903527082, + "grad_norm": 0.16935458779335022, + "learning_rate": 3.612627697196391e-05, + "loss": 2.0813, + "step": 496420 + }, + { + "epoch": 1.9190595475560916, + "grad_norm": 0.16367019712924957, + "learning_rate": 3.6024440070600016e-05, + "loss": 2.0756, + "step": 496430 + }, + { + "epoch": 1.919098204759475, + "grad_norm": 0.19588987529277802, + "learning_rate": 3.592260844945305e-05, + "loss": 2.0907, + "step": 496440 + }, + { + "epoch": 1.9191368619628582, + "grad_norm": 0.15941786766052246, + "learning_rate": 3.58207821077019e-05, + "loss": 2.0899, + "step": 496450 + }, + { + "epoch": 1.9191755191662414, + "grad_norm": 0.16553384065628052, + "learning_rate": 3.5718961044524986e-05, + "loss": 2.0874, + "step": 496460 + }, + { + "epoch": 1.9192141763696249, + "grad_norm": 0.17352290451526642, + "learning_rate": 3.561714525910209e-05, + "loss": 2.0852, + "step": 496470 + }, + { + "epoch": 1.9192528335730081, + "grad_norm": 0.15746286511421204, + "learning_rate": 3.5515334750612085e-05, + "loss": 2.1012, + "step": 496480 + }, + { + "epoch": 1.9192914907763914, + "grad_norm": 0.1638580858707428, + "learning_rate": 3.5413529518234735e-05, + "loss": 2.0964, + "step": 496490 + }, + { + "epoch": 1.9193301479797746, + "grad_norm": 0.16750258207321167, + "learning_rate": 3.531172956115003e-05, + "loss": 2.0837, + "step": 496500 + }, + { + "epoch": 1.9193688051831579, + "grad_norm": 0.1676705926656723, + "learning_rate": 3.5209934878537744e-05, + "loss": 2.1003, + "step": 496510 + }, + { + "epoch": 1.9194074623865411, + "grad_norm": 0.16694529354572296, + "learning_rate": 3.51081454695783e-05, + "loss": 2.0749, + "step": 496520 + }, + { + "epoch": 1.9194461195899244, + "grad_norm": 0.17172051966190338, + "learning_rate": 3.500636133345192e-05, + "loss": 2.1072, + "step": 496530 + }, + { + "epoch": 1.9194847767933076, + "grad_norm": 0.16720014810562134, + "learning_rate": 3.49045824693397e-05, + "loss": 2.1067, + "step": 496540 + }, + { + "epoch": 1.919523433996691, + "grad_norm": 0.15233376622200012, + "learning_rate": 3.480280887642206e-05, + "loss": 2.0892, + "step": 496550 + }, + { + "epoch": 1.9195620912000741, + "grad_norm": 0.16428092122077942, + "learning_rate": 3.470104055388057e-05, + "loss": 2.0915, + "step": 496560 + }, + { + "epoch": 1.9196007484034574, + "grad_norm": 0.15058358013629913, + "learning_rate": 3.459927750089609e-05, + "loss": 2.0904, + "step": 496570 + }, + { + "epoch": 1.9196394056068407, + "grad_norm": 0.16439568996429443, + "learning_rate": 3.449751971665016e-05, + "loss": 2.1122, + "step": 496580 + }, + { + "epoch": 1.919678062810224, + "grad_norm": 0.16111652553081512, + "learning_rate": 3.439576720032478e-05, + "loss": 2.1009, + "step": 496590 + }, + { + "epoch": 1.9197167200136074, + "grad_norm": 0.16498912870883942, + "learning_rate": 3.4294019951101705e-05, + "loss": 2.0852, + "step": 496600 + }, + { + "epoch": 1.9197553772169906, + "grad_norm": 0.1640535145998001, + "learning_rate": 3.4192277968163156e-05, + "loss": 2.0939, + "step": 496610 + }, + { + "epoch": 1.9197940344203739, + "grad_norm": 0.16125431656837463, + "learning_rate": 3.4090541250691334e-05, + "loss": 2.0965, + "step": 496620 + }, + { + "epoch": 1.9198326916237571, + "grad_norm": 0.17195944488048553, + "learning_rate": 3.398880979786889e-05, + "loss": 2.0957, + "step": 496630 + }, + { + "epoch": 1.9198713488271406, + "grad_norm": 0.16381843388080597, + "learning_rate": 3.388708360887871e-05, + "loss": 2.1053, + "step": 496640 + }, + { + "epoch": 1.9199100060305239, + "grad_norm": 0.16488052904605865, + "learning_rate": 3.378536268290389e-05, + "loss": 2.0911, + "step": 496650 + }, + { + "epoch": 1.9199486632339071, + "grad_norm": 0.15774224698543549, + "learning_rate": 3.36836470191273e-05, + "loss": 2.0915, + "step": 496660 + }, + { + "epoch": 1.9199873204372904, + "grad_norm": 0.15254941582679749, + "learning_rate": 3.358193661673248e-05, + "loss": 2.0879, + "step": 496670 + }, + { + "epoch": 1.9200259776406736, + "grad_norm": 0.1649966835975647, + "learning_rate": 3.34802314749032e-05, + "loss": 2.0844, + "step": 496680 + }, + { + "epoch": 1.9200646348440569, + "grad_norm": 0.1633954644203186, + "learning_rate": 3.3378531592823225e-05, + "loss": 2.094, + "step": 496690 + }, + { + "epoch": 1.9201032920474401, + "grad_norm": 0.1792258471250534, + "learning_rate": 3.327683696967654e-05, + "loss": 2.0967, + "step": 496700 + }, + { + "epoch": 1.9201419492508234, + "grad_norm": 0.17409303784370422, + "learning_rate": 3.317514760464757e-05, + "loss": 2.0952, + "step": 496710 + }, + { + "epoch": 1.9201806064542066, + "grad_norm": 0.16607849299907684, + "learning_rate": 3.307346349692053e-05, + "loss": 2.1021, + "step": 496720 + }, + { + "epoch": 1.9202192636575899, + "grad_norm": 0.16597460210323334, + "learning_rate": 3.297178464568029e-05, + "loss": 2.1005, + "step": 496730 + }, + { + "epoch": 1.9202579208609731, + "grad_norm": 0.1616782397031784, + "learning_rate": 3.287011105011173e-05, + "loss": 2.0854, + "step": 496740 + }, + { + "epoch": 1.9202965780643564, + "grad_norm": 0.16032592952251434, + "learning_rate": 3.276844270939971e-05, + "loss": 2.1086, + "step": 496750 + }, + { + "epoch": 1.9203352352677396, + "grad_norm": 0.16161899268627167, + "learning_rate": 3.2666779622730017e-05, + "loss": 2.1006, + "step": 496760 + }, + { + "epoch": 1.9203738924711231, + "grad_norm": 0.1673613041639328, + "learning_rate": 3.2565121789287725e-05, + "loss": 2.0802, + "step": 496770 + }, + { + "epoch": 1.9204125496745064, + "grad_norm": 0.16218461096286774, + "learning_rate": 3.2463469208258826e-05, + "loss": 2.0987, + "step": 496780 + }, + { + "epoch": 1.9204512068778896, + "grad_norm": 0.15713539719581604, + "learning_rate": 3.23618218788293e-05, + "loss": 2.091, + "step": 496790 + }, + { + "epoch": 1.9204898640812729, + "grad_norm": 0.16224777698516846, + "learning_rate": 3.226017980018514e-05, + "loss": 2.0786, + "step": 496800 + }, + { + "epoch": 1.9205285212846563, + "grad_norm": 0.1700608879327774, + "learning_rate": 3.215854297151277e-05, + "loss": 2.0889, + "step": 496810 + }, + { + "epoch": 1.9205671784880396, + "grad_norm": 0.16657862067222595, + "learning_rate": 3.205691139199862e-05, + "loss": 2.0996, + "step": 496820 + }, + { + "epoch": 1.9206058356914228, + "grad_norm": 0.16817116737365723, + "learning_rate": 3.1955285060829564e-05, + "loss": 2.1092, + "step": 496830 + }, + { + "epoch": 1.920644492894806, + "grad_norm": 0.15900260210037231, + "learning_rate": 3.185366397719292e-05, + "loss": 2.0807, + "step": 496840 + }, + { + "epoch": 1.9206831500981894, + "grad_norm": 0.16846583783626556, + "learning_rate": 3.175204814027555e-05, + "loss": 2.0803, + "step": 496850 + }, + { + "epoch": 1.9207218073015726, + "grad_norm": 0.16676563024520874, + "learning_rate": 3.1650437549265e-05, + "loss": 2.0818, + "step": 496860 + }, + { + "epoch": 1.9207604645049559, + "grad_norm": 0.17843875288963318, + "learning_rate": 3.154883220334881e-05, + "loss": 2.0833, + "step": 496870 + }, + { + "epoch": 1.920799121708339, + "grad_norm": 0.1726779043674469, + "learning_rate": 3.144723210171474e-05, + "loss": 2.0926, + "step": 496880 + }, + { + "epoch": 1.9208377789117224, + "grad_norm": 0.1588815450668335, + "learning_rate": 3.1345637243550997e-05, + "loss": 2.0811, + "step": 496890 + }, + { + "epoch": 1.9208764361151056, + "grad_norm": 0.16358856856822968, + "learning_rate": 3.1244047628046e-05, + "loss": 2.0864, + "step": 496900 + }, + { + "epoch": 1.9209150933184889, + "grad_norm": 0.16491766273975372, + "learning_rate": 3.1142463254387746e-05, + "loss": 2.0988, + "step": 496910 + }, + { + "epoch": 1.9209537505218721, + "grad_norm": 0.23247385025024414, + "learning_rate": 3.104088412176531e-05, + "loss": 2.0923, + "step": 496920 + }, + { + "epoch": 1.9209924077252554, + "grad_norm": 0.1721058338880539, + "learning_rate": 3.0939310229367135e-05, + "loss": 2.0808, + "step": 496930 + }, + { + "epoch": 1.9210310649286388, + "grad_norm": 0.15375511348247528, + "learning_rate": 3.083774157638297e-05, + "loss": 2.0904, + "step": 496940 + }, + { + "epoch": 1.921069722132022, + "grad_norm": 0.15841242671012878, + "learning_rate": 3.0736178162001474e-05, + "loss": 2.0963, + "step": 496950 + }, + { + "epoch": 1.9211083793354053, + "grad_norm": 0.1550142467021942, + "learning_rate": 3.0634619985412617e-05, + "loss": 2.0928, + "step": 496960 + }, + { + "epoch": 1.9211470365387886, + "grad_norm": 0.1543474793434143, + "learning_rate": 3.053306704580594e-05, + "loss": 2.1007, + "step": 496970 + }, + { + "epoch": 1.921185693742172, + "grad_norm": 0.1605292558670044, + "learning_rate": 3.0431519342371205e-05, + "loss": 2.0966, + "step": 496980 + }, + { + "epoch": 1.9212243509455553, + "grad_norm": 0.16049958765506744, + "learning_rate": 3.0329976874298837e-05, + "loss": 2.0857, + "step": 496990 + }, + { + "epoch": 1.9212630081489386, + "grad_norm": 0.15806148946285248, + "learning_rate": 3.022843964077904e-05, + "loss": 2.088, + "step": 497000 + }, + { + "epoch": 1.9213016653523218, + "grad_norm": 0.15949906408786774, + "learning_rate": 3.0126907641002456e-05, + "loss": 2.0815, + "step": 497010 + }, + { + "epoch": 1.921340322555705, + "grad_norm": 0.16571669280529022, + "learning_rate": 3.0025380874159737e-05, + "loss": 2.0862, + "step": 497020 + }, + { + "epoch": 1.9213789797590883, + "grad_norm": 0.16648319363594055, + "learning_rate": 2.992385933944153e-05, + "loss": 2.0836, + "step": 497030 + }, + { + "epoch": 1.9214176369624716, + "grad_norm": 0.16171889007091522, + "learning_rate": 2.982234303603981e-05, + "loss": 2.0988, + "step": 497040 + }, + { + "epoch": 1.9214562941658548, + "grad_norm": 0.16150999069213867, + "learning_rate": 2.9720831963145457e-05, + "loss": 2.0727, + "step": 497050 + }, + { + "epoch": 1.921494951369238, + "grad_norm": 0.16288860142230988, + "learning_rate": 2.9619326119949996e-05, + "loss": 2.0962, + "step": 497060 + }, + { + "epoch": 1.9215336085726213, + "grad_norm": 0.16144005954265594, + "learning_rate": 2.9517825505645414e-05, + "loss": 2.0893, + "step": 497070 + }, + { + "epoch": 1.9215722657760046, + "grad_norm": 0.15442635118961334, + "learning_rate": 2.9416330119423684e-05, + "loss": 2.0967, + "step": 497080 + }, + { + "epoch": 1.9216109229793878, + "grad_norm": 0.15948958694934845, + "learning_rate": 2.9314839960477014e-05, + "loss": 2.0991, + "step": 497090 + }, + { + "epoch": 1.921649580182771, + "grad_norm": 0.17574442923069, + "learning_rate": 2.9213355027998045e-05, + "loss": 2.0934, + "step": 497100 + }, + { + "epoch": 1.9216882373861546, + "grad_norm": 0.15875700116157532, + "learning_rate": 2.9111875321178983e-05, + "loss": 2.0884, + "step": 497110 + }, + { + "epoch": 1.9217268945895378, + "grad_norm": 0.16926713287830353, + "learning_rate": 2.9010400839212915e-05, + "loss": 2.0772, + "step": 497120 + }, + { + "epoch": 1.921765551792921, + "grad_norm": 0.16093897819519043, + "learning_rate": 2.890893158129293e-05, + "loss": 2.0844, + "step": 497130 + }, + { + "epoch": 1.9218042089963043, + "grad_norm": 0.15793362259864807, + "learning_rate": 2.880746754661234e-05, + "loss": 2.0753, + "step": 497140 + }, + { + "epoch": 1.9218428661996878, + "grad_norm": 0.16137558221817017, + "learning_rate": 2.870600873436424e-05, + "loss": 2.081, + "step": 497150 + }, + { + "epoch": 1.921881523403071, + "grad_norm": 0.16008661687374115, + "learning_rate": 2.8604555143742827e-05, + "loss": 2.0733, + "step": 497160 + }, + { + "epoch": 1.9219201806064543, + "grad_norm": 0.17916366457939148, + "learning_rate": 2.8503106773941635e-05, + "loss": 2.0863, + "step": 497170 + }, + { + "epoch": 1.9219588378098376, + "grad_norm": 0.16702742874622345, + "learning_rate": 2.840166362415486e-05, + "loss": 2.0998, + "step": 497180 + }, + { + "epoch": 1.9219974950132208, + "grad_norm": 0.1600625365972519, + "learning_rate": 2.8300225693576932e-05, + "loss": 2.0912, + "step": 497190 + }, + { + "epoch": 1.922036152216604, + "grad_norm": 0.1780082732439041, + "learning_rate": 2.8198792981402042e-05, + "loss": 2.0917, + "step": 497200 + }, + { + "epoch": 1.9220748094199873, + "grad_norm": 0.16587743163108826, + "learning_rate": 2.8097365486825065e-05, + "loss": 2.0816, + "step": 497210 + }, + { + "epoch": 1.9221134666233706, + "grad_norm": 0.16313040256500244, + "learning_rate": 2.7995943209041086e-05, + "loss": 2.0881, + "step": 497220 + }, + { + "epoch": 1.9221521238267538, + "grad_norm": 0.16506393253803253, + "learning_rate": 2.7894526147244748e-05, + "loss": 2.0968, + "step": 497230 + }, + { + "epoch": 1.922190781030137, + "grad_norm": 0.1675456464290619, + "learning_rate": 2.779311430063203e-05, + "loss": 2.0927, + "step": 497240 + }, + { + "epoch": 1.9222294382335203, + "grad_norm": 0.1661851704120636, + "learning_rate": 2.7691707668398015e-05, + "loss": 2.0833, + "step": 497250 + }, + { + "epoch": 1.9222680954369036, + "grad_norm": 0.1600489616394043, + "learning_rate": 2.759030624973846e-05, + "loss": 2.0867, + "step": 497260 + }, + { + "epoch": 1.9223067526402868, + "grad_norm": 0.15950718522071838, + "learning_rate": 2.7488910043849792e-05, + "loss": 2.0789, + "step": 497270 + }, + { + "epoch": 1.9223454098436703, + "grad_norm": 0.4952988624572754, + "learning_rate": 2.7387519049927534e-05, + "loss": 2.0819, + "step": 497280 + }, + { + "epoch": 1.9223840670470536, + "grad_norm": 0.1711588203907013, + "learning_rate": 2.7286133267168333e-05, + "loss": 2.0835, + "step": 497290 + }, + { + "epoch": 1.9224227242504368, + "grad_norm": 0.1545725017786026, + "learning_rate": 2.7184752694769054e-05, + "loss": 2.0985, + "step": 497300 + }, + { + "epoch": 1.92246138145382, + "grad_norm": 0.16545742750167847, + "learning_rate": 2.7083377331925896e-05, + "loss": 2.0882, + "step": 497310 + }, + { + "epoch": 1.9225000386572035, + "grad_norm": 0.17229430377483368, + "learning_rate": 2.6982007177836166e-05, + "loss": 2.0788, + "step": 497320 + }, + { + "epoch": 1.9225386958605868, + "grad_norm": 0.16649629175662994, + "learning_rate": 2.6880642231697172e-05, + "loss": 2.0769, + "step": 497330 + }, + { + "epoch": 1.92257735306397, + "grad_norm": 0.15658745169639587, + "learning_rate": 2.6779282492706005e-05, + "loss": 2.0893, + "step": 497340 + }, + { + "epoch": 1.9226160102673533, + "grad_norm": 0.1738753765821457, + "learning_rate": 2.6677927960060412e-05, + "loss": 2.0972, + "step": 497350 + }, + { + "epoch": 1.9226546674707365, + "grad_norm": 0.15449616312980652, + "learning_rate": 2.657657863295837e-05, + "loss": 2.0907, + "step": 497360 + }, + { + "epoch": 1.9226933246741198, + "grad_norm": 0.16673849523067474, + "learning_rate": 2.6475234510597857e-05, + "loss": 2.0871, + "step": 497370 + }, + { + "epoch": 1.922731981877503, + "grad_norm": 0.15777698159217834, + "learning_rate": 2.6373895592176845e-05, + "loss": 2.0953, + "step": 497380 + }, + { + "epoch": 1.9227706390808863, + "grad_norm": 0.5646606087684631, + "learning_rate": 2.6272561876893752e-05, + "loss": 2.089, + "step": 497390 + }, + { + "epoch": 1.9228092962842696, + "grad_norm": 0.17194247245788574, + "learning_rate": 2.6171233363947667e-05, + "loss": 2.0926, + "step": 497400 + }, + { + "epoch": 1.9228479534876528, + "grad_norm": 0.16004107892513275, + "learning_rate": 2.606991005253678e-05, + "loss": 2.0919, + "step": 497410 + }, + { + "epoch": 1.922886610691036, + "grad_norm": 0.16976672410964966, + "learning_rate": 2.5968591941860853e-05, + "loss": 2.0906, + "step": 497420 + }, + { + "epoch": 1.9229252678944193, + "grad_norm": 0.165140762925148, + "learning_rate": 2.5867279031118294e-05, + "loss": 2.0898, + "step": 497430 + }, + { + "epoch": 1.9229639250978026, + "grad_norm": 0.16237983107566833, + "learning_rate": 2.5765971319509306e-05, + "loss": 2.0822, + "step": 497440 + }, + { + "epoch": 1.923002582301186, + "grad_norm": 0.15686587989330292, + "learning_rate": 2.5664668806233414e-05, + "loss": 2.087, + "step": 497450 + }, + { + "epoch": 1.9230412395045693, + "grad_norm": 0.16701488196849823, + "learning_rate": 2.556337149049015e-05, + "loss": 2.0851, + "step": 497460 + }, + { + "epoch": 1.9230798967079525, + "grad_norm": 0.16209928691387177, + "learning_rate": 2.546207937147993e-05, + "loss": 2.0915, + "step": 497470 + }, + { + "epoch": 1.9231185539113358, + "grad_norm": 0.17187343537807465, + "learning_rate": 2.5360792448402726e-05, + "loss": 2.0916, + "step": 497480 + }, + { + "epoch": 1.9231572111147193, + "grad_norm": 0.15735509991645813, + "learning_rate": 2.525951072045918e-05, + "loss": 2.0987, + "step": 497490 + }, + { + "epoch": 1.9231958683181025, + "grad_norm": 0.16227933764457703, + "learning_rate": 2.5158234186850148e-05, + "loss": 2.0824, + "step": 497500 + }, + { + "epoch": 1.9232345255214858, + "grad_norm": 0.168210968375206, + "learning_rate": 2.5056962846776276e-05, + "loss": 2.0978, + "step": 497510 + }, + { + "epoch": 1.923273182724869, + "grad_norm": 0.15861283242702484, + "learning_rate": 2.4955696699438647e-05, + "loss": 2.1101, + "step": 497520 + }, + { + "epoch": 1.9233118399282523, + "grad_norm": 0.1637565791606903, + "learning_rate": 2.4854435744038785e-05, + "loss": 2.0775, + "step": 497530 + }, + { + "epoch": 1.9233504971316355, + "grad_norm": 0.15976262092590332, + "learning_rate": 2.4753179979777774e-05, + "loss": 2.0916, + "step": 497540 + }, + { + "epoch": 1.9233891543350188, + "grad_norm": 0.16702741384506226, + "learning_rate": 2.4651929405857808e-05, + "loss": 2.0942, + "step": 497550 + }, + { + "epoch": 1.923427811538402, + "grad_norm": 0.16337959468364716, + "learning_rate": 2.4550684021480642e-05, + "loss": 2.0812, + "step": 497560 + }, + { + "epoch": 1.9234664687417853, + "grad_norm": 0.15780958533287048, + "learning_rate": 2.4449443825848238e-05, + "loss": 2.0807, + "step": 497570 + }, + { + "epoch": 1.9235051259451685, + "grad_norm": 0.16224455833435059, + "learning_rate": 2.4348208818163242e-05, + "loss": 2.077, + "step": 497580 + }, + { + "epoch": 1.9235437831485518, + "grad_norm": 0.1588510423898697, + "learning_rate": 2.4246978997627843e-05, + "loss": 2.0938, + "step": 497590 + }, + { + "epoch": 1.923582440351935, + "grad_norm": 0.19597147405147552, + "learning_rate": 2.4145754363444906e-05, + "loss": 2.0695, + "step": 497600 + }, + { + "epoch": 1.9236210975553183, + "grad_norm": 0.15698018670082092, + "learning_rate": 2.4044534914817507e-05, + "loss": 2.0844, + "step": 497610 + }, + { + "epoch": 1.9236597547587018, + "grad_norm": 0.16293266415596008, + "learning_rate": 2.394332065094851e-05, + "loss": 2.0815, + "step": 497620 + }, + { + "epoch": 1.923698411962085, + "grad_norm": 0.1649063676595688, + "learning_rate": 2.3842111571041658e-05, + "loss": 2.0947, + "step": 497630 + }, + { + "epoch": 1.9237370691654683, + "grad_norm": 0.16432203352451324, + "learning_rate": 2.3740907674299816e-05, + "loss": 2.1011, + "step": 497640 + }, + { + "epoch": 1.9237757263688515, + "grad_norm": 0.16237296164035797, + "learning_rate": 2.3639708959927398e-05, + "loss": 2.0957, + "step": 497650 + }, + { + "epoch": 1.923814383572235, + "grad_norm": 0.17158690094947815, + "learning_rate": 2.353851542712837e-05, + "loss": 2.0853, + "step": 497660 + }, + { + "epoch": 1.9238530407756183, + "grad_norm": 0.16722534596920013, + "learning_rate": 2.3437327075106263e-05, + "loss": 2.0818, + "step": 497670 + }, + { + "epoch": 1.9238916979790015, + "grad_norm": 0.16253255307674408, + "learning_rate": 2.3336143903066155e-05, + "loss": 2.0944, + "step": 497680 + }, + { + "epoch": 1.9239303551823848, + "grad_norm": 0.16351984441280365, + "learning_rate": 2.3234965910212235e-05, + "loss": 2.0991, + "step": 497690 + }, + { + "epoch": 1.923969012385768, + "grad_norm": 0.18320319056510925, + "learning_rate": 2.3133793095749146e-05, + "loss": 2.0812, + "step": 497700 + }, + { + "epoch": 1.9240076695891513, + "grad_norm": 0.16540493071079254, + "learning_rate": 2.3032625458882405e-05, + "loss": 2.0781, + "step": 497710 + }, + { + "epoch": 1.9240463267925345, + "grad_norm": 0.16712576150894165, + "learning_rate": 2.2931462998816655e-05, + "loss": 2.0957, + "step": 497720 + }, + { + "epoch": 1.9240849839959178, + "grad_norm": 0.1669510304927826, + "learning_rate": 2.2830305714757417e-05, + "loss": 2.0826, + "step": 497730 + }, + { + "epoch": 1.924123641199301, + "grad_norm": 0.15979358553886414, + "learning_rate": 2.2729153605910213e-05, + "loss": 2.0826, + "step": 497740 + }, + { + "epoch": 1.9241622984026843, + "grad_norm": 0.15580396354198456, + "learning_rate": 2.2628006671481016e-05, + "loss": 2.0935, + "step": 497750 + }, + { + "epoch": 1.9242009556060675, + "grad_norm": 0.1805225908756256, + "learning_rate": 2.2526864910676016e-05, + "loss": 2.0809, + "step": 497760 + }, + { + "epoch": 1.9242396128094508, + "grad_norm": 0.1583130806684494, + "learning_rate": 2.2425728322700957e-05, + "loss": 2.0912, + "step": 497770 + }, + { + "epoch": 1.9242782700128342, + "grad_norm": 0.16896626353263855, + "learning_rate": 2.232459690676225e-05, + "loss": 2.087, + "step": 497780 + }, + { + "epoch": 1.9243169272162175, + "grad_norm": 0.16624216735363007, + "learning_rate": 2.222347066206698e-05, + "loss": 2.0922, + "step": 497790 + }, + { + "epoch": 1.9243555844196008, + "grad_norm": 0.17013777792453766, + "learning_rate": 2.2122349587821554e-05, + "loss": 2.0903, + "step": 497800 + }, + { + "epoch": 1.924394241622984, + "grad_norm": 0.15819691121578217, + "learning_rate": 2.2021233683232834e-05, + "loss": 2.0977, + "step": 497810 + }, + { + "epoch": 1.9244328988263673, + "grad_norm": 0.16058465838432312, + "learning_rate": 2.192012294750856e-05, + "loss": 2.0806, + "step": 497820 + }, + { + "epoch": 1.9244715560297507, + "grad_norm": 0.1678999662399292, + "learning_rate": 2.1819017379855587e-05, + "loss": 2.1012, + "step": 497830 + }, + { + "epoch": 1.924510213233134, + "grad_norm": 0.1562078595161438, + "learning_rate": 2.1717916979481888e-05, + "loss": 2.0852, + "step": 497840 + }, + { + "epoch": 1.9245488704365172, + "grad_norm": 0.16949644684791565, + "learning_rate": 2.1616821745595206e-05, + "loss": 2.102, + "step": 497850 + }, + { + "epoch": 1.9245875276399005, + "grad_norm": 0.15762582421302795, + "learning_rate": 2.1515731677403506e-05, + "loss": 2.0957, + "step": 497860 + }, + { + "epoch": 1.9246261848432837, + "grad_norm": 0.1545087844133377, + "learning_rate": 2.1414646774114977e-05, + "loss": 2.0911, + "step": 497870 + }, + { + "epoch": 1.924664842046667, + "grad_norm": 0.1587749570608139, + "learning_rate": 2.1313567034938254e-05, + "loss": 2.0966, + "step": 497880 + }, + { + "epoch": 1.9247034992500502, + "grad_norm": 0.16813033819198608, + "learning_rate": 2.1212492459081746e-05, + "loss": 2.0915, + "step": 497890 + }, + { + "epoch": 1.9247421564534335, + "grad_norm": 0.1571056842803955, + "learning_rate": 2.1111423045754306e-05, + "loss": 2.0834, + "step": 497900 + }, + { + "epoch": 1.9247808136568167, + "grad_norm": 0.15674418210983276, + "learning_rate": 2.101035879416502e-05, + "loss": 2.0793, + "step": 497910 + }, + { + "epoch": 1.9248194708602, + "grad_norm": 0.20706580579280853, + "learning_rate": 2.0909299703522954e-05, + "loss": 2.0638, + "step": 497920 + }, + { + "epoch": 1.9248581280635833, + "grad_norm": 0.15839233994483948, + "learning_rate": 2.0808245773037858e-05, + "loss": 2.0728, + "step": 497930 + }, + { + "epoch": 1.9248967852669665, + "grad_norm": 0.1598314344882965, + "learning_rate": 2.070719700191903e-05, + "loss": 2.1033, + "step": 497940 + }, + { + "epoch": 1.92493544247035, + "grad_norm": 0.15691877901554108, + "learning_rate": 2.060615338937666e-05, + "loss": 2.0766, + "step": 497950 + }, + { + "epoch": 1.9249740996737332, + "grad_norm": 0.16243983805179596, + "learning_rate": 2.0505114934620484e-05, + "loss": 2.0885, + "step": 497960 + }, + { + "epoch": 1.9250127568771165, + "grad_norm": 0.16956232488155365, + "learning_rate": 2.0404081636860917e-05, + "loss": 2.0959, + "step": 497970 + }, + { + "epoch": 1.9250514140804997, + "grad_norm": 0.16090889275074005, + "learning_rate": 2.0303053495308364e-05, + "loss": 2.0777, + "step": 497980 + }, + { + "epoch": 1.925090071283883, + "grad_norm": 0.1664169579744339, + "learning_rate": 2.0202030509173463e-05, + "loss": 2.0855, + "step": 497990 + }, + { + "epoch": 1.9251287284872665, + "grad_norm": 0.15896333754062653, + "learning_rate": 2.010101267766684e-05, + "loss": 2.0926, + "step": 498000 + }, + { + "epoch": 1.9251673856906497, + "grad_norm": 0.17546389997005463, + "learning_rate": 2.000000000000002e-05, + "loss": 2.0943, + "step": 498010 + }, + { + "epoch": 1.925206042894033, + "grad_norm": 0.15868791937828064, + "learning_rate": 1.9898992475383847e-05, + "loss": 2.0868, + "step": 498020 + }, + { + "epoch": 1.9252447000974162, + "grad_norm": 0.15706300735473633, + "learning_rate": 1.979799010303007e-05, + "loss": 2.0817, + "step": 498030 + }, + { + "epoch": 1.9252833573007995, + "grad_norm": 0.1700574904680252, + "learning_rate": 1.9696992882149767e-05, + "loss": 2.0733, + "step": 498040 + }, + { + "epoch": 1.9253220145041827, + "grad_norm": 0.1693497747182846, + "learning_rate": 1.959600081195556e-05, + "loss": 2.0879, + "step": 498050 + }, + { + "epoch": 1.925360671707566, + "grad_norm": 0.15419720113277435, + "learning_rate": 1.9495013891658974e-05, + "loss": 2.0951, + "step": 498060 + }, + { + "epoch": 1.9253993289109492, + "grad_norm": 0.15349604189395905, + "learning_rate": 1.9394032120472417e-05, + "loss": 2.0951, + "step": 498070 + }, + { + "epoch": 1.9254379861143325, + "grad_norm": 0.6273776888847351, + "learning_rate": 1.9293055497608513e-05, + "loss": 2.0877, + "step": 498080 + }, + { + "epoch": 1.9254766433177157, + "grad_norm": 0.17664119601249695, + "learning_rate": 1.9192084022279675e-05, + "loss": 2.0932, + "step": 498090 + }, + { + "epoch": 1.925515300521099, + "grad_norm": 0.16162990033626556, + "learning_rate": 1.9091117693698758e-05, + "loss": 2.0929, + "step": 498100 + }, + { + "epoch": 1.9255539577244822, + "grad_norm": 0.1592620462179184, + "learning_rate": 1.8990156511078826e-05, + "loss": 2.0871, + "step": 498110 + }, + { + "epoch": 1.9255926149278657, + "grad_norm": 0.16288858652114868, + "learning_rate": 1.8889200473633406e-05, + "loss": 2.0784, + "step": 498120 + }, + { + "epoch": 1.925631272131249, + "grad_norm": 0.16478540003299713, + "learning_rate": 1.8788249580575567e-05, + "loss": 2.0965, + "step": 498130 + }, + { + "epoch": 1.9256699293346322, + "grad_norm": 0.15600290894508362, + "learning_rate": 1.8687303831119053e-05, + "loss": 2.087, + "step": 498140 + }, + { + "epoch": 1.9257085865380155, + "grad_norm": 0.15345881879329681, + "learning_rate": 1.8586363224478043e-05, + "loss": 2.0777, + "step": 498150 + }, + { + "epoch": 1.9257472437413987, + "grad_norm": 0.16258493065834045, + "learning_rate": 1.848542775986628e-05, + "loss": 2.0898, + "step": 498160 + }, + { + "epoch": 1.9257859009447822, + "grad_norm": 0.16715885698795319, + "learning_rate": 1.838449743649817e-05, + "loss": 2.0847, + "step": 498170 + }, + { + "epoch": 1.9258245581481654, + "grad_norm": 0.15741226077079773, + "learning_rate": 1.828357225358812e-05, + "loss": 2.079, + "step": 498180 + }, + { + "epoch": 1.9258632153515487, + "grad_norm": 0.15472184121608734, + "learning_rate": 1.818265221035076e-05, + "loss": 2.0921, + "step": 498190 + }, + { + "epoch": 1.925901872554932, + "grad_norm": 0.1586213856935501, + "learning_rate": 1.8081737306000713e-05, + "loss": 2.0709, + "step": 498200 + }, + { + "epoch": 1.9259405297583152, + "grad_norm": 0.16578112542629242, + "learning_rate": 1.798082753975372e-05, + "loss": 2.0695, + "step": 498210 + }, + { + "epoch": 1.9259791869616985, + "grad_norm": 0.16239455342292786, + "learning_rate": 1.7879922910824186e-05, + "loss": 2.1012, + "step": 498220 + }, + { + "epoch": 1.9260178441650817, + "grad_norm": 0.1665370762348175, + "learning_rate": 1.7779023418428296e-05, + "loss": 2.0751, + "step": 498230 + }, + { + "epoch": 1.926056501368465, + "grad_norm": 0.1582210808992386, + "learning_rate": 1.767812906178112e-05, + "loss": 2.0871, + "step": 498240 + }, + { + "epoch": 1.9260951585718482, + "grad_norm": 0.15214033424854279, + "learning_rate": 1.757723984009907e-05, + "loss": 2.0826, + "step": 498250 + }, + { + "epoch": 1.9261338157752315, + "grad_norm": 0.156508669257164, + "learning_rate": 1.7476355752597872e-05, + "loss": 2.0822, + "step": 498260 + }, + { + "epoch": 1.9261724729786147, + "grad_norm": 0.16116832196712494, + "learning_rate": 1.737547679849372e-05, + "loss": 2.092, + "step": 498270 + }, + { + "epoch": 1.926211130181998, + "grad_norm": 0.1606052964925766, + "learning_rate": 1.727460297700323e-05, + "loss": 2.0812, + "step": 498280 + }, + { + "epoch": 1.9262497873853814, + "grad_norm": 0.15373031795024872, + "learning_rate": 1.7173734287343033e-05, + "loss": 2.0877, + "step": 498290 + }, + { + "epoch": 1.9262884445887647, + "grad_norm": 0.1582830548286438, + "learning_rate": 1.7072870728729985e-05, + "loss": 2.0819, + "step": 498300 + }, + { + "epoch": 1.926327101792148, + "grad_norm": 0.15733368694782257, + "learning_rate": 1.6972012300381145e-05, + "loss": 2.0806, + "step": 498310 + }, + { + "epoch": 1.9263657589955312, + "grad_norm": 0.15600501000881195, + "learning_rate": 1.6871159001513594e-05, + "loss": 2.0883, + "step": 498320 + }, + { + "epoch": 1.9264044161989147, + "grad_norm": 0.15862053632736206, + "learning_rate": 1.677031083134506e-05, + "loss": 2.1008, + "step": 498330 + }, + { + "epoch": 1.926443073402298, + "grad_norm": 0.16201405227184296, + "learning_rate": 1.666946778909284e-05, + "loss": 2.0811, + "step": 498340 + }, + { + "epoch": 1.9264817306056812, + "grad_norm": 0.15795743465423584, + "learning_rate": 1.6568629873975116e-05, + "loss": 2.0822, + "step": 498350 + }, + { + "epoch": 1.9265203878090644, + "grad_norm": 0.16118592023849487, + "learning_rate": 1.6467797085209844e-05, + "loss": 2.0861, + "step": 498360 + }, + { + "epoch": 1.9265590450124477, + "grad_norm": 0.15352848172187805, + "learning_rate": 1.636696942201521e-05, + "loss": 2.094, + "step": 498370 + }, + { + "epoch": 1.926597702215831, + "grad_norm": 0.15276506543159485, + "learning_rate": 1.6266146883609835e-05, + "loss": 2.0752, + "step": 498380 + }, + { + "epoch": 1.9266363594192142, + "grad_norm": 0.16707296669483185, + "learning_rate": 1.616532946921212e-05, + "loss": 2.0875, + "step": 498390 + }, + { + "epoch": 1.9266750166225974, + "grad_norm": 0.1634710729122162, + "learning_rate": 1.6064517178040916e-05, + "loss": 2.0779, + "step": 498400 + }, + { + "epoch": 1.9267136738259807, + "grad_norm": 0.16529691219329834, + "learning_rate": 1.5963710009315514e-05, + "loss": 2.0951, + "step": 498410 + }, + { + "epoch": 1.926752331029364, + "grad_norm": 0.21218083798885345, + "learning_rate": 1.5862907962254757e-05, + "loss": 2.0855, + "step": 498420 + }, + { + "epoch": 1.9267909882327472, + "grad_norm": 0.16146108508110046, + "learning_rate": 1.5762111036078385e-05, + "loss": 2.0816, + "step": 498430 + }, + { + "epoch": 1.9268296454361304, + "grad_norm": 0.16276432573795319, + "learning_rate": 1.566131923000591e-05, + "loss": 2.0884, + "step": 498440 + }, + { + "epoch": 1.9268683026395137, + "grad_norm": 0.15712717175483704, + "learning_rate": 1.5560532543257512e-05, + "loss": 2.1012, + "step": 498450 + }, + { + "epoch": 1.9269069598428972, + "grad_norm": 0.1626255065202713, + "learning_rate": 1.54597509750527e-05, + "loss": 2.0731, + "step": 498460 + }, + { + "epoch": 1.9269456170462804, + "grad_norm": 0.1566796898841858, + "learning_rate": 1.535897452461188e-05, + "loss": 2.0855, + "step": 498470 + }, + { + "epoch": 1.9269842742496637, + "grad_norm": 1.1111677885055542, + "learning_rate": 1.5258203191155672e-05, + "loss": 2.0843, + "step": 498480 + }, + { + "epoch": 1.927022931453047, + "grad_norm": 0.16669593751430511, + "learning_rate": 1.5157436973904481e-05, + "loss": 2.0786, + "step": 498490 + }, + { + "epoch": 1.9270615886564304, + "grad_norm": 0.15540610253810883, + "learning_rate": 1.505667587207915e-05, + "loss": 2.1004, + "step": 498500 + }, + { + "epoch": 1.9271002458598137, + "grad_norm": 0.16334572434425354, + "learning_rate": 1.495591988490097e-05, + "loss": 2.0952, + "step": 498510 + }, + { + "epoch": 1.927138903063197, + "grad_norm": 0.1581311970949173, + "learning_rate": 1.4855169011590785e-05, + "loss": 2.0911, + "step": 498520 + }, + { + "epoch": 1.9271775602665802, + "grad_norm": 0.1556026041507721, + "learning_rate": 1.4754423251370109e-05, + "loss": 2.097, + "step": 498530 + }, + { + "epoch": 1.9272162174699634, + "grad_norm": 0.34120458364486694, + "learning_rate": 1.4653682603460672e-05, + "loss": 2.085, + "step": 498540 + }, + { + "epoch": 1.9272548746733467, + "grad_norm": 0.15523135662078857, + "learning_rate": 1.4552947067084211e-05, + "loss": 2.0765, + "step": 498550 + }, + { + "epoch": 1.92729353187673, + "grad_norm": 0.15660513937473297, + "learning_rate": 1.44522166414629e-05, + "loss": 2.0961, + "step": 498560 + }, + { + "epoch": 1.9273321890801132, + "grad_norm": 0.15083718299865723, + "learning_rate": 1.43514913258187e-05, + "loss": 2.0884, + "step": 498570 + }, + { + "epoch": 1.9273708462834964, + "grad_norm": 0.16068926453590393, + "learning_rate": 1.4250771119374228e-05, + "loss": 2.0815, + "step": 498580 + }, + { + "epoch": 1.9274095034868797, + "grad_norm": 0.1572009027004242, + "learning_rate": 1.4150056021351886e-05, + "loss": 2.0845, + "step": 498590 + }, + { + "epoch": 1.927448160690263, + "grad_norm": 0.15406206250190735, + "learning_rate": 1.404934603097452e-05, + "loss": 2.0754, + "step": 498600 + }, + { + "epoch": 1.9274868178936462, + "grad_norm": 0.15755899250507355, + "learning_rate": 1.3948641147465191e-05, + "loss": 2.0842, + "step": 498610 + }, + { + "epoch": 1.9275254750970294, + "grad_norm": 0.15822505950927734, + "learning_rate": 1.384794137004719e-05, + "loss": 2.0812, + "step": 498620 + }, + { + "epoch": 1.927564132300413, + "grad_norm": 0.15680760145187378, + "learning_rate": 1.3747246697943583e-05, + "loss": 2.0832, + "step": 498630 + }, + { + "epoch": 1.9276027895037962, + "grad_norm": 0.16229304671287537, + "learning_rate": 1.3646557130378101e-05, + "loss": 2.0784, + "step": 498640 + }, + { + "epoch": 1.9276414467071794, + "grad_norm": 0.15850351750850677, + "learning_rate": 1.3545872666574699e-05, + "loss": 2.0978, + "step": 498650 + }, + { + "epoch": 1.9276801039105627, + "grad_norm": 0.15322215855121613, + "learning_rate": 1.344519330575733e-05, + "loss": 2.0911, + "step": 498660 + }, + { + "epoch": 1.9277187611139461, + "grad_norm": 0.15965676307678223, + "learning_rate": 1.334451904714995e-05, + "loss": 2.0949, + "step": 498670 + }, + { + "epoch": 1.9277574183173294, + "grad_norm": 0.1568049192428589, + "learning_rate": 1.3243849889976956e-05, + "loss": 2.0873, + "step": 498680 + }, + { + "epoch": 1.9277960755207126, + "grad_norm": 0.1586667150259018, + "learning_rate": 1.3143185833463188e-05, + "loss": 2.085, + "step": 498690 + }, + { + "epoch": 1.927834732724096, + "grad_norm": 0.15746428072452545, + "learning_rate": 1.3042526876833271e-05, + "loss": 2.0733, + "step": 498700 + }, + { + "epoch": 1.9278733899274791, + "grad_norm": 0.1772247552871704, + "learning_rate": 1.2941873019312045e-05, + "loss": 2.0868, + "step": 498710 + }, + { + "epoch": 1.9279120471308624, + "grad_norm": 0.1588650941848755, + "learning_rate": 1.2841224260124795e-05, + "loss": 2.1039, + "step": 498720 + }, + { + "epoch": 1.9279507043342456, + "grad_norm": 0.15893688797950745, + "learning_rate": 1.2740580598496809e-05, + "loss": 2.0938, + "step": 498730 + }, + { + "epoch": 1.927989361537629, + "grad_norm": 0.1555604785680771, + "learning_rate": 1.2639942033653373e-05, + "loss": 2.1021, + "step": 498740 + }, + { + "epoch": 1.9280280187410122, + "grad_norm": 0.15307071805000305, + "learning_rate": 1.2539308564820884e-05, + "loss": 2.092, + "step": 498750 + }, + { + "epoch": 1.9280666759443954, + "grad_norm": 0.15516358613967896, + "learning_rate": 1.2438680191224849e-05, + "loss": 2.0851, + "step": 498760 + }, + { + "epoch": 1.9281053331477787, + "grad_norm": 0.15749025344848633, + "learning_rate": 1.2338056912091445e-05, + "loss": 2.0726, + "step": 498770 + }, + { + "epoch": 1.928143990351162, + "grad_norm": 0.16453056037425995, + "learning_rate": 1.2237438726647066e-05, + "loss": 2.0833, + "step": 498780 + }, + { + "epoch": 1.9281826475545452, + "grad_norm": 0.1561942994594574, + "learning_rate": 1.2136825634118331e-05, + "loss": 2.0889, + "step": 498790 + }, + { + "epoch": 1.9282213047579286, + "grad_norm": 0.15563102066516876, + "learning_rate": 1.2036217633731638e-05, + "loss": 2.0907, + "step": 498800 + }, + { + "epoch": 1.9282599619613119, + "grad_norm": 0.16227789223194122, + "learning_rate": 1.1935614724714273e-05, + "loss": 2.0934, + "step": 498810 + }, + { + "epoch": 1.9282986191646951, + "grad_norm": 0.1553933024406433, + "learning_rate": 1.1835016906293073e-05, + "loss": 2.0954, + "step": 498820 + }, + { + "epoch": 1.9283372763680784, + "grad_norm": 0.16049771010875702, + "learning_rate": 1.1734424177695547e-05, + "loss": 2.0913, + "step": 498830 + }, + { + "epoch": 1.9283759335714619, + "grad_norm": 0.16263166069984436, + "learning_rate": 1.16338365381492e-05, + "loss": 2.0843, + "step": 498840 + }, + { + "epoch": 1.9284145907748451, + "grad_norm": 0.16481728851795197, + "learning_rate": 1.1533253986881543e-05, + "loss": 2.1019, + "step": 498850 + }, + { + "epoch": 1.9284532479782284, + "grad_norm": 0.16423772275447845, + "learning_rate": 1.1432676523120522e-05, + "loss": 2.0942, + "step": 498860 + }, + { + "epoch": 1.9284919051816116, + "grad_norm": 0.15550151467323303, + "learning_rate": 1.1332104146094536e-05, + "loss": 2.094, + "step": 498870 + }, + { + "epoch": 1.9285305623849949, + "grad_norm": 0.1563386619091034, + "learning_rate": 1.1231536855031533e-05, + "loss": 2.0742, + "step": 498880 + }, + { + "epoch": 1.9285692195883781, + "grad_norm": 0.1562238335609436, + "learning_rate": 1.1130974649160131e-05, + "loss": 2.0853, + "step": 498890 + }, + { + "epoch": 1.9286078767917614, + "grad_norm": 0.1578655242919922, + "learning_rate": 1.1030417527708946e-05, + "loss": 2.0899, + "step": 498900 + }, + { + "epoch": 1.9286465339951446, + "grad_norm": 0.15448294579982758, + "learning_rate": 1.092986548990682e-05, + "loss": 2.0888, + "step": 498910 + }, + { + "epoch": 1.9286851911985279, + "grad_norm": 0.15214316546916962, + "learning_rate": 1.0829318534983035e-05, + "loss": 2.0823, + "step": 498920 + }, + { + "epoch": 1.9287238484019111, + "grad_norm": 0.15848049521446228, + "learning_rate": 1.072877666216665e-05, + "loss": 2.0806, + "step": 498930 + }, + { + "epoch": 1.9287625056052944, + "grad_norm": 0.1562502384185791, + "learning_rate": 1.0628239870687173e-05, + "loss": 2.0774, + "step": 498940 + }, + { + "epoch": 1.9288011628086776, + "grad_norm": 0.1534113883972168, + "learning_rate": 1.052770815977433e-05, + "loss": 2.0889, + "step": 498950 + }, + { + "epoch": 1.928839820012061, + "grad_norm": 0.15739640593528748, + "learning_rate": 1.0427181528657847e-05, + "loss": 2.0891, + "step": 498960 + }, + { + "epoch": 1.9288784772154444, + "grad_norm": 0.16295334696769714, + "learning_rate": 1.0326659976567899e-05, + "loss": 2.0795, + "step": 498970 + }, + { + "epoch": 1.9289171344188276, + "grad_norm": 0.1570284366607666, + "learning_rate": 1.0226143502734654e-05, + "loss": 2.0894, + "step": 498980 + }, + { + "epoch": 1.9289557916222109, + "grad_norm": 0.1605892926454544, + "learning_rate": 1.0125632106388505e-05, + "loss": 2.0887, + "step": 498990 + }, + { + "epoch": 1.9289944488255941, + "grad_norm": 0.15600307285785675, + "learning_rate": 1.002512578676007e-05, + "loss": 2.0777, + "step": 499000 + }, + { + "epoch": 1.9290331060289776, + "grad_norm": 0.1557171493768692, + "learning_rate": 9.924624543080408e-06, + "loss": 2.0935, + "step": 499010 + }, + { + "epoch": 1.9290717632323608, + "grad_norm": 0.15279081463813782, + "learning_rate": 9.824128374580132e-06, + "loss": 2.0806, + "step": 499020 + }, + { + "epoch": 1.929110420435744, + "grad_norm": 0.15580080449581146, + "learning_rate": 9.72363728049075e-06, + "loss": 2.0762, + "step": 499030 + }, + { + "epoch": 1.9291490776391274, + "grad_norm": 0.1522652804851532, + "learning_rate": 9.623151260043539e-06, + "loss": 2.0978, + "step": 499040 + }, + { + "epoch": 1.9291877348425106, + "grad_norm": 0.15721817314624786, + "learning_rate": 9.522670312470006e-06, + "loss": 2.0829, + "step": 499050 + }, + { + "epoch": 1.9292263920458939, + "grad_norm": 0.16213186085224152, + "learning_rate": 9.422194437002097e-06, + "loss": 2.0832, + "step": 499060 + }, + { + "epoch": 1.929265049249277, + "grad_norm": 0.157264843583107, + "learning_rate": 9.321723632871982e-06, + "loss": 2.0887, + "step": 499070 + }, + { + "epoch": 1.9293037064526604, + "grad_norm": 0.15679962933063507, + "learning_rate": 9.221257899311385e-06, + "loss": 2.0768, + "step": 499080 + }, + { + "epoch": 1.9293423636560436, + "grad_norm": 0.15212687849998474, + "learning_rate": 9.120797235552925e-06, + "loss": 2.0873, + "step": 499090 + }, + { + "epoch": 1.9293810208594269, + "grad_norm": 0.1606370061635971, + "learning_rate": 9.020341640829432e-06, + "loss": 2.0871, + "step": 499100 + }, + { + "epoch": 1.9294196780628101, + "grad_norm": 0.17525170743465424, + "learning_rate": 8.91989111437308e-06, + "loss": 2.0926, + "step": 499110 + }, + { + "epoch": 1.9294583352661934, + "grad_norm": 0.15746445953845978, + "learning_rate": 8.819445655417146e-06, + "loss": 2.0828, + "step": 499120 + }, + { + "epoch": 1.9294969924695766, + "grad_norm": 0.1544865220785141, + "learning_rate": 8.719005263194912e-06, + "loss": 2.0888, + "step": 499130 + }, + { + "epoch": 1.92953564967296, + "grad_norm": 0.15426436066627502, + "learning_rate": 8.618569936939436e-06, + "loss": 2.0765, + "step": 499140 + }, + { + "epoch": 1.9295743068763433, + "grad_norm": 0.15780779719352722, + "learning_rate": 8.51813967588444e-06, + "loss": 2.0805, + "step": 499150 + }, + { + "epoch": 1.9296129640797266, + "grad_norm": 0.15918251872062683, + "learning_rate": 8.41771447926365e-06, + "loss": 2.075, + "step": 499160 + }, + { + "epoch": 1.9296516212831099, + "grad_norm": 0.15235045552253723, + "learning_rate": 8.317294346310789e-06, + "loss": 2.0832, + "step": 499170 + }, + { + "epoch": 1.9296902784864933, + "grad_norm": 0.15264438092708588, + "learning_rate": 8.216879276260248e-06, + "loss": 2.0832, + "step": 499180 + }, + { + "epoch": 1.9297289356898766, + "grad_norm": 0.15535661578178406, + "learning_rate": 8.116469268345971e-06, + "loss": 2.0835, + "step": 499190 + }, + { + "epoch": 1.9297675928932598, + "grad_norm": 0.16374972462654114, + "learning_rate": 8.016064321802797e-06, + "loss": 2.0684, + "step": 499200 + }, + { + "epoch": 1.929806250096643, + "grad_norm": 0.1522805392742157, + "learning_rate": 7.915664435865332e-06, + "loss": 2.0914, + "step": 499210 + }, + { + "epoch": 1.9298449073000263, + "grad_norm": 0.1582413613796234, + "learning_rate": 7.815269609768417e-06, + "loss": 2.0801, + "step": 499220 + }, + { + "epoch": 1.9298835645034096, + "grad_norm": 0.15402774512767792, + "learning_rate": 7.714879842747102e-06, + "loss": 2.0744, + "step": 499230 + }, + { + "epoch": 1.9299222217067928, + "grad_norm": 0.15144217014312744, + "learning_rate": 7.61449513403667e-06, + "loss": 2.078, + "step": 499240 + }, + { + "epoch": 1.929960878910176, + "grad_norm": 0.14982645213603973, + "learning_rate": 7.514115482872397e-06, + "loss": 2.0867, + "step": 499250 + }, + { + "epoch": 1.9299995361135593, + "grad_norm": 0.15028296411037445, + "learning_rate": 7.41374088849045e-06, + "loss": 2.084, + "step": 499260 + }, + { + "epoch": 1.9300381933169426, + "grad_norm": 0.16110238432884216, + "learning_rate": 7.313371350126108e-06, + "loss": 2.0796, + "step": 499270 + }, + { + "epoch": 1.9300768505203258, + "grad_norm": 0.15005861222743988, + "learning_rate": 7.213006867015981e-06, + "loss": 2.0954, + "step": 499280 + }, + { + "epoch": 1.930115507723709, + "grad_norm": 0.1562095582485199, + "learning_rate": 7.112647438395792e-06, + "loss": 2.0881, + "step": 499290 + }, + { + "epoch": 1.9301541649270924, + "grad_norm": 0.15073002874851227, + "learning_rate": 7.012293063502151e-06, + "loss": 2.0852, + "step": 499300 + }, + { + "epoch": 1.9301928221304758, + "grad_norm": 0.15659964084625244, + "learning_rate": 6.911943741571669e-06, + "loss": 2.0964, + "step": 499310 + }, + { + "epoch": 1.930231479333859, + "grad_norm": 0.15236833691596985, + "learning_rate": 6.811599471841179e-06, + "loss": 2.0645, + "step": 499320 + }, + { + "epoch": 1.9302701365372423, + "grad_norm": 0.15685366094112396, + "learning_rate": 6.711260253547735e-06, + "loss": 2.0672, + "step": 499330 + }, + { + "epoch": 1.9303087937406256, + "grad_norm": 0.15089979767799377, + "learning_rate": 6.610926085928392e-06, + "loss": 2.0858, + "step": 499340 + }, + { + "epoch": 1.930347450944009, + "grad_norm": 0.15144990384578705, + "learning_rate": 6.51059696822065e-06, + "loss": 2.0766, + "step": 499350 + }, + { + "epoch": 1.9303861081473923, + "grad_norm": 0.14963634312152863, + "learning_rate": 6.4102728996620065e-06, + "loss": 2.0737, + "step": 499360 + }, + { + "epoch": 1.9304247653507756, + "grad_norm": 0.15470971167087555, + "learning_rate": 6.309953879490404e-06, + "loss": 2.085, + "step": 499370 + }, + { + "epoch": 1.9304634225541588, + "grad_norm": 0.1591530442237854, + "learning_rate": 6.2096399069435646e-06, + "loss": 2.0996, + "step": 499380 + }, + { + "epoch": 1.930502079757542, + "grad_norm": 0.15408135950565338, + "learning_rate": 6.1093309812596534e-06, + "loss": 2.0892, + "step": 499390 + }, + { + "epoch": 1.9305407369609253, + "grad_norm": 0.15867233276367188, + "learning_rate": 6.009027101677278e-06, + "loss": 2.0822, + "step": 499400 + }, + { + "epoch": 1.9305793941643086, + "grad_norm": 0.15423956513404846, + "learning_rate": 5.908728267434604e-06, + "loss": 2.0953, + "step": 499410 + }, + { + "epoch": 1.9306180513676918, + "grad_norm": 0.16282016038894653, + "learning_rate": 5.808434477770685e-06, + "loss": 2.0738, + "step": 499420 + }, + { + "epoch": 1.930656708571075, + "grad_norm": 0.14597930014133453, + "learning_rate": 5.708145731924352e-06, + "loss": 2.0794, + "step": 499430 + }, + { + "epoch": 1.9306953657744583, + "grad_norm": 0.15457600355148315, + "learning_rate": 5.607862029134436e-06, + "loss": 2.0842, + "step": 499440 + }, + { + "epoch": 1.9307340229778416, + "grad_norm": 0.15878304839134216, + "learning_rate": 5.507583368640656e-06, + "loss": 2.0962, + "step": 499450 + }, + { + "epoch": 1.9307726801812248, + "grad_norm": 1.0090843439102173, + "learning_rate": 5.407309749682288e-06, + "loss": 2.0813, + "step": 499460 + }, + { + "epoch": 1.930811337384608, + "grad_norm": 0.15068727731704712, + "learning_rate": 5.3070411714990495e-06, + "loss": 2.0946, + "step": 499470 + }, + { + "epoch": 1.9308499945879916, + "grad_norm": 0.1503390669822693, + "learning_rate": 5.206777633330662e-06, + "loss": 2.1053, + "step": 499480 + }, + { + "epoch": 1.9308886517913748, + "grad_norm": 0.14822939038276672, + "learning_rate": 5.106519134417509e-06, + "loss": 2.0816, + "step": 499490 + }, + { + "epoch": 1.930927308994758, + "grad_norm": 0.15106622874736786, + "learning_rate": 5.006265673999755e-06, + "loss": 2.0756, + "step": 499500 + }, + { + "epoch": 1.9309659661981413, + "grad_norm": 0.15554742515087128, + "learning_rate": 4.906017251317563e-06, + "loss": 2.0865, + "step": 499510 + }, + { + "epoch": 1.9310046234015248, + "grad_norm": 0.14966271817684174, + "learning_rate": 4.805773865611984e-06, + "loss": 2.0888, + "step": 499520 + }, + { + "epoch": 1.931043280604908, + "grad_norm": 0.1527647078037262, + "learning_rate": 4.705535516123405e-06, + "loss": 2.0945, + "step": 499530 + }, + { + "epoch": 1.9310819378082913, + "grad_norm": 0.15576697885990143, + "learning_rate": 4.605302202093098e-06, + "loss": 2.0841, + "step": 499540 + }, + { + "epoch": 1.9311205950116745, + "grad_norm": 0.14974485337734222, + "learning_rate": 4.505073922762337e-06, + "loss": 2.0797, + "step": 499550 + }, + { + "epoch": 1.9311592522150578, + "grad_norm": 0.14997202157974243, + "learning_rate": 4.404850677372396e-06, + "loss": 2.0903, + "step": 499560 + }, + { + "epoch": 1.931197909418441, + "grad_norm": 0.15646487474441528, + "learning_rate": 4.3046324651649925e-06, + "loss": 2.0843, + "step": 499570 + }, + { + "epoch": 1.9312365666218243, + "grad_norm": 0.15585726499557495, + "learning_rate": 4.2044192853818445e-06, + "loss": 2.0945, + "step": 499580 + }, + { + "epoch": 1.9312752238252076, + "grad_norm": 0.1496593952178955, + "learning_rate": 4.104211137264891e-06, + "loss": 2.0853, + "step": 499590 + }, + { + "epoch": 1.9313138810285908, + "grad_norm": 0.15382583439350128, + "learning_rate": 4.004008020056071e-06, + "loss": 2.0881, + "step": 499600 + }, + { + "epoch": 1.931352538231974, + "grad_norm": 0.14952684938907623, + "learning_rate": 3.903809932998215e-06, + "loss": 2.1048, + "step": 499610 + }, + { + "epoch": 1.9313911954353573, + "grad_norm": 0.15660254657268524, + "learning_rate": 3.803616875333704e-06, + "loss": 2.0571, + "step": 499620 + }, + { + "epoch": 1.9314298526387406, + "grad_norm": 0.15601907670497894, + "learning_rate": 3.7034288463049236e-06, + "loss": 2.0852, + "step": 499630 + }, + { + "epoch": 1.9314685098421238, + "grad_norm": 0.15216785669326782, + "learning_rate": 3.6032458451551454e-06, + "loss": 2.092, + "step": 499640 + }, + { + "epoch": 1.9315071670455073, + "grad_norm": 0.15855903923511505, + "learning_rate": 3.5030678711274187e-06, + "loss": 2.0718, + "step": 499650 + }, + { + "epoch": 1.9315458242488905, + "grad_norm": 0.15953026711940765, + "learning_rate": 3.4028949234650164e-06, + "loss": 2.0856, + "step": 499660 + }, + { + "epoch": 1.9315844814522738, + "grad_norm": 0.1549243927001953, + "learning_rate": 3.302727001411432e-06, + "loss": 2.0979, + "step": 499670 + }, + { + "epoch": 1.931623138655657, + "grad_norm": 0.15357156097888947, + "learning_rate": 3.2025641042103817e-06, + "loss": 2.0899, + "step": 499680 + }, + { + "epoch": 1.9316617958590405, + "grad_norm": 0.15665321052074432, + "learning_rate": 3.1024062311058033e-06, + "loss": 2.0889, + "step": 499690 + }, + { + "epoch": 1.9317004530624238, + "grad_norm": 0.15755918622016907, + "learning_rate": 3.0022533813414134e-06, + "loss": 2.0704, + "step": 499700 + }, + { + "epoch": 1.931739110265807, + "grad_norm": 0.1516823172569275, + "learning_rate": 2.9021055541618156e-06, + "loss": 2.089, + "step": 499710 + }, + { + "epoch": 1.9317777674691903, + "grad_norm": 0.1494276374578476, + "learning_rate": 2.801962748811393e-06, + "loss": 2.095, + "step": 499720 + }, + { + "epoch": 1.9318164246725735, + "grad_norm": 0.1531931608915329, + "learning_rate": 2.7018249645347493e-06, + "loss": 2.0815, + "step": 499730 + }, + { + "epoch": 1.9318550818759568, + "grad_norm": 0.1523408740758896, + "learning_rate": 2.601692200576711e-06, + "loss": 2.0761, + "step": 499740 + }, + { + "epoch": 1.93189373907934, + "grad_norm": 0.14666548371315002, + "learning_rate": 2.5015644561821038e-06, + "loss": 2.0641, + "step": 499750 + }, + { + "epoch": 1.9319323962827233, + "grad_norm": 0.15638861060142517, + "learning_rate": 2.401441730596421e-06, + "loss": 2.0839, + "step": 499760 + }, + { + "epoch": 1.9319710534861065, + "grad_norm": 0.15035462379455566, + "learning_rate": 2.3013240230647102e-06, + "loss": 2.0846, + "step": 499770 + }, + { + "epoch": 1.9320097106894898, + "grad_norm": 0.15132653713226318, + "learning_rate": 2.2012113328329085e-06, + "loss": 2.0848, + "step": 499780 + }, + { + "epoch": 1.932048367892873, + "grad_norm": 0.1532151997089386, + "learning_rate": 2.1011036591465082e-06, + "loss": 2.0807, + "step": 499790 + }, + { + "epoch": 1.9320870250962563, + "grad_norm": 0.27533096075057983, + "learning_rate": 2.0010010012516676e-06, + "loss": 2.0879, + "step": 499800 + }, + { + "epoch": 1.9321256822996398, + "grad_norm": 0.15347447991371155, + "learning_rate": 1.9009033583945456e-06, + "loss": 2.0861, + "step": 499810 + }, + { + "epoch": 1.932164339503023, + "grad_norm": 0.15102313458919525, + "learning_rate": 1.8008107298210784e-06, + "loss": 2.0871, + "step": 499820 + }, + { + "epoch": 1.9322029967064063, + "grad_norm": 0.1492982655763626, + "learning_rate": 1.700723114778313e-06, + "loss": 2.0826, + "step": 499830 + }, + { + "epoch": 1.9322416539097895, + "grad_norm": 0.14892347157001495, + "learning_rate": 1.60064051251263e-06, + "loss": 2.086, + "step": 499840 + }, + { + "epoch": 1.9322803111131728, + "grad_norm": 0.14762091636657715, + "learning_rate": 1.5005629222708539e-06, + "loss": 2.0986, + "step": 499850 + }, + { + "epoch": 1.9323189683165563, + "grad_norm": 0.1522284299135208, + "learning_rate": 1.4004903433004756e-06, + "loss": 2.0931, + "step": 499860 + }, + { + "epoch": 1.9323576255199395, + "grad_norm": 0.14866903424263, + "learning_rate": 1.30042277484832e-06, + "loss": 2.0816, + "step": 499870 + }, + { + "epoch": 1.9323962827233228, + "grad_norm": 0.148747518658638, + "learning_rate": 1.2003602161620997e-06, + "loss": 2.0782, + "step": 499880 + }, + { + "epoch": 1.932434939926706, + "grad_norm": 0.15060488879680634, + "learning_rate": 1.1003026664895277e-06, + "loss": 2.069, + "step": 499890 + }, + { + "epoch": 1.9324735971300893, + "grad_norm": 0.14562411606311798, + "learning_rate": 1.000250125078095e-06, + "loss": 2.0864, + "step": 499900 + }, + { + "epoch": 1.9325122543334725, + "grad_norm": 0.15331986546516418, + "learning_rate": 9.002025911764023e-07, + "loss": 2.077, + "step": 499910 + }, + { + "epoch": 1.9325509115368558, + "grad_norm": 0.14756985008716583, + "learning_rate": 8.001600640319406e-07, + "loss": 2.0719, + "step": 499920 + }, + { + "epoch": 1.932589568740239, + "grad_norm": 0.1471695899963379, + "learning_rate": 7.001225428937552e-07, + "loss": 2.0673, + "step": 499930 + }, + { + "epoch": 1.9326282259436223, + "grad_norm": 0.15674185752868652, + "learning_rate": 6.00090027010225e-07, + "loss": 2.0643, + "step": 499940 + }, + { + "epoch": 1.9326668831470055, + "grad_norm": 0.14780041575431824, + "learning_rate": 5.000625156297289e-07, + "loss": 2.0786, + "step": 499950 + }, + { + "epoch": 1.9327055403503888, + "grad_norm": 0.1498531550168991, + "learning_rate": 4.0004000800197835e-07, + "loss": 2.0835, + "step": 499960 + }, + { + "epoch": 1.932744197553772, + "grad_norm": 0.14761725068092346, + "learning_rate": 3.0002250337557434e-07, + "loss": 2.0839, + "step": 499970 + }, + { + "epoch": 1.9327828547571555, + "grad_norm": 0.1464381068944931, + "learning_rate": 2.000100010000061e-07, + "loss": 2.0842, + "step": 499980 + }, + { + "epoch": 1.9328215119605388, + "grad_norm": 0.1600688099861145, + "learning_rate": 1.0000250012498491e-07, + "loss": 2.0756, + "step": 499990 + }, + { + "epoch": 1.932860169163922, + "grad_norm": 0.1499907523393631, + "learning_rate": 0.0, + "loss": 2.109, + "step": 500000 + } + ], + "logging_steps": 10, + "max_steps": 500000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2721816869275194e+21, + "train_batch_size": 13, + "trial_name": null, + "trial_params": null +}